From 54ee46d38def617477c1daaf13e9ecce955c8f5e Mon Sep 17 00:00:00 2001 From: rcorreia Date: Fri, 28 Nov 2008 18:51:48 +0000 Subject: [PATCH] Merge b_hd_kdmu from HEAD (20081128_1836) --- lustre/ChangeLog | 138 +- lustre/Makefile.in | 2 +- lustre/autoMakefile.am | 4 +- lustre/autoconf/lustre-core.m4 | 197 +- lustre/autoconf/lustre-version.ac | 2 +- lustre/cmm/cmm_device.c | 306 +- lustre/cmm/cmm_object.c | 28 +- lustre/cmm/cmm_split.c | 9 +- lustre/cmm/mdc_device.c | 21 +- lustre/cmm/mdc_object.c | 14 +- lustre/contrib/packet-lnet.c | 783 + lustre/contrib/packet-lustre.c | 10146 ++++++++++ lustre/doc/lfs.1 | 49 +- lustre/doc/lock-ordering | 309 + lustre/fid/fid_handler.c | 133 +- lustre/fid/fid_internal.h | 18 +- lustre/fid/fid_lib.c | 52 +- lustre/fid/fid_request.c | 31 +- lustre/fid/fid_store.c | 80 +- lustre/fid/lproc_fid.c | 12 +- lustre/fld/fld_cache.c | 540 +- lustre/fld/fld_handler.c | 253 +- lustre/fld/fld_index.c | 203 +- lustre/fld/fld_internal.h | 122 +- lustre/fld/fld_request.c | 161 +- lustre/include/Makefile.am | 4 +- lustre/include/cl_object.h | 3033 +++ lustre/include/class_hash.h | 95 +- lustre/include/dt_object.h | 111 +- lustre/include/interval_tree.h | 11 +- lustre/include/lclient.h | 375 + lustre/include/liblustre.h | 9 +- lustre/include/linux/lustre_acl.h | 6 +- lustre/include/linux/lustre_compat25.h | 78 +- lustre/include/linux/lustre_fsfilt.h | 31 +- lustre/include/linux/lustre_user.h | 3 + lustre/include/linux/lvfs.h | 2 +- lustre/include/linux/obd_support.h | 3 + lustre/include/lprocfs_status.h | 78 +- lustre/include/lu_object.h | 149 +- lustre/include/lustre/liblustreapi.h | 5 +- lustre/include/lustre/lustre_idl.h | 347 +- lustre/include/lustre/lustre_user.h | 33 +- lustre/include/lustre_cache.h | 87 - lustre/include/lustre_capa.h | 113 +- lustre/include/lustre_disk.h | 72 +- lustre/include/lustre_dlm.h | 88 +- lustre/include/lustre_export.h | 16 +- lustre/include/lustre_fid.h | 90 +- lustre/include/lustre_fld.h | 162 +- lustre/include/lustre_import.h | 5 +- lustre/include/lustre_lib.h | 3 +- lustre/include/lustre_lite.h | 8 +- lustre/include/lustre_log.h | 16 +- lustre/include/lustre_mds.h | 3 +- lustre/include/lustre_net.h | 256 +- lustre/include/lustre_param.h | 1 + lustre/include/lustre_quota.h | 595 +- lustre/include/lustre_req_layout.h | 2 + lustre/include/lustre_sec.h | 69 +- lustre/include/md_object.h | 172 +- lustre/include/obd.h | 205 +- lustre/include/obd_class.h | 392 +- lustre/include/obd_lov.h | 1 - lustre/include/obd_ost.h | 22 +- lustre/include/obd_support.h | 11 +- .../kernel_patches/patches/2.6-rhel4-kgdb-ga.patch | 44 +- .../kernel_patches/patches/2.6-rhel5-kgdb-ga.patch | 19200 ++++++++++++++++++ lustre/kernel_patches/patches/8kstack-2.6.12.patch | 3 + .../patches/dev_read_only-2.6.22-vanilla.patch | 8 + .../patches/export-2.6.18-vanilla.patch | 17 +- .../patches/export-show_task-2.6.18-vanilla.patch | 5 + lustre/kernel_patches/patches/i_filter_data.patch | 4 + .../patches/kgdb-2.6.18-vanilla.patch | 19778 +++++++++++++++++++ .../patches/lockdep_chains-2.6.18-vanilla.patch | 269 + .../kernel_patches/patches/md-rebuild-policy.patch | 7 +- .../kernel_patches/patches/md-soft-lockups.patch | 13 + .../quota-fix-oops-in-invalidate_dquots.patch | 127 + .../patches/quota-large-limits-rhel5.patch | 616 + .../patches/quota-large-limits-sles10.patch | 616 + .../patches/raid5-merge-ios-rhel5.patch | 9 +- .../patches/raid5-zerocopy-rhel5.patch | 2 +- .../patches/sd_iostats-2.6.22-vanilla.patch | 4 + lustre/kernel_patches/series/2.6-rhel4.series | 1 + lustre/kernel_patches/series/2.6-rhel5.series | 2 + lustre/kernel_patches/series/2.6-sles10.series | 2 + lustre/kernel_patches/series/2.6.18-vanilla.series | 1 + lustre/kernel_patches/series/2.6.22-vanilla.series | 1 + lustre/kernel_patches/targets/2.6-rhel5.target.in | 2 +- lustre/kernel_patches/targets/2.6-sles10.target.in | 2 +- lustre/kernel_patches/which_patch | 4 +- lustre/lclient/Makefile.am | 1 + lustre/lclient/glimpse.c | 253 + lustre/lclient/lcommon_cl.c | 1188 ++ lustre/ldlm/interval_tree.c | 4 + lustre/ldlm/ldlm_extent.c | 29 +- lustre/ldlm/ldlm_flock.c | 7 +- lustre/ldlm/ldlm_inodebits.c | 17 +- lustre/ldlm/ldlm_internal.h | 12 +- lustre/ldlm/ldlm_lib.c | 234 +- lustre/ldlm/ldlm_lock.c | 183 +- lustre/ldlm/ldlm_lockd.c | 104 +- lustre/ldlm/ldlm_pool.c | 284 +- lustre/ldlm/ldlm_request.c | 248 +- lustre/ldlm/ldlm_resource.c | 7 + lustre/liblustre/Makefile.am | 7 +- lustre/liblustre/dir.c | 8 +- lustre/liblustre/file.c | 88 +- lustre/liblustre/llite_cl.c | 835 + lustre/liblustre/llite_lib.c | 5 +- lustre/liblustre/llite_lib.h | 110 +- lustre/liblustre/lutil.c | 8 +- lustre/liblustre/namei.c | 4 +- lustre/liblustre/rw.c | 613 +- lustre/liblustre/super.c | 139 +- lustre/liblustre/tests/sanity.c | 30 +- lustre/llite/Makefile.in | 2 + lustre/llite/autoMakefile.am | 1 + lustre/llite/dcache.c | 10 +- lustre/llite/dir.c | 166 +- lustre/llite/file.c | 1473 +- lustre/llite/llite_capa.c | 89 +- lustre/llite/llite_close.c | 52 +- lustre/llite/llite_fid.c | 12 + lustre/llite/llite_internal.h | 424 +- lustre/llite/llite_lib.c | 414 +- lustre/llite/llite_mmap.c | 475 +- lustre/llite/lloop.c | 30 +- lustre/llite/lproc_llite.c | 367 +- lustre/llite/namei.c | 44 +- lustre/llite/rw.c | 2050 +- lustre/llite/rw26.c | 284 +- lustre/llite/statahead.c | 41 +- lustre/llite/super25.c | 29 +- lustre/llite/symlink.c | 32 +- lustre/llite/vvp_dev.c | 559 + lustre/llite/vvp_internal.h | 68 + lustre/llite/vvp_io.c | 996 + lustre/llite/vvp_lock.c | 89 + lustre/llite/vvp_object.c | 153 + lustre/llite/vvp_page.c | 556 + lustre/llite/xattr.c | 4 +- lustre/lmv/lmv_fld.c | 4 +- lustre/lmv/lmv_intent.c | 2 +- lustre/lmv/lmv_internal.h | 30 +- lustre/lmv/lmv_obd.c | 247 +- lustre/lmv/lproc_lmv.c | 6 +- lustre/lov/Makefile.in | 2 +- lustre/lov/autoMakefile.am | 16 +- lustre/lov/lov_cl_internal.h | 798 + lustre/lov/lov_dev.c | 540 + lustre/lov/lov_ea.c | 4 +- lustre/lov/lov_internal.h | 38 +- lustre/lov/lov_io.c | 894 + lustre/lov/lov_lock.c | 935 + lustre/lov/lov_log.c | 6 + lustre/lov/lov_merge.c | 53 +- lustre/lov/lov_obd.c | 734 +- lustre/lov/lov_object.c | 700 + lustre/lov/lov_page.c | 227 + lustre/lov/lov_pool.c | 12 +- lustre/lov/lov_qos.c | 38 +- lustre/lov/lov_request.c | 127 +- lustre/lov/lovsub_dev.c | 212 + lustre/{include/obd_echo.h => lov/lovsub_io.c} | 40 +- lustre/lov/lovsub_lock.c | 430 + lustre/lov/lovsub_object.c | 160 + lustre/lov/lovsub_page.c | 83 + lustre/lvfs/autoMakefile.am | 5 +- lustre/lvfs/fsfilt_ext3.c | 186 +- lustre/lvfs/fsfilt_reiserfs.c | 2 - lustre/lvfs/lustre_quota_fmt.c | 483 +- lustre/lvfs/lustre_quota_fmt.h | 109 +- lustre/lvfs/lvfs_linux.c | 15 +- lustre/lvfs/quotafmt_test.c | 16 +- lustre/mdc/mdc_internal.h | 19 - lustre/mdc/mdc_locks.c | 14 +- lustre/mdc/mdc_reint.c | 12 +- lustre/mdc/mdc_request.c | 79 +- lustre/mdd/Makefile.in | 2 +- lustre/mdd/mdd_device.c | 120 +- lustre/mdd/mdd_dir.c | 440 +- lustre/mdd/mdd_internal.h | 94 +- lustre/mdd/mdd_lov.c | 191 +- lustre/mdd/mdd_lproc.c | 21 + lustre/mdd/mdd_object.c | 237 +- lustre/mdd/mdd_orphans.c | 422 +- lustre/mdd/mdd_permission.c | 26 +- lustre/mdd/mdd_quota.c | 274 + lustre/mdd/mdd_trans.c | 42 +- lustre/mds/handler.c | 35 +- lustre/mds/lproc_mds.c | 175 +- lustre/mds/mds_fs.c | 9 +- lustre/mds/mds_internal.h | 4 +- lustre/mds/mds_lov.c | 325 +- lustre/mdt/mdt_capa.c | 11 +- lustre/mdt/mdt_handler.c | 861 +- lustre/mdt/mdt_identity.c | 4 +- lustre/mdt/mdt_idmap.c | 148 +- lustre/mdt/mdt_internal.h | 59 +- lustre/mdt/mdt_lib.c | 118 +- lustre/mdt/mdt_lproc.c | 229 + lustre/mdt/mdt_open.c | 25 +- lustre/mdt/mdt_recovery.c | 96 +- lustre/mdt/mdt_reint.c | 16 +- lustre/mdt/mdt_xattr.c | 29 +- lustre/mgc/mgc_internal.h | 2 + lustre/mgc/mgc_request.c | 348 +- lustre/mgs/lproc_mgs.c | 86 +- lustre/mgs/mgs_handler.c | 69 +- lustre/mgs/mgs_internal.h | 8 +- lustre/mgs/mgs_llog.c | 1417 +- lustre/obdclass/Makefile.in | 3 +- lustre/obdclass/autoMakefile.am | 5 +- lustre/obdclass/capa.c | 147 +- lustre/obdclass/cl_internal.h | 97 + lustre/obdclass/cl_io.c | 1625 ++ lustre/obdclass/cl_lock.c | 2076 ++ lustre/obdclass/cl_object.c | 1077 + lustre/obdclass/cl_page.c | 1519 ++ lustre/obdclass/class_hash.c | 167 +- lustre/obdclass/class_obd.c | 5 +- lustre/obdclass/dt_object.c | 228 +- lustre/obdclass/genops.c | 240 +- lustre/obdclass/linux/linux-module.c | 2 +- lustre/obdclass/linux/linux-obdo.c | 8 +- lustre/obdclass/llog_cat.c | 8 +- lustre/obdclass/llog_lvfs.c | 22 +- lustre/obdclass/llog_obd.c | 8 +- lustre/obdclass/llog_swab.c | 20 +- lustre/obdclass/lprocfs_status.c | 20 +- lustre/obdclass/lu_object.c | 154 +- lustre/obdclass/lu_time.c | 2 +- lustre/obdclass/md_local_object.c | 464 + lustre/obdclass/obd_config.c | 129 +- lustre/obdclass/obd_mount.c | 92 +- lustre/obdecho/autoMakefile.am | 2 +- lustre/obdecho/echo.c | 7 +- lustre/obdecho/echo_client.c | 1993 +- lustre/obdecho/echo_internal.h | 30 + lustre/obdfilter/filter.c | 290 +- lustre/obdfilter/filter_capa.c | 69 +- lustre/obdfilter/filter_internal.h | 10 +- lustre/obdfilter/filter_io.c | 26 +- lustre/obdfilter/filter_io_26.c | 46 +- lustre/obdfilter/filter_log.c | 25 +- lustre/obdfilter/lproc_obdfilter.c | 49 +- lustre/osc/Makefile.in | 2 +- lustre/osc/autoMakefile.am | 10 +- lustre/osc/cache.c | 445 - lustre/osc/lproc_osc.c | 84 +- lustre/osc/osc_cl_internal.h | 420 + lustre/osc/osc_create.c | 20 +- lustre/osc/osc_dev.c | 253 + lustre/osc/osc_internal.h | 108 +- lustre/osc/osc_io.c | 653 + lustre/osc/osc_lock.c | 1623 ++ lustre/osc/osc_object.c | 243 + lustre/osc/osc_page.c | 532 + lustre/osc/osc_request.c | 1232 +- lustre/osd/osd_handler.c | 2257 ++- lustre/osd/osd_internal.h | 96 +- lustre/osd/osd_oi.c | 120 +- lustre/osd/osd_oi.h | 10 +- lustre/ost/ost_handler.c | 708 +- lustre/ptlrpc/client.c | 400 +- lustre/ptlrpc/connection.c | 2 +- lustre/ptlrpc/events.c | 23 +- lustre/ptlrpc/gss/gss_cli_upcall.c | 39 +- lustre/ptlrpc/gss/gss_internal.h | 8 +- lustre/ptlrpc/gss/gss_keyring.c | 8 +- lustre/ptlrpc/gss/gss_krb5.h | 2 - lustre/ptlrpc/gss/gss_krb5_mech.c | 4 +- lustre/ptlrpc/gss/gss_pipefs.c | 2 +- lustre/ptlrpc/gss/gss_svc_upcall.c | 18 +- lustre/ptlrpc/gss/lproc_gss.c | 3 +- lustre/ptlrpc/gss/sec_gss.c | 2 +- lustre/ptlrpc/import.c | 174 +- lustre/ptlrpc/layout.c | 62 +- lustre/ptlrpc/lproc_ptlrpc.c | 56 +- lustre/ptlrpc/niobuf.c | 91 +- lustre/ptlrpc/pack_generic.c | 150 +- lustre/ptlrpc/pinger.c | 31 +- lustre/ptlrpc/ptlrpc_internal.h | 15 +- lustre/ptlrpc/ptlrpc_module.c | 4 +- lustre/ptlrpc/ptlrpcd.c | 237 +- lustre/ptlrpc/recov_thread.c | 268 +- lustre/ptlrpc/recover.c | 20 +- lustre/ptlrpc/sec.c | 206 +- lustre/ptlrpc/sec_config.c | 977 +- lustre/ptlrpc/sec_gc.c | 28 +- lustre/ptlrpc/sec_null.c | 2 +- lustre/ptlrpc/sec_plain.c | 4 +- lustre/ptlrpc/service.c | 340 +- lustre/ptlrpc/wiretest.c | 207 +- lustre/quota/Makefile.in | 2 +- lustre/quota/autoMakefile.am | 4 +- lustre/quota/lproc_quota.c | 667 + lustre/quota/quota_adjust_qunit.c | 419 + lustre/quota/quota_check.c | 67 +- lustre/quota/quota_context.c | 1082 +- lustre/quota/quota_ctl.c | 146 +- lustre/quota/quota_interface.c | 765 +- lustre/quota/quota_internal.h | 125 +- lustre/quota/quota_master.c | 766 +- lustre/tests/Makefile.am | 1 + lustre/tests/acceptance-small.sh | 37 +- lustre/tests/cfg/insanity-lmv.sh | 8 + lustre/tests/cfg/lmv.sh | 9 +- lustre/tests/cfg/local.sh | 9 +- lustre/tests/conf-sanity.sh | 70 +- lustre/tests/createmany.c | 149 +- lustre/tests/createtest.c | 4 +- lustre/tests/disk1_8.tgz | Bin 0 -> 10506 bytes lustre/tests/fsx.c | 18 +- lustre/tests/insanity.sh | 13 +- lustre/tests/it_test.c | 24 +- lustre/tests/kbuild | 312 + lustre/tests/lockorder.sh | 2 +- lustre/tests/multifstat.c | 2 +- lustre/tests/performance-sanity.sh | 1 + lustre/tests/racer/racer.sh | 8 +- lustre/tests/recovery-small.sh | 31 +- lustre/tests/replay-dual.sh | 145 +- lustre/tests/replay-ost-single.sh | 63 +- lustre/tests/replay-single.sh | 20 +- lustre/tests/runracer | 113 + lustre/tests/sanity-gss.sh | 100 +- lustre/tests/sanity-nano.sh | 29 + lustre/tests/sanity-quota.sh | 1507 +- lustre/tests/sanity-sec.sh | 197 +- lustre/tests/sanity.sh | 301 +- lustre/tests/sanityN.sh | 174 +- lustre/tests/sendfile.c | 211 +- lustre/tests/test-framework.sh | 158 +- lustre/utils/gss/gss_util.c | 26 +- lustre/utils/gss/gssd.h | 1 + lustre/utils/gss/lgss_utils.c | 2 +- lustre/utils/gss/lgss_utils.h | 8 +- lustre/utils/gss/lsupport.c | 3 +- lustre/utils/gss/lsupport.h | 5 +- lustre/utils/gss/svcgssd.c | 15 +- lustre/utils/gss/svcgssd.h | 3 +- lustre/utils/gss/svcgssd_proc.c | 5 +- lustre/utils/l_getidentity.c | 2 + lustre/utils/lfs.c | 599 +- lustre/utils/liblustreapi.c | 33 +- lustre/utils/llog_reader.c | 5 + lustre/utils/lmc | 4 +- lustre/utils/mkfs_lustre.c | 321 +- lustre/utils/mount_lustre.c | 1 + lustre/utils/obd.c | 3 - lustre/utils/obdiolib.c | 2 + lustre/utils/req-layout.c | 3 +- lustre/utils/wirecheck.c | 47 +- lustre/utils/wiretest.c | 220 +- 356 files changed, 101934 insertions(+), 17168 deletions(-) create mode 100644 lustre/contrib/packet-lnet.c create mode 100644 lustre/contrib/packet-lustre.c create mode 100644 lustre/doc/lock-ordering create mode 100644 lustre/include/cl_object.h create mode 100644 lustre/include/lclient.h delete mode 100644 lustre/include/lustre_cache.h create mode 100644 lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch create mode 100644 lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch create mode 100644 lustre/kernel_patches/patches/md-soft-lockups.patch create mode 100644 lustre/kernel_patches/patches/quota-fix-oops-in-invalidate_dquots.patch create mode 100644 lustre/kernel_patches/patches/quota-large-limits-rhel5.patch create mode 100644 lustre/kernel_patches/patches/quota-large-limits-sles10.patch create mode 100644 lustre/lclient/Makefile.am create mode 100644 lustre/lclient/glimpse.c create mode 100644 lustre/lclient/lcommon_cl.c create mode 100644 lustre/liblustre/llite_cl.c create mode 100644 lustre/llite/vvp_dev.c create mode 100644 lustre/llite/vvp_internal.h create mode 100644 lustre/llite/vvp_io.c create mode 100644 lustre/llite/vvp_lock.c create mode 100644 lustre/llite/vvp_object.c create mode 100644 lustre/llite/vvp_page.c create mode 100644 lustre/lov/lov_cl_internal.h create mode 100644 lustre/lov/lov_dev.c create mode 100644 lustre/lov/lov_io.c create mode 100644 lustre/lov/lov_lock.c create mode 100644 lustre/lov/lov_object.c create mode 100644 lustre/lov/lov_page.c create mode 100644 lustre/lov/lovsub_dev.c rename lustre/{include/obd_echo.h => lov/lovsub_io.c} (58%) create mode 100644 lustre/lov/lovsub_lock.c create mode 100644 lustre/lov/lovsub_object.c create mode 100644 lustre/lov/lovsub_page.c create mode 100644 lustre/mdd/mdd_quota.c create mode 100644 lustre/obdclass/cl_internal.h create mode 100644 lustre/obdclass/cl_io.c create mode 100644 lustre/obdclass/cl_lock.c create mode 100644 lustre/obdclass/cl_object.c create mode 100644 lustre/obdclass/cl_page.c create mode 100644 lustre/obdclass/md_local_object.c create mode 100644 lustre/obdecho/echo_internal.h delete mode 100644 lustre/osc/cache.c create mode 100644 lustre/osc/osc_cl_internal.h create mode 100644 lustre/osc/osc_dev.c create mode 100644 lustre/osc/osc_io.c create mode 100644 lustre/osc/osc_lock.c create mode 100644 lustre/osc/osc_object.c create mode 100644 lustre/osc/osc_page.c create mode 100644 lustre/quota/lproc_quota.c create mode 100644 lustre/quota/quota_adjust_qunit.c create mode 100644 lustre/tests/disk1_8.tgz create mode 100755 lustre/tests/kbuild create mode 100644 lustre/tests/runracer create mode 100755 lustre/tests/sanity-nano.sh diff --git a/lustre/ChangeLog b/lustre/ChangeLog index 8cc4fcb..3df013c 100644 --- a/lustre/ChangeLog +++ b/lustre/ChangeLog @@ -1,8 +1,8 @@ tbd Sun Microsystems, Inc. * version 2.0.0 * Support for kernels: - 2.6.16.60-0.27 (SLES 10), - 2.6.18-92.1.10.el5 (RHEL 5), + 2.6.16.60-0.31 (SLES 10), + 2.6.18-92.1.17.el5 (RHEL 5), 2.6.22.14 vanilla (kernel.org). * Client support for unpatched kernels: (see http://wiki.lustre.org/index.php?title=Patchless_Client) @@ -14,6 +14,59 @@ tbd Sun Microsystems, Inc. * File join has been disabled in this release, refer to Bugzilla 16929. Severity : enhancement +Bugzilla : 15957 +Description: compact fld format with extents +Details : Store range of seq rather than every seq in FLD. Seq + controller update FLD rather than clients. In Case of CMD, mdt0 + has FLD, all other metadata server act as non persistent proxy + for FLD queries and cache fld entries in fld cache. + +Severity : normal +Frequency : rare +Bugzilla : 16081 +Description: don't skip ost target if they assigned to file +Details : Drop slow OSCs if we can, but not for requested start idx. + This means "if OSC is slow and it is not the requested + start OST, then it can be skipped, otherwise skip it only + if it is inactive/recovering/out-of-space. + +Severity : normal +Bugzilla : 16080 +Description: more cleanup in mds_lov +Details : not send LOV EA under replay, we can't know about they size at this + time. Don't allow client connect to mds before any ost connected, + for avoid problems with LOV EA size and returning EIO to client. + +Severity : enhancement +Bugzilla : 11826 +Description: Interoperability at server side (Disk interoperability) + +Severity : enhancement +Bugzilla : 17201 +Description: Update to RHEL5 kernel-2.6.18-92.1.17.el5. + +Severity : enhancement +Bugzilla : 17458 +Description: Update to SLES10 SP2 kernel-2.6.16.60-0.31. + +Severity : enhancement +Bugzilla : 14166 +Description: New client IO stack (CLIO). + +Severity : enhancement +Bugzilla : 15393 +Description: Commit on sharing. Eliminate inter-client dependencies between + uncommitted transactions by doing transaction commits. + Thereby clients may recovery independently. + +Severity : normal +Frequency : Create a symlink file with a very long name +Bugzilla : 16578 +Description: ldlm_cancel_pack()) ASSERTION(max >= dlm->lock_count + count) +Details : If there is no extra space in the request for early cancels, + ldlm_req_handles_avail() returns 0 instead of a negative value. + +Severity : enhancement Bugzilla : 1819 Description: Add /proc entry for import status Details : The mdc, osc, and mgc import directories now have @@ -45,6 +98,15 @@ Description: Hitting mdc_commit_close() ASSERTION Details : Properly handle request reference release in ll_release_openhandle(). +Severity : major +Bugzilla : 14840 +Description: quota recovery deadlock during mds failover +Details : This patch includes att18982, att18236, att18237 in bz14840. + Slove the problems: + 1. fix osts hang when mds does failover with quotaon + 2. prevent watchdog storm when osts threads wait for the + recovery of mds + Severity : normal Bugzilla : 15975 Frequency : only patchless client @@ -114,6 +176,7 @@ Details : When connection is reused this not moved from CONN_UNUSED_HASH into CONN_USED_HASH and this prodice warning when put connection again in unused hash. + Severity : enhancement Bugzilla : 15899 Description: File striping can now be set to use an arbitrary pool of OSTs. @@ -129,6 +192,23 @@ Details : Apply the MGS_CONNECT_SUPPORTED mask at reconnect time so the connect flags are properly negotiated. Severity : normal +Frequency : often +Bugzilla : 16125 +Description: quotas are not honored with O_DIRECT +Details : all writes with the flag O_DIRECT will use grants which leads to + this problem. Now using OBD_BRW_SYNC to guard this. + +Severity : normal +Bugzilla : 15058 +Description: add quota statistics +Details : 1. sort out quota proc entries and proc code. + 2. add quota statistics + +Severity : enhancement +Bugzilla : 13058 +Description: enable quota support for HEAD. + +Severity : normal Bugzilla : 16006 Description: Properly propagate oinfo flags from lov to osc for statfs Details : restore missing copy oi_flags to lov requests. @@ -1549,9 +1629,9 @@ Details : Kill unused ldlm_handle2lock_ns() function. Severity : normal Bugzilla : 16450 Description: Add lu_ref support to ldlm_lock -Details : lu_ref support for ldlm_lock and ldlm_resource. See lu_ref patch. - lu_ref fields ->l_reference and ->lr_reference are added to ldlm_lock - and ldlm_resource. LDLM interface has to be changed, because code that +Details : lu_ref support for ldlm_lock and ldlm_resource. See lu_ref patch. + lu_ref fields ->l_reference and ->lr_reference are added to ldlm_lock + and ldlm_resource. LDLM interface has to be changed, because code that releases a reference on a lock, has to "know" what reference this is. In the most frequent case @@ -1559,12 +1639,12 @@ Details : lu_ref support for ldlm_lock and ldlm_resource. See lu_ref patch. ... LDLM_LOCK_PUT(lock); - no changes are required. When any other reference (received _not_ from - ldlm_handle2lock()) is released, LDLM_LOCK_RELEASE() has to be called + no changes are required. When any other reference (received _not_ from + ldlm_handle2lock()) is released, LDLM_LOCK_RELEASE() has to be called instead of LDLM_LOCK_PUT(). Arguably, changes are pervasive, and interface requires some discipline - for proper use. On the other hand, it was very instrumental in finding + for proper use. On the other hand, it was very instrumental in finding a few leaked lock references. Severity : normal @@ -1577,7 +1657,7 @@ Details : Introduce ldlm_lock_addref_try() function (used by CLIO) that Severity : normal Bugzilla : 16450 Description: Add ldlm_weigh_callback(). -Details : Add new ->l_weigh_ast() call-back to ldlm_lock. It is called +Details : Add new ->l_weigh_ast() call-back to ldlm_lock. It is called by ldlm_cancel_shrink_policy() to estimate lock "value", instead of hard-coded `number of pages' logic. @@ -1617,8 +1697,8 @@ Details : Introduce new lu_context functions that are needed on the client Severity : normal Bugzilla : 16450 Description: Add start and stop methods to lu_device_type_operations. -Details : Introduce two new methods in lu_device_type_operations, that are - invoked when first instance of a given type is created and last one +Details : Introduce two new methods in lu_device_type_operations, that are + invoked when first instance of a given type is created and last one is destroyed respectively. This is need by CLIO. Severity : normal @@ -1663,7 +1743,7 @@ Severity : normal Bugzilla : 16450 Description: Introduce struct md_site and move meta-data specific parts of struct lu_site here. -Details : Move md-specific fields out of struct lu_site into special struct +Details : Move md-specific fields out of struct lu_site into special struct md_site, so that lu_site can be used on a client. Severity : minor @@ -1747,10 +1827,40 @@ Severity : normal Bugzilla : 17197 Description: (rw.c:1323:ll_read_ahead_pages()) ASSERTION(page_idx > ria->ria_stoff) failed Details : Once the unmatched stride IO mode is detected, shrink the stride-ahead - window to 0. If it does hit cache miss, and read-pattern is still - stride-io mode, does not reset the stride window, but also does not + window to 0. If it does hit cache miss, and read-pattern is still + stride-io mode, does not reset the stride window, but also does not increase the stride window length in this case. +Severity : normal +Bugzilla : 16438 +Frequency : only for big-endian servers +Description: Check if system is big-endian while mounting fs with extents feature +Details : Mounting a filesystem with extents feature will fail on big-endian + systems since ext3-based ldiskfs is not supported on big-endian + systems. This can be over-riden with "bigendian_extents" mount option. + +Severity : enhancement +Bugzilla : 12749 +Description: The root squash functionality +Details : A security feature, which is to prevent users from being able + to mount lustre on their desktop, run as root, and delete + all of the files in the filesystem. The goal is accomplished by + remapping user id (UID) and group id (GID) of the root user to + a UID and GID specified by the system administartor via Lustre + configuration management server (MGS). The functionality also + allows to specify sets of clients for which the remapping does + not apply. + +Severity : normal +Bugzilla : 16860 +Description: Excessive recovery window +Details : With AT enabled, the recovery window can be excessively long (6000+ + seconds). To address this problem, we no longer use + OBD_RECOVERY_FACTOR when extending the recovery window (the connect + timeout no longer depends on the service time, it is set to + INITIAL_CONNECT_TIMEOUT now) and clients report the old service + time via pb_service_time. + -------------------------------------------------------------------------------- 2007-08-10 Cluster File Systems, Inc. diff --git a/lustre/Makefile.in b/lustre/Makefile.in index 82c5433..f1c44fa 100644 --- a/lustre/Makefile.in +++ b/lustre/Makefile.in @@ -6,9 +6,9 @@ subdir-m += ptlrpc subdir-m += osc subdir-m += obdecho subdir-m += mgc +subdir-m += quota @SERVER_TRUE@subdir-m += mds obdfilter ost mgs mdt cmm mdd osd @CLIENT_TRUE@subdir-m += mdc lmv llite fld -@QUOTA_TRUE@subdir-m += quota @INCLUDE_RULES@ diff --git a/lustre/autoMakefile.am b/lustre/autoMakefile.am index 55aa1b6..51658ae 100644 --- a/lustre/autoMakefile.am +++ b/lustre/autoMakefile.am @@ -42,7 +42,7 @@ ALWAYS_SUBDIRS := include lvfs obdclass ldlm ptlrpc osc lov obdecho \ SERVER_SUBDIRS := obdfilter ost mds mgs mdt cmm mdd osd -CLIENT_SUBDIRS := mdc lmv llite +CLIENT_SUBDIRS := mdc lmv llite lclient QUOTA_SUBDIRS := quota @@ -58,9 +58,7 @@ if CLIENT SUBDIRS += $(CLIENT_SUBDIRS) endif -if QUOTA SUBDIRS += $(QUOTA_SUBDIRS) -endif # this needs to be after the client subdirs if LIBLUSTRE diff --git a/lustre/autoconf/lustre-core.m4 b/lustre/autoconf/lustre-core.m4 index 4899310..66284e8 100644 --- a/lustre/autoconf/lustre-core.m4 +++ b/lustre/autoconf/lustre-core.m4 @@ -258,9 +258,9 @@ LB_LINUX_TRY_COMPILE([ # LC_FUNC_REGISTER_CACHE # # if register_cache() is defined by kernel -# +# # There are two ways to shrink one customized cache in linux kernels. For the -# kernels are prior than 2.6.5(?), register_cache() is used, and for latest +# kernels are prior than 2.6.5(?), register_cache() is used, and for latest # kernels, set_shrinker() is used instead. # AC_DEFUN([LC_FUNC_REGISTER_CACHE], @@ -342,7 +342,7 @@ LB_LINUX_TRY_COMPILE([ AC_MSG_RESULT([yes]) AC_DEFINE(HAVE_DEV_SET_RDONLY, 1, [kernel has new dev_set_rdonly]) ],[ - AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre + AC_MSG_RESULT([no, Linux kernel source needs to be patches by lustre kernel patches from Lustre version 1.4.3 or above.]) ]) ]) @@ -580,7 +580,7 @@ AC_DEFUN([LC_BIT_SPINLOCK_H], # # LC_POSIX_ACL_XATTR # -# If we have xattr_acl.h +# If we have xattr_acl.h # AC_DEFUN([LC_XATTR_ACL], [LB_CHECK_FILE([$LINUX/include/linux/xattr_acl.h],[ @@ -703,6 +703,18 @@ LB_LINUX_CONFIG_IM([CRYPTO_SHA1],[],[ ]) ]) +# +# LC_CONFIG_RMTCLIENT +# +dnl FIXME +dnl the AES symbol usually tied with arch, e.g. CRYPTO_AES_586 +dnl FIXME +AC_DEFUN([LC_CONFIG_RMTCLIENT], +[LB_LINUX_CONFIG_IM([CRYPTO_AES],[],[ + AC_MSG_ERROR([Lustre remote client require that CONFIG_CRYPTO_AES is enabled in your kernel.]) +]) +]) + AC_DEFUN([LC_SUNRPC_CACHE], [AC_MSG_CHECKING([if sunrpc struct cache_head uses kref]) LB_LINUX_TRY_COMPILE([ @@ -729,7 +741,7 @@ AC_DEFUN([LC_CONFIG_SUNRPC], # AC_DEFUN([LC_CONFIG_GSS_KEYRING], [AC_MSG_CHECKING([whether to enable gss keyring backend]) - AC_ARG_ENABLE([gss_keyring], + AC_ARG_ENABLE([gss_keyring], [AC_HELP_STRING([--disable-gss-keyring], [disable gss keyring backend])], [],[enable_gss_keyring='yes']) @@ -747,8 +759,6 @@ AC_DEFUN([LC_CONFIG_GSS_KEYRING], fi ]) -m4_pattern_allow(AC_KERBEROS_V5) - # # LC_CONFIG_GSS (default disabled) # @@ -757,7 +767,7 @@ m4_pattern_allow(AC_KERBEROS_V5) # AC_DEFUN([LC_CONFIG_GSS], [AC_MSG_CHECKING([whether to enable gss/krb5 support]) - AC_ARG_ENABLE([gss], + AC_ARG_ENABLE([gss], [AC_HELP_STRING([--enable-gss], [enable gss/krb5 support])], [],[enable_gss='no']) AC_MSG_RESULT([$enable_gss]) @@ -784,11 +794,6 @@ AC_DEFUN([LC_CONFIG_GSS], [AC_MSG_WARN([kernel TWOFISH support is recommended by using GSS.])]) LB_LINUX_CONFIG_IM([CRYPTO_CAST6],[], [AC_MSG_WARN([kernel CAST6 support is recommended by using GSS.])]) - dnl FIXME - dnl the AES symbol usually tied with arch, e.g. CRYPTO_AES_586 - dnl FIXME - LB_LINUX_CONFIG_IM([CRYPTO_AES],[], - [AC_MSG_WARN([kernel AES support is recommended by using GSS.])]) AC_CHECK_LIB([gssapi], [gss_init_sec_context], [GSSAPI_LIBS="$GSSAPI_LDFLAGS -lgssapi"], @@ -949,7 +954,7 @@ LB_LINUX_TRY_COMPILE([ AC_MSG_RESULT(no) ]) ]) - + # # LC_STATFS_DENTRY_PARAM # starting from 2.6.18 linux kernel uses dentry instead of @@ -990,7 +995,7 @@ LB_LINUX_TRY_COMPILE([ ]) ]) -# +# # LC_INVALIDATEPAGE_RETURN_INT # more 2.6 api changes. return type for the invalidatepage # address_space_operation is 'void' in new kernels but 'int' in old @@ -1048,7 +1053,7 @@ LB_LINUX_TRY_COMPILE([ #include ],[ struct inode i; - i.i_blksize = 0; + i.i_blksize = 0; ],[ AC_MSG_RESULT(yes) AC_DEFINE(HAVE_INODE_BLKSIZE, 1, @@ -1086,37 +1091,37 @@ LB_LINUX_TRY_COMPILE([ EXTRA_KCFLAGS="$tmp_flags" ]) -# LC_GENERIC_FILE_WRITE -# 2.6.19 introduce do_sync_write instead of -# generic_file_write -AC_DEFUN([LC_GENERIC_FILE_WRITE], -[AC_MSG_CHECKING([use generic_file_write]) +# LC_FILE_WRITEV +# 2.6.19 replaced writev with aio_write +AC_DEFUN([LC_FILE_WRITEV], +[AC_MSG_CHECKING([writev in fops]) LB_LINUX_TRY_COMPILE([ #include ],[ - int result = generic_file_read(NULL, NULL, 0, 0); + struct file_operations *fops; + fops->writev = NULL; ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_FILE_WRITE, 1, - [use generic_file_write]) + AC_DEFINE(HAVE_FILE_WRITEV, 1, + [use fops->writev]) ],[ AC_MSG_RESULT(no) ]) ]) # LC_GENERIC_FILE_READ -# 2.6.19 need to use do_sync_read instead of -# generic_file_read -AC_DEFUN([LC_GENERIC_FILE_READ], -[AC_MSG_CHECKING([use generic_file_read]) +# 2.6.19 replaced readv with aio_read +AC_DEFUN([LC_FILE_READV], +[AC_MSG_CHECKING([readv in fops]) LB_LINUX_TRY_COMPILE([ #include ],[ - int result = generic_file_read(NULL, NULL, 0, 0); + struct file_operations *fops; + fops->readv = NULL; ],[ AC_MSG_RESULT(yes) - AC_DEFINE(HAVE_GENERIC_FILE_READ, 1, - [use generic_file_read]) + AC_DEFINE(HAVE_FILE_READV, 1, + [use fops->readv]) ],[ AC_MSG_RESULT(no) ]) @@ -1140,7 +1145,7 @@ LB_LINUX_TRY_COMPILE([ ]) # LC_CANCEL_DIRTY_PAGE -# 2.6.20 introduse cancel_dirty_page instead of +# 2.6.20 introduse cancel_dirty_page instead of # clear_page_dirty. AC_DEFUN([LC_CANCEL_DIRTY_PAGE], [AC_MSG_CHECKING([kernel has cancel_dirty_page]) @@ -1348,7 +1353,7 @@ LB_LINUX_TRY_COMPILE([ int i = unregister_blkdev(0,NULL); ],[ AC_MSG_RESULT([yes]) - AC_DEFINE(HAVE_UNREGISTER_BLKDEV_RETURN_INT, 1, + AC_DEFINE(HAVE_UNREGISTER_BLKDEV_RETURN_INT, 1, [unregister_blkdev return int]) ],[ AC_MSG_RESULT([no]) @@ -1467,7 +1472,7 @@ AC_TRY_RUN([ #include #include #undef __KERNEL__ -// block include +// block include #define __LINUX_POSIX_ACL_H # ifdef CONFIG_FS_POSIX_ACL @@ -1504,7 +1509,7 @@ CFLAGS="$tmp_flags" ]) # -# check for crypto API +# check for crypto API # AC_DEFUN([LC_ASYNC_BLOCK_CIPHER], [AC_MSG_CHECKING([if kernel has block cipher support]) @@ -1551,9 +1556,9 @@ AC_DEFUN([LC_PROG_LINUX], LC_CONFIG_PINGER LC_CONFIG_CHECKSUM LC_CONFIG_LIBLUSTRE_RECOVERY - LC_CONFIG_QUOTA LC_CONFIG_HEALTH_CHECK_WRITE LC_CONFIG_LRU_RESIZE + LC_QUOTA_MODULE LC_TASK_PPTR # RHEL4 patches @@ -1591,6 +1596,7 @@ AC_DEFUN([LC_PROG_LINUX], LC_FUNC_SET_FS_PWD LC_CAPA_CRYPTO + LC_CONFIG_RMTCLIENT LC_CONFIG_GSS LC_FUNC_MS_FLOCK_LOCK LC_FUNC_HAVE_CAN_SLEEP_ARG @@ -1599,6 +1605,7 @@ AC_DEFUN([LC_PROG_LINUX], LC_COOKIE_FOLLOW_LINK LC_FUNC_RCU LC_PERCPU_COUNTER + LC_QUOTA64 # does the kernel have VFS intent patches? LC_VFS_INTENT_PATCHES @@ -1637,15 +1644,15 @@ AC_DEFUN([LC_PROG_LINUX], # 2.6.19 LC_INODE_BLKSIZE LC_VFS_READDIR_U64_INO - LC_GENERIC_FILE_READ - LC_GENERIC_FILE_WRITE + LC_FILE_WRITEV + LC_FILE_READV # 2.6.20 LC_CANCEL_DIRTY_PAGE # raid5-zerocopy patch LC_PAGE_CONSTANT - + # 2.6.22 LC_INVALIDATE_BDEV_2ARG LC_ASYNC_BLOCK_CIPHER @@ -1765,7 +1772,7 @@ LC_CONFIG_LIBLUSTRE_RECOVERY AC_DEFUN([LC_CONFIG_LRU_RESIZE], [AC_MSG_CHECKING([whether to enable lru self-adjusting]) -AC_ARG_ENABLE([lru_resize], +AC_ARG_ENABLE([lru_resize], AC_HELP_STRING([--enable-lru-resize], [enable lru resize support]), [],[enable_lru_resize='yes']) @@ -1778,52 +1785,37 @@ fi # # LC_CONFIG_QUOTA # -# whether to enable quota support +# whether to enable quota support global control # AC_DEFUN([LC_CONFIG_QUOTA], -[AC_ARG_ENABLE([quota], +[AC_ARG_ENABLE([quota], AC_HELP_STRING([--enable-quota], [enable quota support]), - [],[enable_quota='default']) -if test x$linux25 != xyes; then - enable_quota='no' -fi -LB_LINUX_CONFIG([QUOTA],[ - if test x$enable_quota = xdefault; then - enable_quota='yes' - fi -],[ - if test x$enable_quota = xdefault; then - enable_quota='no' - AC_MSG_WARN([quota is not enabled because the kernel lacks quota support]) - else - if test x$enable_quota = xyes; then - AC_MSG_ERROR([cannot enable quota because the kernel lacks quota support]) - fi - fi + [],[enable_quota='yes']) ]) -if test x$enable_quota != xno; then + +# whether to enable quota support(kernel modules) +AC_DEFUN([LC_QUOTA_MODULE], +[if test x$enable_quota != xno; then + LB_LINUX_CONFIG([QUOTA],[ + enable_quota_module='yes' AC_DEFINE(HAVE_QUOTA_SUPPORT, 1, [Enable quota support]) + ],[ + enable_quota_module='no' + AC_MSG_WARN([quota is not enabled because the kernel - lacks quota support]) + ]) fi ]) -# -# LC_CONFIG_SPLIT -# -# whether to enable split support -# -AC_DEFUN([LC_CONFIG_SPLIT], -[AC_MSG_CHECKING([whether to enable split support]) -AC_ARG_ENABLE([split], - AC_HELP_STRING([--enable-split], - [enable split support]), - [],[enable_split='no']) -AC_MSG_RESULT([$enable_split]) -if test x$enable_split != xno; then - AC_DEFINE(HAVE_SPLIT_SUPPORT, 1, [enable split support]) -fi +AC_DEFUN([LC_QUOTA], +[#check global +LC_CONFIG_QUOTA +#check for utils +AC_CHECK_HEADER(sys/quota.h, + [AC_DEFINE(HAVE_SYS_QUOTA_H, 1, [Define to 1 if you have .])], + [AC_MSG_ERROR([don't find in your system])]) ]) - + AC_DEFUN([LC_QUOTA_READ], [AC_MSG_CHECKING([if kernel supports quota_read]) LB_LINUX_TRY_COMPILE([ @@ -1840,6 +1832,23 @@ LB_LINUX_TRY_COMPILE([ ]) # +# LC_CONFIG_SPLIT +# +# whether to enable split support +# +AC_DEFUN([LC_CONFIG_SPLIT], +[AC_MSG_CHECKING([whether to enable split support]) +AC_ARG_ENABLE([split], + AC_HELP_STRING([--enable-split], + [enable split support]), + [],[enable_split='no']) +AC_MSG_RESULT([$enable_split]) +if test x$enable_split != xno; then + AC_DEFINE(HAVE_SPLIT_SUPPORT, 1, [enable split support]) +fi +]) + +# # LC_COOKIE_FOLLOW_LINK # # kernel 2.6.13+ ->follow_link returns a cookie @@ -1866,7 +1875,7 @@ LB_LINUX_TRY_COMPILE([ # # LC_FUNC_RCU # -# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), +# kernels prior than 2.6.0(?) have no RCU supported; in kernel 2.6.5(SUSE), # call_rcu takes three parameters. # AC_DEFUN([LC_FUNC_RCU], @@ -1887,7 +1896,7 @@ LB_LINUX_TRY_COMPILE([ AC_DEFINE(HAVE_CALL_RCU_PARAM, 1, [call_rcu takes three parameters]) AC_MSG_RESULT([yes]) ],[ - AC_MSG_RESULT([no]) + AC_MSG_RESULT([no]) ]) ],[ AC_MSG_RESULT([no]) @@ -1895,7 +1904,7 @@ LB_LINUX_TRY_COMPILE([ ]) # LC_SECURITY_PLUG # for SLES10 SP2 -# check security plug in sles10 sp2 kernel +# check security plug in sles10 sp2 kernel AC_DEFUN([LC_SECURITY_PLUG], [AC_MSG_CHECKING([If kernel has security plug support]) LB_LINUX_TRY_COMPILE([ @@ -1942,6 +1951,33 @@ LB_LINUX_TRY_COMPILE([ ]) # +# LC_QUOTA64 +# linux kernel have 64-bit limits support +# +AC_DEFUN([LC_QUOTA64], +[if test x$enable_quota_module = xyes; then + AC_MSG_CHECKING([if kernel has 64-bit quota limits support]) + LB_LINUX_TRY_COMPILE([ + #include + #include + #include + int versions[] = V2_INITQVERSIONS_R1; + struct v2_disk_dqblk_r1 dqblk_r1; + ],[],[ + AC_DEFINE(HAVE_QUOTA64, 1, [have quota64]) + AC_MSG_RESULT([yes]) + ],[ + LB_CHECK_FILE([$LINUX/include/linux/lustre_version.h],[ + if test x$enable_server = xyes ; then + AC_MSG_ERROR([You have got no 64-bit kernel quota support.]) + fi + ],[]) + AC_MSG_RESULT([no]) + ]) +fi +]) + +# # LC_CONFIGURE # # other configure checks @@ -2046,7 +2082,7 @@ AM_CONDITIONAL(LIBLUSTRE_TESTS, test x$enable_liblustre_tests = xyes) AM_CONDITIONAL(MPITESTS, test x$enable_mpitests = xyes, Build MPI Tests) AM_CONDITIONAL(CLIENT, test x$enable_client = xyes) AM_CONDITIONAL(SERVER, test x$enable_server = xyes) -AM_CONDITIONAL(QUOTA, test x$enable_quota = xyes) +AM_CONDITIONAL(QUOTA, test x$enable_quota_module = xyes) AM_CONDITIONAL(SPLIT, test x$enable_split = xyes) AM_CONDITIONAL(BLKID, test x$ac_cv_header_blkid_blkid_h = xyes) AM_CONDITIONAL(EXT2FS_DEVEL, test x$ac_cv_header_ext2fs_ext2fs_h = xyes) @@ -2085,6 +2121,7 @@ lustre/liblustre/Makefile lustre/liblustre/tests/Makefile lustre/llite/Makefile lustre/llite/autoMakefile +lustre/lclient/Makefile lustre/lov/Makefile lustre/lov/autoMakefile lustre/lvfs/Makefile diff --git a/lustre/autoconf/lustre-version.ac b/lustre/autoconf/lustre-version.ac index 679eaa0..4c2a067 100644 --- a/lustre/autoconf/lustre-version.ac +++ b/lustre/autoconf/lustre-version.ac @@ -1,6 +1,6 @@ m4_define([LUSTRE_MAJOR],[1]) m4_define([LUSTRE_MINOR],[9]) -m4_define([LUSTRE_PATCH],[90]) +m4_define([LUSTRE_PATCH],[110]) m4_define([LUSTRE_FIX],[0]) dnl # don't forget to update the service tags info diff --git a/lustre/cmm/cmm_device.c b/lustre/cmm/cmm_device.c index 4308533..01f319d 100644 --- a/lustre/cmm/cmm_device.c +++ b/lustre/cmm/cmm_device.c @@ -53,6 +53,9 @@ #include #include "cmm_internal.h" #include "mdc_internal.h" +#ifdef HAVE_QUOTA_SUPPORT +# include +#endif static struct obd_ops cmm_obd_device_ops = { .o_owner = THIS_MODULE @@ -127,12 +130,270 @@ static int cmm_update_capa_key(const struct lu_env *env, RETURN(rc); } +#ifdef HAVE_QUOTA_SUPPORT +static int cmm_quota_notify(const struct lu_env *env, struct md_device *m) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_notify(env, + cmm_dev->cmm_child); + RETURN(rc); +} + +static int cmm_quota_setup(const struct lu_env *env, struct md_device *m, + void *data) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_setup(env, + cmm_dev->cmm_child, + data); + RETURN(rc); +} + +static int cmm_quota_cleanup(const struct lu_env *env, struct md_device *m) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_cleanup(env, + cmm_dev->cmm_child); + RETURN(rc); +} + +static int cmm_quota_recovery(const struct lu_env *env, struct md_device *m) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_recovery(env, + cmm_dev->cmm_child); + RETURN(rc); +} + +static int cmm_quota_check(const struct lu_env *env, struct md_device *m, + struct obd_export *exp, __u32 type) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_check(env, + cmm_dev->cmm_child, + exp, type); + RETURN(rc); +} + +static int cmm_quota_on(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_on(env, + cmm_dev->cmm_child, + type); + RETURN(rc); +} + +static int cmm_quota_off(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_off(env, + cmm_dev->cmm_child, + type); + RETURN(rc); +} + +static int cmm_quota_setinfo(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_setinfo(env, + cmm_dev->cmm_child, + type, id, dqinfo); + RETURN(rc); +} + +static int cmm_quota_getinfo(const struct lu_env *env, + const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct cmm_device *cmm_dev = md2cmm_dev((struct md_device *)m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_getinfo(env, + cmm_dev->cmm_child, + type, id, dqinfo); + RETURN(rc); +} + +static int cmm_quota_setquota(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_setquota(env, + cmm_dev->cmm_child, + type, id, dqblk); + RETURN(rc); +} + +static int cmm_quota_getquota(const struct lu_env *env, + const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct cmm_device *cmm_dev = md2cmm_dev((struct md_device *)m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_getquota(env, + cmm_dev->cmm_child, + type, id, dqblk); + RETURN(rc); +} + +static int cmm_quota_getoinfo(const struct lu_env *env, + const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct cmm_device *cmm_dev = md2cmm_dev((struct md_device *)m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_getoinfo(env, + cmm_dev->cmm_child, + type, id, dqinfo); + RETURN(rc); +} + +static int cmm_quota_getoquota(const struct lu_env *env, + const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct cmm_device *cmm_dev = md2cmm_dev((struct md_device *)m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_getoquota(env, + cmm_dev->cmm_child, + type, id, dqblk); + RETURN(rc); +} + +static int cmm_quota_invalidate(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_invalidate(env, + cmm_dev->cmm_child, + type); + RETURN(rc); +} + +static int cmm_quota_finvalidate(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct cmm_device *cmm_dev = md2cmm_dev(m); + int rc; + ENTRY; + + /* disable quota for CMD case temporary. */ + if (cmm_dev->cmm_tgt_count) + RETURN(-EOPNOTSUPP); + + rc = cmm_child_ops(cmm_dev)->mdo_quota.mqo_finvalidate(env, + cmm_dev->cmm_child, + type); + RETURN(rc); +} +#endif + static const struct md_device_operations cmm_md_ops = { .mdo_statfs = cmm_statfs, .mdo_root_get = cmm_root_get, .mdo_maxsize_get = cmm_maxsize_get, .mdo_init_capa_ctxt = cmm_init_capa_ctxt, .mdo_update_capa_key = cmm_update_capa_key, +#ifdef HAVE_QUOTA_SUPPORT + .mdo_quota = { + .mqo_notify = cmm_quota_notify, + .mqo_setup = cmm_quota_setup, + .mqo_cleanup = cmm_quota_cleanup, + .mqo_recovery = cmm_quota_recovery, + .mqo_check = cmm_quota_check, + .mqo_on = cmm_quota_on, + .mqo_off = cmm_quota_off, + .mqo_setinfo = cmm_quota_setinfo, + .mqo_getinfo = cmm_quota_getinfo, + .mqo_setquota = cmm_quota_setquota, + .mqo_getquota = cmm_quota_getquota, + .mqo_getoinfo = cmm_quota_getoinfo, + .mqo_getoquota = cmm_quota_getoquota, + .mqo_invalidate = cmm_quota_invalidate, + .mqo_finvalidate = cmm_quota_finvalidate + } +#endif }; extern struct lu_device_type mdc_device_type; @@ -170,7 +431,11 @@ static int cmm_add_mdc(const struct lu_env *env, struct lu_device *ld; struct lu_device *cmm_lu = cmm2lu_dev(cm); mdsno_t mdc_num; + struct lu_site *site = cmm2lu_dev(cm)->ld_site; int rc; +#ifdef HAVE_QUOTA_SUPPORT + int first; +#endif ENTRY; /* find out that there is no such mdc */ @@ -194,7 +459,7 @@ static int cmm_add_mdc(const struct lu_env *env, if (IS_ERR(ld)) RETURN(PTR_ERR(ld)); - ld->ld_site = cmm2lu_dev(cm)->ld_site; + ld->ld_site = site; rc = ldt->ldt_ops->ldto_device_init(env, ld, NULL, NULL); if (rc) { @@ -222,6 +487,9 @@ static int cmm_add_mdc(const struct lu_env *env, mc = lu2mdc_dev(ld); list_add_tail(&mc->mc_linkage, &cm->cmm_targets); cm->cmm_tgt_count++; +#ifdef HAVE_QUOTA_SUPPORT + first = cm->cmm_tgt_count; +#endif spin_unlock(&cm->cmm_tgt_guard); lu_device_get(cmm_lu); @@ -232,6 +500,20 @@ static int cmm_add_mdc(const struct lu_env *env, target.ft_exp = mc->mc_desc.cl_exp; fld_client_add_target(cm->cmm_fld, &target); + if (mc->mc_num == 0) { + /* this is mdt0 -> mc export, fld lookup need this export + to forward fld lookup request. */ + LASSERT(!lu_site2md(site)->ms_server_fld->lsf_control_exp); + lu_site2md(site)->ms_server_fld->lsf_control_exp = + mc->mc_desc.cl_exp; + } +#ifdef HAVE_QUOTA_SUPPORT + /* XXX: Disable quota for CMD case temporary. */ + if (first == 1) { + CWARN("Disable quota for CMD case temporary!\n"); + cmm_child_ops(cm)->mdo_quota.mqo_off(env, cm->cmm_child, UGQUOTA); + } +#endif /* Set max md size for the mdc. */ rc = cmm_post_init_mdc(env, cm); RETURN(rc); @@ -336,10 +618,24 @@ static int cmm_recovery_complete(const struct lu_env *env, RETURN(rc); } +static int cmm_prepare(const struct lu_env *env, + struct lu_device *pdev, + struct lu_device *dev) +{ + struct cmm_device *cmm = lu2cmm_dev(dev); + struct lu_device *next = md2lu_dev(cmm->cmm_child); + int rc; + + ENTRY; + rc = next->ld_ops->ldo_prepare(env, dev, next); + RETURN(rc); +} + static const struct lu_device_operations cmm_lu_ops = { .ldo_object_alloc = cmm_object_alloc, .ldo_process_config = cmm_process_config, - .ldo_recovery_complete = cmm_recovery_complete + .ldo_recovery_complete = cmm_recovery_complete, + .ldo_prepare = cmm_prepare, }; /* --- lu_device_type operations --- */ @@ -401,7 +697,7 @@ static struct lu_device *cmm_device_alloc(const struct lu_env *env, if (!m->cmm_fld) { cmm_device_free(env, l); l = ERR_PTR(-ENOMEM); - } + } } RETURN(l); } @@ -448,14 +744,14 @@ static int cmm_device_init(const struct lu_env *env, struct lu_device *d, ls = cmm2lu_dev(m)->ld_site; lu_site2md(ls)->ms_client_fld = m->cmm_fld; err = cmm_procfs_init(m, name); - + RETURN(err); } static struct lu_device *cmm_device_fini(const struct lu_env *env, struct lu_device *ld) { - struct cmm_device *cm = lu2cmm_dev(ld); + struct cmm_device *cm = lu2cmm_dev(ld); struct mdc_device *mc, *tmp; struct lu_site *ls; ENTRY; diff --git a/lustre/cmm/cmm_object.c b/lustre/cmm/cmm_object.c index 2289be3..7cbf87d 100644 --- a/lustre/cmm/cmm_object.c +++ b/lustre/cmm/cmm_object.c @@ -66,12 +66,12 @@ int cmm_fld_lookup(struct cmm_device *cm, const struct lu_fid *fid, } if (*mds > cm->cmm_tgt_count) { - CERROR("Got invalid mdsno: "LPU64" (max: %u)\n", + CERROR("Got invalid mdsno: %x (max: %x)\n", *mds, cm->cmm_tgt_count); rc = -EINVAL; } else { - CDEBUG(D_INFO, "CMM: got MDS "LPU64" for sequence: " - LPU64"\n", *mds, fid_seq(fid)); + CDEBUG(D_INFO, "CMM: got MDS %x for sequence: " + LPX64"\n", *mds, fid_seq(fid)); } RETURN (rc); @@ -116,8 +116,8 @@ struct lu_object *cmm_object_alloc(const struct lu_env *env, struct cml_object *clo; OBD_ALLOC_PTR(clo); - if (clo != NULL) { - lo = &clo->cmm_obj.cmo_obj.mo_lu; + if (clo != NULL) { + lo = &clo->cmm_obj.cmo_obj.mo_lu; lu_object_init(lo, NULL, ld); clo->cmm_obj.cmo_obj.mo_ops = &cml_mo_ops; clo->cmm_obj.cmo_obj.mo_dir_ops = &cml_dir_ops; @@ -127,8 +127,8 @@ struct lu_object *cmm_object_alloc(const struct lu_env *env, struct cmr_object *cro; OBD_ALLOC_PTR(cro); - if (cro != NULL) { - lo = &cro->cmm_obj.cmo_obj.mo_lu; + if (cro != NULL) { + lo = &cro->cmm_obj.cmo_obj.mo_lu; lu_object_init(lo, NULL, ld); cro->cmm_obj.cmo_obj.mo_ops = &cmr_mo_ops; cro->cmm_obj.cmo_obj.mo_dir_ops = &cmr_dir_ops; @@ -199,9 +199,9 @@ static int cml_object_print(const struct lu_env *env, void *cookie, } static const struct lu_object_operations cml_obj_ops = { - .loo_object_init = cml_object_init, - .loo_object_free = cml_object_free, - .loo_object_print = cml_object_print + .loo_object_init = cml_object_init, + .loo_object_free = cml_object_free, + .loo_object_print = cml_object_print }; /* CMM local md_object operations */ @@ -831,9 +831,9 @@ static int cmr_object_print(const struct lu_env *env, void *cookie, } static const struct lu_object_operations cmr_obj_ops = { - .loo_object_init = cmr_object_init, - .loo_object_free = cmr_object_free, - .loo_object_print = cmr_object_print + .loo_object_init = cmr_object_init, + .loo_object_free = cmr_object_free, + .loo_object_print = cmr_object_print }; /* CMM remote md_object operations. All are invalid */ @@ -1274,5 +1274,5 @@ static const struct md_dir_operations cmr_dir_ops = { .mdo_link = cmr_link, .mdo_unlink = cmr_unlink, .mdo_rename = cmr_rename, - .mdo_rename_tgt = cmr_rename_tgt, + .mdo_rename_tgt = cmr_rename_tgt }; diff --git a/lustre/cmm/cmm_split.c b/lustre/cmm/cmm_split.c index 361b38d..8cb4cd9 100644 --- a/lustre/cmm/cmm_split.c +++ b/lustre/cmm/cmm_split.c @@ -268,13 +268,8 @@ static int cmm_split_fid_alloc(const struct lu_env *env, /* Alloc new fid on @mc. */ rc = obd_fid_alloc(mc->mc_desc.cl_exp, fid, NULL); - if (rc > 0) { - /* Setup FLD for new sequenceif needed. */ - rc = fld_client_create(cmm->cmm_fld, fid_seq(fid), - mc->mc_num, env); - if (rc) - CERROR("Can't create fld entry, rc %d\n", rc); - } + if (rc > 0) + rc = 0; up(&mc->mc_fid_sem); RETURN(rc); diff --git a/lustre/cmm/mdc_device.c b/lustre/cmm/mdc_device.c index 8c75a6c..db2d0b1 100644 --- a/lustre/cmm/mdc_device.c +++ b/lustre/cmm/mdc_device.c @@ -89,7 +89,7 @@ static int mdc_obd_update(struct obd_device *host, CDEBUG(D_INFO, "Update connect_flags: "LPX64"\n", conn_data->ocd_connect_flags); } - + RETURN(rc); } /* MDC OBD is set up already and connected to the proper MDS @@ -146,9 +146,9 @@ static int mdc_obd_add(const struct lu_env *env, ocd->ocd_ibits_known = MDS_INODELOCK_UPDATE; ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_ACL | - OBD_CONNECT_LCL_CLIENT | + OBD_CONNECT_RMT_CLIENT | OBD_CONNECT_MDS_CAPA | - OBD_CONNECT_OSS_CAPA | + OBD_CONNECT_OSS_CAPA | OBD_CONNECT_IBITS | OBD_CONNECT_MDS_MDS | OBD_CONNECT_FID | @@ -173,7 +173,7 @@ static int mdc_obd_add(const struct lu_env *env, mdc->obd_upcall.onu_upcall = mdc_obd_update; } } - + if (rc) { obd_disconnect(desc->cl_exp); desc->cl_exp = NULL; @@ -205,7 +205,7 @@ static int mdc_obd_del(const struct lu_env *env, struct mdc_device *mc, mdc_obd->obd_force = mdt_obd->obd_force; mdc_obd->obd_fail = 0; } - + rc = obd_fid_fini(desc->cl_exp); if (rc) CERROR("Fid fini error %d\n", rc); @@ -246,7 +246,7 @@ static int mdc_process_config(const struct lu_env *env, } static const struct lu_device_operations mdc_lu_ops = { - .ldo_object_alloc = mdc_object_alloc, + .ldo_object_alloc = mdc_object_alloc, .ldo_process_config = mdc_process_config }; @@ -254,12 +254,12 @@ void cmm_mdc_init_ea_size(const struct lu_env *env, struct mdc_device *mc, int max_mdsize, int max_cookiesize) { struct obd_device *obd = class_exp2obd(mc->mc_desc.cl_exp); - + obd->u.cli.cl_max_mds_easize = max_mdsize; obd->u.cli.cl_max_mds_cookiesize = max_cookiesize; } -static int mdc_device_init(const struct lu_env *env, struct lu_device *ld, +static int mdc_device_init(const struct lu_env *env, struct lu_device *ld, const char *name, struct lu_device *next) { return 0; @@ -286,10 +286,9 @@ static struct lu_device *mdc_device_alloc(const struct lu_env *env, } else { md_device_init(&mc->mc_md_dev, ldt); mc->mc_md_dev.md_ops = &mdc_md_ops; - ld = mdc2lu_dev(mc); + ld = mdc2lu_dev(mc); ld->ld_ops = &mdc_lu_ops; sema_init(&mc->mc_fid_sem, 1); - } RETURN (ld); @@ -300,7 +299,7 @@ static struct lu_device *mdc_device_free(const struct lu_env *env, { struct mdc_device *mc = lu2mdc_dev(ld); - LASSERTF(atomic_read(&ld->ld_ref) == 0, + LASSERTF(atomic_read(&ld->ld_ref) == 0, "Refcount = %i\n", atomic_read(&ld->ld_ref)); LASSERT(list_empty(&mc->mc_linkage)); md_device_fini(&mc->mc_md_dev); diff --git a/lustre/cmm/mdc_object.c b/lustre/cmm/mdc_object.c index 36c7678..2e884ba 100644 --- a/lustre/cmm/mdc_object.c +++ b/lustre/cmm/mdc_object.c @@ -176,7 +176,7 @@ static int mdc_req2attr_update(const struct lu_env *env, LASSERT(ma->ma_capa != NULL); *ma->ma_capa = *capa; } - + if ((body->valid & OBD_MD_FLEASIZE) || (body->valid & OBD_MD_FLDIREA)) { if (body->eadatasize == 0) { CERROR("No size defined for easize field\n"); @@ -189,7 +189,7 @@ static int mdc_req2attr_update(const struct lu_env *env, RETURN(-EPROTO); LASSERT(ma->ma_lmm != NULL); - LASSERT(ma->ma_lmm_size >= body->eadatasize); + LASSERT(ma->ma_lmm_size >= body->eadatasize); ma->ma_lmm_size = body->eadatasize; memcpy(ma->ma_lmm, md, ma->ma_lmm_size); ma->ma_valid |= MA_LOV; @@ -207,7 +207,7 @@ static int mdc_req2attr_update(const struct lu_env *env, RETURN(-EPROTO); } - cookie = req_capsule_server_sized_get(&req->rq_pill, + cookie = req_capsule_server_sized_get(&req->rq_pill, &RMF_LOGCOOKIES, body->aclsize); if (cookie == NULL) @@ -226,7 +226,7 @@ static int mdc_req2attr_update(const struct lu_env *env, RETURN(-EPROTO); } - acl = req_capsule_server_sized_get(&req->rq_pill, + acl = req_capsule_server_sized_get(&req->rq_pill, &RMF_ACL, body->aclsize); if (acl == NULL) @@ -349,7 +349,7 @@ static int mdc_object_create(const struct lu_env *env, mci = mdc_info_init(env); mci->mci_opdata.op_bias = MDS_CROSS_REF; mci->mci_opdata.op_fid2 = *lu_object_fid(&mo->mo_lu); - + /* Parent fid is needed to create dotdot on the remote node. */ mci->mci_opdata.op_fid1 = *(spec->u.sp_pfid); mci->mci_opdata.op_mod_time = la->la_ctime; @@ -572,7 +572,7 @@ static int mdc_rename_tgt(const struct lu_env *env, struct md_object *mo_p, RETURN(rc); } -/* +/* * Return resulting fid in sfid * 0: fids are not relatives * fid: fid at which search stopped @@ -594,7 +594,7 @@ static int mdc_is_subdir(const struct lu_env *env, struct md_object *mo, body = req_capsule_server_get(&mci->mci_req->rq_pill, &RMF_MDT_BODY); LASSERT(body->valid & OBD_MD_FLID); - + CDEBUG(D_INFO, "Remote mdo_is_subdir(), new src "DFID"\n", PFID(&body->fid1)); *sfid = body->fid1; diff --git a/lustre/contrib/packet-lnet.c b/lustre/contrib/packet-lnet.c new file mode 100644 index 0000000..32feb95 --- /dev/null +++ b/lustre/contrib/packet-lnet.c @@ -0,0 +1,783 @@ +/* packet-lnet.c + * Lnet packet dissection + * Author: Laurent George + * based on packet-agentx.c and packet-afs.c + * 20080903 + * + * Wireshark - Network traffic analyzer + * By Gerald Combs + * Copyright 1999 Gerald Combs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +/* how much data has at least to be available to be able to determine the + * length of the lnet message */ +#define LNET_HEADER_LEN 52 +#define LNET_NID_DEST_OFFSET 24 +#define LNET_NID_SRC_OFFSET 32 +#define LNET_MSG_TYPE_OFFSET 48 + +static guint global_lnet_tcp_port = 988; +static guint lnet_tcp_port = 988; + +void proto_reg_handoff_lnet(void); + +#define LNET_PTL_INDEX_OFFSET_PUT 88 + +/* Define the lnet proto */ +static int proto_lnet = -1; + +static int hf_lnet_src_nid = -1 ; +static int hf_lnet_src_nid_addr = -1 ; +static int hf_lnet_src_nid_lnet_type = -1 ; +static int hf_lnet_src_nid_interface = -1 ; + +static int hf_lnet_ksm_type = -1 ; +static int hf_lnet_ksm_csum= -1; +static int hf_lnet_ksm_zc_req_cookie=-1; +static int hf_lnet_ksm_zc_ack_cookie=-1; + +static int hf_lnet_dest_nid = -1 ; +static int hf_lnet_dest_nid_addr = -1 ; +static int hf_lnet_dest_nid_lnet_type = -1 ; +static int hf_lnet_dest_nid_interface = -1 ; + +static int hf_lnet_dest_pid = -1 ; +static int hf_lnet_src_pid = -1 ; + +static int hf_lnet_msg_type = -1 ; +static int hf_lnet_payload_length = -1; +static int hf_lnet_payload = -1 ; +static int hf_lnet_msg_header = -1 ; +static int hf_lnet_msg_filler = -1 ; + +static int hf_dst_wmd = -1 ; +static int hf_dst_wmd_interface = -1 ; +static int hf_dst_wmd_object = -1 ; + +static int hf_match_bits = -1 ; +static int hf_mlength = -1 ; + +static int hf_hdr_data = -1 ; +static int hf_ptl_index = -1 ; +static int hf_offset = -1 ; +static gint ett_lnet = -1; + +static int hf_src_offset = -1; +static int hf_sink_length = -1; + +static int hf_hello_incarnation = -1 ; +static int hf_hello_type = -1 ; + +static gint ett_lnet_dest_nid= -1; +static gint ett_lnet_src_nid= -1; + +tvbuff_t *next_tvb; + +/*static heur_dissector_list_t heur_subdissector_list; */ +static dissector_table_t subdissector_table; + +static const value_string lnetnames[] = { + { 1, "QSWLND "}, + { 2, "SOCKLND "}, + { 3, "GMLND "}, + { 4, "PTLLND "}, + { 5, "O2IBLND "}, + { 6, "CIBLND "}, + { 7, "OPENIBLND"}, + { 8, "IIBLND "}, + { 9, "LOLND "}, + { 10,"RALND "}, + { 11,"VIBLND "}, + { 12,"MXLND "} +}; + +enum MSG_type{ + LNET_MSG_ACK = 0, + LNET_MSG_PUT, + LNET_MSG_GET, + LNET_MSG_REPLY, + LNET_MSG_HELLO, +} ; + +static const value_string lnet_msg_type_t[] = { + { LNET_MSG_ACK , "ACK"}, + { LNET_MSG_PUT , "PUT"}, + { LNET_MSG_GET , "GET"}, + { LNET_MSG_REPLY, "REPLY"}, + { LNET_MSG_HELLO, "HELLO"} +}; + +/* defined in lustre/include/lustre/lustre_idl.h */ +static const value_string portal_indices[] = { + { 1 , "CONNMGR_REQUEST_PORTAL"}, + { 2 , "CONNMGR_REPLY_PORTAL"}, + { 3 , "OSC_REQUEST_PORTAL(obsolete)"}, + { 4 , "OSC_REPLY_PORTAL"}, + { 5 , "OSC_BULK_PORTAL(obsolete)"}, + { 6 , "OST_IO_PORTAL"}, + { 7 , "OST_CREATE_PORTAL"}, + { 8 , "OST_BULK_PORTAL"}, + { 9 , "MDC_REQUEST_PORTAL(obsolete)"}, + { 10 , "MDC_REPLY_PORTAL"}, + { 11 , "MDC_BULK_PORTAL(obsolete)"}, + { 12 , "MDS_REQUEST_PORTAL"}, + { 13 , "MDS_REPLY_PORTAL(obsolete)"}, + { 14 , "MDS_BULK_PORTAL"}, + { 15 , "LDLM_CB_REQUEST_PORTAL"}, + { 16 , "LDLM_CB_REPLY_PORTAL"}, + { 17 , "LDLM_CANCEL_REQUEST_PORTAL"}, + { 18 , "LDLM_CANCEL_REPLY_PORTAL"}, + { 19 , "PTLBD_REQUEST_PORTAL(obsolete)"}, + { 20 , "PTLBD_REPLY_PORTAL(obsolete)"}, + { 21 , "PTLBD_BULK_PORTAL(obsolete)"}, + { 22 , "MDS_SETATTR_PORTAL"}, + { 23 , "MDS_READPAGE_PORTAL"}, + { 25 , "MGC_REPLY_PORTAL"}, + { 26 , "MGS_REQUEST_PORTAL"}, + { 27 , "MGS_REPLY_PORTAL"}, + { 28 , "OST_REQUEST_PORTAL"} +}; + +#define KSOCK_MSG_NOOP 0xc0 /* ksm_u empty */ +#define KSOCK_MSG_LNET 0xc1 /* lnet msg */ + +static const value_string ksm_type_t[] = { + {0xc0, "KSOCK_MSG_NOOP"},/* ksm_u empty */ + {0xc1, "KSOCK_MSG_LNET"} /* lnet msg */ +}; + + +static int dissect_csum(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + guint32 csum; + csum = tvb_get_letohl(tvb, offset); + if (!csum) + proto_tree_add_text(tree, tvb, offset, 4, "checksum disabled"); + else + proto_tree_add_item(tree, hf_lnet_ksm_csum, tvb, offset, 4, TRUE); + + offset+=4; + return offset; +} + + +static int dissect_req_cookie(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + guint32 req; + req= tvb_get_letoh64(tvb, offset); + if (!req) + proto_tree_add_text(tree, tvb, offset, 8, "ack not required"); + else + proto_tree_add_item(tree, hf_lnet_ksm_zc_req_cookie, tvb, offset, 8, TRUE); + offset+=8; + return offset; +} + +static int dissect_ack_cookie(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + guint32 ack; + ack= tvb_get_letoh64(tvb, offset); + if (!ack) + proto_tree_add_text(tree, tvb, offset, 8, "not ack"); + else + proto_tree_add_item(tree, hf_lnet_ksm_zc_ack_cookie, tvb, offset, 8, TRUE); + offset+=8; + return offset; +} + +static void +dissect_ksock_msg_noop( tvbuff_t * tvb, packet_info *pinfo _U_ , proto_tree *tree) +{ + guint32 offset; + offset=0; + proto_tree_add_item(tree, hf_lnet_ksm_type, tvb, offset, 4, TRUE);offset+=4; + offset=dissect_csum(tvb,tree,offset); + offset=dissect_req_cookie(tvb, tree, offset); + offset=dissect_ack_cookie(tvb,tree,offset); +} + + +static int dissect_ksock_msg(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + proto_tree_add_item(tree, hf_lnet_ksm_type, tvb, offset, 4, TRUE);offset+=4; + offset=dissect_csum(tvb,tree,offset); + offset=dissect_req_cookie(tvb, tree, offset); + offset=dissect_ack_cookie(tvb,tree,offset); + return offset; +} + +static int dissect_dest_nid(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + proto_tree_add_item(tree, hf_lnet_dest_nid_addr, tvb, offset, 4, TRUE);offset+=4; + proto_tree_add_item(tree, hf_lnet_dest_nid_interface, tvb, offset, 2, TRUE);offset+=2; + proto_tree_add_item(tree, hf_lnet_dest_nid_lnet_type, tvb, offset, 2, TRUE);offset+=2; + return offset; +} + + +static int dissect_src_nid(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + proto_tree_add_item(tree, hf_lnet_src_nid_addr, tvb, offset, 4, TRUE);offset+=4; + proto_tree_add_item(tree, hf_lnet_src_nid_interface, tvb, offset, 2, TRUE);offset+=2; + proto_tree_add_item(tree, hf_lnet_src_nid_lnet_type, tvb, offset, 2, TRUE);offset+=2; + return offset; +} + +static int dissect_lnet_put(tvbuff_t * tvb, proto_tree *tree, int offset, packet_info *pinfo _U_) +{ + /* typedef struct lnet_put { + lnet_handle_wire_t ack_wmd; + __u64 match_bits; + __u64 hdr_data; + __u32 ptl_index; + __u32 offset; + } WIRE_ATTR lnet_put_t; */ + + gboolean little_endian=TRUE ; + + proto_tree_add_item(tree,hf_dst_wmd_interface,tvb,offset,8,little_endian); offset+=8; + proto_tree_add_item(tree,hf_dst_wmd_object,tvb,offset,8,little_endian);offset+=8; + + proto_tree_add_item(tree,hf_match_bits,tvb,offset,8,little_endian);offset+=8; + proto_tree_add_item(tree,hf_hdr_data,tvb,offset,8,little_endian);offset+=8; + if (check_col(pinfo->cinfo, COL_INFO)) + col_append_sep_str(pinfo->cinfo, COL_INFO, ", ", val_to_str(tvb_get_letohl(tvb,offset), portal_indices, "Unknow")); /* add some nice value */ + proto_item_append_text(tree, ", %s" , val_to_str(tvb_get_letohl(tvb,offset), portal_indices, "Unknow")); /* print ptl_index */ + proto_tree_add_item(tree,hf_ptl_index,tvb,offset,4,little_endian);offset+=4; + proto_tree_add_item(tree,hf_offset,tvb,offset,4,little_endian);offset+=4; + return offset ; +} + +static int dissect_lnet_get(tvbuff_t * tvb, proto_tree *tree, int offset, packet_info *pinfo _U_) +{ + /* typedef struct lnet_get { + lnet_handle_wire_t return_wmd; + __u64 match_bits; + __u32 ptl_index; + __u32 src_offset; + __u32 sink_length; + } WIRE_ATTR lnet_get_t; */ + + gboolean little_endian=TRUE ; + proto_tree_add_item(tree,hf_dst_wmd_interface,tvb,offset,8,little_endian);offset+=8; + proto_tree_add_item(tree,hf_dst_wmd_object,tvb,offset,8,little_endian);offset+=8; + /*if (check_col(pinfo->cinfo, COL_INFO))*/ + /* col_prepend_fence_fstr(pinfo->cinfo, COL_INFO, " %" G_GINT64_MODIFIER "u ", tvb_get_letoh64(tvb,offset) );*/ + + proto_tree_add_item(tree,hf_match_bits,tvb,offset,8,little_endian);offset+=8; + if (check_col(pinfo->cinfo, COL_INFO)) + col_append_sep_str(pinfo->cinfo, COL_INFO, ", ", val_to_str(tvb_get_letohl(tvb,offset), portal_indices, "Unknow")); + proto_item_append_text(tree, ", %s" , val_to_str(tvb_get_letohl(tvb,offset), portal_indices, "Unknow")); /* print ptl_index */ + proto_tree_add_item(tree,hf_ptl_index,tvb,offset,4,little_endian);offset+=4; + proto_tree_add_item(tree,hf_src_offset,tvb,offset,4,little_endian);offset+=4; + proto_tree_add_item(tree,hf_sink_length,tvb,offset,4,little_endian);offset+=4; + return offset; +} + +static int dissect_lnet_reply(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + /* typedef struct lnet_reply { + lnet_handle_wire_t dst_wmd; + } WIRE_ATTR lnet_reply_t; */ + + gboolean little_endian=TRUE ; + proto_tree_add_item(tree,hf_dst_wmd_interface,tvb,offset,8,little_endian);offset+=8; + proto_tree_add_item(tree,hf_dst_wmd_object,tvb,offset,8,little_endian);offset+=8; + + return offset; +} + + +static int dissect_lnet_hello(tvbuff_t * tvb, proto_tree *tree, int offset) +{ + /* typedef struct lnet_hello { + __u64 incarnation; + __u32 type; + } WIRE_ATTR lnet_hello_t; */ + + gboolean little_endian=TRUE ; + proto_tree_add_item(tree,hf_hello_incarnation,tvb,offset,8,little_endian); offset+=8; + proto_tree_add_item(tree,hf_hello_type,tvb,offset,4,little_endian); offset+=4; + return offset; +} + +static int dissect_lnet_ack(tvbuff_t * tvb, proto_tree *tree, int offset, packet_info *pinfo _U_) +{ + /* typedef struct lnet_ack { + lnet_handle_wire_t dst_wmd; + __u64 match_bits; + __u32 mlength; + } WIRE_ATTR lnet_ack_t; */ + + proto_tree_add_item(tree,hf_dst_wmd_interface,tvb,offset,8,TRUE); offset+=8; + proto_tree_add_item(tree,hf_dst_wmd_object,tvb,offset,8,TRUE);offset+=8; + proto_tree_add_item(tree,hf_match_bits,tvb,offset,8,TRUE);offset+=8; + proto_tree_add_item(tree,hf_mlength, tvb,offset,4,TRUE); offset+=4; + return offset ; +} + +static void dissect_lnet_message(tvbuff_t * tvb, packet_info *pinfo, proto_tree *tree); +/* return the pdu length */ +static guint +get_lnet_message_len(packet_info __attribute__((__unused__))*pinfo, tvbuff_t *tvb, int offset) +{ + /* + * Get the payload length + */ + guint32 plen; + plen = tvb_get_letohl(tvb,offset+28+24); /*24 = ksm header, 28 = le reste des headers*/ + + /* + * That length doesn't include the header; add that in. + */ + return plen + 72 +24 ; /* +24 == ksock msg header.. :D */ + +} + +static guint +get_noop_message_len(packet_info __attribute__((__unused__))*pinfo, tvbuff_t *tvb _U_ , int offset _U_) +{ + return 24; +} + +static void +dissect_lnet(tvbuff_t *tvb, packet_info *pinfo, proto_tree *tree) +{ + /* TODO : correct this, now we do a difference between packet with NOOP and others .. + but I don't find how to use pdu_dissect with a variable length<=LNET_HEADER_LEN */ + switch(tvb_get_letohl(tvb,0)){ + case KSOCK_MSG_NOOP: + /*g_print("ksock noop %d \n", pinfo->fd->num);*/ + tcp_dissect_pdus(tvb,pinfo,tree,TRUE,0, get_noop_message_len,dissect_ksock_msg_noop); + break; + case KSOCK_MSG_LNET: + tcp_dissect_pdus(tvb,pinfo,tree,TRUE,LNET_HEADER_LEN, get_lnet_message_len,dissect_lnet_message); + break; + } + +} + +typedef struct t_nid { + guint32 addr; + guint16 interface; + guint16 proto; +} t_nid ; + +static t_nid get_nid(tvbuff_t *tvb, gint offset) +{ + t_nid nid ; + nid.addr = g_htonl(tvb_get_ipv4(tvb,offset)); + nid.interface = tvb_get_letohs(tvb,offset+4); + nid.proto = tvb_get_letohs(tvb,offset+6); + return nid ; + /* example : + * get_nid(tvb, LNET_NID_DEST_OFFSET); + * get_nid(tvb, LNET_NID_SRC_OFFSET); + * */ +} + +/*----------------------------------------------------------- */ +/* For the conversation */ + +typedef struct { + guint64 match_bits; +} my_entry_t; + + +typedef struct lnet_request_key { + guint64 match_bits ; + guint32 conversation; +} lnet_request_key_t; + +typedef struct lnet_request_val { + guint64 match_bits; + guint32 packet_num_parent; +} lnet_request_val_t; + + +static GHashTable *lnet_request_hash = NULL; + +/* + * Hash Functions + */ +static gint +lnet_equal(gconstpointer v, gconstpointer w) +{ + const struct lnet_request_key *v1 = (const struct lnet_request_key *)v; + const struct lnet_request_key *v2 = (const struct lnet_request_key *)w; + + if (v1 -> conversation == v2 -> conversation && + v1 -> match_bits == v2 -> match_bits) + { + + return 1; + } + + return 0; +} + +static guint +lnet_hash (gconstpointer v) +{ + const struct lnet_request_key *key = (const struct lnet_request_key *)v; + guint val; + + val = key -> conversation + key -> match_bits ; + + return val; +} + + +static void +lnet_init_protocol(void) +{ + if (lnet_request_hash) + g_hash_table_destroy(lnet_request_hash); + + lnet_request_hash = g_hash_table_new(lnet_hash, lnet_equal); +} + + +static lnet_request_val_t* +get_lnet_conv(packet_info * pinfo , GHashTable * lnet_hash_table, guint64 match_bits ) +{ + conversation_t * conversation ; + lnet_request_key_t request_key, *new_request_key; + lnet_request_val_t *request_val=NULL ; + + conversation = find_conversation(pinfo->fd->num, &pinfo->src, &pinfo->dst, pinfo->ptype, pinfo->srcport, pinfo->destport, 0); + + + if (NULL == conversation) + /* It's not part of any conversation - create a new one. */ + conversation = conversation_new(pinfo->fd->num, &pinfo->src, &pinfo->dst, proto_lnet, + pinfo->srcport, pinfo->destport, 0); + + request_key.conversation = conversation->index; + request_key.match_bits = match_bits; + + request_val = (struct lnet_request_val * ) g_hash_table_lookup(lnet_hash_table, &request_key); + if(!request_val){ + new_request_key = se_alloc(sizeof(struct lnet_request_key)); + *new_request_key = request_key; + request_val = se_alloc(sizeof(struct lnet_request_val)); + request_val -> match_bits = match_bits; + request_val -> packet_num_parent = pinfo->fd->num ; + /*request_val -> filename = "test" ; */ + g_hash_table_insert(lnet_hash_table, new_request_key, request_val); + + } + + return request_val ; + +} + + + +/*----------------------------------------------------------- */ +static void +dissect_lnet_message(tvbuff_t * tvb, packet_info *pinfo, proto_tree *tree) +{ + + guint64 match; + guint32 msg_type; + + lnet_request_val_t* conversation_val ; + + + if (check_col(pinfo->cinfo, COL_PROTOCOL)) { + col_set_str(pinfo->cinfo, COL_PROTOCOL, "Lnet"); + } + + if (check_col(pinfo->cinfo, COL_INFO)) { + /* t_nid dest_nid ; */ + /*t_nid src_nid ; */ + /*guint32 msg_type;*/ + /*[> col_clear(pinfo->cinfo, COL_INFO); <]*/ + /*dest_nid = get_nid(tvb, LNET_NID_DEST_OFFSET);*/ + /*src_nid = get_nid(tvb, LNET_NID_SRC_OFFSET);*/ + + /*[> col_add_fstr(pinfo->cinfo, COL_INFO, "%s@tcp%d > %s@tcp%d", + ip_to_str((guint8 *) &src_nid.addr), src_nid.interface, + ip_to_str((guint8 *) & dest_nid.addr), dest_nid.interface); */ + + msg_type = tvb_get_letohl(tvb, LNET_MSG_TYPE_OFFSET ); + /* We delete the entire line and add LNET + msg_type */ + col_add_fstr(pinfo->cinfo, COL_INFO, "LNET_%s", (msg_type < sizeof(lnet_msg_type_t)/sizeof(value_string)) ? lnet_msg_type_t[msg_type].strptr : "Unknow") ; + } + + if (tree) { + t_nid dest_nid ; /* nid value */ + t_nid src_nid ; + + + proto_item *ti = NULL; /* principal node */ + proto_tree *lnet_tree = NULL ; /* principal tree */ + proto_tree *lnet_nid_src_tree= NULL ; /*subtree for the nids*/ + proto_tree *lnet_nid_dest_tree= NULL ; + proto_item *ti_src_nid ; /* node for the nids */ + proto_item *ti_dest_nid ; + + gint offset = 0 ; + + guint32 msg_type ; + guint32 payload_length; + guint32 msg_filler_length; + + + ti = proto_tree_add_item(tree,proto_lnet,tvb,0,-1,FALSE); /* principal node */ + /* ti=proto_tree_add_protocol_format(tree, proto_lnet, tvb, 0, -1, "Lnet"); */ + + lnet_tree = proto_item_add_subtree(ti,ett_lnet); /* add the subtree*/ + + /* dissect the 24first bytes (ksock_msg_t in lnet/socklnd.h */ + offset=dissect_ksock_msg(tvb,lnet_tree,offset); + + /* dest nid */ + dest_nid = get_nid(tvb, LNET_NID_DEST_OFFSET); + ti_dest_nid = proto_tree_add_text(lnet_tree, tvb, offset, 8, "dest_nid = %s@tcp%d", ip_to_str((guint8 *) &dest_nid.addr), dest_nid.interface); + lnet_nid_dest_tree = proto_item_add_subtree(ti_dest_nid,ett_lnet_dest_nid) ; + offset=dissect_dest_nid(tvb,lnet_nid_dest_tree,offset); + + /* same for src_nid */ + src_nid = get_nid(tvb, LNET_NID_SRC_OFFSET); + ti_src_nid = proto_tree_add_text(lnet_tree, tvb, offset, 8, "src_nid = %s@tcp%d", ip_to_str((guint8 *) &src_nid.addr), src_nid.interface); + lnet_nid_src_tree = proto_item_add_subtree(ti_src_nid,ett_lnet_src_nid) ; + offset=dissect_src_nid(tvb,lnet_nid_src_tree,offset); + + /* pid */ + proto_tree_add_item(lnet_tree, hf_lnet_src_pid, tvb, offset, 4, TRUE); offset+=4; + proto_tree_add_item(lnet_tree, hf_lnet_dest_pid, tvb, offset, 4, TRUE); offset+=4; + + /* message_type (32 bits) */ + msg_type = tvb_get_letohl(tvb, offset+0); + /* put some nice info on lnet line */ + proto_item_append_text(ti," %s", (msg_type < sizeof(lnet_msg_type_t)/sizeof(value_string)) ? lnet_msg_type_t[msg_type].strptr : "Unknow") ; /* rajout de l'info dans l'arbre */ + proto_tree_add_item(lnet_tree, hf_lnet_msg_type, tvb, offset, 4, TRUE); offset+=4; + + /* payload data (to follow) length :*/ + payload_length = tvb_get_letohl(tvb,offset+0); + proto_tree_add_item(lnet_tree, hf_lnet_payload_length, tvb, offset, 4, TRUE); offset+=4; + + /* here offset = 24+8+8+4+4+4+4 = 56 */ + match = 0 ; + switch(msg_type) { + case LNET_MSG_ACK: + offset=dissect_lnet_ack(tvb,lnet_tree,offset,pinfo); + match = tvb_get_letoh64(tvb,72 ); + break; + case LNET_MSG_PUT: + offset=dissect_lnet_put(tvb,lnet_tree,offset,pinfo); + match = tvb_get_letoh64(tvb, 72); + break; + case LNET_MSG_GET: + offset=dissect_lnet_get(tvb,lnet_tree,offset,pinfo); + match = tvb_get_letoh64(tvb, 72); + break; + case LNET_MSG_REPLY: + offset=dissect_lnet_reply(tvb,lnet_tree,offset); + break; + case LNET_MSG_HELLO: + offset=dissect_lnet_hello(tvb,lnet_tree,offset); + break; + default: + break; + } + + + conversation_val = get_lnet_conv(pinfo , lnet_request_hash, match ); + /* proto_tree_add_text(tree, tvb, 0 , 0, "match = %" G_GINT64_MODIFIER "u parent = %d", conversation_val -> match_bits , conversation_val -> packet_num_parent); */ + + + /* padding */ + msg_filler_length = 72 - offset + 24 ; + if ( msg_filler_length > 72) + return ; + /* +24 : ksosck_message take 24bytes, and allready in offset */ + + proto_tree_add_item(lnet_tree, hf_lnet_msg_filler, tvb, offset, msg_filler_length, TRUE); + offset+=msg_filler_length; + + if (payload_length>0) + { + + /* display of payload */ + proto_tree_add_item(lnet_tree,hf_lnet_payload, tvb, offset, payload_length, TRUE); + + next_tvb = tvb_new_subset (tvb, offset, payload_length, payload_length); + if(msg_type==LNET_MSG_PUT) + dissector_try_port(subdissector_table, tvb_get_letohl(tvb,LNET_PTL_INDEX_OFFSET_PUT), next_tvb, pinfo, tree); + + } + + offset+=payload_length; + } +} + +void +proto_register_lnet(void) +{ + static hf_register_info hf[] = { + { &hf_lnet_ksm_type , + { "Type of socklnd message" , "lnet.ksm_type" , FT_UINT32 , BASE_HEX , VALS(ksm_type_t) , 0x0 , "" , HFILL }} , + { &hf_lnet_ksm_csum , + { "Checksum" , "lnet.ksm_csum" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL }} , + { &hf_lnet_ksm_zc_req_cookie , + { "Ack required" , "lnet.ksm_zc_req_cookie" , FT_UINT64 , BASE_HEX , NULL , 0x0 , "" , HFILL }} , + { &hf_lnet_ksm_zc_ack_cookie , + { "Ack" , "lnet.ksm_zc_ack_cookie" , FT_UINT64 , BASE_HEX , NULL , 0x0 , "" , HFILL }} , + + { &hf_lnet_src_nid , + { "Src nid" , "lnet.src_nid" , FT_UINT64 , BASE_HEX , NULL , 0x0 , "src nid" , HFILL }} , + { &hf_lnet_src_nid_addr , + { "Src nid" , "lnet.src_nid_addr" , FT_IPv4 , BASE_NONE , NULL , 0x0 , "" , HFILL }} , + { &hf_lnet_src_nid_lnet_type , + { "lnd network type" , "lnet.src_nid_type" , FT_UINT16 , BASE_DEC , VALS(lnetnames) , 0x0 , "" , HFILL }} , + { &hf_lnet_src_nid_interface , + { "lnd network interface" , "lnet.src_nid_net_interface" , FT_UINT16 , BASE_DEC , NULL , 0x0 , NULL , HFILL }} , + + { &hf_lnet_dest_nid , + { "Dest nid" , "lnet.dest_nid" , FT_UINT64 , BASE_HEX , NULL , 0x0 , "" , HFILL }} , + + { &hf_lnet_dest_nid_addr , + { "Destination nid" , "lnet.dest_nid_addr" , FT_IPv4 , BASE_NONE , NULL , 0x0 , "" , HFILL }} , + { &hf_lnet_dest_nid_lnet_type , + { "lnd network type" , "lnet.dest_nid_type" , FT_UINT16 , BASE_DEC , VALS(lnetnames) , 0x0 , "" , HFILL }} , + { &hf_lnet_dest_nid_interface , + { "lnd network interface" , "lnet.dest_nid_net_interface" , FT_UINT16 , BASE_DEC , NULL , 0x0 , NULL , HFILL }} , + + { &hf_lnet_dest_pid , + { "Dest pid" , "lnet.dest_pid" , FT_UINT32 , BASE_DEC_HEX , NULL , 0x0 , "dest pid" , HFILL }} , + { &hf_lnet_src_pid , + { "Src pid" , "lnet.src_pid" , FT_UINT32 , BASE_DEC_HEX , NULL , 0x0 , "src nid" , HFILL }} , + + { &hf_lnet_msg_type , + { "Message type" , "lnet.msg_type" , FT_UINT32 , BASE_DEC , VALS(lnet_msg_type_t) , 0x0 , "msg type" , HFILL }} , + { &hf_lnet_payload_length , + { "Payload length" , "lnet.payload_length" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL }} , + { &hf_lnet_payload , + { "Payload" , "lnet.payload" , FT_NONE , BASE_NONE , NULL , 0x0 , "" , HFILL }} , + + {&hf_dst_wmd , + { "DST MD index " , "lnet.msg_dst_cookie" , FT_BYTES , BASE_NONE , NULL , 0x0 , "" , HFILL }} , + { &hf_dst_wmd_interface , + { "DST MD index interface" , "lnet.msg_dst_inteface_cookie" , FT_UINT64 , BASE_HEX_DEC , NULL , 0x0 , "" , HFILL }} , + { &hf_dst_wmd_object , + { "DST MD index object" , "lnet.msg_dst_object_cookie" , FT_UINT64 , BASE_HEX_DEC , NULL , 0x0 , "" , HFILL }} , + { &hf_match_bits , + { "Match bits" , "lnet.msg_dst_match_bits" , FT_UINT64 , BASE_HEX_DEC , NULL , 0x0 , "" , HFILL}} , + { &hf_mlength , + { "Message length" , "lnet.msg_length" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL}} , + + + /* Put */ + { &hf_hdr_data , + { "hdr data" , "lnet.msg_hdr_data" , FT_UINT64 , BASE_HEX_DEC , NULL , 0x0 , "" , HFILL}} , + { &hf_ptl_index , + { "ptl index" , "lnet.ptl_index" , FT_UINT32 , BASE_DEC , VALS(portal_indices) , 0x0 , "" , HFILL}} , + { &hf_offset , + { "offset" , "lnet.offset" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL}} , + + /* Get*/ + { &hf_src_offset , + { "src offset" , "lnet.src_offset" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL}} , + { &hf_sink_length , + { "sink length" , "lnet.sink_length" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL}} , + + /* Hello*/ + { &hf_hello_incarnation , + { "hello incarnation " , "lnet.hello_incarnation" , FT_UINT64 , BASE_HEX_DEC , NULL , 0x0 , "" , HFILL}} , + { &hf_hello_type , + { "hello type" , "lnet.hello_type" , FT_UINT32 , BASE_DEC , NULL , 0x0 , "" , HFILL}} , + + { &hf_lnet_msg_header , + { "ptl header" , "lnet.ptl_header" , FT_NONE , BASE_NONE , NULL , 0x0 , "" , HFILL}} , + + { &hf_lnet_msg_filler , + { "msg filler (padding)" , "lnet.ptl_filler" , FT_NONE , BASE_NONE , NULL , 0x0 , "" , HFILL}} , + + /* Add more fields here */ + }; + + static gint *ett[] = { + &ett_lnet, + &ett_lnet_dest_nid, + &ett_lnet_src_nid + }; + + + module_t *lnet_module; + + proto_lnet = proto_register_protocol("Lnet", /*name*/ + "Lnet", /*short name*/ + "lnet"); /*abbrev*/ + + proto_register_field_array(proto_lnet, hf, array_length(hf)); + proto_register_subtree_array(ett, array_length(ett)); + + lnet_module = prefs_register_protocol(proto_lnet, proto_reg_handoff_lnet); + + prefs_register_uint_preference(lnet_module, "tcp.lnet_port", + "Lnet listener TCP Port", + "Set the TCP port for Lnet" + "(if other than the default of 988)", + 10, &global_lnet_tcp_port); + + subdissector_table = register_dissector_table("lnet.ptl_index", "lnet portal index", FT_UINT32 , BASE_DEC); + + register_init_routine(&lnet_init_protocol); + +} + + +/* The registration hand-off routine */ +void +proto_reg_handoff_lnet(void) +{ + static int lnet_prefs_initialized = FALSE; + static dissector_handle_t lnet_handle; + + if(!lnet_prefs_initialized) { + lnet_handle = create_dissector_handle(dissect_lnet, proto_lnet); + lnet_prefs_initialized = TRUE; + } + else { + dissector_delete("tcp.port",global_lnet_tcp_port, lnet_handle); + } + + lnet_tcp_port = global_lnet_tcp_port; + + dissector_add("tcp.port", lnet_tcp_port, lnet_handle); +} diff --git a/lustre/contrib/packet-lustre.c b/lustre/contrib/packet-lustre.c new file mode 100644 index 0000000..3d34cb2 --- /dev/null +++ b/lustre/contrib/packet-lustre.c @@ -0,0 +1,10146 @@ +/* packet-lustre.c + * Lustre 1.6 dissection (http://www.lustre.org/) + * Author: Laurent George + * based on packet-agentx.c and packet-afs.c + * partially generated by Pidl + * 20080903 + * Wireshark - Network traffic analyzer + * By Gerald Combs + * Copyright 1999 Gerald Combs + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version 2 + * of the License, or (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with this program; if not, write to the Free Software + * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + + +#include +#include +#include + +#include +#include + + +/* --------------------------------------------------------------------------------------- */ +/* def and macro to know where we are the the lustre payload */ +#define LUSTRE_MAGIC_OFFSET 8 +#define LUSTRE_MSG_MAGIC_V1 0x0BD00BD0 +#define LUSTRE_MSG_MAGIC_V2 0x0BD00BD3 + +#define LUSTRE_BUFCOUNT_OFF ((tvb_get_letohl(tvb, LUSTRE_MAGIC_OFFSET)== LUSTRE_MSG_MAGIC_V2) ? 0 : 60) +#define LUSTRE_BUFCOUNT ((tvb_get_letohl(tvb, LUSTRE_MAGIC_OFFSET)== LUSTRE_MSG_MAGIC_V2) \ + ? (tvb_get_letohl(tvb, LUSTRE_BUFCOUNT_OFF)) : ((tvb_get_letohl(tvb, LUSTRE_BUFCOUNT_OFF))) ) +/* remark : BUFLENOFF don't have the same meaning if it's for v1 or v2 + * v1 : LUSTRE_BUFLEN_OFF = offset buflen[0] - 4 bytes. + * v2 : LUSTRE_BUFLEN_OFF = offset buflen[0] + */ +#define LUSTRE_BUFLEN_OFF ((tvb_get_letohl(tvb, LUSTRE_MAGIC_OFFSET)== LUSTRE_MSG_MAGIC_V2) ? 32 : 60) + +#define LUSTRE_REQ_REC_OFF 1 /* normal request record offset */ +#define LUSTRE_REPLY_REC_OFF 1 /* normal reply record offset */ + +#define LUSTRE_DLM_INTENT_IT_OFF 2 /* intent lock it offset */ +#define LUSTRE_DLM_INTENT_REC_OFF 3 /* intent lock record offset */ +#define LUSTRE_DLM_LOCKREPLY_OFF 1 /* lockrep offset */ +#define LUSTRE_DLM_REPLY_REC_OFF 2 /* reply record offset */ +/* --------------------------------------------------------------------------------------- */ + + +#define LUSTRE_PTLRPC_MSG_VERSION 0x00000003 +#define LUSTRE_VERSION_MASK 0xffff0000 +#define LUSTRE_OBD_VERSION 0x00010000 +#define LUSTRE_MDS_VERSION 0x00020000 +#define LUSTRE_OST_VERSION 0x00030000 +#define LUSTRE_DLM_VERSION 0x00040000 +#define LUSTRE_LOG_VERSION 0x00050000 +#define LUSTRE_MGS_VERSION 0x00060000 + + + +#define LOV_MAGIC_V1 0x0BD10BD0 +#define LOV_MAGIC LOV_MAGIC_V1 +#define LOV_MAGIC_JOIN 0x0BD20BD0 + +typedef enum { + OST_REPLY = 0, /* reply ? */ + OST_GETATTR = 1, + OST_SETATTR = 2, + OST_READ = 3, + OST_WRITE = 4, + OST_CREATE = 5, + OST_DESTROY = 6, + OST_GET_INFO = 7, + OST_CONNECT = 8, + OST_DISCONNECT = 9, + OST_PUNCH = 10, + OST_OPEN = 11, + OST_CLOSE = 12, + OST_STATFS = 13, + /* OST_SAN_READ = 14, deprecated */ + /* OST_SAN_WRITE = 15, deprecated */ + OST_SYNC = 16, + OST_SET_INFO = 17, + OST_QUOTACHECK = 18, + OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, + OST_LAST_OPC +} ost_cmd_t ; + + +typedef enum { + OBD_PING = 400, + OBD_LOG_CANCEL, + OBD_QC_CALLBACK, + OBD_LAST_OPC +} obd_cmd_t; +#define OBD_FIRST_OPC OBD_PING + +typedef enum { + MDS_GETATTR = 33, + MDS_GETATTR_NAME = 34, + MDS_CLOSE = 35, + MDS_REINT = 36, + MDS_READPAGE = 37, + MDS_CONNECT = 38, + MDS_DISCONNECT = 39, + MDS_GETSTATUS = 40, + MDS_STATFS = 41, + MDS_PIN = 42, + MDS_UNPIN = 43, + MDS_SYNC = 44, + MDS_DONE_WRITING = 45, + MDS_SET_INFO = 46, + MDS_QUOTACHECK = 47, + MDS_QUOTACTL = 48, + MDS_GETXATTR = 49, + MDS_SETXATTR = 50, + MDS_LAST_OPC +} mds_cmd_t; + +#define IT_OPEN 0x0001 +#define IT_CREAT 0x0002 +#define IT_READDIR 0x0004 +#define IT_GETATTR 0x0008 +#define IT_LOOKUP 0x0010 +#define IT_UNLINK 0x0020 +#define IT_GETXATTR 0x0040 +#define IT_EXEC 0x0080 +#define IT_PIN 0x0100 + + + +#define MDS_FIRST_OPC MDS_GETATTR +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +typedef enum { + REINT_SETATTR = 1, + REINT_CREATE = 2, + REINT_LINK = 3, + REINT_UNLINK = 4, + REINT_RENAME = 5, + REINT_OPEN = 6, + // REINT_CLOSE = 7, + // REINT_WRITE = 8, + REINT_MAX +} mds_reint_t; + +typedef enum { + LDLM_ENQUEUE = 101, + LDLM_CONVERT = 102, + LDLM_CANCEL = 103, + LDLM_BL_CALLBACK = 104, + LDLM_CP_CALLBACK = 105, + LDLM_GL_CALLBACK = 106, + LDLM_LAST_OPC +} ldlm_cmd_t; +#define LDLM_FIRST_OPC LDLM_ENQUEUE + +#define LDLM_FL_LOCK_CHANGED 0x000001 +#define LDLM_FL_BLOCK_GRANTED 0x000002 +#define LDLM_FL_BLOCK_CONV 0x000004 +#define LDLM_FL_BLOCK_WAIT 0x000008 +#define LDLM_FL_CBPENDING 0x000010 +#define LDLM_FL_AST_SENT 0x000020 +#define LDLM_FL_WAIT_NOREPROC 0x000040 +#define LDLM_FL_CANCEL 0x000080 +#define LDLM_FL_REPLAY 0x000100 +#define LDLM_FL_INTENT_ONLY 0x000200 +#define LDLM_FL_LOCAL_ONLY 0x000400 +#define LDLM_FL_FAILED 0x000800 +#define LDLM_FL_HAS_INTENT 0x001000 +#define LDLM_FL_CANCELING 0x002000 +#define LDLM_FL_LOCAL 0x004000 +#define LDLM_FL_WARN 0x008000 +#define LDLM_FL_DISCARD_DATA 0x010000 +#define LDLM_FL_NO_TIMEOUT 0x020000 +#define LDLM_FL_BLOCK_NOWAIT 0x040000 +#define LDLM_FL_TEST_LOCK 0x080000 +#define LDLM_FL_LVB_READY 0x100000 +#define LDLM_FL_KMS_IGNORE 0x200000 +#define LDLM_FL_NO_LRU 0x400000 +#define LDLM_FL_CANCEL_ON_BLOCK 0x800000 +#define LDLM_FL_CP_REQD 0x1000000 +#define LDLM_FL_CLEANED 0x2000000 +#define LDLM_FL_ATOMIC_CB 0x4000000 +#define LDLM_FL_BL_AST 0x10000000 +#define LDLM_FL_BL_DONE 0x20000000 +#define LDLM_FL_DENY_ON_CONTENTION 0x40000000 +#define LDLM_AST_DISCARD_DATA 0x80000000 + + + +#define LDLM_ENQUEUE (101) +#define LDLM_CONVERT (102) +#define LDLM_CANCEL (103) +#define LDLM_BL_CALLBACK (104) +#define LDLM_CP_CALLBACK (105) +#define LDLM_GL_CALLBACK (106) +#define LCK_MINMODE (0) +#define LCK_EX (1) +#define LCK_PW (2) +#define LCK_PR (4) +#define LCK_CW (8) +#define LCK_CR (16) +#define LCK_NL (32) +#define LCK_GROUP (64) +#define LDLM_PLAIN (10) +#define LDLM_EXTENT (11) +#define LDLM_FLOCK (12) +#define LDLM_IBITS (13) +//#define MGS_CONNECT (250) +#define OBD_PING (400) + +/* + * Opcodes for mountconf (mgs and mgc) + */ +typedef enum { + MGS_CONNECT = 250, + MGS_DISCONNECT, + MGS_EXCEPTION, /* node died, etc. */ + MGS_TARGET_REG, /* whenever target starts up */ + MGS_TARGET_DEL, + MGS_SET_INFO, + MGS_LAST_OPC +} mgs_cmd_t; +#define MGS_FIRST_OPC MGS_CONNECT + +/* llog protocol */ +typedef enum { + LLOG_ORIGIN_HANDLE_CREATE = 501, + LLOG_ORIGIN_HANDLE_NEXT_BLOCK = 502, + LLOG_ORIGIN_HANDLE_READ_HEADER = 503, + LLOG_ORIGIN_HANDLE_WRITE_REC = 504, + LLOG_ORIGIN_HANDLE_CLOSE = 505, + LLOG_ORIGIN_CONNECT = 506, + LLOG_CATINFO = 507, /* for lfs catinfo */ + LLOG_ORIGIN_HANDLE_PREV_BLOCK = 508, + LLOG_ORIGIN_HANDLE_DESTROY = 509, /* for destroy llog object*/ + LLOG_LAST_OPC +} llog_cmd_t; + +#define LLOG_FIRST_OPC LLOG_ORIGIN_HANDLE_CREATE +/*flag for the LLOG*/ +#define LLOG_OP_MAGIC 0x10600000 +#define LLOG_OP_MASK 0xfff00000 + +typedef enum { + LLOG_PAD_MAGIC = LLOG_OP_MAGIC | 0x00000, + OST_SZ_REC = LLOG_OP_MAGIC | 0x00f00, + OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, + MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_UNLINK, + MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_SETATTR, + OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, + PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, /* obsolete */ + LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, + LLOG_JOIN_REC = LLOG_OP_MAGIC | 0x50000, + LLOG_HDR_MAGIC = LLOG_OP_MAGIC | 0x45539, + LLOG_LOGID_MAGIC = LLOG_OP_MAGIC | 0x4553b, +} llog_op_type; + +/* LLOG flag */ +/*defined in lustre/include/lustre/lustre_idl.h*/ +#define LLOG_F_ZAP_WHEN_EMPTY 0x1 +#define LLOG_F_IS_CAT 0x2 +#define LLOG_F_IS_PLAIN 0x4 + +#define PTL_RPC_MSG_REQUEST 4711 +#define PTL_RPC_MSG_ERR 4712 +#define PTL_RPC_MSG_REPLY 4713 + + +/* Ett declarations */ +static gint ett_lustre_llog_log_llh_flags = -1 ; +static gint ett_lustre = -1; +static gint ett_lustre_lustre_handle_cookie = -1; +static gint ett_lustre_lustre_msg_v1 = -1; +static gint ett_lustre_lustre_handle_v1 = -1; +static gint ett_lustre_lustre_msg_v2 = -1; +static gint ett_lustre_ptlrpc_body = -1; +static gint ett_lustre_lustre_handle_v2 = -1; +static gint ett_lustre_obd_connect_data = -1; +static gint ett_lustre_lov_mds_md_v1 = -1; +static gint ett_lustre_lov_ost_data_v1 = -1; +static gint ett_lustre_obd_statfs = -1; +static gint ett_lustre_obd_ioobj = -1; +static gint ett_lustre_niobuf_remote = -1; +static gint ett_lustre_ost_lvb = -1; +static gint ett_lustre_ll_fid = -1; +static gint ett_lustre_mds_status_req = -1; +static gint ett_lustre_mds_body = -1; +static gint ett_lustre_obd_quotactl = -1; +static gint ett_lustre_obd_dqinfo = -1; +static gint ett_lustre_obd_dqblk = -1; +static gint ett_lustre_quota_adjust_qunit = -1; +static gint ett_lustre_mds_rec_setattr = -1; +static gint ett_lustre_mds_rec_create = -1; +static gint ett_lustre_mds_rec_join = -1; +static gint ett_lustre_mds_rec_link = -1; +static gint ett_lustre_mds_rec_unlink = -1; +static gint ett_lustre_mds_rec_rename = -1; +static gint ett_lustre_lov_desc = -1; +static gint ett_lustre_obd_uuid = -1; +static gint ett_lustre_ldlm_res_id = -1; +static gint ett_lustre_ldlm_extent = -1; +static gint ett_lustre_ldlm_inodebits = -1; +static gint ett_lustre_ldlm_flock = -1; +static gint ett_lustre_ldlm_intent_opc = -1; +static gint ett_lustre_ldlm_resource_desc = -1; +static gint ett_lustre_ldlm_lock_desc = -1; +static gint ett_lustre_ldlm_request = -1; +static gint ett_lustre_lustre_handle = -1; +static gint ett_lustre_ldlm_reply = -1; +static gint ett_lustre_mgs_send_param = -1; +static gint ett_lustre_mgs_target_info = -1; +static gint ett_lustre_cfg_marker = -1; +static gint ett_lustre_llog_catid = -1; +static gint ett_lustre_lov_mds_md_join = -1; +static gint ett_lustre_llog_rec_hdr = -1; +static gint ett_lustre_llog_logid_rec = -1; +static gint ett_lustre_llog_logid = -1; +static gint ett_lustre_llog_rec_tail = -1; +static gint ett_lustre_lov_mds_md = -1; +static gint ett_lustre_llog_array_rec = -1; +static gint ett_lustre_mds_extent_desc = -1; +static gint ett_lustre_llog_create_rec = -1; +static gint ett_lustre_llog_orphan_rec = -1; +static gint ett_lustre_llog_unlink_rec = -1; +static gint ett_lustre_llog_setattr_rec = -1; +static gint ett_lustre_llog_size_change_rec = -1; +static gint ett_lustre_llog_gen_rec = -1; +static gint ett_lustre_llog_log_hdr = -1; +static gint ett_lustre_llog_cookie = -1; +static gint ett_lustre_llogd_body = -1; +static gint ett_lustre_llogd_conn_body = -1; +static gint ett_lustre_llog_gen = -1; +static gint ett_lustre_lov_user_md_join = -1; +static gint ett_lustre_lov_user_ost_data_join = -1; +static gint ett_lustre_obdo = -1; +static gint ett_lustre_ost_body = -1; +static gint ett_lustre_qunit_data = -1; +static gint ett_lustre_qunit_data_old2 = -1; +static gint ett_lustre_qunit_data_old = -1; +static gint ett_lustre_ldlm_lock_flags = -1 ; + +/* -----------------------------------------------*/ +/* Header field declarations */ +static int hf_lustre_ptlrpc_body_pb = -1 ; +static int hf_lustre_mds_body = -1 ; +static int hf_lustre_ost_body = -1 ; +static int hf_lustre_obd_statfs = -1 ; +static int hf_lustre_obd_quotactl = -1 ; +static int hf_lustre_quota_adjust_qunit = -1 ; +static int hf_lustre_extra_padding = -1 ; +static int hf_lustre_ldlm_reply = -1 ; +static int hf_lustre_ldlm_request = -1 ; +static int hf_lustre_mds_rec_create = -1 ; +static int hf_lustre_mds_rec_link = -1 ; +static int hf_lustre_mds_rec_unlink = -1 ; +static int hf_lustre_obd_uuid = -1 ; +static int hf_lustre_obd_connect_data = -1 ; +static int hf_lustre_ldlm_intent = -1; +static int hf_lustre_lov_user_md_join = -1 ; +static int hf_lustre_obd_ioobj = -1 ; +static int hf_lustre_niobuf_remote = -1 ; +static int hf_lustre_ost_key = -1 ; +static int hf_lustre_nio= -1 ; +static int hf_lustre_ost_val= -1; +static int hf_lustre_llogd_body = -1; +static int hf_lustre_llogd_log_hdr= -1; +static int hf_lustre_llog_logid_rec =-1 ; +static int hf_lustre_llogd_chunk= -1; +static int hf_lustre_llogd_keyword= -1; +static int hf_lustre_llogd_client= -1; +static int hf_lustre_llogd_name= -1; + +static int hf_lustre_mds_xattr_name = -1; +static int hf_lustre_lov_mds_md_v1= -1; +static int hf_lustre_llog_cookie= -1; +static int hf_lustre_mds_md_data= -1; +static int hf_lustre_mds_reint_opcode= -1; +static int hf_lustre_mds_xattr_eadata = -1; +static int hf_lustre_lov_mds_md_join = -1 ; + +static int hf_lustre_reint_name= -1; +static int hf_lustre_reint_old_name= -1; +static int hf_lustre_reint_new_name= -1; + +static int hf_lustre_mgs_target_info = -1 ; +static int hf_lustre_mgs_send_param = -1; + +static int hf_lustre_ost_lvb = -1 ; + +static int hf_lustre_ldlm_fl_lock_changed = -1; +static int hf_lustre_ldlm_fl_block_granted = -1; +static int hf_lustre_ldlm_fl_block_conv = -1; +static int hf_lustre_ldlm_fl_block_wait = -1; +static int hf_lustre_ldlm_fl_cbpending = -1; +static int hf_lustre_ldlm_fl_ast_sent = -1; +static int hf_lustre_ldlm_fl_wait_noreproc = -1; +static int hf_lustre_ldlm_fl_cancel = -1; +static int hf_lustre_ldlm_fl_replay = -1; +static int hf_lustre_ldlm_fl_intent_only = -1; +static int hf_lustre_ldlm_fl_local_only = -1; +static int hf_lustre_ldlm_fl_failed = -1; +static int hf_lustre_ldlm_fl_has_intent = -1; +static int hf_lustre_ldlm_fl_canceling = -1; +static int hf_lustre_ldlm_fl_local = -1; +static int hf_lustre_ldlm_fl_warn = -1; +static int hf_lustre_ldlm_fl_discard_data = -1; +static int hf_lustre_ldlm_fl_no_timeout = -1; +static int hf_lustre_ldlm_fl_block_nowait = -1; +static int hf_lustre_ldlm_fl_test_lock = -1; +static int hf_lustre_ldlm_fl_lvb_ready = -1; +static int hf_lustre_ldlm_fl_kms_ignore = -1; +static int hf_lustre_ldlm_fl_no_lru = -1; +static int hf_lustre_ldlm_fl_cancel_on_block = -1; +static int hf_lustre_ldlm_fl_cp_reqd = -1; +static int hf_lustre_ldlm_fl_cleaned = -1; +static int hf_lustre_ldlm_fl_atomic_cb = -1; +static int hf_lustre_ldlm_fl_bl_ast = -1; +static int hf_lustre_ldlm_fl_bl_done = -1; +static int hf_lustre_ldlm_fl_deny_on_contention = -1; +static int hf_lustre_ldlm_ast_discard_data = -1; + +static int hf_lustre_mds_body_ctime = -1; +static int hf_lustre_mds_body_fid1 = -1; +static int hf_lustre_mds_body_nlink = -1; +static int hf_lustre_mds_body_flags = -1; +static int hf_lustre_mds_body_fsgid = -1; +static int hf_lustre_mds_body_mtime = -1; +static int hf_lustre_mds_body_uid = -1; +static int hf_lustre_mds_body_mode = -1; +static int hf_lustre_mds_body_max_cookiesize = -1; +static int hf_lustre_mds_body_io_epoch = -1; +static int hf_lustre_mds_body_ino = -1; +static int hf_lustre_mds_body_fid2 = -1; +static int hf_lustre_mds_body_padding_4 = -1; +static int hf_lustre_mds_body_aclsize = -1; +static int hf_lustre_mds_body_valid = -1; +static int hf_lustre_mds_body_generation = -1; +static int hf_lustre_mds_body_atime = -1; +static int hf_lustre_mds_body_handle = -1; +static int hf_lustre_mds_body_max_mdsize = -1; +static int hf_lustre_mds_body_rdev = -1; +static int hf_lustre_mds_body_blocks = -1; +static int hf_lustre_mds_body_fsuid = -1; +static int hf_lustre_mds_body_gid = -1; +static int hf_lustre_lustre_handle_cookie = -1; +static int hf_lustre_mds_body_suppgid = -1; +static int hf_lustre_mds_body_size = -1; +static int hf_lustre_mds_body_eadatasize = -1; +static int hf_lustre_mds_body_capability = -1; + +static int hf_lustre_ptlrpc_body_pb_last_committed = -1; +static int hf_lustre_ptlrpc_body_pb_version = -1; +static int hf_lustre_lustre_msg_v1_lm_bufcount = -1; +static int hf_lustre_obd_ioobj_ioo_id = -1; +static int hf_lustre_ptlrpc_body_pb_slv = -1; +static int hf_lustre_lustre_msg_v1_lm_handle = -1; +static int hf_lustre_ost_lvb_lvb_atime = -1; +static int hf_lustre_ptlrpc_body_pb_timeout = -1; +static int hf_lustre_obd_statfs_os_bavail = -1; +static int hf_lustre_obd_statfs_os_bsize = -1; +static int hf_lustre_lustre_msg_v2_lm_repsize = -1; +static int hf_lustre_lov_mds_md_v1_lmm_stripe_size = -1; +static int hf_lustre_lustre_msg_v1_lm_last_xid = -1; +static int hf_lustre_ll_fid_f_type = -1; +static int hf_lustre_lustre_msg_v2_lm_cksum = -1; +static int hf_lustre_lustre_msg_v2_lm_buflens = -1; +static int hf_lustre_lustre_msg_v1_lm_status = -1; +static int hf_lustre_lustre_msg_v1_lm_type = -1; +static int hf_lustre_niobuf_remote_len = -1; +static int hf_lustre_lov_mds_md_v1_lmm_magic = -1; +static int hf_lustre_ptlrpc_body_pb_op_flags = -1; +static int hf_lustre_ost_lvb_lvb_ctime = -1; +static int hf_lustre_ptlrpc_body_pb_type = -1; +static int hf_lustre_obd_connect_data_ocd_nllg = -1; +static int hf_lustre_obd_connect_data_ocd_nllu = -1; +static int hf_lustre_ll_fid_generation = -1; +static int hf_lustre_ost_lvb_lvb_mtime = -1; +static int hf_lustre_obd_connect_data_ocd_ibits_known = -1; +static int hf_lustre_lustre_msg_v2_lm_padding_3 = -1; +static int hf_lustre_ptlrpc_body_pb_flags = -1; +static int hf_lustre_obd_statfs_os_spare4 = -1; +static int hf_lustre_obd_connect_data_ocd_group = -1; +static int hf_lustre_lov_ost_data_v1_l_object_gr = -1; +static int hf_lustre_lov_mds_md_v1_lmm_object_gr = -1; +static int hf_lustre_obd_connect_data_ocd_brw_size = -1; +static int hf_lustre_ptlrpc_body_pb_limit = -1; +static int hf_lustre_obd_statfs_os_maxbytes = -1; +static int hf_lustre_obd_statfs_os_spare5 = -1; +static int hf_lustre_lustre_msg_v2_lm_flags = -1; +static int hf_lustre_obd_statfs_os_ffree = -1; +static int hf_lustre_obd_statfs_os_files = -1; +static int hf_lustre_lov_mds_md_v1_lmm_stripe_count = -1; +static int hf_lustre_lustre_msg_v1_lm_flags = -1; +static int hf_lustre_lustre_msg_v1_lm_last_committed = -1; +static int hf_lustre_obd_statfs_os_spare9 = -1; +static int hf_lustre_obd_connect_data_ocd_index = -1; +static int hf_lustre_opnum = -1; +static int hf_lustre_lustre_msg_v1_lm_buflens = -1; +static int hf_lustre_obd_statfs_os_spare1 = -1; +static int hf_lustre_obd_statfs_os_spare8 = -1; +static int hf_lustre_lustre_msg_v1_lm_conn_cnt = -1; +static int hf_lustre_ptlrpc_body_pb_transno = -1; +static int hf_lustre_ptlrpc_body_pb_service_time = -1; +static int hf_lustre_ptlrpc_body_pb_conn_cnt = -1; +static int hf_lustre_ptlrpc_body_pb_opc = -1; +static int hf_lustre_obd_connect_data_ocd_connect_flags = -1; +static int hf_lustre_lov_ost_data_v1_l_object_id = -1; +static int hf_lustre_lov_ost_data_v1_l_ost_gen = -1; +static int hf_lustre_obd_statfs_os_bfree = -1; +static int hf_lustre_obd_connect_data_ocd_version = -1; +static int hf_lustre_lov_mds_md_v1_lmm_objects = -1; +static int hf_lustre_mds_status_req_flags = -1; +static int hf_lustre_obd_statfs_os_namelen = -1; +static int hf_lustre_obd_statfs_os_blocks = -1; +static int hf_lustre_lustre_msg_v2_lm_secflvr = -1; +static int hf_lustre_lustre_msg_v1_lm_transno = -1; +static int hf_lustre_lov_mds_md_v1_lmm_pattern = -1; +static int hf_lustre_lustre_msg_v1_lm_opc = -1; +static int hf_lustre_obd_connect_data_ocd_grant = -1; +static int hf_lustre_obd_ioobj_ioo_bufcnt = -1; +static int hf_lustre_lustre_msg_v1_lm_version = -1; +static int hf_lustre_obd_statfs_os_spare7 = -1; +static int hf_lustre_obd_statfs_os_fsid = -1; +static int hf_lustre_obd_connect_data_ocd_cksum_types = -1; +static int hf_lustre_ost_lvb_lvb_size = -1; +static int hf_lustre_obd_statfs_os_type = -1; +static int hf_lustre_obd_statfs_os_spare6 = -1; +static int hf_lustre_obd_statfs_os_state = -1; +static int hf_lustre_obd_statfs_os_spare3 = -1; +static int hf_lustre_lustre_msg_v2_lm_magic = -1; +static int hf_lustre_lov_mds_md_v1_lmm_object_id = -1; +static int hf_lustre_ptlrpc_body_pb_last_seen = -1; +static int hf_lustre_obd_ioobj_ioo_type = -1; +static int hf_lustre_ptlrpc_body_pb_last_xid = -1; +static int hf_lustre_ptlrpc_body_pb_status = -1; +static int hf_lustre_niobuf_remote_flags = -1; +static int hf_lustre_ll_fid_id = -1; +static int hf_lustre_ost_lvb_lvb_blocks = -1; +static int hf_lustre_lustre_msg_v2_lm_padding_2 = -1; +static int hf_lustre_obd_connect_data_padding1 = -1; +static int hf_lustre_lov_ost_data_v1_l_ost_idx = -1; +static int hf_lustre_obd_connect_data_padding2 = -1; +static int hf_lustre_obd_ioobj_ioo_gr = -1; +static int hf_lustre_niobuf_remote_offset=-1; +static int hf_lustre_mds_status_req_repbuf = -1; +static int hf_lustre_obd_statfs_os_spare2 = -1; +static int hf_lustre_lustre_msg_v2_lm_bufcount = -1; +static int hf_lustre_ptlrpc_body_pb_handle = -1; +static int hf_lustre_obd_connect_data_ocd_transno = -1; +static int hf_lustre_lustre_msg_v1_lm_magic = -1; +static int hf_lustre_llog_unlink_rec_lur_tail = -1; +static int hf_lustre_llog_size_change_rec_lsc_io_epoch = -1; +static int hf_lustre_mgs_target_info_mti_flags = -1; +static int hf_lustre_ldlm_reply_lock_policy_res1 = -1; +static int hf_lustre_mds_rec_link_lk_fsuid = -1; +static int hf_lustre_llogd_body_lgd_len = -1; +static int hf_lustre_qunit_data_old_qd_id = -1; +static int hf_lustre_lov_user_md_join_lmm_stripe_count = -1; +static int hf_lustre_llog_logid_rec_padding1 = -1; +static int hf_lustre_quota_adjust_qunit_padding1 = -1; +static int hf_lustre_llog_size_change_rec_lsc_fid = -1; +static int hf_lustre_llog_rec_hdr_padding = -1; +static int hf_lustre_mds_rec_create_cr_time = -1; +static int hf_lustre_mds_rec_create_cr_padding_1 = -1; +static int hf_lustre_obdo_o_nlink = -1; +static int hf_lustre_mds_rec_rename_rn_suppgid2 = -1; +static int hf_lustre_mds_rec_link_lk_padding_4 = -1; +static int hf_lustre_mds_rec_link_lk_cap = -1; +static int hf_lustre_ldlm_extent_gid = -1; +static int hf_lustre_obdo_o_uid = -1; +static int hf_lustre_mds_rec_create_cr_padding_5 = -1; +static int hf_lustre_obdo_o_valid = -1; +static int hf_lustre_ldlm_reply_lock_flags = -1; +static int hf_lustre_mds_rec_link_lk_padding_3 = -1; +static int hf_lustre_obdo_o_misc = -1; +static int hf_lustre_mds_rec_setattr_sa_suppgid = -1; +static int hf_lustre_mds_rec_setattr_sa_attr_flags = -1; +static int hf_lustre_mds_rec_rename_rn_padding_2 = -1; +static int hf_lustre_ldlm_request_lock_handle = -1; +static int hf_lustre_llog_logid_lgl_oid = -1; +static int hf_lustre_ldlm_inodebits_bits = -1; +static int hf_lustre_llog_log_hdr_llh_count = -1; +static int hf_lustre_mds_rec_unlink_ul_padding_4 = -1; +static int hf_lustre_lov_user_md_join_lmm_stripe_size = -1; +static int hf_lustre_llog_gen_rec_lgr_tail = -1; +static int hf_lustre_llog_catid_lci_padding3 = -1; +static int hf_lustre_qunit_data_qd_qunit = -1; +static int hf_lustre_mds_rec_unlink_ul_fid1 = -1; +static int hf_lustre_llog_setattr_rec_padding = -1; +static int hf_lustre_mds_rec_rename_rn_opcode = -1; +static int hf_lustre_mds_rec_create_cr_flags = -1; +static int hf_lustre_mds_rec_rename_rn_fid1 = -1; +static int hf_lustre_mds_extent_desc_med_start = -1; +static int hf_lustre_llog_cookie_lgc_lgl = -1; +static int hf_lustre_obd_quotactl_qc_dqinfo = -1; +static int hf_lustre_llog_log_hdr_llh_bitmap = -1; +static int hf_lustre_mds_rec_setattr_sa_size = -1; +static int hf_lustre_mds_rec_rename_rn_suppgid1 = -1; +static int hf_lustre_obd_quotactl_qc_stat = -1; +static int hf_lustre_qunit_data_old2_qd_id = -1; +static int hf_lustre_llog_logid_rec_padding2 = -1; +static int hf_lustre_mds_rec_unlink_ul_mode = -1; +static int hf_lustre_llog_orphan_rec_lor_tail = -1; +static int hf_lustre_llog_logid_rec_padding5 = -1; +static int hf_lustre_mds_rec_create_cr_fsgid = -1; +static int hf_lustre_mds_rec_join_jr_fid = -1; +static int hf_lustre_ldlm_intent_opc = -1; +static int hf_lustre_llog_rec_hdr_lrh_type = -1; +static int hf_lustre_mds_rec_link_lk_fsgid = -1; +static int hf_lustre_llog_rec_hdr_lrh_len = -1; +static int hf_lustre_llog_setattr_rec_lsr_uid = -1; +static int hf_lustre_lov_desc_ld_padding_1 = -1; +static int hf_lustre_obdo_o_padding_4 = -1; +static int hf_lustre_mgs_target_info_padding = -1; +static int hf_lustre_obd_quotactl_qc_dqblk = -1; +static int hf_lustre_llogd_conn_body_lgdc_gen = -1; +static int hf_lustre_mds_rec_create_cr_padding_2 = -1; +static int hf_lustre_mds_rec_setattr_sa_ctime = -1; +static int hf_lustre_llog_log_hdr_llh_tail = -1; +static int hf_lustre_obdo_o_size = -1; +static int hf_lustre_mds_rec_setattr_sa_cap = -1; +static int hf_lustre_ldlm_extent_start = -1; +static int hf_lustre_mds_rec_unlink_ul_opcode = -1; +static int hf_lustre_llog_size_change_rec_lsc_hdr = -1; +static int hf_lustre_mds_rec_unlink_ul_time = -1; +static int hf_lustre_lov_user_ost_data_join_l_extent_start = -1; +static int hf_lustre_lov_user_md_join_lmm_tree_id = -1; +static int hf_lustre_llog_create_rec_lcr_tail = -1; +static int hf_lustre_mds_rec_setattr_sa_mode = -1; +static int hf_lustre_llog_logid_lgl_ogr = -1; +static int hf_lustre_llog_create_rec_lcr_hdr = -1; +static int hf_lustre_llog_cookie_lgc_padding = -1; +static int hf_lustre_mds_rec_create_cr_cap = -1; +static int hf_lustre_qunit_data_old_qd_type = -1; +static int hf_lustre_ldlm_flock_blocking_export = -1; +static int hf_lustre_mds_rec_setattr_sa_gid = -1; +static int hf_lustre_lov_desc_ld_pattern = -1; +static int hf_lustre_qunit_data_qd_id = -1; +static int hf_lustre_mgs_target_info_mti_fsname = -1; +static int hf_lustre_lov_user_md_join_lmm_object_gr = -1; +static int hf_lustre_ldlm_request_lock_flags = -1; +static int hf_lustre_obdo_o_mode = -1; +static int hf_lustre_mgs_target_info_mti_svname = -1; +static int hf_lustre_llogd_body_lgd_logid = -1; +static int hf_lustre_mds_rec_create_cr_opcode = -1; +static int hf_lustre_llog_log_hdr_llh_size = -1; +static int hf_lustre_llog_create_rec_padding = -1; +static int hf_lustre_obdo_o_handle = -1; +static int hf_lustre_obdo_o_atime = -1; +static int hf_lustre_quota_adjust_qunit_qaq_id = -1; +static int hf_lustre_mds_rec_rename_rn_fid2 = -1; +static int hf_lustre_mds_rec_create_cr_replayfid = -1; +static int hf_lustre_ldlm_lock_desc_l_policy_data = -1; +static int hf_lustre_mds_rec_link_lk_suppgid1 = -1; +static int hf_lustre_obd_quotactl_qc_cmd = -1; +static int hf_lustre_lov_user_md_join_lmm_object_id = -1; +static int hf_lustre_mds_rec_rename_rn_padding_3 = -1; +static int hf_lustre_qunit_data_padding = -1; +static int hf_lustre_lov_user_md_join_lmm_objects = -1; +static int hf_lustre_quota_adjust_qunit_qaq_flags = -1; +static int hf_lustre_lov_user_ost_data_join_l_object_gr = -1; +static int hf_lustre_ldlm_lock_desc_l_granted_mode = -1; +static int hf_lustre_obdo_o_gr = -1; +static int hf_lustre_mds_rec_unlink_ul_padding_2 = -1; +static int hf_lustre_obdo_o_gid = -1; +static int hf_lustre_llog_catid_lci_logid = -1; +static int hf_lustre_llog_rec_tail_lrt_index = -1; +static int hf_lustre_obdo_o_mds = -1; +static int hf_lustre_mds_extent_desc_med_lmm = -1; +static int hf_lustre_lov_desc_ld_default_stripe_count = -1; +static int hf_lustre_ldlm_resource_desc_lr_padding = -1; +static int hf_lustre_cfg_marker_cm_vers = -1; +static int hf_lustre_mds_rec_create_cr_fid = -1; +static int hf_lustre_llog_unlink_rec_lur_hdr = -1; +static int hf_lustre_llogd_body_lgd_index = -1; +static int hf_lustre_cfg_marker_cm_tgtname = -1; +static int hf_lustre_mds_rec_unlink_ul_padding_1 = -1; +static int hf_lustre_mds_rec_unlink_ul_cap = -1; +static int hf_lustre_llog_array_rec_lmr_med = -1; +static int hf_lustre_llog_setattr_rec_lsr_ogen = -1; +static int hf_lustre_mds_rec_create_cr_padding_3 = -1; +static int hf_lustre_llog_logid_rec_lid_hdr = -1; +static int hf_lustre_lov_user_ost_data_join_l_ost_idx = -1; +static int hf_lustre_obdo_o_easize = -1; +static int hf_lustre_lov_user_md_join_lmm_array_id = -1; +static int hf_lustre_ost_body_oa = -1; +static int hf_lustre_llog_logid_rec_padding3 = -1; +static int hf_lustre_llog_log_hdr_llh_flags = -1; +static int hf_lustre_llog_setattr_rec_lsr_oid = -1; +static int hf_lustre_mds_rec_create_cr_mode = -1; +static int hf_lustre_llog_size_change_rec_padding = -1; +static int hf_lustre_mgs_target_info_mti_config_ver = -1; +static int hf_lustre_cfg_marker_cm_createtime = -1; +static int hf_lustre_qunit_data_old_qd_count = -1; +static int hf_lustre_lov_mds_md_join_lmmj_array_id = -1; +static int hf_lustre_mds_rec_setattr_sa_uid = -1; +static int hf_lustre_llog_catid_lci_padding1 = -1; +static int hf_lustre_mds_rec_setattr_sa_atime = -1; +static int hf_lustre_lov_desc_ld_active_tgt_count = -1; +static int hf_lustre_obdo_o_lcookie = -1; +static int hf_lustre_llog_gen_rec_lgr_gen = -1; +static int hf_lustre_lov_user_ost_data_join_l_object_id = -1; +static int hf_lustre_obdo_o_id = -1; +static int hf_lustre_mgs_target_info_mti_uuid = -1; +static int hf_lustre_mds_rec_link_lk_padding_1 = -1; +static int hf_lustre_llog_rec_hdr_lrh_index = -1; +static int hf_lustre_llog_setattr_rec_lsr_hdr = -1; +static int hf_lustre_mgs_target_info_mti_stripe_index = -1; +static int hf_lustre_llog_gen_conn_cnt = -1; +static int hf_lustre_obdo_o_padding_6 = -1; +static int hf_lustre_mds_rec_create_cr_suppgid = -1; +static int hf_lustre_llog_cookie_lgc_index = -1; +static int hf_lustre_lov_desc_ld_uuid = -1; +static int hf_lustre_llog_create_rec_lcr_oid = -1; +static int hf_lustre_ldlm_reply_lock_desc = -1; +static int hf_lustre_lov_desc_ld_padding_0 = -1; +static int hf_lustre_llog_unlink_rec_lur_ogen = -1; +static int hf_lustre_llog_orphan_rec_lor_hdr = -1; +static int hf_lustre_mds_rec_rename_rn_fsuid = -1; +static int hf_lustre_cfg_marker_cm_flags = -1; +static int hf_lustre_obdo_o_padding_3 = -1; +static int hf_lustre_lov_user_ost_data_join_l_ost_gen = -1; +static int hf_lustre_mds_rec_create_cr_fsuid = -1; +static int hf_lustre_mds_rec_unlink_ul_fsgid = -1; +static int hf_lustre_ldlm_request_lock_desc = -1; +static int hf_lustre_lov_user_md_join_lmm_pattern = -1; +static int hf_lustre_mds_rec_unlink_ul_fsuid = -1; +static int hf_lustre_mds_rec_link_lk_suppgid2 = -1; +static int hf_lustre_llog_orphan_rec_padding = -1; +static int hf_lustre_lov_user_md_join_lmm_tree_gen = -1; +static int hf_lustre_obdo_o_flags = -1; +static int hf_lustre_mgs_target_info_mti_params = -1; +static int hf_lustre_llog_logid_lgl_ogen = -1; +static int hf_lustre_mds_rec_setattr_sa_valid = -1; +static int hf_lustre_cfg_marker_cm_comment = -1; +static int hf_lustre_llog_unlink_rec_lur_oid = -1; +static int hf_lustre_qunit_data_qd_count = -1; +static int hf_lustre_mds_rec_rename_rn_padding_1 = -1; +static int hf_lustre_obdo_o_mtime = -1; +static int hf_lustre_lov_mds_md_join_lmmj_md = -1; +static int hf_lustre_mds_rec_rename_rn_fsgid = -1; +static int hf_lustre_mds_rec_rename_rn_cap = -1; +static int hf_lustre_obdo_o_blksize = -1; +static int hf_lustre_mds_rec_unlink_ul_suppgid = -1; +static int hf_lustre_ldlm_res_id_name = -1; +static int hf_lustre_mds_rec_link_lk_time = -1; +static int hf_lustre_ldlm_reply_lock_handle = -1; +static int hf_lustre_mds_rec_unlink_ul_padding_3 = -1; +static int hf_lustre_llogd_body_lgd_saved_index = -1; +static int hf_lustre_mds_rec_join_jr_headsize = -1; +static int hf_lustre_mds_rec_rename_rn_padding_4 = -1; +static int hf_lustre_qunit_data_old_qd_isblk = -1; +static int hf_lustre_obdo_o_blocks = -1; +static int hf_lustre_lov_desc_ld_padding_2 = -1; +static int hf_lustre_mds_rec_link_lk_fid2 = -1; +static int hf_lustre_llog_logid_rec_lid_tail = -1; +static int hf_lustre_obdo_o_grant = -1; +static int hf_lustre_obdo_o_padding_2 = -1; +static int hf_lustre_quota_adjust_qunit_qaq_iunit_sz = -1; +static int hf_lustre_llog_unlink_rec_padding = -1; +static int hf_lustre_ldlm_lock_desc_l_req_mode = -1; +static int hf_lustre_ldlm_extent_end = -1; +static int hf_lustre_llog_gen_rec_lgr_hdr = -1; +static int hf_lustre_llog_orphan_rec_lor_ogen = -1; +static int hf_lustre_lov_user_md_join_lmm_extent_count = -1; +static int hf_lustre_mds_extent_desc_med_len = -1; +static int hf_lustre_llogd_body_lgd_llh_flags = -1; +static int hf_lustre_llog_array_rec_lmr_hdr = -1; +static int hf_lustre_llog_log_hdr_llh_cat_idx = -1; +static int hf_lustre_llog_log_hdr_llh_bitmap_offset=-1; +static int hf_lustre_llog_orphan_rec_lor_oid = -1; +static int hf_lustre_ldlm_reply_lock_padding = -1; +static int hf_lustre_obd_quotactl_qc_id = -1; +static int hf_lustre_mds_rec_create_cr_padding_4 = -1; +static int hf_lustre_llog_logid_rec_padding4 = -1; +static int hf_lustre_mds_rec_link_lk_padding_2 = -1; +static int hf_lustre_llog_setattr_rec_lsr_gid = -1; +static int hf_lustre_lov_user_md_join_lmm_magic = -1; +static int hf_lustre_obd_quotactl_qc_type = -1; +static int hf_lustre_cfg_marker_padding = -1; +static int hf_lustre_mgs_target_info_mti_nids = -1; +static int hf_lustre_lov_user_ost_data_join_l_extent_end = -1; +static int hf_lustre_obdo_o_stripe_idx = -1; +static int hf_lustre_llogd_conn_body_lgdc_logid = -1; +static int hf_lustre_mds_rec_setattr_sa_fsuid = -1; +static int hf_lustre_ldlm_flock_blocking_pid = -1; +static int hf_lustre_lov_desc_ld_tgt_count = -1; +static int hf_lustre_llogd_body_lgd_cur_offset=-1; +static int hf_lustre_llog_create_rec_lcr_ogen = -1; +static int hf_lustre_qunit_data_old2_qd_count = -1; +static int hf_lustre_qunit_data_old2_qd_flags = -1; +static int hf_lustre_ldlm_flock_start = -1; +static int hf_lustre_quota_adjust_qunit_qaq_bunit_sz = -1; +static int hf_lustre_llog_array_rec_lmr_tail = -1; +static int hf_lustre_ldlm_flock_pid = -1; +static int hf_lustre_lov_desc_ld_default_stripe_size = -1; +static int hf_lustre_mds_rec_setattr_sa_opcode = -1; +static int hf_lustre_llog_log_hdr_llh_tgtuuid = -1; +static int hf_lustre_mds_rec_link_lk_fid1 = -1; +static int hf_lustre_cfg_marker_cm_step = -1; +static int hf_lustre_mgs_send_param_mgs_param = -1; +static int hf_lustre_llog_create_rec_lcr_fid = -1; +static int hf_lustre_lov_desc_ld_default_stripe_offset=-1; +static int hf_lustre_ldlm_resource_desc_lr_name = -1; +static int hf_lustre_llog_rec_tail_lrt_len = -1; +static int hf_lustre_mds_rec_setattr_sa_mtime = -1; +static int hf_lustre_llog_log_hdr_llh_timestamp = -1; +static int hf_lustre_llog_catid_lci_padding2 = -1; +static int hf_lustre_llogd_conn_body_lgdc_ctxt_idx = -1; +static int hf_lustre_cfg_marker_cm_canceltime = -1; +static int hf_lustre_mgs_target_info_mti_lustre_ver = -1; +static int hf_lustre_obdo_o_padding_1 = -1; +static int hf_lustre_qunit_data_qd_flags = -1; +static int hf_lustre_llog_logid_rec_lid_id = -1; +static int hf_lustre_obdo_o_generation = -1; +static int hf_lustre_llog_gen_mnt_cnt = -1; +static int hf_lustre_llog_size_change_rec_lsc_tail = -1; +static int hf_lustre_obdo_o_padding_5 = -1; +static int hf_lustre_ldlm_lock_desc_l_resource = -1; +static int hf_lustre_mds_rec_rename_rn_time = -1; +static int hf_lustre_mds_rec_create_cr_rdev = -1; +static int hf_lustre_obdo_o_fid = -1; +static int hf_lustre_mds_rec_setattr_sa_fid = -1; +static int hf_lustre_ldlm_request_lock_count = -1; +static int hf_lustre_ldlm_flock_end = -1; +static int hf_lustre_mds_rec_link_lk_opcode = -1; +static int hf_lustre_mgs_target_info_mti_nid_count = -1; +static int hf_lustre_obdo_o_ctime = -1; +static int hf_lustre_ldlm_reply_lock_policy_res2 = -1; +static int hf_lustre_llogd_body_lgd_ctxt_idx = -1; +static int hf_lustre_mds_rec_unlink_ul_fid2 = -1; +static int hf_lustre_lov_desc_ld_qos_maxage = -1; +static int hf_lustre_ldlm_resource_desc_lr_type = -1; +static int hf_lustre_llog_setattr_rec_lsr_tail = -1; +static int hf_lustre_llog_cookie_lgc_subsys = -1; +static int hf_lustre_llog_log_hdr_llh_hdr = -1; +static int hf_lustre_mds_rec_setattr_sa_fsgid = -1; +static int hf_lustre_mds_rec_setattr_sa_padding = -1; +static int hf_lustre_lov_mds_md_join_lmmj_extent_count = -1; +static int hf_lustre_llog_log_hdr_llh_reserved = -1; + +/* Header field declarations for field from lustre_user.h*/ +static int hf_lustre_obd_dqinfo_dqi_valid = -1; +static int hf_lustre_obd_dqblk_dqb_isoftlimit = -1; +static int hf_lustre_obd_dqblk_dqb_bhardlimit = -1; +static int hf_lustre_obd_dqblk_dqb_curspace = -1; +static int hf_lustre_obd_dqblk_dqb_itime = -1; +static int hf_lustre_obd_dqblk_dqb_valid = -1; +static int hf_lustre_obd_dqinfo_dqi_igrace = -1; +static int hf_lustre_obd_dqinfo_dqi_bgrace = -1; +static int hf_lustre_obd_dqblk_padding = -1; +static int hf_lustre_obd_dqblk_dqb_curinodes = -1; +static int hf_lustre_obd_dqblk_dqb_bsoftlimit = -1; +static int hf_lustre_obd_dqinfo_dqi_flags = -1; +static int hf_lustre_obd_dqblk_dqb_btime = -1; +static int hf_lustre_obd_dqblk_dqb_ihardlimit = -1; +static int hf_lustre_ldlm_intent_opc_open = -1 ; +static int hf_lustre_ldlm_intent_opc_creat = -1; +static int hf_lustre_ldlm_intent_opc_readdir = -1; +static int hf_lustre_ldlm_intent_opc_getattr = -1; +static int hf_lustre_ldlm_intent_opc_lookup = -1; +static int hf_lustre_ldlm_intent_opc_unlink = -1; +static int hf_lustre_ldlm_intent_opc_getxattr = -1; +static int hf_lustre_ldlm_intent_opc_exec = -1; +static int hf_lustre_ldlm_intent_opc_pin = -1; +static int hf_lustre_llog_hdr_llh_flag_zap_when_empty = -1; +static int hf_lustre_llog_hdr_llh_flag_is_cat = -1; +static int hf_lustre_llog_hdr_llh_flag_is_play = -1; +/* --------------------------------------------------------------------*/ + + +/* proto declaration */ +static gint proto_lustre = -1; + + + + +static int ldlm_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint64 intent_opc _U_) ; +static int lustre_dissect_element_ldlm_lock_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_); +static int add_extra_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_) ; + + +/* ======================================================================== */ +/*the value_string*/ +const value_string lustre_ldlm_opcode[] = { + { IT_OPEN , "IT_OPEN "}, + { IT_CREAT , "IT_CREAT "}, + { IT_READDIR , "IT_READDIR "}, + { IT_GETATTR , "IT_GETATTR "}, + { IT_LOOKUP , "IT_LOOKUP "}, + { IT_UNLINK , "IT_UNLINK "}, + { IT_GETXATTR, "IT_GETXATTR"}, + { IT_EXEC , "IT_EXEC "}, + { IT_PIN , "IT_PIN "}, + { 0, NULL } +}; + +const value_string lustre_lov_magic[] = { + { LOV_MAGIC_V1, "LOV_MAGIC_V1" }, + { LOV_MAGIC_JOIN, "LOV_MAGIC_JOIN" }, + {0, NULL} +}; + +const value_string lustre_ldlm_flags_vals[] = { + {0x000001 , "LDLM_FL_LOCK_CHANGED"}, + {0x000002 , "LDLM_FL_BLOCK_GRANTED"}, + {0x000004 , "LDLM_FL_BLOCK_CONV"}, + {0x000008 , "LDLM_FL_BLOCK_WAIT"}, + {0x000010 , "LDLM_FL_CBPENDING"}, + {0x000020 , "LDLM_FL_AST_SENT"}, + {0x000040 , "LDLM_FL_WAIT_NOREPROC"}, + {0x000080 , "LDLM_FL_CANCEL"}, + {0x000100 , "LDLM_FL_REPLAY"}, + {0x000200 , "LDLM_FL_INTENT_ONLY"}, + {0x000400 , "LDLM_FL_LOCAL_ONLY"}, + {0x000800 , "LDLM_FL_FAILED"}, + {0x001000 , "LDLM_FL_HAS_INTENT"}, + {0x002000 , "LDLM_FL_CANCELING"}, + {0x004000 , "LDLM_FL_LOCAL"}, + {0x008000 , "LDLM_FL_WARN"}, + {0x010000 , "LDLM_FL_DISCARD_DATA"}, + {0x020000 , "LDLM_FL_NO_TIMEOUT"}, + {0x040000 , "LDLM_FL_BLOCK_NOWAIT"}, + {0x080000 , "LDLM_FL_TEST_LOCK"}, + {0x100000 , "LDLM_FL_LVB_READY"}, + {0x200000 , "LDLM_FL_KMS_IGNORE"}, + {0x400000 , "LDLM_FL_NO_LRU"}, + {0x800000 , "LDLM_FL_CANCEL_ON_BLOCK"}, + {0x1000000 , "LDLM_FL_CP_REQD"}, + {0x2000000 , "LDLM_FL_CLEANED"}, + {0x4000000 , "LDLM_FL_ATOMIC_CB"}, + {0x10000000 , "LDLM_FL_BL_AST"}, + {0x20000000 , "LDLM_FL_BL_DONE"}, + {0x40000000 , "LDLM_FL_DENY_ON_CONTENTION"}, + {0x80000000 , "LDLM_AST_DISCARD_DATA"}, + { 0, NULL } +}; + +const value_string lustre_llog_op_type[] = { + {LLOG_PAD_MAGIC ,"LLOG_PAD_MAGIC "}, + {OST_SZ_REC ,"OST_SZ_REC "}, + {OST_RAID1_REC ,"OST_RAID1_REC "}, + {MDS_UNLINK_REC ,"MDS_UNLINK_REC "}, + {MDS_SETATTR_REC ,"MDS_SETATTR_REC "}, + {OBD_CFG_REC ,"OBD_CFG_REC "}, + {PTL_CFG_REC ,"PTL_CFG_REC "}, + {LLOG_GEN_REC ,"LLOG_GEN_REC "}, + {LLOG_JOIN_REC ,"LLOG_JOIN_REC "}, + {LLOG_HDR_MAGIC ,"LLOG_HDR_MAGIC "}, + {LLOG_LOGID_MAGIC ,"LLOG_LOGID_MAGIC"}, + { 0, NULL } +}; + +const value_string lustre_llog_hdr_llh_flags[]= { + {LLOG_F_ZAP_WHEN_EMPTY , "LLOhdr_llh_G_F_ZAP_WHEN_EMPTY"}, + {LLOG_F_IS_CAT , "LLOhdr_llh_G_F_IS_CAT"}, + {LLOG_F_IS_PLAIN , "LLOG_F_IS_PLAIN"}, + { 0, NULL } +}; + +const value_string lustre_mds_flags_vals[] = { + {0x1,"MDS_BFLAG_UNCOMMITTED_WRITES"}, + {0x80000000, "MDS_BFLAG_EXT_FLAGS"}, /* == EXT3_RESERVED_FL */ + {0x00000008, "MDS_SYNC_FL "}, /* Synchronous updates */ + {0x00000010, "MDS_IMMUTABLE_FL "}, /* Immutable file */ + {0x00000020, "MDS_APPEND_FL "}, /* writes to file may only append */ + {0x00000080, "MDS_NOATIME_FL "}, /* do not update atime */ + {0x00010000, "MDS_DIRSYNC_FL "}, /* dirsync behaviour (dir only) */ + { 0, NULL } +}; + +const value_string lustre_LMTypes[] = { + { PTL_RPC_MSG_REQUEST, "request"}, + { PTL_RPC_MSG_ERR , "error"}, + { PTL_RPC_MSG_REPLY , "reply"}, + { 0, NULL } +}; + +const value_string lustre_mds_reint_t_vals[] = { + { REINT_SETATTR, "REINT_SETATTR" }, + { REINT_CREATE, "REINT_CREATE" }, + { REINT_LINK, "REINT_LINK" }, + { REINT_UNLINK, "REINT_UNLINK" }, + { REINT_RENAME, "REINT_RENAME" }, + { REINT_OPEN, "REINT_OPEN" }, + { 0, NULL } +}; +const value_string lustre_op_codes[] = { + /*OST Opcodes*/ + {0 , "OST_REPLY"}, + {1 , "OST_GETATTR"}, + {2 , "OST_SETATTR"}, + {3 , "OST_READ"}, + {4 , "OST_WRITE"}, + {5 , "OST_CREATE"}, + {6 , "OST_DESTROY"}, + {7 , "OST_GET_INFO"}, + {8 , "OST_CONNECT"}, + {9 , "OST_DISCONNECT"}, + {10 , "OST_PUNCH"}, + {11 , "OST_OPEN"}, + {12 , "OST_CLOSE"}, + {13 , "OST_STATFS"}, + {14 , "OST_SAN_READ(deprecated)"}, + {15 , "OST_SAN_WRITE(deprecated)"}, + {16 , "OST_SYNC"}, + {17 , "OST_SET_INFO"}, + {18 , "OST_QUOTACHECK"}, + {19 , "OST_QUOTACTL"}, + {20 , "OST_LAST_OPC"}, + /*MDS Opcodes*/ + {33 , "MDS_GETATTR"}, + {34 , "MDS_GETATTR_NAME"}, + {35 , "MDS_CLOSE"}, + {36 , "MDS_REINT"}, + {37 , "MDS_READPAGE"}, + {38 , "MDS_CONNECT"}, + {39 , "MDS_DISCONNECT"}, + {40 , "MDS_GETSTATUS"}, + {41 , "MDS_STATFS"}, + {42 , "MDS_PIN"}, + {43 , "MDS_UNPIN"}, + {44 , "MDS_SYNC"}, + {45 , "MDS_DONE_WRITING"}, + {46 , "MDS_SET_INFO"}, + {47 , "MDS_QUOTACHECK"}, + {48 , "MDS_QUOTACTL"}, + {49 , "MDS_GETXATTR"}, + {50 , "MDS_SETXATTR"}, + {51 , "MDS_LAST_OPC"}, + /*LDLM Opcodes*/ + {101 , "LDLM_ENQUEUE"}, + {102 , "LDLM_CONVERT"}, + {103 , "LDLM_CANCEL"}, + {104 , "LDLM_BL_CALLBACK"}, + {105 , "LDLM_CP_CALLBACK"}, + {106 , "LDLM_GL_CALLBACK"}, + {107 , "LDLM_LAST_OPC"}, + /*MGS Opcodes*/ + {250 , "MGS_CONNECT"}, + {251 , "MGS_DISCONNECT"}, + {252 , "MGS_EXCEPTION"}, + {253 , "MGS_TARGET_REG"}, + {254 , "MGS_TARGET_DEL"}, + {255 , "MGS_SET_INFO"}, + {256 , "MGS_LAST_OPC"}, + /*OBD Opcodes*/ + {400 , "OBD_PING"}, + {401 , "OBD_LOG_CANCEL"}, + {402 , "OBD_QC_CALLBACK"}, + {403 , "OBD_LAST_OPC"}, + /* LLOG opcodes */ + { 501, "LLOG_ORIGIN_HANDLE_CREATE"}, + { 502, "LLOG_ORIGIN_HANDLE_NEXT_BLOCK"}, + { 503, "LLOG_ORIGIN_HANDLE_READ_HEADER"}, + { 504, "LLOG_ORIGIN_HANDLE_WRITE_REC"}, + { 505, "LLOG_ORIGIN_HANDLE_CLOSE"}, + { 506, "LLOG_ORIGIN_CONNECT"}, + { 507, "LLOG_CATINFO"}, + { 508, "LLOG_ORIGIN_HANDLE_PREV_BLOCK"}, + { 509, "LLOG_ORIGIN_HANDLE_DESTROY"}, + { 0, NULL } +}; +/*const value_string lustre_ldlm_mode_t_vals[] = {*/ +/* { LCK_MINMODE, "MINMODE" },*/ +/* { LCK_EX, "EX" },*/ +/* { LCK_PW, "PW" },*/ +/* { LCK_PR, "PR" },*/ +/* { LCK_CW, "CW" },*/ +/* { LCK_CR, "CR" },*/ +/* { LCK_NL, "NL" },*/ +/* { LCK_GROUP, "GROUP" },*/ +/* { 0, NULL }*/ +/*};*/ + +/* detailled version the information came from : http://wiki.lustre.org/images/e/e5/LustreInternals_Architecture.pdf */ +const value_string lustre_ldlm_mode_t_vals[] = { + { LCK_MINMODE, "MINMODE" }, + { LCK_EX, "Exclusive" }, + { LCK_PW, "Protected Write" }, + { LCK_PR, "Protected Read" }, + { LCK_CW, "Concurrent Write" }, + { LCK_CR, "Concurrent Read" }, + { LCK_NL, "Null" }, + { LCK_GROUP, "GROUP" }, + { 0, NULL } +}; + +const value_string lustre_ldlm_type_t_vals[] = { + { LDLM_PLAIN, "LDLM_PLAIN" }, + { LDLM_EXTENT,"LDLM_EXTENT" }, + { LDLM_FLOCK, "LDLM_FLOCK" }, + { LDLM_IBITS, "LDLM_IBITS" }, + { 0, NULL } +}; + + +const value_string lustre_llog_cmd_t_vals[] = { + { LLOG_ORIGIN_HANDLE_CREATE, "LLOG_ORIGIN_HANDLE_CREATE" }, + { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "LLOG_ORIGIN_HANDLE_NEXT_BLOCK" }, + { LLOG_ORIGIN_HANDLE_READ_HEADER, "LLOG_ORIGIN_HANDLE_READ_HEADER" }, + { LLOG_ORIGIN_HANDLE_WRITE_REC, "LLOG_ORIGIN_HANDLE_WRITE_REC" }, + { LLOG_ORIGIN_HANDLE_CLOSE, "LLOG_ORIGIN_HANDLE_CLOSE" }, + { LLOG_ORIGIN_CONNECT, "LLOG_ORIGIN_CONNECT" }, + { LLOG_CATINFO, "LLOG_CATINFO" }, + { LLOG_ORIGIN_HANDLE_PREV_BLOCK, "LLOG_ORIGIN_HANDLE_PREV_BLOCK" }, + { LLOG_ORIGIN_HANDLE_DESTROY, "LLOG_ORIGIN_HANDLE_DESTROY" }, + { 0, NULL } +}; + + +/* ------------------------------------------------------------------- */ +/* display functions */ +/* ------------------------------------------------------------------- */ +/* display str in left corner and in COL */ +/* TODO : do we need to use inline here ?*/ +/* @gint col : the col where we add the information */ +inline static void display_info_str(proto_item *pi, column_info *cinfo, gint col, const gchar* str) +{ + if (NULL !=pi) + proto_item_append_text(pi, str); + + if (NULL !=cinfo) + if (check_col(cinfo, col)) + col_append_str(cinfo, col, str); +} + +/* + * Need to be (re)written + */ +static void display_info_fstr(proto_item *pi, column_info *cinfo, gint col, const char* format, const gchar * str){ + + if (NULL !=pi){ + //va_start(ap, format); + proto_item_append_text(pi, format, str); + //va_end(ap); + } + + if (NULL !=cinfo){ + if (check_col(cinfo, col)){ + // va_list ap; + // va_start(ap, format); + col_append_fstr(cinfo, col, format, str); + // va_end(ap); + } + } +} + + +/* ------------------------------ basic dissect functions ------------------------ */ +static int +dissect_uint64 +(tvbuff_t *tvb, gint offset, packet_info *pinfo _U_, proto_tree *tree, int hfindex) +{ + proto_tree_add_item(tree, hfindex, tvb, offset, 8, TRUE); + return offset+8; +} + +static int +dissect_uint32 +(tvbuff_t *tvb, gint offset, packet_info *pinfo _U_, proto_tree *tree, int hfindex) +{ + proto_tree_add_item(tree, hfindex, tvb, offset, 4, TRUE); + return offset+4; +} + +static int +dissect_uint8 +(tvbuff_t *tvb, gint offset, packet_info *pinfo _U_, proto_tree *tree, int hfindex) +{ + proto_tree_add_item(tree, hfindex, tvb, offset, 1, TRUE); + return offset+1; +} +/* ------------------------------------------------------------------------- */ + + + + +/* IDL: struct lustre_handle { */ +/* IDL: uint64 cookie; */ +/* IDL: } */ + +static int +lustre_dissect_element_handle_cookie(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lustre_handle_cookie); + + return offset; +} + +int +lustre_dissect_struct_handle_cookie(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lustre_handle_cookie); + } + + offset=lustre_dissect_element_handle_cookie(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct obd_connect_data { */ +/* IDL: uint64 ocd_connect_flags; */ +/* IDL: uint32 ocd_version; */ +/* IDL: uint32 ocd_grant; */ +/* IDL: uint32 ocd_index; */ +/* IDL: uint32 ocd_brw_size; */ +/* IDL: uint64 ocd_ibits_known; */ +/* IDL: uint32 ocd_nllu; */ +/* IDL: uint32 ocd_nllg; */ +/* IDL: uint64 ocd_transno; */ +/* IDL: uint32 ocd_group; */ +/* IDL: uint32 ocd_cksum_types; */ +/* IDL: uint64 padding1; */ +/* IDL: uint64 padding2; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_connect_data_ocd_connect_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_connect_flags); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_version(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_version); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_grant(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_grant); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_index); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_brw_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_brw_size); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_ibits_known(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_ibits_known); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_nllu(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_nllu); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_nllg(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_nllg); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_transno(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_transno); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_group(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_group); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_ocd_cksum_types(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_ocd_cksum_types); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_padding1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_padding1); + + return offset; +} + +static int +lustre_dissect_element_obd_connect_data_padding2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_connect_data_padding2); + + return offset; +} + +static int +lustre_dissect_struct_obd_connect_data(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_connect_data); + } + + offset=lustre_dissect_element_obd_connect_data_ocd_connect_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_version(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_grant(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_brw_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_ibits_known(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_nllu(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_nllg(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_transno(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_group(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_ocd_cksum_types(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_padding1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_connect_data_padding2(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct lov_ost_data_v1 { */ +/* IDL: uint64 l_object_id; */ +/* IDL: uint64 l_object_gr; */ +/* IDL: uint32 l_ost_gen; */ +/* IDL: uint32 l_ost_idx; */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_ost_data_v1_l_object_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_ost_data_v1_l_object_id); + + return offset; +} + +static int +lustre_dissect_element_lov_ost_data_v1_l_object_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_ost_data_v1_l_object_gr); + + return offset; +} + +static int +lustre_dissect_element_lov_ost_data_v1_l_ost_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_ost_data_v1_l_ost_gen); + + return offset; +} + +static int +lustre_dissect_element_lov_ost_data_v1_l_ost_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_ost_data_v1_l_ost_idx); + + return offset; +} + +int +lustre_dissect_struct_lov_ost_data_v1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_ost_data_v1); + } + + offset=lustre_dissect_element_lov_ost_data_v1_l_object_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_ost_data_v1_l_object_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_ost_data_v1_l_ost_gen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_ost_data_v1_l_ost_idx(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct lov_mds_md_v1 { */ +/* IDL: uint32 lmm_magic; */ +/* IDL: uint32 lmm_pattern; */ +/* IDL: uint64 lmm_object_id; */ +/* IDL: uint64 lmm_object_gr; */ +/* IDL: uint32 lmm_stripe_size; */ +/* IDL: uint32 lmm_stripe_count; */ +/* IDL: struct lov_ost_data_v1 { */ +/* IDL: } lmm_objects[0]; <-- en fait on en a lmm_stripe_count */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_magic(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_magic); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_pattern(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_pattern); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_object_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_object_id); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_object_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_object_gr); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_stripe_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_stripe_size); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_stripe_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_v1_lmm_stripe_count); + + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_objects_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_lov_ost_data_v1(tvb,offset,pinfo,tree,hf_lustre_lov_mds_md_v1_lmm_objects); + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_v1_lmm_objects(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_, int num) +{ + int i; + /*g_print("num = %d", num);*/ + for (i = 0; i < num; i++){ + offset=lustre_dissect_element_lov_mds_md_v1_lmm_objects_(tvb, offset, pinfo, tree); + } + + return offset; +} + + + +int +lustre_dissect_struct_lov_mds_md_v1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 stripe_count ; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_mds_md_v1); + } + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_magic(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_pattern(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_object_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_object_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_stripe_size(tvb, offset, pinfo, tree); + + stripe_count = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_lov_mds_md_v1_lmm_stripe_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_v1_lmm_objects(tvb, offset, pinfo, tree, stripe_count); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct obd_statfs { */ +/* IDL: uint64 os_type; */ +/* IDL: uint64 os_blocks; */ +/* IDL: uint64 os_bfree; */ +/* IDL: uint64 os_bavail; */ +/* IDL: uint64 os_files; */ +/* IDL: uint64 os_ffree; */ +/* IDL: uint8 os_fsid[40]; */ +/* IDL: uint32 os_bsize; */ +/* IDL: uint32 os_namelen; */ +/* IDL: uint64 os_maxbytes; */ +/* IDL: uint32 os_state; */ +/* IDL: uint32 os_spare1; */ +/* IDL: uint32 os_spare2; */ +/* IDL: uint32 os_spare3; */ +/* IDL: uint32 os_spare4; */ +/* IDL: uint32 os_spare5; */ +/* IDL: uint32 os_spare6; */ +/* IDL: uint32 os_spare7; */ +/* IDL: uint32 os_spare8; */ +/* IDL: uint32 os_spare9; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_statfs_os_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_type); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_blocks(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_blocks); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_bfree(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_bfree); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_bavail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_bavail); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_files(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_files); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_ffree(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_ffree); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_fsid_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_fsid); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_fsid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 40; i++) + offset=lustre_dissect_element_obd_statfs_os_fsid_(tvb, offset, pinfo, tree); + + return offset; +} + + + +static int +lustre_dissect_element_obd_statfs_os_bsize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_bsize); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_namelen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_namelen); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_maxbytes(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_maxbytes); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_state(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_state); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare1); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare2); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare3); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare4); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare5(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare5); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare6(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare6); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare7(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare7); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare8(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare8); + + return offset; +} + +static int +lustre_dissect_element_obd_statfs_os_spare9(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_statfs_os_spare9); + + return offset; +} + +int +lustre_dissect_struct_obd_statfs(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_statfs); + } + + offset=lustre_dissect_element_obd_statfs_os_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_blocks(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_bfree(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_bavail(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_files(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_ffree(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_fsid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_bsize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_namelen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_maxbytes(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_state(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare4(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare5(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare6(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare7(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare8(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_statfs_os_spare9(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct obd_ioobj { */ +/* IDL: uint64 ioo_id; */ +/* IDL: uint64 ioo_gr; */ +/* IDL: uint32 ioo_type; */ +/* IDL: uint32 ioo_bufcnt; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_ioobj_ioo_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_ioobj_ioo_id); + + return offset; +} + +static int +lustre_dissect_element_obd_ioobj_ioo_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_ioobj_ioo_gr); + + return offset; +} + +static int +lustre_dissect_element_obd_ioobj_ioo_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_ioobj_ioo_type); + + return offset; +} + +static int +lustre_dissect_element_obd_ioobj_ioo_bufcnt(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_ioobj_ioo_bufcnt); + + return offset; +} + +int +lustre_dissect_struct_obd_ioobj(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_ioobj); + } + + offset=lustre_dissect_element_obd_ioobj_ioo_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_ioobj_ioo_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_ioobj_ioo_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_ioobj_ioo_bufcnt(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct niobuf_remote { */ +/* IDL: uint64 offset; */ +/* IDL: uint32 len; */ +/* IDL: uint32 flags; */ +/* IDL: } */ + +static int +lustre_dissect_element_niobuf_remote_offset(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_niobuf_remote_offset); + + return offset; +} + +static int +lustre_dissect_element_niobuf_remote_len(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_niobuf_remote_len); + + return offset; +} + +static int +lustre_dissect_element_niobuf_remote_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_niobuf_remote_flags); + + return offset; +} + +int +lustre_dissect_struct_niobuf_remote(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_niobuf_remote); + } + + offset=lustre_dissect_element_niobuf_remote_offset(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_niobuf_remote_len(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_niobuf_remote_flags(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct ost_lvb { */ +/* IDL: uint64 lvb_size; */ +/* IDL: uint64 lvb_mtime; */ +/* IDL: uint64 lvb_atime; */ +/* IDL: uint64 lvb_ctime; */ +/* IDL: uint64 lvb_blocks; */ +/* IDL: } */ + +static int +lustre_dissect_element_ost_lvb_lvb_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ost_lvb_lvb_size); + + return offset; +} + +static int +lustre_dissect_element_ost_lvb_lvb_mtime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_ost_lvb_lvb_mtime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_ost_lvb_lvb_atime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_ost_lvb_lvb_atime, tvb, offset, 8, &ns ); + offset+=8; + return offset; + + return offset; +} + +static int +lustre_dissect_element_ost_lvb_lvb_ctime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_ost_lvb_lvb_ctime, tvb, offset, 8, &ns ); + offset+=8; + return offset; + + return offset; +} + +static int +lustre_dissect_element_ost_lvb_lvb_blocks(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ost_lvb_lvb_blocks); + + return offset; +} + +int +lustre_dissect_struct_ost_lvb(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ost_lvb); + } + + offset=lustre_dissect_element_ost_lvb_lvb_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ost_lvb_lvb_mtime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ost_lvb_lvb_atime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ost_lvb_lvb_ctime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ost_lvb_lvb_blocks(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct ll_fid { */ +/* IDL: uint64 id; */ +/* IDL: uint32 generation; */ +/* IDL: uint32 f_type; */ +/* IDL: } */ + +static int +lustre_dissect_element_ll_fid_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ll_fid_id); + + return offset; +} + +static int +lustre_dissect_element_ll_fid_generation(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ll_fid_generation); + + return offset; +} + +static int +lustre_dissect_element_ll_fid_f_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ll_fid_f_type); + + return offset; +} + +int +lustre_dissect_struct_ll_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ll_fid); + } + + offset=lustre_dissect_element_ll_fid_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ll_fid_generation(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ll_fid_f_type(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct mds_status_req { */ +/* IDL: uint32 flags; */ +/* IDL: uint32 repbuf; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_status_req_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_status_req_flags); + + return offset; +} + +static int +lustre_dissect_element_mds_status_req_repbuf(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_status_req_repbuf); + + return offset; +} + +int +lustre_dissect_struct_mds_status_req(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_status_req); + } + + offset=lustre_dissect_element_mds_status_req_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_status_req_repbuf(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct mds_body { */ +/* IDL: struct ll_fid { */ +/* IDL: } fid1; */ +/* IDL: struct ll_fid { */ +/* IDL: } fid2; */ +/* IDL: struct lustre_handle { */ +/* IDL: } handle; */ +/* IDL: uint64 valid; */ +/* IDL: uint64 size; */ +/* IDL: uint64 mtime; */ +/* IDL: uint64 atime; */ +/* IDL: uint64 ctime; */ +/* IDL: uint64 blocks; */ +/* IDL: uint64 io_epoch; */ +/* IDL: uint64 ino; */ +/* IDL: uint32 fsuid; */ +/* IDL: uint32 fsgid; */ +/* IDL: uint32 capability; */ +/* IDL: uint32 mode; */ +/* IDL: uint32 uid; */ +/* IDL: uint32 gid; */ +/* IDL: uint32 flags; */ +/* IDL: uint32 rdev; */ +/* IDL: uint32 nlink; */ +/* IDL: uint32 generation; */ +/* IDL: uint32 suppgid; */ +/* IDL: uint32 eadatasize; */ +/* IDL: uint32 aclsize; */ +/* IDL: uint32 max_mdsize; */ +/* IDL: uint32 max_cookiesize; */ +/* IDL: uint32 padding_4; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_body_fid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_body_fid1); + return offset; +} + +static int +lustre_dissect_element_mds_body_fid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_body_fid2); + return offset; +} + +static int +lustre_dissect_element_mds_body_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_mds_body_handle); + + return offset; +} + +static int +lustre_dissect_element_mds_body_valid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_body_valid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_body_size); + + return offset; +} + +static int +lustre_dissect_element_mds_body_mtime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_body_mtime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_body_atime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_body_atime, tvb, offset, 8, &ns ); + offset+=8; + + return offset; +} + +static int +lustre_dissect_element_mds_body_ctime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_body_ctime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_body_blocks(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_body_blocks); + + return offset; +} + +static int +lustre_dissect_element_mds_body_io_epoch(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_body_io_epoch); + + return offset; +} + +static int +lustre_dissect_element_mds_body_ino(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_body_ino); + + return offset; +} + +static int +lustre_dissect_element_mds_body_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_capability(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_capability); + + return offset; +} + +static int +lustre_dissect_element_mds_body_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_mode); + + return offset; +} + +static int +lustre_dissect_element_mds_body_uid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_uid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_gid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_gid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_flags); + + return offset; +} + +static int +lustre_dissect_element_mds_body_rdev(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_rdev); + + return offset; +} + +static int +lustre_dissect_element_mds_body_nlink(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_nlink); + + return offset; +} + +static int +lustre_dissect_element_mds_body_generation(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_generation); + + return offset; +} + +static int +lustre_dissect_element_mds_body_suppgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_suppgid); + + return offset; +} + +static int +lustre_dissect_element_mds_body_eadatasize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_eadatasize); + + return offset; +} + +static int +lustre_dissect_element_mds_body_aclsize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_aclsize); + + return offset; +} + +static int +lustre_dissect_element_mds_body_max_mdsize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_max_mdsize); + + return offset; +} + +static int +lustre_dissect_element_mds_body_max_cookiesize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_max_cookiesize); + + return offset; +} + +static int +lustre_dissect_element_mds_body_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_body_padding_4); + + return offset; +} + +int +lustre_dissect_struct_mds_body(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_body); + } + + offset=lustre_dissect_element_mds_body_fid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_fid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_handle(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_valid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_mtime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_atime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_ctime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_blocks(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_io_epoch(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_ino(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_capability(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_uid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_gid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_rdev(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_nlink(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_generation(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_suppgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_eadatasize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_aclsize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_max_mdsize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_max_cookiesize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_body_padding_4(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct obd_dqinfo { */ +/* IDL: uint64 dqi_bgrace; */ +/* IDL: uint64 dqi_igrace; */ +/* IDL: uint32 dqi_flags; */ +/* IDL: uint32 dqi_valid; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_dqinfo_dqi_bgrace(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqinfo_dqi_bgrace); + + return offset; +} + +static int +lustre_dissect_element_obd_dqinfo_dqi_igrace(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqinfo_dqi_igrace); + + return offset; +} + +static int +lustre_dissect_element_obd_dqinfo_dqi_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_dqinfo_dqi_flags); + + return offset; +} + +static int +lustre_dissect_element_obd_dqinfo_dqi_valid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_dqinfo_dqi_valid); + + return offset; +} + +int +lustre_dissect_struct_obd_dqinfo(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset = offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_dqinfo); + } + + offset = lustre_dissect_element_obd_dqinfo_dqi_bgrace(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqinfo_dqi_igrace(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqinfo_dqi_flags(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqinfo_dqi_valid(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct obd_dqblk { */ +/* IDL: uint64 dqb_bhardlimit; */ +/* IDL: uint64 dqb_bsoftlimit; */ +/* IDL: uint64 dqb_curspace; */ +/* IDL: uint64 dqb_ihardlimit; */ +/* IDL: uint64 dqb_isoftlimit; */ +/* IDL: uint64 dqb_curinodes; */ +/* IDL: uint64 dqb_btime; */ +/* IDL: uint64 dqb_itime; */ +/* IDL: uint32 dqb_valid; */ +/* IDL: uint32 padding; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_dqblk_dqb_bhardlimit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_bhardlimit); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_bsoftlimit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_bsoftlimit); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_curspace(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_curspace); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_ihardlimit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_ihardlimit); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_isoftlimit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_isoftlimit); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_curinodes(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_curinodes); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_btime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_obd_dqblk_dqb_btime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_itime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_obd_dqblk_dqb_itime, tvb, offset, 8, &ns ); + offset+=8; + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_dqb_valid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_dqb_valid); + + return offset; +} + +static int +lustre_dissect_element_obd_dqblk_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset = dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_dqblk_padding); + + return offset; +} + +int +lustre_dissect_struct_obd_dqblk(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset = offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_dqblk); + } + + offset = lustre_dissect_element_obd_dqblk_dqb_bhardlimit(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_bsoftlimit(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_curspace(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_ihardlimit(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_isoftlimit(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_curinodes(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_btime(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_itime(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_dqb_valid(tvb, offset, pinfo, tree); + + offset = lustre_dissect_element_obd_dqblk_padding(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct obd_quotactl { */ +/* IDL: uint32 qc_cmd; */ +/* IDL: uint32 qc_type; */ +/* IDL: uint32 qc_id; */ +/* IDL: uint32 qc_stat; */ +/* IDL: struct obd_dqinfo { */ +/* IDL: } qc_dqinfo; */ +/* IDL: struct obd_dqblk { */ +/* IDL: } qc_dqblk; */ +/* IDL: } */ + +static int +lustre_dissect_element_obd_quotactl_qc_cmd(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl_qc_cmd); + + return offset; +} + +static int +lustre_dissect_element_obd_quotactl_qc_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl_qc_type); + + return offset; +} + +static int +lustre_dissect_element_obd_quotactl_qc_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl_qc_id); + + return offset; +} + +static int +lustre_dissect_element_obd_quotactl_qc_stat(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl_qc_stat); + + return offset; +} + + + +static int +lustre_dissect_element_obd_quotactl_qc_dqblk(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_obd_dqblk(tvb,offset,pinfo,tree,hf_lustre_obd_quotactl_qc_dqblk); + return offset; +} + +static int +lustre_dissect_element_obd_quotactl_qc_dqinfo(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_obd_dqinfo(tvb,offset,pinfo,tree,hf_lustre_obd_quotactl_qc_dqinfo); + return offset; +} + +int +lustre_dissect_struct_obd_quotactl(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obd_quotactl); + } + + offset=lustre_dissect_element_obd_quotactl_qc_cmd(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_quotactl_qc_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_quotactl_qc_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_quotactl_qc_stat(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_quotactl_qc_dqinfo(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obd_quotactl_qc_dqblk(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct quota_adjust_qunit { */ +/* IDL: uint32 qaq_flags; */ +/* IDL: uint32 qaq_id; */ +/* IDL: uint64 qaq_bunit_sz; */ +/* IDL: uint64 qaq_iunit_sz; */ +/* IDL: uint64 padding1; */ +/* IDL: } */ + +static int +lustre_dissect_element_quota_adjust_qunit_qaq_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit_qaq_flags); + + return offset; +} + +static int +lustre_dissect_element_quota_adjust_qunit_qaq_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit_qaq_id); + + return offset; +} + +static int +lustre_dissect_element_quota_adjust_qunit_qaq_bunit_sz(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit_qaq_bunit_sz); + + return offset; +} + +static int +lustre_dissect_element_quota_adjust_qunit_qaq_iunit_sz(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit_qaq_iunit_sz); + + return offset; +} + +static int +lustre_dissect_element_quota_adjust_qunit_padding1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit_padding1); + + return offset; +} + +int +lustre_dissect_struct_quota_adjust_qunit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_quota_adjust_qunit); + } + + offset=lustre_dissect_element_quota_adjust_qunit_qaq_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_quota_adjust_qunit_qaq_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_quota_adjust_qunit_qaq_bunit_sz(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_quota_adjust_qunit_qaq_iunit_sz(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_quota_adjust_qunit_padding1(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* I don't find where this struct appear on wire.. need to search mdc_setattr_pack() */ +/* IDL: struct mds_rec_setattr { */ +/* IDL: uint32 sa_opcode; */ +/* IDL: uint32 sa_fsuid; */ +/* IDL: uint32 sa_fsgid; */ +/* IDL: uint32 sa_cap; */ +/* IDL: uint32 sa_suppgid; */ +/* IDL: uint32 sa_mode; */ +/* IDL: struct ll_fid { */ +/* IDL: } sa_fid; */ +/* IDL: uint64 sa_valid; */ +/* IDL: uint64 sa_size; */ +/* IDL: uint64 sa_mtime; */ +/* IDL: uint64 sa_atime; */ +/* IDL: uint64 sa_ctime; */ +/* IDL: uint32 sa_uid; */ +/* IDL: uint32 sa_gid; */ +/* IDL: uint32 sa_attr_flags; */ +/* IDL: uint32 sa_padding; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_setattr_sa_opcode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_opcode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_cap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_cap); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_suppgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_suppgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_mode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85af8d0)(tvb,offset,pinfo,tree,hf_lustre_mds_rec_setattr_sa_fid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_valid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_valid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_size); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_mtime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_setattr_sa_mtime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_atime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_setattr_sa_atime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_ctime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_setattr_sa_ctime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_uid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_uid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_gid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_gid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_attr_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_attr_flags); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_setattr_sa_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_setattr_sa_padding); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_setattr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 sa_opcode ; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_setattr); + } + sa_opcode=tvb_get_letohl(tvb,offset); + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(sa_opcode, lustre_mds_reint_t_vals, "Unknown sa_opc")); + + offset=lustre_dissect_element_mds_rec_setattr_sa_opcode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_cap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_suppgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_valid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_mtime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_atime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_ctime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_uid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_gid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_attr_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_setattr_sa_padding(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct mds_rec_create { */ +/* IDL: uint32 cr_opcode; */ +/* IDL: uint32 cr_fsuid; */ +/* IDL: uint32 cr_fsgid; */ +/* IDL: uint32 cr_cap; */ +/* IDL: uint32 cr_flags; */ +/* IDL: uint32 cr_mode; */ +/* IDL: struct ll_fid { */ +/* IDL: } cr_fid; */ +/* IDL: struct ll_fid { */ +/* IDL: } cr_replayfid; */ +/* IDL: uint64 cr_time; */ +/* IDL: uint64 cr_rdev; */ +/* IDL: uint32 cr_suppgid; */ +/* IDL: uint32 cr_padding_1; */ +/* IDL: uint32 cr_padding_2; */ +/* IDL: uint32 cr_padding_3; */ +/* IDL: uint32 cr_padding_4; */ +/* IDL: uint32 cr_padding_5; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_create_cr_opcode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_opcode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_cap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_cap); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_flags); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_mode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_create_cr_fid); + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_replayfid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_create_cr_replayfid); + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_time(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_create_cr_time, tvb, offset, 8, &ns ); + offset+=8; + return offset; + +} + +static int +lustre_dissect_element_mds_rec_create_cr_rdev(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_rdev); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_suppgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_suppgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_padding_1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_padding_2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_padding_3); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_padding_4); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_create_cr_padding_5(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create_cr_padding_5); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_create(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + guint32 cr_opcode ; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_create); + } + cr_opcode=tvb_get_letohl(tvb,offset); + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(cr_opcode, lustre_mds_reint_t_vals, "Unknown cr_opc")); + + offset=lustre_dissect_element_mds_rec_create_cr_opcode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_cap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_replayfid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_time(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_rdev(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_suppgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_padding_3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_padding_4(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_create_cr_padding_5(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + + return offset; +} + + + +/* TODO : find where this structure appear ! */ +/* IDL: struct mds_rec_join { */ +/* IDL: struct ll_fid { */ +/* IDL: } jr_fid; */ +/* IDL: uint64 jr_headsize; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_join_jr_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85b17c0)(tvb,offset,pinfo,tree,hf_lustre_mds_rec_join_jr_fid); + return offset; +} + +static int +lustre_dissect_element_mds_rec_join_jr_headsize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_rec_join_jr_headsize); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_join(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_join); + } + + offset=lustre_dissect_element_mds_rec_join_jr_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_join_jr_headsize(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct mds_rec_link { */ +/* IDL: uint32 lk_opcode; */ +/* IDL: uint32 lk_fsuid; */ +/* IDL: uint32 lk_fsgid; */ +/* IDL: uint32 lk_cap; */ +/* IDL: uint32 lk_suppgid1; */ +/* IDL: uint32 lk_suppgid2; */ +/* IDL: struct ll_fid { */ +/* IDL: } lk_fid1; */ +/* IDL: struct ll_fid { */ +/* IDL: } lk_fid2; */ +/* IDL: uint64 lk_time; */ +/* IDL: uint32 lk_padding_1; */ +/* IDL: uint32 lk_padding_2; */ +/* IDL: uint32 lk_padding_3; */ +/* IDL: uint32 lk_padding_4; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_link_lk_opcode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_opcode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_cap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_cap); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_suppgid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_suppgid1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_suppgid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_suppgid2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_fid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_link_lk_fid1); + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_fid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_link_lk_fid2); + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_time(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_link_lk_time, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_padding_1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_padding_2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_padding_3); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_link_lk_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link_lk_padding_4); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_link(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 lk_opcode ; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_link); + } + lk_opcode=tvb_get_letohl(tvb,offset); + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(lk_opcode, lustre_mds_reint_t_vals, "Unknown lk_opc")); + + offset=lustre_dissect_element_mds_rec_link_lk_opcode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_cap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_suppgid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_suppgid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_fid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_fid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_time(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_padding_3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_link_lk_padding_4(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct mds_rec_unlink { */ +/* IDL: uint32 ul_opcode; */ +/* IDL: uint32 ul_fsuid; */ +/* IDL: uint32 ul_fsgid; */ +/* IDL: uint32 ul_cap; */ +/* IDL: uint32 ul_suppgid; */ +/* IDL: uint32 ul_mode; */ +/* IDL: struct ll_fid { */ +/* IDL: } ul_fid1; */ +/* IDL: struct ll_fid { */ +/* IDL: } ul_fid2; */ +/* IDL: uint64 ul_time; */ +/* IDL: uint32 ul_padding_1; */ +/* IDL: uint32 ul_padding_2; */ +/* IDL: uint32 ul_padding_3; */ +/* IDL: uint32 ul_padding_4; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_unlink_ul_opcode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_opcode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_cap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_cap); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_suppgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_suppgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_mode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_fid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_unlink_ul_fid1); + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_fid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_unlink_ul_fid2); + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_time(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree,hf_lustre_mds_rec_unlink_ul_time , tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_padding_1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_padding_2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_padding_3); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_unlink_ul_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink_ul_padding_4); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_unlink(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 ul_opcode ; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_unlink); + } + + ul_opcode=tvb_get_letohl(tvb,offset); + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(ul_opcode, lustre_mds_reint_t_vals, "Unknown ul_opc")); + + offset=lustre_dissect_element_mds_rec_unlink_ul_opcode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_cap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_suppgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_fid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_fid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_time(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_padding_3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_unlink_ul_padding_4(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct mds_rec_rename { */ +/* IDL: uint32 rn_opcode; */ +/* IDL: uint32 rn_fsuid; */ +/* IDL: uint32 rn_fsgid; */ +/* IDL: uint32 rn_cap; */ +/* IDL: uint32 rn_suppgid1; */ +/* IDL: uint32 rn_suppgid2; */ +/* IDL: struct ll_fid { */ +/* IDL: } rn_fid1; */ +/* IDL: struct ll_fid { */ +/* IDL: } rn_fid2; */ +/* IDL: uint64 rn_time; */ +/* IDL: uint32 rn_padding_1; */ +/* IDL: uint32 rn_padding_2; */ +/* IDL: uint32 rn_padding_3; */ +/* IDL: uint32 rn_padding_4; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_rec_rename_rn_opcode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_opcode); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_fsuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_fsuid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_fsgid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_fsgid); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_cap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_cap); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_suppgid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_suppgid1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_suppgid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_suppgid2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_fid1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_rename_rn_fid1); + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_fid2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ll_fid(tvb,offset,pinfo,tree,hf_lustre_mds_rec_rename_rn_fid2); + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_time(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_mds_rec_rename_rn_time, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_padding_1); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_padding_2); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_padding_3); + + return offset; +} + +static int +lustre_dissect_element_mds_rec_rename_rn_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mds_rec_rename_rn_padding_4); + + return offset; +} + +int +lustre_dissect_struct_mds_rec_rename(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 rn_opcode ; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_rec_rename); + } + + rn_opcode=tvb_get_letohl(tvb,offset); + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(rn_opcode, lustre_mds_reint_t_vals, "Unknown rn_opc")); + + offset=lustre_dissect_element_mds_rec_rename_rn_opcode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_fsuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_fsgid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_cap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_suppgid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_suppgid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_fid1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_fid2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_time(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_padding_3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_rec_rename_rn_padding_4(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* TODO : find where this structure appear */ + +/* IDL: struct lov_desc { */ +/* IDL: uint32 ld_tgt_count; */ +/* IDL: uint32 ld_active_tgt_count; */ +/* IDL: uint32 ld_default_stripe_count; */ +/* IDL: uint32 ld_pattern; */ +/* IDL: uint64 ld_default_stripe_size; */ +/* IDL: uint64 ld_default_stripe_offset; */ +/* IDL: uint32 ld_padding_0; */ +/* IDL: uint32 ld_qos_maxage; */ +/* IDL: uint32 ld_padding_1; */ +/* IDL: uint32 ld_padding_2; */ +/* IDL: struct obd_uuid { */ +/* IDL: } ld_uuid; */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_desc_ld_tgt_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_tgt_count); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_active_tgt_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_active_tgt_count); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_default_stripe_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_default_stripe_count); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_pattern(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_pattern); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_default_stripe_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_default_stripe_size); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_default_stripe_offset(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_default_stripe_offset); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_padding_0(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_padding_0); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_qos_maxage(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_qos_maxage); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_padding_1); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_desc_ld_padding_2); + + return offset; +} + +static int +lustre_dissect_element_lov_desc_ld_uuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85bade8)(tvb,offset,pinfo,tree,hf_lustre_lov_desc_ld_uuid); + return offset; +} + +int +lustre_dissect_struct_lov_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_desc); + } + + offset=lustre_dissect_element_lov_desc_ld_tgt_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_active_tgt_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_default_stripe_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_pattern(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_default_stripe_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_default_stripe_offset(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_padding_0(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_qos_maxage(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_desc_ld_uuid(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct obd_uuid { */ +/* char uuid[40]; */ +/* IDL: } */ + +static int +lustre_dissect_struct_obd_uuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + + /* the lenght of the string is 40 bytes max, with \0 inside */ + + proto_tree_add_item(parent_tree, hf_index, tvb, offset, 40, TRUE); + + + offset+=40; + return offset; +} + + + +/* IDL: struct ldlm_res_id { */ +/* IDL: uint64 name[4]; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_res_id_name_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_res_id_name); + + return offset; +} + +static int +lustre_dissect_element_ldlm_res_id_name(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 4; i++) + offset=lustre_dissect_element_ldlm_res_id_name_(tvb, offset, pinfo, tree); + + return offset; +} + + + +int +lustre_dissect_struct_ldlm_res_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_res_id); + } + + offset=lustre_dissect_element_ldlm_res_id_name(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: enum { */ +/* IDL: LCK_MINMODE=0, */ +/* IDL: LCK_EX=1, */ +/* IDL: LCK_PW=2, */ +/* IDL: LCK_PR=4, */ +/* IDL: LCK_CW=8, */ +/* IDL: LCK_CR=16, */ +/* IDL: LCK_NL=32, */ +/* IDL: LCK_GROUP=64, */ +/* IDL: LCK_MAXMODE, */ +/* IDL: } */ + +int +lustre_dissect_enum_ldlm_mode_t(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_, int hf_index _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_index); + return offset; +} + + +/* IDL: enum { */ +/* IDL: LDLM_PLAIN=10, */ +/* IDL: LDLM_EXTENT=11, */ +/* IDL: LDLM_FLOCK=12, */ +/* IDL: LDLM_IBITS=13, */ +/* IDL: LDLM_MAX_TYPE, */ +/* IDL: } */ + +int +lustre_dissect_enum_ldlm_type_t(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_, int hf_index _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_resource_desc_lr_type ); + return offset; +} + +/* IDL: struct ldlm_extent { */ +/* IDL: uint64 start; */ +/* IDL: uint64 end; */ +/* IDL: uint64 gid; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_extent_start(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_extent_start); + + return offset; +} + +static int +lustre_dissect_element_ldlm_extent_end(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_extent_end); + + return offset; +} + +static int +lustre_dissect_element_ldlm_extent_gid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_extent_gid); + + return offset; +} + +int +lustre_dissect_struct_ldlm_extent(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_extent); + } + + offset=lustre_dissect_element_ldlm_extent_start(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_extent_end(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_extent_gid(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct ldlm_inodebits { */ +/* IDL: uint64 bits; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_inodebits_bits(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_inodebits_bits); + + return offset; +} + +int +lustre_dissect_struct_ldlm_inodebits(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_inodebits); + } + + offset=lustre_dissect_element_ldlm_inodebits_bits(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct ldlm_flock { */ +/* IDL: uint64 start; */ +/* IDL: uint64 end; */ +/* IDL: uint64 blocking_export; */ +/* IDL: uint32 blocking_pid; */ +/* IDL: uint32 pid; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_flock_start(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_flock_start); + + return offset; +} + +static int +lustre_dissect_element_ldlm_flock_end(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_flock_end); + + return offset; +} + +static int +lustre_dissect_element_ldlm_flock_blocking_export(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_flock_blocking_export); + + return offset; +} + +static int +lustre_dissect_element_ldlm_flock_blocking_pid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_flock_blocking_pid); + + return offset; +} + +static int +lustre_dissect_element_ldlm_flock_pid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_flock_pid); + + return offset; +} + +int +lustre_dissect_struct_ldlm_flock(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_flock); + } + + offset=lustre_dissect_element_ldlm_flock_start(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_flock_end(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_flock_blocking_export(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_flock_blocking_pid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_flock_pid(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct ldlm_intent { */ +/* IDL: uint64 opc; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_intent_opc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + guint32 opcode; + /* this opcode is like a flag*/ + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_open ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_creat ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_readdir ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_getattr ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_lookup ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_unlink ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_getxattr); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_exec ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent_opc_pin ); + opcode = tvb_get_letohl(tvb,offset); + + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, "[ intent :"); + if(opcode & IT_OPEN ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " open"); + proto_item_append_text(tree, " open" ); + } + if(opcode & IT_CREAT ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " create"); + proto_item_append_text(tree, " create" ); + } + if(opcode & IT_READDIR ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " readdir"); + proto_item_append_text(tree, " readdir" ); + } + if(opcode & IT_GETATTR ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " getattr"); + proto_item_append_text(tree, " getattr" ); + } + if(opcode & IT_LOOKUP ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " lookup"); + proto_item_append_text(tree, " lookup" ); + } + if(opcode & IT_UNLINK ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " unlink"); + proto_item_append_text(tree, " unlink" ); + } + if(opcode & IT_GETXATTR){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " getxattr"); + proto_item_append_text(tree, " getxattr" ); + } + if(opcode & IT_EXEC ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " exec"); + proto_item_append_text(tree, " exec" ); + } + if(opcode & IT_PIN ){ + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " pin"); + proto_item_append_text(tree, " pin" ); + } + + display_info_str(tree->parent, pinfo->cinfo, COL_INFO, " ]"); + offset+=8; + return offset; +} + +int +lustre_dissect_struct_ldlm_intent(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + // proto_item *item = NULL; + //proto_tree *tree = NULL; + int old_offset; + guint64 intent_opc; + proto_item * opcode_item = NULL; + proto_tree * opcode_tree = NULL; + + old_offset=offset; + // if (parent_tree) { + // item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + // tree = proto_item_add_subtree(item, ett_lustre_ldlm_intent); + // //tree = parent_tree; /* TODO : !!!!!!!!!!!!!!! ATTENTIon à revoir, + // //fait rapidement pour ne pas avoir le subtree */ + // } + + + intent_opc = tvb_get_letoh64(tvb,offset); + if (parent_tree){ + opcode_item = proto_tree_add_item(parent_tree, hf_lustre_ldlm_intent_opc, tvb, offset, 8, TRUE); + opcode_tree = proto_item_add_subtree(opcode_item, ett_lustre_ldlm_intent_opc); + } + + offset=lustre_dissect_element_ldlm_intent_opc(tvb, offset, pinfo, opcode_tree); + + offset=ldlm_opcode_process(tvb, offset, pinfo, parent_tree, intent_opc); + + + + return offset; +} + +/* IDL: struct ldlm_resource_desc { */ +/* IDL: ldlm_type_t lr_type; */ +/* IDL: uint32 lr_padding; */ +/* IDL: struct ldlm_res_id { */ +/* IDL: } lr_name; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_resource_desc_lr_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + /* offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_resource_desc_lr_type);*/ + offset=lustre_dissect_enum_ldlm_type_t(tvb, offset, pinfo, tree, hf_lustre_ldlm_resource_desc_lr_type); + return offset; +} + +static int +lustre_dissect_element_ldlm_resource_desc_lr_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_resource_desc_lr_padding); + + return offset; +} + +static int +lustre_dissect_element_ldlm_resource_desc_lr_name(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ldlm_res_id(tvb,offset,pinfo,tree,hf_lustre_ldlm_resource_desc_lr_name); + return offset; +} + +int +lustre_dissect_struct_ldlm_resource_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_resource_desc); + } + + offset=lustre_dissect_element_ldlm_resource_desc_lr_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_resource_desc_lr_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_resource_desc_lr_name(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct ldlm_lock_desc { */ +/* IDL: struct ldlm_resource_desc { */ +/* IDL: } l_resource; */ +/* IDL: ldlm_mode_t l_req_mode; */ +/* IDL: ldlm_mode_t l_granted_mode; */ +/* IDL: ldlm_policy_data_t l_policy_data; */ +/* IDL: } */ + +static int +lustre_dissect_element_ldlm_lock_desc_l_resource(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ldlm_resource_desc(tvb,offset,pinfo,tree,hf_lustre_ldlm_lock_desc_l_resource); + return offset; +} + +static int +lustre_dissect_element_ldlm_lock_desc_l_req_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_enum_ldlm_mode_t(tvb, offset, pinfo, tree, hf_lustre_ldlm_lock_desc_l_req_mode); + + return offset; +} + +static int +lustre_dissect_element_ldlm_lock_desc_l_granted_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_enum_ldlm_mode_t(tvb, offset, pinfo, tree, hf_lustre_ldlm_lock_desc_l_granted_mode); + + return offset; +} + +static int +lustre_dissect_element_ldlm_lock_desc_l_policy_data(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + + guint32 lr_type; /* type of the lock */ + + lr_type = tvb_get_letohl(tvb,offset - 48 ); /* TODO : find another way that using -48 */ + + /* this section must be check with lustre code, possible mistake here */ + switch(lr_type){ + case LDLM_PLAIN: + case LDLM_FLOCK: + offset=lustre_dissect_struct_ldlm_flock(tvb,offset,pinfo,tree,hf_lustre_ldlm_lock_desc_l_policy_data); + break; + case LDLM_EXTENT : + offset= lustre_dissect_struct_ldlm_extent(tvb,offset,pinfo,tree,hf_lustre_ldlm_lock_desc_l_policy_data); + /* add an extra padding 8 bytes */ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + break; + case LDLM_IBITS: + offset=lustre_dissect_struct_ldlm_inodebits(tvb,offset,pinfo,tree,hf_lustre_ldlm_lock_desc_l_policy_data); + /* add an extra padding 24 bytes */ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + break; + default: + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_extra_padding); + break; + } + return offset; +} + +int +lustre_dissect_struct_ldlm_lock_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 lock_req_mode; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_lock_desc); + } + + offset=lustre_dissect_element_ldlm_lock_desc_l_resource(tvb, offset, pinfo, tree); + + lock_req_mode = tvb_get_letohl(tvb,offset); + + display_info_fstr(parent_tree->parent, pinfo->cinfo, COL_INFO, "[%s]", val_to_str(lock_req_mode, lustre_ldlm_mode_t_vals, "Unknown lock")); + + offset=lustre_dissect_element_ldlm_lock_desc_l_req_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_lock_desc_l_granted_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_lock_desc_l_policy_data(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct ldlm_request { */ +/* IDL: uint32 lock_flags; */ +/* IDL: uint32 lock_count; */ +/* IDL: struct ldlm_lock_desc { */ +/* IDL: } lock_desc; */ +/* IDL: struct lustre_handle { */ +/* IDL: } lock_handle[2]; */ +/* IDL: } */ + + + +static int +lustre_dissect_element_ldlm_request_lock_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_request_lock_count); + + return offset; +} + +static int +lustre_dissect_element_ldlm_request_lock_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ldlm_lock_desc(tvb,offset,pinfo,tree,hf_lustre_ldlm_request_lock_desc); + return offset; +} + + + +static int +lustre_dissect_element_ldlm_request_lock_handle_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + /*if (check_col(pinfo->cinfo, COL_INFO)) */ + /* col_append_fstr(pinfo->cinfo, COL_INFO, " ldlm cookie : %" G_GINT64_MODIFIER "u", tvb_get_letoh64(tvb,offset) );*/ + + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_ldlm_request_lock_handle); + return offset; +} + +static int +lustre_dissect_element_ldlm_request_lock_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 2; i++) + offset=lustre_dissect_element_ldlm_request_lock_handle_(tvb, offset, pinfo, tree); + + return offset; +} + +int +lustre_dissect_struct_ldlm_request(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + guint32 lock_flag ; + guint32 ldlm_type; + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_request); + } + + lock_flag = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_ldlm_lock_flags(tvb, offset, pinfo, tree, hf_lustre_ldlm_request_lock_flags); + + offset=lustre_dissect_element_ldlm_request_lock_count(tvb, offset, pinfo, tree); + + /* ldlm_type = (EXTENT, PLAIN, ou IBITS) */ + ldlm_type = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_ldlm_request_lock_desc(tvb, offset, pinfo, tree); + offset=lustre_dissect_element_ldlm_request_lock_handle(tvb, offset, pinfo, tree); + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct ldlm_reply { */ +/* IDL: uint32 lock_flags; */ +/* IDL: uint32 lock_padding; */ +/* IDL: struct ldlm_lock_desc { */ +/* IDL: } lock_desc; */ +/* IDL: struct lustre_handle { */ +/* IDL: } lock_handle; */ +/* IDL: uint64 lock_policy_res1; */ +/* IDL: uint64 lock_policy_res2; */ +/* IDL: } */ + + +static int +lustre_dissect_element_ldlm_lock_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree,hf_index, tvb, offset, 4, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_lock_flags); + } + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_ast_discard_data); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_deny_on_contention); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_bl_done ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_bl_ast ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_atomic_cb ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_cleaned ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_cp_reqd ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_cancel_on_block ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_no_lru ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_kms_ignore ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_lvb_ready ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_test_lock ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_block_nowait ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_no_timeout ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_discard_data ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_warn ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_local ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_canceling ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_has_intent ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_failed ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_local_only ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_intent_only ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_replay ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_cancel ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_wait_noreproc ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_ast_sent ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_cbpending ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_block_wait ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_block_conv ); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_block_granted ); + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_fl_lock_changed ); + return offset; +} + +static int +lustre_dissect_element_ldlm_reply_lock_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply_lock_padding); + + return offset; +} + +static int +lustre_dissect_element_ldlm_reply_lock_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_ldlm_lock_desc(tvb,offset,pinfo,tree,hf_lustre_ldlm_reply_lock_desc); + return offset; +} + +static int +lustre_dissect_element_ldlm_reply_lock_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + /*if (check_col(pinfo->cinfo, COL_INFO)) */ + /* col_append_fstr(pinfo->cinfo, COL_INFO, " ldlm cookie : %" G_GINT64_MODIFIER "u", tvb_get_letoh64(tvb,offset) );*/ + + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_ldlm_reply_lock_handle); + return offset; +} + +static int +lustre_dissect_element_ldlm_reply_lock_policy_res1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply_lock_policy_res1); + + return offset; +} + +static int +lustre_dissect_element_ldlm_reply_lock_policy_res2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply_lock_policy_res2); + + return offset; +} + +int +lustre_dissect_struct_ldlm_reply(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_, guint32 * ldlm_type _U_ ) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 lock_flag ; + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ldlm_reply); + } + + lock_flag = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_ldlm_lock_flags(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply_lock_flags); + + offset=lustre_dissect_element_ldlm_reply_lock_padding(tvb, offset, pinfo, tree); + + if (ldlm_type != NULL) + *ldlm_type = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_ldlm_reply_lock_desc(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_reply_lock_handle(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_reply_lock_policy_res1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ldlm_reply_lock_policy_res2(tvb, offset, pinfo, tree); + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct mgs_send_param { */ +/* IDL: uint8 mgs_param[1024]; */ +/* IDL: } */ + + + +static int +lustre_dissect_element_mgs_send_param_mgs_param_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_mgs_send_param_mgs_param); + + return offset; +} + +static int +lustre_dissect_element_mgs_send_param_mgs_param(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 1024; i++) + offset=lustre_dissect_element_mgs_send_param_mgs_param_(tvb, offset, pinfo, tree); + + return offset; +} + +int +lustre_dissect_struct_mgs_send_param(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mgs_send_param); + } + + offset=lustre_dissect_element_mgs_send_param_mgs_param(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct mgs_target_info { */ +/* IDL: uint32 mti_lustre_ver; */ +/* IDL: uint32 mti_stripe_index; */ +/* IDL: uint32 mti_config_ver; */ +/* IDL: uint32 mti_flags; */ +/* IDL: uint32 mti_nid_count; */ +/* IDL: uint32 padding; */ +/* IDL: uint8 mti_fsname[64]; */ +/* IDL: uint8 mti_svname[64]; */ +/* IDL: uint8 mti_uuid[40]; */ +/* IDL: uint64 mti_nids[32]; */ +/* IDL: uint8 mti_params[4096]; */ +/* IDL: } */ + +static int +lustre_dissect_element_mgs_target_info_mti_lustre_ver(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_lustre_ver); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_stripe_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_stripe_index); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_config_ver(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_config_ver); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_flags); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_nid_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_nid_count); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_padding); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_fsname_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_fsname); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_fsname(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 64; i++) + offset=lustre_dissect_element_mgs_target_info_mti_fsname_(tvb, offset, pinfo, tree); + + return offset; +} + + + + + +static int +lustre_dissect_element_mgs_target_info_mti_svname_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_svname); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_svname(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 64; i++) + offset=lustre_dissect_element_mgs_target_info_mti_svname_(tvb, offset, pinfo, tree); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_uuid_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_uuid); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_uuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 40; i++) + offset=lustre_dissect_element_mgs_target_info_mti_uuid_(tvb, offset, pinfo, tree); + + return offset; +} + + + + + +static int +lustre_dissect_element_mgs_target_info_mti_nids_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_nids); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_nids(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 32; i++) + offset=lustre_dissect_element_mgs_target_info_mti_nids_(tvb, offset, pinfo, tree); + + return offset; +} + + + +static int +lustre_dissect_element_mgs_target_info_mti_params_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info_mti_params); + + return offset; +} + +static int +lustre_dissect_element_mgs_target_info_mti_params(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 4096; i++) + offset=lustre_dissect_element_mgs_target_info_mti_params_(tvb, offset, pinfo, tree); + + return offset; +} + +int +lustre_dissect_struct_mgs_target_info(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mgs_target_info); + } + + offset=lustre_dissect_element_mgs_target_info_mti_lustre_ver(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_stripe_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_config_ver(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_nid_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_fsname(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_svname(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_uuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_nids(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mgs_target_info_mti_params(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct cfg_marker { */ +/* IDL: uint32 cm_step; */ +/* IDL: uint32 cm_flags; */ +/* IDL: uint32 cm_vers; */ +/* IDL: uint32 padding; */ +/* IDL: time_t cm_createtime; */ +/* IDL: time_t cm_canceltime; */ +/* IDL: uint8 cm_tgtname[64]; */ +/* IDL: uint8 cm_comment[64]; */ +/* IDL: } */ + +static int +lustre_dissect_element_cfg_marker_cm_step(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_cm_step); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_cm_flags); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_vers(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_cm_vers); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_padding); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_createtime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=dissect_ndr_time_t(tvb, offset, pinfo,tree, hf_lustre_cfg_marker_cm_createtime ); + /*g_print("function lustre_dissect_element_cfg_marker_cm_createtime don't work\n");*/ + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_canceltime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=dissect_ndr_time_t(tvb, offset, pinfo,tree, hf_lustre_cfg_marker_cm_canceltime); + + /*g_print("element_cfg_marker_cm_canceltime\n");*/ + return offset; +} + + + +static int +lustre_dissect_element_cfg_marker_cm_tgtname_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_cm_tgtname); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_tgtname(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 64; i++) + offset=lustre_dissect_element_cfg_marker_cm_tgtname_(tvb, offset, pinfo, tree); + + return offset; +} + + + +static int +lustre_dissect_element_cfg_marker_cm_comment_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint8(tvb, offset, pinfo, tree, hf_lustre_cfg_marker_cm_comment); + + return offset; +} + +static int +lustre_dissect_element_cfg_marker_cm_comment(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 64; i++) + offset=lustre_dissect_element_cfg_marker_cm_comment_(tvb, offset, pinfo, tree); + + return offset; +} + +int +lustre_dissect_struct_cfg_marker(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_cfg_marker); + } + + offset=lustre_dissect_element_cfg_marker_cm_step(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_vers(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_createtime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_canceltime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_tgtname(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_cfg_marker_cm_comment(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct llog_logid { */ +/* IDL: uint64 lgl_oid; */ +/* IDL: uint64 lgl_ogr; */ +/* IDL: uint32 lgl_ogen; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_logid_lgl_oid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_logid_lgl_oid); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_lgl_ogr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_logid_lgl_ogr); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_lgl_ogen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_lgl_ogen); + + return offset; +} + +int +lustre_dissect_struct_llog_logid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_logid); + } + + offset=lustre_dissect_element_llog_logid_lgl_oid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_lgl_ogr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_lgl_ogen(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* I don't know when this structure is used on network */ +/* IDL: struct llog_catid { */ +/* IDL: struct llog_logid { */ +/* IDL: } lci_logid; */ +/* IDL: uint32 lci_padding1; */ +/* IDL: uint32 lci_padding2; */ +/* IDL: uint32 lci_padding3; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_catid_lci_logid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85cc76c)(tvb,offset,pinfo,tree,hf_lustre_llog_catid_lci_logid); + + /*g_print("llog_catid_lci_logid bug\n");*/ + return offset; +} + +static int +lustre_dissect_element_llog_catid_lci_padding1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_catid_lci_padding1); + + return offset; +} + +static int +lustre_dissect_element_llog_catid_lci_padding2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_catid_lci_padding2); + + return offset; +} + +static int +lustre_dissect_element_llog_catid_lci_padding3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_catid_lci_padding3); + + return offset; +} + +int +lustre_dissect_struct_llog_catid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_catid); + } + + offset=lustre_dissect_element_llog_catid_lci_logid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_catid_lci_padding1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_catid_lci_padding2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_catid_lci_padding3(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct lov_mds_md_join { */ +/* IDL: struct lov_mds_md { */ +/* IDL: } lmmj_md; */ +/* IDL: struct llog_logid { */ +/* IDL: } lmmj_array_id; */ +/* IDL: uint32 lmmj_extent_count; */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_mds_md_join_lmmj_md(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_lov_mds_md_v1(tvb,offset,pinfo,tree,hf_lustre_lov_mds_md_join_lmmj_md); + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_join_lmmj_array_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_logid(tvb,offset,pinfo,tree,hf_lustre_lov_mds_md_join_lmmj_array_id); + return offset; +} + +static int +lustre_dissect_element_lov_mds_md_join_lmmj_extent_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_mds_md_join_lmmj_extent_count); + + return offset; +} + +int +lustre_dissect_struct_lov_mds_md_join(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_mds_md_join); + } + + offset=lustre_dissect_element_lov_mds_md_join_lmmj_md(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_join_lmmj_array_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_mds_md_join_lmmj_extent_count(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct llog_rec_hdr { */ +/* IDL: uint32 lrh_len; */ +/* IDL: uint32 lrh_index; */ +/* IDL: uint32 lrh_type; */ +/* IDL: uint32 padding; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_rec_hdr_lrh_len(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_hdr_lrh_len); + + return offset; +} + +static int +lustre_dissect_element_llog_rec_hdr_lrh_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_hdr_lrh_index); + + return offset; +} + +static int +lustre_dissect_element_llog_rec_hdr_lrh_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_hdr_lrh_type); + + return offset; +} + +static int +lustre_dissect_element_llog_rec_hdr_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_hdr_padding); + + return offset; +} + +int +lustre_dissect_struct_llog_rec_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_rec_hdr); + } + + offset=lustre_dissect_element_llog_rec_hdr_lrh_len(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_rec_hdr_lrh_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_rec_hdr_lrh_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_rec_hdr_padding(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct llog_rec_tail { */ +/* IDL: uint32 lrt_len; */ +/* IDL: uint32 lrt_index; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_rec_tail_lrt_len(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_tail_lrt_len); + + return offset; +} + +static int +lustre_dissect_element_llog_rec_tail_lrt_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_rec_tail_lrt_index); + + return offset; +} + +int +lustre_dissect_struct_llog_rec_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_rec_tail); + } + + offset=lustre_dissect_element_llog_rec_tail_lrt_len(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_rec_tail_lrt_index(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct llog_logid_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lid_hdr; */ +/* IDL: struct llog_logid { */ +/* IDL: } lid_id; */ +/* IDL: uint32 padding1; */ +/* IDL: uint32 padding2; */ +/* IDL: uint32 padding3; */ +/* IDL: uint32 padding4; */ +/* IDL: uint32 padding5; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lid_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_logid_rec_lid_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_rec_hdr(tvb,offset,pinfo,tree,hf_lustre_llog_logid_rec_lid_hdr); + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_lid_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_logid(tvb,offset,pinfo,tree,hf_lustre_llog_logid_rec_lid_id); + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_padding1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_rec_padding1); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_padding2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_rec_padding2); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_padding3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_rec_padding3); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_padding4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_rec_padding4); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_padding5(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_logid_rec_padding5); + + return offset; +} + +static int +lustre_dissect_element_llog_logid_rec_lid_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_rec_tail(tvb,offset,pinfo,tree,hf_lustre_llog_logid_rec_lid_tail); + return offset; +} + +int +lustre_dissect_struct_llog_logid_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_logid_rec); + } + + offset=lustre_dissect_element_llog_logid_rec_lid_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_lid_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_padding1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_padding2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_padding3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_padding4(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_padding5(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_logid_rec_lid_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct mds_extent_desc { */ +/* IDL: uint64 med_start; */ +/* IDL: uint64 med_len; */ +/* IDL: struct lov_mds_md { */ +/* IDL: } med_lmm; */ +/* IDL: } */ + +static int +lustre_dissect_element_mds_extent_desc_med_start(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_extent_desc_med_start); + + return offset; +} + +static int +lustre_dissect_element_mds_extent_desc_med_len(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_mds_extent_desc_med_len); + + return offset; +} + +static int +lustre_dissect_element_mds_extent_desc_med_lmm(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d3578)(tvb,offset,pinfo,tree,hf_lustre_mds_extent_desc_med_lmm); + /*g_print("bug\n");*/ + return offset; +} + +int +lustre_dissect_struct_mds_extent_desc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_mds_extent_desc); + } + + offset=lustre_dissect_element_mds_extent_desc_med_start(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_extent_desc_med_len(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_mds_extent_desc_med_lmm(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct llog_array_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lmr_hdr; */ +/* IDL: struct mds_extent_desc { */ +/* IDL: } lmr_med; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lmr_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_array_rec_lmr_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d374c)(tvb,offset,pinfo,tree,hf_lustre_llog_array_rec_lmr_hdr); + + return offset; +} + +static int +lustre_dissect_element_llog_array_rec_lmr_med(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d4328)(tvb,offset,pinfo,tree,hf_lustre_llog_array_rec_lmr_med); + + return offset; +} + +static int +lustre_dissect_element_llog_array_rec_lmr_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d4478)(tvb,offset,pinfo,tree,hf_lustre_llog_array_rec_lmr_tail); + + return offset; +} + +int +lustre_dissect_struct_llog_array_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_array_rec); + } + + offset=lustre_dissect_element_llog_array_rec_lmr_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_array_rec_lmr_med(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_array_rec_lmr_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_create_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lcr_hdr; */ +/* IDL: struct ll_fid { */ +/* IDL: } lcr_fid; */ +/* IDL: uint64 lcr_oid; */ +/* IDL: uint32 lcr_ogen; */ +/* IDL: uint32 padding; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lcr_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_create_rec_lcr_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d56e0)(tvb,offset,pinfo,tree,hf_lustre_llog_create_rec_lcr_hdr); + + return offset; +} + +static int +lustre_dissect_element_llog_create_rec_lcr_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d5830)(tvb,offset,pinfo,tree,hf_lustre_llog_create_rec_lcr_fid); + + return offset; +} + +static int +lustre_dissect_element_llog_create_rec_lcr_oid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_create_rec_lcr_oid); + + return offset; +} + +static int +lustre_dissect_element_llog_create_rec_lcr_ogen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_create_rec_lcr_ogen); + + return offset; +} + +static int +lustre_dissect_element_llog_create_rec_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_create_rec_padding); + + return offset; +} + +static int +lustre_dissect_element_llog_create_rec_lcr_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d69dc)(tvb,offset,pinfo,tree,hf_lustre_llog_create_rec_lcr_tail); + + return offset; +} + +int +lustre_dissect_struct_llog_create_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_create_rec); + } + + offset=lustre_dissect_element_llog_create_rec_lcr_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_create_rec_lcr_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_create_rec_lcr_oid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_create_rec_lcr_ogen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_create_rec_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_create_rec_lcr_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_orphan_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lor_hdr; */ +/* IDL: uint64 lor_oid; */ +/* IDL: uint32 lor_ogen; */ +/* IDL: uint32 padding; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lor_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_orphan_rec_lor_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d767c)(tvb,offset,pinfo,tree,hf_lustre_llog_orphan_rec_lor_hdr); + + return offset; +} + +static int +lustre_dissect_element_llog_orphan_rec_lor_oid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_orphan_rec_lor_oid); + + return offset; +} + +static int +lustre_dissect_element_llog_orphan_rec_lor_ogen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_orphan_rec_lor_ogen); + + return offset; +} + +static int +lustre_dissect_element_llog_orphan_rec_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_orphan_rec_padding); + + return offset; +} + +static int +lustre_dissect_element_llog_orphan_rec_lor_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d8550)(tvb,offset,pinfo,tree,hf_lustre_llog_orphan_rec_lor_tail); + return offset; +} + +int +lustre_dissect_struct_llog_orphan_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_orphan_rec); + } + + offset=lustre_dissect_element_llog_orphan_rec_lor_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_orphan_rec_lor_oid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_orphan_rec_lor_ogen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_orphan_rec_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_orphan_rec_lor_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct llog_unlink_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lur_hdr; */ +/* IDL: uint64 lur_oid; */ +/* IDL: uint32 lur_ogen; */ +/* IDL: uint32 padding; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lur_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_unlink_rec_lur_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d8730)(tvb,offset,pinfo,tree,hf_lustre_llog_unlink_rec_lur_hdr); + return offset; +} + +static int +lustre_dissect_element_llog_unlink_rec_lur_oid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_unlink_rec_lur_oid); + + return offset; +} + +static int +lustre_dissect_element_llog_unlink_rec_lur_ogen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_unlink_rec_lur_ogen); + + return offset; +} + +static int +lustre_dissect_element_llog_unlink_rec_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_unlink_rec_padding); + + return offset; +} + +static int +lustre_dissect_element_llog_unlink_rec_lur_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85d9664)(tvb,offset,pinfo,tree,hf_lustre_llog_unlink_rec_lur_tail); + return offset; +} + +int +lustre_dissect_struct_llog_unlink_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_unlink_rec); + } + + offset=lustre_dissect_element_llog_unlink_rec_lur_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_unlink_rec_lur_oid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_unlink_rec_lur_ogen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_unlink_rec_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_unlink_rec_lur_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct llog_setattr_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lsr_hdr; */ +/* IDL: uint64 lsr_oid; */ +/* IDL: uint32 lsr_ogen; */ +/* IDL: uint32 lsr_uid; */ +/* IDL: uint32 lsr_gid; */ +/* IDL: uint32 padding; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lsr_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_setattr_rec_lsr_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85da2f0)(tvb,offset,pinfo,tree,hf_lustre_llog_setattr_rec_lsr_hdr); + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_lsr_oid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_setattr_rec_lsr_oid); + + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_lsr_ogen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_setattr_rec_lsr_ogen); + + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_lsr_uid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_setattr_rec_lsr_uid); + + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_lsr_gid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_setattr_rec_lsr_gid); + + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_setattr_rec_padding); + + return offset; +} + +static int +lustre_dissect_element_llog_setattr_rec_lsr_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85db3d4)(tvb,offset,pinfo,tree,hf_lustre_llog_setattr_rec_lsr_tail); + return offset; +} + +int +lustre_dissect_struct_llog_setattr_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_setattr_rec); + } + + offset=lustre_dissect_element_llog_setattr_rec_lsr_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_lsr_oid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_lsr_ogen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_lsr_uid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_lsr_gid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_setattr_rec_lsr_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_size_change_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lsc_hdr; */ +/* IDL: struct ll_fid { */ +/* IDL: } lsc_fid; */ +/* IDL: uint32 lsc_io_epoch; */ +/* IDL: uint32 padding; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lsc_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_size_change_rec_lsc_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85dc458)(tvb,offset,pinfo,tree,hf_lustre_llog_size_change_rec_lsc_hdr); + return offset; +} + +static int +lustre_dissect_element_llog_size_change_rec_lsc_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85dc5a8)(tvb,offset,pinfo,tree,hf_lustre_llog_size_change_rec_lsc_fid); + return offset; +} + +static int +lustre_dissect_element_llog_size_change_rec_lsc_io_epoch(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_size_change_rec_lsc_io_epoch); + + return offset; +} + +static int +lustre_dissect_element_llog_size_change_rec_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_size_change_rec_padding); + + return offset; +} + +static int +lustre_dissect_element_llog_size_change_rec_lsc_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85dd384)(tvb,offset,pinfo,tree,hf_lustre_llog_size_change_rec_lsc_tail); + // TODO: to be corrected ! + return offset; +} + +int +lustre_dissect_struct_llog_size_change_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_size_change_rec); + } + + offset=lustre_dissect_element_llog_size_change_rec_lsc_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_size_change_rec_lsc_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_size_change_rec_lsc_io_epoch(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_size_change_rec_padding(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_size_change_rec_lsc_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_gen { */ +/* IDL: uint64 mnt_cnt; */ +/* IDL: uint64 conn_cnt; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_gen_mnt_cnt(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_gen_mnt_cnt); + + return offset; +} + +static int +lustre_dissect_element_llog_gen_conn_cnt(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llog_gen_conn_cnt); + + return offset; +} + +int +lustre_dissect_struct_llog_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_gen); + } + + offset=lustre_dissect_element_llog_gen_mnt_cnt(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_gen_conn_cnt(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* IDL: struct llog_gen_rec { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } lgr_hdr; */ +/* IDL: struct llog_gen { */ +/* IDL: } lgr_gen; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } lgr_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_gen_rec_lgr_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85de2e4)(tvb,offset,pinfo,tree,hf_lustre_llog_gen_rec_lgr_hdr); + // TODO: to be corrected ! + return offset; +} + +static int +lustre_dissect_element_llog_gen_rec_lgr_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85de434)(tvb,offset,pinfo,tree,hf_lustre_llog_gen_rec_lgr_gen); + // TODO: to be corrected ! + return offset; +} + +static int +lustre_dissect_element_llog_gen_rec_lgr_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85df3f8)(tvb,offset,pinfo,tree,hf_lustre_llog_gen_rec_lgr_tail); + // TODO: to be corrected ! + return offset; +} + +int +lustre_dissect_struct_llog_gen_rec(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_gen_rec); + } + + offset=lustre_dissect_element_llog_gen_rec_lgr_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_gen_rec_lgr_gen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_gen_rec_lgr_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_log_hdr { */ +/* IDL: struct llog_rec_hdr { */ +/* IDL: } llh_hdr; */ +/* IDL: uint64 llh_timestamp; */ +/* IDL: uint32 llh_count; */ +/* IDL: uint32 llh_bitmap_offset; */ +/* IDL: uint32 llh_size; */ +/* IDL: uint32 llh_flags; */ +/* IDL: uint32 llh_cat_idx; */ +/* IDL: struct obd_uuid { */ +/* IDL: } llh_tgtuuid; */ +/* IDL: uint32 llh_reserved[1]; */ +/* IDL: uint32 llh_bitmap[2024]; */ +/* IDL: struct llog_rec_tail { */ +/* IDL: } llh_tail; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_log_hdr_llh_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_rec_hdr(tvb,offset,pinfo,tree,hf_lustre_llog_log_hdr_llh_hdr); + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_timestamp(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_llog_log_hdr_llh_timestamp, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_count); + + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_bitmap_offset(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_bitmap_offset); + + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_size); + + return offset; +} + + + +static int +lustre_dissect_element_llog_log_llh_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree,hf_index, tvb, offset, 4, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_log_llh_flags); + } + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_hdr_llh_flag_zap_when_empty); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_hdr_llh_flag_is_cat); + dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_hdr_llh_flag_is_play); + + offset+=4; + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + // offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_flags); + offset=lustre_dissect_element_llog_log_llh_flags(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_flags); + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_cat_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_cat_idx); + + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_tgtuuid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_obd_uuid(tvb,offset,pinfo,tree,hf_lustre_llog_log_hdr_llh_tgtuuid); + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_reserved_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_reserved); + + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_reserved(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 1; i++) + offset=lustre_dissect_element_llog_log_hdr_llh_reserved_(tvb, offset, pinfo, tree); + + return offset; +} + +static int +lustre_dissect_element_llog_log_hdr_llh_bitmap_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_log_hdr_llh_bitmap); + + return offset; +} + + +static int +lustre_dissect_element_llog_log_hdr_llh_bitmap(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + int i; + for (i = 0; i < 2024; i++) + offset=lustre_dissect_element_llog_log_hdr_llh_bitmap_(tvb, offset, pinfo, tree); + + return offset; +} + + + +static int +lustre_dissect_element_llog_log_hdr_llh_tail(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_rec_tail(tvb,offset,pinfo,tree,hf_lustre_llog_log_hdr_llh_tail); + return offset; +} + +int +lustre_dissect_struct_llog_log_hdr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_log_hdr); + } + + offset=lustre_dissect_element_llog_log_hdr_llh_hdr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_timestamp(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_bitmap_offset(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_cat_idx(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_tgtuuid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_reserved(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_bitmap(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_log_hdr_llh_tail(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + + +/* IDL: struct llog_cookie { */ +/* IDL: struct llog_logid { */ +/* IDL: } lgc_lgl; */ +/* IDL: uint32 lgc_subsys; */ +/* IDL: uint32 lgc_index; */ +/* IDL: uint32 lgc_padding; */ +/* IDL: } */ + +static int +lustre_dissect_element_llog_cookie_lgc_lgl(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_logid(tvb,offset,pinfo,tree,hf_lustre_llog_cookie_lgc_lgl); + return offset; +} + +static int +lustre_dissect_element_llog_cookie_lgc_subsys(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_cookie_lgc_subsys); + + return offset; +} + +static int +lustre_dissect_element_llog_cookie_lgc_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_cookie_lgc_index); + + return offset; +} + +static int +lustre_dissect_element_llog_cookie_lgc_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llog_cookie_lgc_padding); + + return offset; +} + +int +lustre_dissect_struct_llog_cookie(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llog_cookie); + } + + offset=lustre_dissect_element_llog_cookie_lgc_lgl(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_cookie_lgc_subsys(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_cookie_lgc_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llog_cookie_lgc_padding(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct llogd_body { */ +/* IDL: struct llog_logid { */ +/* IDL: } lgd_logid; */ +/* IDL: uint32 lgd_ctxt_idx; */ +/* IDL: uint32 lgd_llh_flags; */ +/* IDL: uint32 lgd_index; */ +/* IDL: uint32 lgd_saved_index; */ +/* IDL: uint32 lgd_len; */ +/* IDL: uint64 lgd_cur_offset; */ +/* IDL: } */ + +static int +lustre_dissect_element_llogd_body_lgd_logid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_logid(tvb,offset,pinfo,tree,hf_lustre_llogd_body_lgd_logid); + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_ctxt_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_ctxt_idx); + + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_llh_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_llh_flags); + + offset=lustre_dissect_element_llog_log_llh_flags(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_llh_flags); + + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_index); + + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_saved_index(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_saved_index); + + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_len(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_len); + + return offset; +} + +static int +lustre_dissect_element_llogd_body_lgd_cur_offset(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_llogd_body_lgd_cur_offset); + + return offset; +} + +int +lustre_dissect_struct_llogd_body(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llogd_body); + } + + offset=lustre_dissect_element_llogd_body_lgd_logid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_ctxt_idx(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_llh_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_saved_index(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_len(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_body_lgd_cur_offset(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct llogd_conn_body { */ +/* IDL: struct llog_gen { */ +/* IDL: } lgdc_gen; */ +/* IDL: struct llog_logid { */ +/* IDL: } lgdc_logid; */ +/* IDL: uint32 lgdc_ctxt_idx; */ +/* IDL: } */ + +static int +lustre_dissect_element_llogd_conn_body_lgdc_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85e6038)(tvb,offset,pinfo,tree,hf_lustre_llogd_conn_body_lgdc_gen); + return offset; +} + +static int +lustre_dissect_element_llogd_conn_body_lgdc_logid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + //offset=lustre_dissect_struct_HASH(0x85e6188)(tvb,offset,pinfo,tree,hf_lustre_llogd_conn_body_lgdc_logid); + return offset; +} + +static int +lustre_dissect_element_llogd_conn_body_lgdc_ctxt_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_llogd_conn_body_lgdc_ctxt_idx); + + return offset; +} + +int +lustre_dissect_struct_llogd_conn_body(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_llogd_conn_body); + } + + offset=lustre_dissect_element_llogd_conn_body_lgdc_gen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_conn_body_lgdc_logid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_llogd_conn_body_lgdc_ctxt_idx(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct lov_user_ost_data_join { */ +/* IDL: uint64 l_extent_start; */ +/* IDL: uint64 l_extent_end; */ +/* IDL: uint64 l_object_id; */ +/* IDL: uint64 l_object_gr; */ +/* IDL: uint32 l_ost_gen; */ +/* IDL: uint32 l_ost_idx; */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_user_ost_data_join_l_extent_start(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_extent_start); + + return offset; +} + +static int +lustre_dissect_element_lov_user_ost_data_join_l_extent_end(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_extent_end); + + return offset; +} + +static int +lustre_dissect_element_lov_user_ost_data_join_l_object_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_object_id); + + return offset; +} + +static int +lustre_dissect_element_lov_user_ost_data_join_l_object_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_object_gr); + + return offset; +} + +static int +lustre_dissect_element_lov_user_ost_data_join_l_ost_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_ost_gen); + + return offset; +} + +static int +lustre_dissect_element_lov_user_ost_data_join_l_ost_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_ost_data_join_l_ost_idx); + + return offset; +} + +int +lustre_dissect_struct_lov_user_ost_data_join(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_user_ost_data_join); + } + + offset=lustre_dissect_element_lov_user_ost_data_join_l_extent_start(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_ost_data_join_l_extent_end(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_ost_data_join_l_object_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_ost_data_join_l_object_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_ost_data_join_l_ost_gen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_ost_data_join_l_ost_idx(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct lov_user_md_join { */ +/* IDL: uint32 lmm_magic; */ +/* IDL: uint32 lmm_pattern; */ +/* IDL: uint64 lmm_object_id; */ +/* IDL: uint64 lmm_object_gr; */ +/* IDL: uint32 lmm_stripe_size; */ +/* IDL: uint32 lmm_stripe_count; */ +/* IDL: uint32 lmm_extent_count; */ +/* IDL: uint64 lmm_tree_id; */ +/* IDL: uint64 lmm_tree_gen; */ +/* IDL: struct llog_logid { */ +/* IDL: } lmm_array_id; */ +/* IDL: struct lov_user_ost_data_join { */ +/* IDL: } lmm_objects[0]; */ +/* IDL: } */ + +static int +lustre_dissect_element_lov_user_md_join_lmm_magic(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_magic); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_pattern(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_pattern); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_object_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_object_id); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_object_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_object_gr); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_stripe_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_stripe_size); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_stripe_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_stripe_count); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_extent_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_extent_count); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_tree_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_tree_id); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_tree_gen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_tree_gen); + + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_array_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_logid(tvb,offset,pinfo,tree,hf_lustre_lov_user_md_join_lmm_array_id); + return offset; +} + +static int +lustre_dissect_element_lov_user_md_join_lmm_objects(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lov_user_md_join_lmm_objects); + //for (i = 0; i < 0; i++) + // offset=lustre_dissect_element_lov_user_md_join_lmm_objects_(tvb, offset, pinfo, tree); + + return offset; +} + +//static int +//lustre_dissect_element_lov_user_md_join_lmm_objects_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +//{ +// //offset=lustre_dissect_struct_HASH(0x85eb304)(tvb,offset,pinfo,tree,hf_lustre_lov_user_md_join_lmm_objects); +// return offset; +//} + +int +lustre_dissect_struct_lov_user_md_join(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_lov_user_md_join); + } + + offset=lustre_dissect_element_lov_user_md_join_lmm_magic(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_pattern(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_object_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_object_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_stripe_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_stripe_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_extent_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_tree_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_tree_gen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_array_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_lov_user_md_join_lmm_objects(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* IDL: struct obdo { */ +/* IDL: uint64 o_valid; */ +/* IDL: uint64 o_id; */ +/* IDL: uint64 o_gr; */ +/* IDL: uint64 o_fid; */ +/* IDL: uint64 o_size; */ +/* IDL: uint64 o_mtime; */ +/* IDL: uint64 o_atime; */ +/* IDL: uint64 o_ctime; */ +/* IDL: uint64 o_blocks; */ +/* IDL: uint64 o_grant; */ +/* IDL: uint32 o_blksize; */ +/* IDL: uint32 o_mode; */ +/* IDL: uint32 o_uid; */ +/* IDL: uint32 o_gid; */ +/* IDL: uint32 o_flags; */ +/* IDL: uint32 o_nlink; */ +/* IDL: uint32 o_generation; */ +/* IDL: uint32 o_misc; */ +/* IDL: uint32 o_easize; */ +/* IDL: uint32 o_mds; */ +/* IDL: uint32 o_stripe_idx; */ +/* IDL: uint32 o_padding_1; */ +/* IDL: struct lustre_handle { */ +/* IDL: } o_handle; */ +/* IDL: struct llog_cookie { */ +/* IDL: } o_lcookie; */ +/* IDL: uint64 o_padding_2; */ +/* IDL: uint64 o_padding_3; */ +/* IDL: uint64 o_padding_4; */ +/* IDL: uint64 o_padding_5; */ +/* IDL: uint64 o_padding_6; */ +/* IDL: } */ + +static int +lustre_dissect_element_obdo_o_valid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_valid); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_id); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_gr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_gr); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_fid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_fid); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_size(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_size); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_mtime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_obdo_o_mtime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_obdo_o_atime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_obdo_o_atime, tvb, offset, 8, &ns ); + offset+=8; + return offset; + +} + +static int +lustre_dissect_element_obdo_o_ctime(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + nstime_t ns; + /* timestamp */ + ns.secs = tvb_get_letohl(tvb,offset); + ns.nsecs=0; + proto_tree_add_time(tree, hf_lustre_obdo_o_ctime, tvb, offset, 8, &ns ); + offset+=8; + return offset; +} + +static int +lustre_dissect_element_obdo_o_blocks(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_blocks); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_grant(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_grant); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_blksize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_blksize); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_mode(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_mode); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_uid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_uid); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_gid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_gid); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_flags); + /* TODO rajouter ça , faire une fonction générique qui s'occupe de tous les flags d'un tableau. */ + /* LOBDFlags = {*/ + /*0x00000001 : "OBD_FL_INLINEDATA",*/ + /*0x00000002 : "OBD_FL_OBDMDEXISTS",*/ + /*0x00000004 : "OBD_FL_DELORPHAN",*/ + /*0x00000008 : "OBD_FL_NORPC",*/ + /*0x00000010 : "OBD_FL_IDONLY",*/ + /*0x00000020 : "OBD_FL_RECREATE_OBJS",*/ + /*0x00000040 : "OBD_FL_DEBUG_CHECK",*/ + /*0x00000100 : "OBD_FL_NO_USRQUOTA",*/ + /*0x00000200 : "OBD_FL_NO_GRPQUOTA",*/ + /*0x00000400 : "OBD_FL_CREATE_CROW",*/ + /*0x00000800 : "OBD_FL_TRUNCLOCK",*/ + //} + return offset; +} + +static int +lustre_dissect_element_obdo_o_nlink(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_nlink); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_generation(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_generation); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_misc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_misc); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_easize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_easize); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_mds(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_mds); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_stripe_idx(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_stripe_idx); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_1(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_1); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_obdo_o_handle); + return offset; +} + +static int +lustre_dissect_element_obdo_o_lcookie(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_llog_cookie(tvb,offset,pinfo,tree,hf_lustre_obdo_o_lcookie); + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_2); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_3); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_4(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_4); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_5(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_5); + + return offset; +} + +static int +lustre_dissect_element_obdo_o_padding_6(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_obdo_o_padding_6); + + return offset; +} + +int +lustre_dissect_struct_obdo(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_obdo); + } + + offset=lustre_dissect_element_obdo_o_valid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_gr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_fid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_size(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_mtime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_atime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_ctime(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_blocks(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_grant(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_blksize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_mode(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_uid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_gid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_nlink(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_generation(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_misc(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_easize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_mds(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_stripe_idx(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_1(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_handle(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_lcookie(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_3(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_4(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_5(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_obdo_o_padding_6(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + + +/* IDL: struct ost_body { */ +/* IDL: struct obdo { */ +/* IDL: } oa; */ +/* IDL: } */ + +static int +lustre_dissect_element_ost_body_oa(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_obdo(tvb,offset,pinfo,tree,hf_lustre_ost_body_oa); + return offset; +} + +int +lustre_dissect_struct_ost_body(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ost_body); + } + + offset=lustre_dissect_element_ost_body_oa(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct qunit_data { */ +/* IDL: uint32 qd_id; */ +/* IDL: uint32 qd_flags; */ +/* IDL: uint64 qd_count; */ +/* IDL: uint64 qd_qunit; */ +/* IDL: uint64 padding; */ +/* IDL: } */ + +static int +lustre_dissect_element_qunit_data_qd_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_qd_id); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_qd_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_qd_flags); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_qd_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_qunit_data_qd_count); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_qd_qunit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_qunit_data_qd_qunit); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_qunit_data_padding); + + return offset; +} + +int +lustre_dissect_struct_qunit_data(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_qunit_data); + } + + offset=lustre_dissect_element_qunit_data_qd_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_qd_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_qd_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_qd_qunit(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_padding(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct qunit_data_old2 { */ +/* IDL: uint32 qd_id; */ +/* IDL: uint32 qd_flags; */ +/* IDL: uint64 qd_count; */ +/* IDL: } */ + +static int +lustre_dissect_element_qunit_data_old2_qd_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old2_qd_id); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_old2_qd_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old2_qd_flags); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_old2_qd_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old2_qd_count); + + return offset; +} + +int +lustre_dissect_struct_qunit_data_old2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_qunit_data_old2); + } + + offset=lustre_dissect_element_qunit_data_old2_qd_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_old2_qd_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_old2_qd_count(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + +/* TODO : find when this Structure appear on the wire */ +/* IDL: struct qunit_data_old { */ +/* IDL: uint32 qd_id; */ +/* IDL: uint32 qd_type; */ +/* IDL: uint32 qd_count; */ +/* IDL: uint32 qd_isblk; */ +/* IDL: } */ + +static int +lustre_dissect_element_qunit_data_old_qd_id(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old_qd_id); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_old_qd_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old_qd_type); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_old_qd_count(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old_qd_count); + + return offset; +} + +static int +lustre_dissect_element_qunit_data_old_qd_isblk(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_qunit_data_old_qd_isblk); + + return offset; +} + +int +lustre_dissect_struct_qunit_data_old(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + + + old_offset=offset; + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_qunit_data_old); + } + + offset=lustre_dissect_element_qunit_data_old_qd_id(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_old_qd_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_old_qd_count(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_qunit_data_old_qd_isblk(tvb, offset, pinfo, tree); + + + proto_item_set_len(item, offset-old_offset); + + return offset; +} + + +/* ------------------------------------------------------------------------- */ +/* dissect string . + * the length of the string is the current buflen + * @tvb the packet buff + * @offset the current offset + * @pinfo + * @parent_tree + * @hf_index : the corresponding header field + * @buf_num : the corresponding bufnumber of the string (we use it to + * determine the string length), must respect : 0LUSTRE_BUFCOUNT) + return offset; + + string_len = tvb_get_letohl(tvb, LUSTRE_BUFLEN_OFF + 4 * buf_num) ; /* 4 because a buflen is on a guint32 */ + proto_tree_add_item(parent_tree, hf_index, tvb, offset, string_len, TRUE); + if(string_len>1) + display_info_fstr(parent_tree, pinfo->cinfo, COL_INFO, " filename : %s", tvb_get_string(tvb,offset,string_len) ); + + offset+=string_len; + proto_item_set_len(item, offset-old_offset); + offset=add_extra_padding(tvb,offset,pinfo,parent_tree); /* after a string we must be aligned to 8 bytes. */ + + + return offset; +} + +/* ------------------------------------------------------------------------- */ +/* dissect raw data */ +static int +lustre_dissect_element_data(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_, guint32 buf_num) +{ + proto_item *item = NULL; + + guint32 data_len ; + int old_offset; + + old_offset=offset; + if(buf_num+1>LUSTRE_BUFCOUNT) + return offset; + + data_len = tvb_get_letohl(tvb, LUSTRE_BUFLEN_OFF + 4 * buf_num) ; /* 4 because a buflen is on a guint32 */ + proto_tree_add_item(parent_tree, hf_index, tvb, offset, data_len, TRUE); + + offset+=data_len; + proto_item_set_len(item, offset-old_offset); + offset=add_extra_padding(tvb,offset,pinfo,parent_tree); /* align on 8 bytes */ + + return offset; +} +/* ------------------------------------------------------------------------ */ + + +static int reint_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_) +{ + guint32 opcode ; + + opcode = tvb_get_letohl(tvb,offset); + + switch(opcode){ + case REINT_SETATTR: + /* [eadata][cookie_data][ldlm_request] */ + offset=lustre_dissect_struct_mds_rec_create(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create); + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, + hf_lustre_ldlm_request) ; + break; + case REINT_CREATE : + /* [rec_create][filename][tgt.. à 0 pour l'instant][ldlm_request] */ + offset=lustre_dissect_struct_mds_rec_create(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_REQ_REC_OFF+1); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_REQ_REC_OFF+2); /* this string is all the time =="\0"*/ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, + hf_lustre_ldlm_request) ; + break; + case REINT_LINK : + /*[mds_rec_link][filename][ldlm_req] */ + offset=lustre_dissect_struct_mds_rec_link(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link); + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, + hf_lustre_ldlm_request) ; + /*TODO : need to be check*/ + break; + case REINT_UNLINK : /* mds_unlink_unpack : [mds_rec_unlink][filename][ldlm_req] */ + /* [mds_rec_unlink][filename][ldlm_req][..]*/ + offset=lustre_dissect_struct_mds_rec_unlink(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_REQ_REC_OFF+1); + if ( (tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF +2))) == 112) /* TODO : ugly .. but + for now we have to do this */ + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl); + else + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + + break; + case REINT_RENAME : /*mds_rename_unpack : [mds_rec_rename][filename source][filename target_name][ldlm_request] */ + offset=lustre_dissect_struct_mds_rec_rename(tvb, offset, pinfo, tree, hf_lustre_mds_rec_link); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_old_name, LUSTRE_REQ_REC_OFF+1); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_new_name, LUSTRE_REQ_REC_OFF+2); + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + break; + case REINT_OPEN : /* [rec_create][filename][eadata] */ + offset=lustre_dissect_struct_mds_rec_create(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_REQ_REC_OFF+1); + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_mds_xattr_eadata, LUSTRE_REQ_REC_OFF+2); /* TODO : replace with hf_eadata */ + break; + default: + break; + + } + + return offset ; +} + + +/* dissect a connect message */ +static int +lustre_dissect_generic_connect(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + /*TODO : add a cuuid and target uid */ + offset=lustre_dissect_struct_obd_uuid(tvb, offset, pinfo, tree, hf_lustre_obd_uuid); + offset=lustre_dissect_struct_obd_uuid(tvb, offset, pinfo, tree, hf_lustre_obd_uuid); + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_mds_body_handle); + offset=lustre_dissect_struct_obd_connect_data(tvb,offset,pinfo,tree,hf_lustre_obd_connect_data); + return offset; +} + +static int +lustre_ost_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + guint32 i ; + + switch (opc){ + case OST_REPLY: /* obsolete so nothing */ + break; + case OST_GETATTR: + offset=lustre_dissect_struct_ost_body(tvb, offset, pinfo, tree, hf_lustre_ost_body) ; + break; + case OST_SETATTR: + offset=lustre_dissect_struct_ost_body(tvb, offset, pinfo, tree, hf_lustre_ost_body) ; + break; + case OST_READ: /* [OST_BODY][obd_ioobj][niobuf_remote] for request, [OST_BODY] for reply */ + offset=lustre_dissect_struct_ost_body(tvb, offset, pinfo, tree, hf_lustre_ost_body) ; + if(pb_type==PTL_RPC_MSG_REQUEST){ + offset=lustre_dissect_struct_obd_ioobj(tvb, offset, pinfo, tree, hf_lustre_obd_ioobj); + offset=lustre_dissect_struct_niobuf_remote(tvb,offset,pinfo, tree, hf_lustre_niobuf_remote); + } + break; + case OST_WRITE: + offset=lustre_dissect_struct_ost_body(tvb, offset, pinfo, + tree, hf_lustre_ost_body) ; /* [ost_body] in both case */ + if(pb_type==PTL_RPC_MSG_REQUEST) + { + for (i=0;i=2) + offset=lustre_dissect_struct_obd_statfs(tvb, offset, pinfo, tree, hf_lustre_obd_statfs) ; + break; + case OST_SYNC: + /*[ost_body] in both case */ + offset=lustre_dissect_struct_ost_body(tvb, offset, pinfo, tree, hf_lustre_ost_body) ; + break; + case OST_SET_INFO: + if(pb_type==PTL_RPC_MSG_REQUEST) + { + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_ost_key,LUSTRE_REQ_REC_OFF); /* key */ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_ost_val, LUSTRE_REQ_REC_OFF+1); /* val */ + } + /* if Key = "evict_by_nid" --> need to be process.. TODO */ + break; + case OST_QUOTACHECK: + if(pb_type==PTL_RPC_MSG_REQUEST) + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl) ; + /* nothing in reply */ + break; + case OST_QUOTACTL: + /*[obd_quotactl in both case]*/ + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl) ; + case OST_QUOTA_ADJUST_QUNIT: + /* [quota_adjust_qunit] in both case ? */ + offset=lustre_dissect_struct_quota_adjust_qunit(tvb, offset, pinfo, tree, hf_lustre_quota_adjust_qunit) ; + }; + + return offset; +} + +static int +lustre_mds_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + switch (opc){ + case MDS_DISCONNECT: + /*[nothing]*/ + break; + case MDS_GETSTATUS: + /*[mds body]*/ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + break; + case MDS_SETXATTR: + if(pb_type==PTL_RPC_MSG_REQUEST) + /* [mds body] */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + /*if(reply) : [nothing]*/ + break; + case MDS_GETXATTR: + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[string_xattr_name]*/ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_mds_xattr_name, LUSTRE_REQ_REC_OFF+1); + if(pb_type==PTL_RPC_MSG_REPLY) + /*[eada]*/ + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_mds_xattr_eadata, LUSTRE_REQ_REC_OFF+1); + break; + case MDS_GETATTR: + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + /* TODO [ something ??? ] */ + break; + case MDS_GETATTR_NAME: + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[mds_body]*/ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + /*[nothing in reply]*/ + break; + case MDS_DONE_WRITING: + /*[mds_body]*/ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + break; + case MDS_PIN: /* repbody.. */ + /*[mds_body]*/ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + break; + case MDS_SYNC: + /*[mds_body]*/ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + break; + + case MDS_CLOSE: /* TODO : check the corresponding structure in lustre code */ + if(pb_type==PTL_RPC_MSG_REQUEST) + { /* [mds_body] [lov_mds_md][log_cookie] */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + if(LUSTRE_BUFCOUNT>=3) + offset=lustre_dissect_struct_lov_mds_md_v1(tvb,offset,pinfo,tree,hf_lustre_lov_mds_md_v1); + if(LUSTRE_BUFCOUNT>=4) + if( tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+2)) > 0) + offset=lustre_dissect_struct_llog_cookie(tvb,offset,pinfo,tree,hf_lustre_llog_cookie); + } + if(pb_type==PTL_RPC_MSG_REPLY) + { /* [mds_body][md][cookie] TODO : check that (reread the code about shrink in lustre) */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + if(tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+1)) > 0) + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_mds_md_data,LUSTRE_REQ_REC_OFF+1); /* key */ + if(tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+2)) > 0) + offset=lustre_dissect_struct_llog_cookie(tvb,offset,pinfo,tree,hf_lustre_llog_cookie); + } + break; + + case MDS_STATFS: + if(pb_type==PTL_RPC_MSG_REPLY) + /*[obd_statfs]*/ + offset=lustre_dissect_struct_obd_statfs(tvb, offset, pinfo, tree, hf_lustre_obd_statfs) ; + /*in request : [nothing]*/ + break; + + case MDS_READPAGE: + /* [mds_body] but with some different interpration, need to modify the hf : TODO */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + break; + + case MDS_REINT: + /* the structure depend on the intent_opcode */ + if(pb_type==PTL_RPC_MSG_REQUEST) + offset=reint_opcode_process(tvb, offset, pinfo, tree); + if(pb_type==PTL_RPC_MSG_REPLY) + { + /*[mds_body][??][llog_logid_rec] */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + if(tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+1)) > 0){ + /* OPEN, RENAME, and UNLINK */ + if(tvb_get_letohl(tvb, LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+1)) > 0){ /* rec_unlink or rec_rename */ + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_mds_xattr_eadata, + LUSTRE_REQ_REC_OFF+1); // replace by eadata TODO TODO // + if(tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*(LUSTRE_REQ_REC_OFF+2)) > 0) /* with unlink or rename we have 4 buffers handler.c line 1691*/ + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, + hf_lustre_mds_xattr_eadata, LUSTRE_REQ_REC_OFF+2); // replace withe eadata TODO TODO // + } + } + + } + break; + case MDS_SET_INFO: + if(pb_type==PTL_RPC_MSG_REQUEST){ + /*[key][val]*/ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_ost_key,LUSTRE_REQ_REC_OFF); /* key */ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_ost_val, LUSTRE_REQ_REC_OFF+1); /* val */ + //offset=add_extra_padding(tvb,offset,pinfo,tree); + } + /*nothing en reply*/ + break; + case MDS_QUOTACHECK: + /* [obd_quotactl] */ + if(pb_type==PTL_RPC_MSG_REQUEST) + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl) ; + break; + case MDS_QUOTACTL: + /* [obd_quotactl] in both case*/ + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl) ; + break; + case MDS_CONNECT: + if (pb_type==PTL_RPC_MSG_REQUEST) /* [targetuuid][clientuuid][lustre_handle][obd_connect_data] */ + offset=lustre_dissect_generic_connect(tvb,offset,pinfo,tree); + if (pb_type==PTL_RPC_MSG_REPLY) /*[obd_connect_data]*/ + offset=lustre_dissect_struct_obd_connect_data(tvb,offset,pinfo,tree,hf_lustre_obd_connect_data); + break; + default: + break; + }; + + return offset; + +} + + +static int +lustre_ldlm_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + + switch (opc) + { + case LDLM_ENQUEUE: + /*[ldlm_request] if we have one more buffer it's [intent_opcode] and the opcode give us the + * corresponding intent structure [intent] */ + if(pb_type==PTL_RPC_MSG_REQUEST) + { + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + if (LUSTRE_BUFCOUNT>=3) + offset=lustre_dissect_struct_ldlm_intent(tvb, offset, pinfo, tree, hf_lustre_ldlm_intent); + } + if(pb_type==PTL_RPC_MSG_REPLY) + { + guint32 ldlm_type; + guint32 magic; + /*[ldlm_reply]*/ + offset=lustre_dissect_struct_ldlm_reply(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply,&ldlm_type) ; + if(LUSTRE_BUFCOUNT>2 && (tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*2) == 40)) + /* TODO :this code need to be check and test, the lustre corresponding code is'nt explicit, so + * not sure to have this buffer, but in example it works fine + */ + offset=lustre_dissect_struct_ost_lvb(tvb, offset, pinfo, tree, hf_lustre_ost_lvb); + else + if(LUSTRE_BUFCOUNT>2) + { + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + /*g_print("buflen_off+4*3 = %d", tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*LUSTRE_DLM_INTENT_REC_OFF));*/ + if(LUSTRE_BUFCOUNT>2 && + (tvb_get_letohl(tvb,LUSTRE_BUFLEN_OFF+4*LUSTRE_DLM_INTENT_REC_OFF) > 0)) + { + magic=tvb_get_letohl(tvb, offset); /* TODO : replace this with a macro */ + switch(magic) + { + case LOV_MAGIC_V1: + offset=lustre_dissect_struct_lov_mds_md_v1(tvb,offset,pinfo,tree,hf_lustre_lov_mds_md_v1); + break; + case LOV_MAGIC_JOIN: + offset=lustre_dissect_struct_lov_mds_md_join(tvb, offset, pinfo, tree, + hf_lustre_lov_mds_md_join); + break; + default: + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, + hf_lustre_extra_padding, LUSTRE_DLM_INTENT_REC_OFF); + break; + }; + } + } + } + break; + + case LDLM_CONVERT: + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[ldlm_request]*/ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + if(pb_type==PTL_RPC_MSG_REPLY) + /*[ldlm_reply]*/ + offset=lustre_dissect_struct_ldlm_reply(tvb, offset, pinfo, tree, hf_lustre_ldlm_reply, NULL) ; + break; + + case LDLM_CANCEL: + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[ldlm_request]*/ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + /*[nothing en reply]*/ + break; + + case LDLM_BL_CALLBACK: /* TODO : check the corresponding code in lustre*/ + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[ldlm_request]*/ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + break; + + case LDLM_CP_CALLBACK: + if(pb_type==PTL_RPC_MSG_REQUEST){ + /*[ldlm_request] if the third buffer exist we have [lvb data] so it's [ost_lvb] : TODO : + * check that */ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + if(LUSTRE_BUFCOUNT>=3) + offset=lustre_dissect_struct_ost_lvb(tvb,offset,pinfo,tree,hf_lustre_ost_lvb); + } + /*reply : [nothing] */ + break; + + case LDLM_GL_CALLBACK: + if(pb_type==PTL_RPC_MSG_REQUEST) + /*[ldlm_request] */ + offset=lustre_dissect_struct_ldlm_request(tvb, offset, pinfo, tree, hf_lustre_ldlm_request) ; + else + if(pb_type==PTL_RPC_MSG_REPLY) + /*reply : [ost_lvb] <-- need to be check*/ + offset=lustre_dissect_struct_ost_lvb(tvb, offset, pinfo, tree, hf_lustre_ost_lvb); + break; + + default : + break; + } + return offset; +} + +static int +lustre_mgs_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + switch (opc){ + case MGS_CONNECT : + if (pb_type==PTL_RPC_MSG_REQUEST) /* [targetuuid][clientuuid][lustre_handle][obd_connect_data] */ + offset=lustre_dissect_generic_connect(tvb,offset,pinfo,tree); + if (pb_type==PTL_RPC_MSG_REPLY) /*[obd_connect_data]*/ + offset=lustre_dissect_struct_obd_connect_data(tvb,offset,pinfo,tree,hf_lustre_obd_connect_data); + break; + case MGS_DISCONNECT : + /*[nothing]*/ + break; + case MGS_EXCEPTION : /* node died, etc. */ + /*[nothing]*/ + break; + case MGS_TARGET_REG: + /*[mgs_target_info], mgs_handler.c mgs_handle_target_reg() called whenever a target startup*/ + offset=lustre_dissect_struct_mgs_target_info(tvb, offset, pinfo, tree, hf_lustre_mgs_target_info); + break; + case MGS_TARGET_DEL: + /*[nothing]*/ + break; + case MGS_SET_INFO: + /*[mgs_send_param], mgs_set_info_rpc()*/ + offset=lustre_dissect_struct_mgs_send_param(tvb,offset,pinfo,tree,hf_lustre_mgs_send_param); + break; + default: + break; + }; + return offset; +} + +static int +lustre_odb_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + switch(opc){ + case OBD_PING : + /*[nothing]*/ + break; + case OBD_LOG_CANCEL: + /*[nothing]*/ + break; + case OBD_QC_CALLBACK: + if(pb_type==PTL_RPC_MSG_REQUEST) + offset=lustre_dissect_struct_obd_quotactl(tvb, offset, pinfo, tree, hf_lustre_obd_quotactl); + /*if (request) : [nothing]*/ + break; + default: + break; + }; + return offset; +} + +static int +lustre_llog_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + switch(opc){ + case LLOG_ORIGIN_HANDLE_CREATE : /* in handler.c */ + /*[llogd_body] (reply and request)*/ + offset=lustre_dissect_struct_llogd_body(tvb, offset, pinfo, tree, hf_lustre_llogd_body); + if(pb_type==PTL_RPC_MSG_REQUEST) /*[filename] */ + if (LUSTRE_BUFCOUNT>2) + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_llogd_name, + LUSTRE_REQ_REC_OFF+1); + break; + case LLOG_ORIGIN_HANDLE_NEXT_BLOCK :/* in handler.c */ + /* [llogd_body][???] + * the size of second buf is LLOG_CHKUNK_SIZE, so it's maybee only bulk data */ + offset=lustre_dissect_struct_llogd_body(tvb, offset, pinfo, tree, hf_lustre_llogd_body); + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_llogd_chunk, + LUSTRE_REQ_REC_OFF + 1 ); + break; + case LLOG_ORIGIN_HANDLE_READ_HEADER:/* in handler.c */ + if(pb_type==PTL_RPC_MSG_REQUEST){ + /* [llogd_body][llog_log_hdr] */ + offset=lustre_dissect_struct_llogd_body(tvb, offset, pinfo, tree, hf_lustre_llogd_body); + if(LUSTRE_BUFCOUNT>2) + offset=lustre_dissect_struct_llog_log_hdr(tvb, offset, pinfo, tree, + hf_lustre_llogd_log_hdr); + } + if(pb_type==PTL_RPC_MSG_REPLY) /* [llog_log_hdr] */ + offset=lustre_dissect_struct_llog_log_hdr(tvb, offset, pinfo, tree, + hf_lustre_llogd_log_hdr); + break; + case LLOG_ORIGIN_HANDLE_WRITE_REC : /* I think this is obsolete */ + /*[nothing]*/ + break; + case LLOG_ORIGIN_HANDLE_CLOSE :/* handler.c */ + /*[nothing]*/ + break; + case LLOG_ORIGIN_CONNECT : /* ost_handler.c */ + /*[nothing]*/ + case LLOG_CATINFO : /*in handler.c */ + if(pb_type==PTL_RPC_MSG_REQUEST){ + /* [keyword][if keyword=config [char*] else [nothing]] */ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_llogd_keyword, + LUSTRE_REQ_REC_OFF); + if(strcmp(tvb_get_string(tvb, LUSTRE_REQ_REC_OFF, tvb_get_letohl(tvb, + LUSTRE_BUFLEN_OFF+4*LUSTRE_REQ_REC_OFF)), "config")==0) /* if(keyword == + "config") */ + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_llogd_client, + LUSTRE_REQ_REC_OFF+1); + } + if(pb_type==PTL_RPC_MSG_REPLY) + /*[buf] sizeof = llog_chunk_size*/ + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_llogd_chunk, + LUSTRE_REQ_REC_OFF + 1 ); + /* TODO TODO : check if it's note a catid */ + break; + case LLOG_ORIGIN_HANDLE_PREV_BLOCK : /* in handler.c */ + /* [llogd_body] in both case */ + offset=lustre_dissect_struct_llogd_body(tvb, offset, pinfo, tree, hf_lustre_llogd_body); + if(pb_type==PTL_RPC_MSG_REPLY) + /*[buf] size of llog_chunk_size*/ + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_llogd_chunk, + LUSTRE_REQ_REC_OFF + 1 ); + break; + case LLOG_ORIGIN_HANDLE_DESTROY : /* in handler.c */ + /*[llogd_body] in both case*/ + offset=lustre_dissect_struct_llogd_body(tvb, offset, pinfo, tree, hf_lustre_llogd_body); + break; + default: + break; + }; + + return offset; +} + +/* process lustre opcode : + check if opcode is in range_opcode, and call the corresponding opcode process function */ +static int +lustre_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint32 opc _U_, guint32 pb_type) +{ + if (opc <= OST_LAST_OPC) /* OST opcodes */ + offset=lustre_ost_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + if ( (opc >= MDS_FIRST_OPC) && (opc < MDS_LAST_OPC )) /* MDS opcodes */ + offset=lustre_mds_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + if ( (opc >= LDLM_FIRST_OPC) && (opc < LDLM_LAST_OPC) ) /*LDLM Opcodes*/ + offset=lustre_ldlm_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + if( (opc>= MGS_FIRST_OPC) && (opc <= MGS_LAST_OPC)) /* MGS Opcodes */ + offset=lustre_mgs_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + if( (opc>= OBD_FIRST_OPC) && (opc<=OBD_LAST_OPC)) /* ODB Opcodes */ + offset=lustre_odb_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + if( (opc>=LLOG_FIRST_OPC) && (opc<=LLOG_LAST_OPC)) /* LLOG Opcodes */ + offset=lustre_llog_opcode_process( tvb , offset ,pinfo , tree , opc , pb_type) ; + + return offset ; +} + +/* ----------------------------------------------- */ +/* add an extra padding to be aligned to 8bytes */ +static int +add_extra_padding(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_) +{ + guint32 padding_len; + + padding_len = (8- offset%8)%8; + if(padding_len){ + proto_tree_add_item(tree, hf_lustre_extra_padding , tvb, offset, padding_len, TRUE); + offset+=padding_len; + } + return offset; +} +/* ----------------------------------------------- */ + +static int +ldlm_opcode_process(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree * tree _U_, guint64 intent_opc _U_) +{ + /* all corresponding code is in mdc_locks.c in function mdc_enqueue() */ + /* if 0x0003 we have CREAT + OPEN + */ + if (intent_opc & IT_OPEN) { + /* mdc_intent_open_pack(), d'où [opcode][mdc_rec_create][name][eada] */ + offset=lustre_dissect_struct_mds_rec_create(tvb, offset, pinfo, tree, hf_lustre_mds_rec_create); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_DLM_INTENT_REC_OFF+1); + offset=lustre_dissect_element_data(tvb, offset, pinfo, tree, hf_lustre_mds_xattr_eadata, + LUSTRE_DLM_INTENT_REC_OFF+2); /* TODO : replace hf with eada hf */ + + } + if (intent_opc & IT_UNLINK){ + /* mdc_intent_unlink_pack(), d'où [opcode][mds_rec_unlink][name] */ + offset=lustre_dissect_struct_mds_rec_unlink(tvb, offset, pinfo, tree, hf_lustre_mds_rec_unlink); + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_DLM_INTENT_REC_OFF+1); + } + if (intent_opc & IT_GETATTR){ + /* mdc_intent_lookup_pack, d'où [mds_body][name] */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_DLM_INTENT_REC_OFF+1); + } + + if (intent_opc & IT_LOOKUP){ + /* mdc_intent_lookup_pack, d'où [mds_body][name] */ + offset=lustre_dissect_struct_mds_body(tvb, offset, pinfo, tree, hf_lustre_mds_body) ; + offset=lustre_dissect_element_string(tvb, offset, pinfo, tree, hf_lustre_reint_name, LUSTRE_DLM_INTENT_REC_OFF+1); + } + return offset; +} + + + +/* ----------------------------------------------- */ +/* function to test if the packet is entirely dissected add BUG in PROTOCOL COL when it's not*/ +static void +sanity_check(tvbuff_t *tvb, packet_info *pinfo, guint32 val_offset _U_) +{ + guint32 magic_number ; + guint32 somme_buflen = 0 ; + guint32 i ; + + magic_number = tvb_get_letohl(tvb, 8); + + + for (i=0;icinfo, COL_INFO)) { + col_append_str(pinfo->cinfo, COL_PROTOCOL, "BUG"); + } + } + +} + + +/* IDL: struct lustre_msg_v1 { */ +/* IDL: struct lustre_handle { */ +/* IDL: } lm_handle; */ +/* IDL: uint32 lm_magic; */ +/* IDL: uint32 lm_type; */ +/* IDL: uint32 lm_version; */ +/* IDL: uint32 lm_opc; */ +/* IDL: uint64 lm_last_xid; */ +/* IDL: uint64 lm_last_committed; */ +/* IDL: uint64 lm_transno; */ +/* IDL: uint32 lm_status; */ +/* IDL: uint32 lm_flags; */ +/* IDL: uint32 lm_conn_cnt; */ +/* IDL: uint32 lm_bufcount; */ +/* IDL: uint32 lm_buflens[0]; */ +/* IDL: } */ + + +static int +lustre_dissect_element_msg_v1_lm_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + /*TODO : replace with a v1 handle*/ + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_lustre_msg_v1_lm_handle); + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_magic(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_magic); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_type); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_version(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_version); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_opc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_opc); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_last_xid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_last_xid); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_last_committed(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_last_committed); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_transno(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_transno); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_status(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_status); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_flags); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_conn_cnt(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_conn_cnt); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_bufcount(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_bufcount); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_buflens_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v1_lm_buflens); + + return offset; +} + +static int +lustre_dissect_element_msg_v1_lm_buflens(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + guint32 bufcount ; + gboolean extra_padding ; + guint i; + + bufcount=tvb_get_letohl(tvb, offset-4); /* TODO : replace with a macro */ + + if (bufcount & 1) /* we add an extra padding if bufcount is odd */ + extra_padding = 1 ; + else + extra_padding = 0 ; + + for (i=0;icinfo, COL_INFO, val_to_str(opc, lustre_op_codes, "Unknown")); + display_info_fstr(parent_tree, pinfo->cinfo,COL_INFO, " %s ", val_to_str(pb_type, lustre_LMTypes, "Unknown")); + + offset=lustre_opcode_process(tvb, offset, pinfo, tree, opc, pb_type); + + return offset; +} + +/* IDL: struct ptlrpc_body { */ +/* IDL: struct lustre_handle { */ +/* IDL: } pb_handle; */ +/* IDL: uint32 pb_type; */ +/* IDL: uint32 pb_version; */ +/* IDL: uint32 pb_opc; */ +/* IDL: uint32 pb_status; */ +/* IDL: uint64 pb_last_xid; */ +/* IDL: uint64 pb_last_seen; */ +/* IDL: uint64 pb_last_committed; */ +/* IDL: uint64 pb_transno; */ +/* IDL: uint32 pb_flags; */ +/* IDL: uint32 pb_op_flags; */ +/* IDL: uint32 pb_conn_cnt; */ +/* IDL: uint32 pb_timeout; */ +/* IDL: uint32 pb_service_time; */ +/* IDL: uint32 pb_limit; */ +/* IDL: uint64 pb_slv; */ +/* IDL: } */ + +static int +lustre_dissect_element_ptlrpc_body_pb_handle(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=lustre_dissect_struct_handle_cookie(tvb,offset,pinfo,tree,hf_lustre_ptlrpc_body_pb_handle); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_type(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_type); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_version(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_version); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_opc(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_opc); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_status(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_status); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_last_xid(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_last_xid); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_last_seen(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_last_seen); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_last_committed(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_last_committed); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_transno(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_transno); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_flags); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_op_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_op_flags); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_conn_cnt(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_conn_cnt); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_timeout(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_timeout); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_service_time(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_service_time); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_limit(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_limit); + + return offset; +} + +static int +lustre_dissect_element_ptlrpc_body_pb_slv(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint64(tvb, offset, pinfo, tree, hf_lustre_ptlrpc_body_pb_slv); + + return offset; +} + +static int +lustre_dissect_struct_ptlrpc_body(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_, guint32 buf_len _U_ ) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + int old_offset; + + guint32 opc, pb_type; + + + old_offset=offset; + + + if (parent_tree) { + item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + tree = proto_item_add_subtree(item, ett_lustre_ptlrpc_body); + } + + offset=lustre_dissect_element_ptlrpc_body_pb_handle(tvb, offset, pinfo, tree); + + pb_type = tvb_get_letohl(tvb, offset); + //g_print("msg_v2_lm_type_offset = %d \n" , offset) ; + + offset=lustre_dissect_element_ptlrpc_body_pb_type(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_version(tvb, offset, pinfo, tree); + + // g_print("msg_v2_opcode_offset %d \n", offset); + opc = tvb_get_letohl(tvb, offset); + offset=lustre_dissect_element_ptlrpc_body_pb_opc(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_status(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_last_xid(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_last_seen(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_last_committed(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_transno(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_op_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_conn_cnt(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_timeout(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_service_time(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_limit(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_ptlrpc_body_pb_slv(tvb, offset, pinfo, tree); + + + + proto_item_set_len(item, offset-old_offset); + + /* display some nice infos */ + display_info_str(parent_tree, pinfo->cinfo, COL_INFO, val_to_str(opc, lustre_op_codes, "Unknown")); + display_info_fstr(parent_tree, pinfo->cinfo,COL_INFO, " %s ", val_to_str(pb_type, lustre_LMTypes, "Unknown")); + + /* on utilise parent_tree pour bien distinguer les différents buffers (relatifs à bufcount + buflen), il s'agit d'un choix de présentation */ + offset=lustre_opcode_process(tvb, offset, pinfo, parent_tree, opc, pb_type); + + sanity_check(tvb,pinfo,offset-old_offset); + return offset; +} + + + +/* IDL: struct lustre_msg_v2 { */ +/* IDL: uint32 lm_bufcount; */ +/* IDL: uint32 lm_secflvr; */ +/* IDL: uint32 lm_magic; */ +/* IDL: uint32 lm_repsize; */ +/* IDL: uint32 lm_cksum; */ +/* IDL: uint32 lm_flags; */ +/* IDL: uint32 lm_padding_2; */ +/* IDL: uint32 lm_padding_3; */ +/* IDL: uint32 lm_buflens[0]; */ +/* IDL: } */ + +static int +lustre_dissect_element_msg_v2_lm_bufcount(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + proto_tree_add_item(tree, hf_lustre_lustre_msg_v2_lm_bufcount, tvb, offset, 4, TRUE); + offset += 4 ; + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_secflvr(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_secflvr); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_magic(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_magic); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_repsize(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_repsize); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_cksum(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_cksum); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_flags(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_flags); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_padding_2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_padding_2); + + return offset; +} + +static int +lustre_dissect_element_msg_v2_lm_padding_3(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_padding_3); + + return offset; +} + + + +static int +lustre_dissect_element_msg_v2_lm_buflens_(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *tree _U_) +{ + offset=dissect_uint32(tvb, offset, pinfo, tree, hf_lustre_lustre_msg_v2_lm_buflens); + + return offset; +} + +int +lustre_dissect_struct_msg_v2(tvbuff_t *tvb _U_, int offset _U_, packet_info *pinfo _U_, proto_tree *parent_tree _U_, int hf_index _U_) +{ + proto_item *item = NULL; + proto_tree *tree = NULL; + guint32 bufcount ; + int old_offset; + guint32 i ; + guint32 buf_len_offset; + guint32 current_buf_len ; + gboolean extra_padding ; + + + old_offset=offset; + /* to get a light display */ + tree=parent_tree; + // if (parent_tree) { + // item = proto_tree_add_item(parent_tree, hf_index, tvb, offset, -1, TRUE); + // tree = proto_item_add_subtree(item, ett_lustre_lustre_msg_v2); + // } + + bufcount = tvb_get_letohl(tvb,offset); + offset=lustre_dissect_element_msg_v2_lm_bufcount(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_secflvr(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_magic(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_repsize(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_cksum(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_flags(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_padding_2(tvb, offset, pinfo, tree); + + offset=lustre_dissect_element_msg_v2_lm_padding_3(tvb, offset, pinfo, tree); + + + + if (bufcount & 1) /* we add an extra padding if bufcount is odd */ + extra_padding = 1 ; + else + extra_padding = 0 ; + + buf_len_offset=offset ; + for (i=0;icinfo, COL_PROTOCOL)) + col_set_str(pinfo->cinfo, COL_PROTOCOL, "Lustre"); + + /*light display*/ + if (check_col(pinfo->cinfo, COL_INFO)) + col_set_str(pinfo->cinfo, COL_INFO, ""); + /* guint32 magic_number ; */ + /* magic_number = tvb_get_letohl(tvb, LUSTRE_MAGIC_OFFSET); */ + /* switch (magic_number)*/ + /* {*/ + /* case LUSTRE_MSG_MAGIC_V1:*/ + /* col_append_fstr(pinfo->cinfo, COL_INFO, " V1 ");*/ + /* break;*/ + /* case LUSTRE_MSG_MAGIC_V2:*/ + /* col_append_fstr(pinfo->cinfo, COL_INFO, " V2 ");*/ + /* break;*/ + /* default:*/ + /* break;*/ + /* }*/ + + if (tree) { + + guint32 magic_number ; + guint32 offset; + proto_item *ti = NULL ; + proto_tree * lustre_tree = NULL ; + + + ti = proto_tree_add_item(tree,proto_lustre,tvb,0,-1,FALSE); + lustre_tree = proto_item_add_subtree(ti,ett_lustre); + + + magic_number = tvb_get_letohl(tvb, 8); + + + switch (magic_number){ + case LUSTRE_MSG_MAGIC_V1: + /* put some nice info*/ + proto_item_append_text(lustre_tree, " V1 "); + offset=lustre_dissect_struct_msg_v1(tvb, 0, pinfo, lustre_tree, proto_lustre ) ; + break; + case LUSTRE_MSG_MAGIC_V2: + /* put some nice info*/ + proto_item_append_text(lustre_tree, " V2 "); + offset=lustre_dissect_struct_msg_v2(tvb, 0, pinfo, lustre_tree, proto_lustre ) ; + break; + default: + break; + } + + } +} + +void proto_register_dcerpc_lustre(void) +{ + static hf_register_info hf[] = { + { &hf_lustre_mds_body_ctime, + { "Ctime", "lustre.mds_body.ctime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_fid1, + { "Fid1", "lustre.mds_body.fid1", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_nlink, + { "Nlink", "lustre.mds_body.nlink", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_flags, + { "Flags", "lustre.mds_body.flags", FT_UINT32, BASE_HEX, VALS(lustre_mds_flags_vals) , 0, "", HFILL }}, + { &hf_lustre_mds_body_fsgid, + { "Fsgid", "lustre.mds_body.fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_mtime, + { "Mtime", "lustre.mds_body.mtime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_uid, + { "Uid", "lustre.mds_body.uid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_mode, + { "Mode", "lustre.mds_body.mode", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_max_cookiesize, + { "Max Cookiesize", "lustre.mds_body.max_cookiesize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_io_epoch, + { "Io Epoch", "lustre.mds_body.io_epoch", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_ino, + { "Ino", "lustre.mds_body.ino", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_fid2, + { "Fid2", "lustre.mds_body.fid2", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_padding_4, + { "Padding 4", "lustre.mds_body.padding_4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_aclsize, + { "Aclsize", "lustre.mds_body.aclsize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_valid, + { "Valid", "lustre.mds_body.valid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_generation, + { "Generation", "lustre.mds_body.generation", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_atime, + { "Atime", "lustre.mds_body.atime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_handle, + { "Handle", "lustre.mds_body.handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_max_mdsize, + { "Max Mdsize", "lustre.mds_body.max_mdsize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_rdev, + { "Rdev", "lustre.mds_body.rdev", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_blocks, + { "Blocks", "lustre.mds_body.blocks", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_fsuid, + { "Fsuid", "lustre.mds_body.fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_gid, + { "Gid", "lustre.mds_body.gid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_handle_cookie, + { "Cookie", "lustre.lustre_handle.cookie", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_suppgid, + { "Suppgid", "lustre.mds_body.suppgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_size, + { "Size", "lustre.mds_body.size", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_eadatasize, + { "Eadatasize", "lustre.mds_body.eadatasize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_body_capability, + { "Capability", "lustre.mds_body.capability", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_last_committed, + { "Pb Last Committed", "lustre.ptlrpc_body.pb_last_committed", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_version, + { "Pb Version", "lustre.ptlrpc_body.pb_version", FT_UINT32, BASE_DEC, NULL, ~LUSTRE_VERSION_MASK, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_bufcount, + { "Lm Bufcount", "lustre.lustre_msg_v1.lm_bufcount", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_ioobj_ioo_id, + { "Ioo Id", "lustre.obd_ioobj.ioo_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_slv, + { "Pb Slv", "lustre.ptlrpc_body.pb_slv", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_handle, + { "Lm Handle", "lustre.lustre_msg_v1.lm_handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_lvb_lvb_atime, + { "Lvb Atime", "lustre.ost_lvb.lvb_atime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_timeout, + { "Pb Timeout", "lustre.ptlrpc_body.pb_timeout", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_bavail, + { "Os Bavail", "lustre.obd_statfs.os_bavail", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_bsize, + { "Os Bsize", "lustre.obd_statfs.os_bsize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_repsize, + { "Lm Repsize", "lustre.lustre_msg_v2.lm_repsize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_stripe_size, + { "Lmm Stripe Size", "lustre.lov_mds_md_v1.lmm_stripe_size", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_last_xid, + { "Lm Last Xid", "lustre.lustre_msg_v1.lm_last_xid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ll_fid_f_type, + { "F Type", "lustre.ll_fid.f_type", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_cksum, + { "Lm Cksum", "lustre.lustre_msg_v2.lm_cksum", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_buflens, + { "Lm Buflens", "lustre.lustre_msg_v2.lm_buflens", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_status, + { "Lm Status", "lustre.lustre_msg_v1.lm_status", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_type, + { "Lm Type", "lustre.lustre_msg_v1.lm_type", FT_UINT32, BASE_DEC, VALS(lustre_LMTypes), 0, "", HFILL }}, + { &hf_lustre_niobuf_remote_len, + { "Len", "lustre.niobuf_remote.len", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_magic, + { "Lmm Magic", "lustre.lov_mds_md_v1.lmm_magic", FT_UINT32, BASE_HEX, VALS(lustre_lov_magic) , 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_op_flags, + { "Pb Op Flags", "lustre.ptlrpc_body.pb_op_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_lvb_lvb_ctime, + { "Lvb Ctime", "lustre.ost_lvb.lvb_ctime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_type, + { "Pb Type", "lustre.ptlrpc_body.pb_type", FT_UINT32, BASE_DEC, VALS(lustre_LMTypes), 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_nllg, + { "Ocd Nllg", "lustre.obd_connect_data.ocd_nllg", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_nllu, + { "Ocd Nllu", "lustre.obd_connect_data.ocd_nllu", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ll_fid_generation, + { "Generation", "lustre.ll_fid.generation", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_lvb_lvb_mtime, + { "Lvb Mtime", "lustre.ost_lvb.lvb_mtime",FT_ABSOLUTE_TIME, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_ibits_known, + { "Ocd Ibits Known", "lustre.obd_connect_data.ocd_ibits_known", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_padding_3, + { "Lm Padding 3", "lustre.lustre_msg_v2.lm_padding_3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_flags, + { "Pb Flags", "lustre.ptlrpc_body.pb_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare4, + { "Os Spare4", "lustre.obd_statfs.os_spare4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_group, + { "Ocd Group", "lustre.obd_connect_data.ocd_group", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_ost_data_v1_l_object_gr, + { "L Object Gr", "lustre.lov_ost_data_v1.l_object_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_object_gr, + { "Lmm Object Gr", "lustre.lov_mds_md_v1.lmm_object_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_brw_size, + { "Ocd Brw Size", "lustre.obd_connect_data.ocd_brw_size", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_limit, + { "Pb Limit", "lustre.ptlrpc_body.pb_limit", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_maxbytes, + { "Os Maxbytes", "lustre.obd_statfs.os_maxbytes", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare5, + { "Os Spare5", "lustre.obd_statfs.os_spare5", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_flags, + { "Lm Flags", "lustre.lustre_msg_v2.lm_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_ffree, + { "Os Ffree", "lustre.obd_statfs.os_ffree", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_files, + { "Os Files", "lustre.obd_statfs.os_files", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_stripe_count, + { "Lmm Stripe Count", "lustre.lov_mds_md_v1.lmm_stripe_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_flags, + { "Lm Flags", "lustre.lustre_msg_v1.lm_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_last_committed, + { "Lm Last Committed", "lustre.lustre_msg_v1.lm_last_committed", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare9, + { "Os Spare9", "lustre.obd_statfs.os_spare9", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_index, + { "Ocd Index", "lustre.obd_connect_data.ocd_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_buflens, + { "Lm Buflens", "lustre.lustre_msg_v1.lm_buflens", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare1, + { "Os Spare1", "lustre.obd_statfs.os_spare1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare8, + { "Os Spare8", "lustre.obd_statfs.os_spare8", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_conn_cnt, + { "Lm Conn Cnt", "lustre.lustre_msg_v1.lm_conn_cnt", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_transno, + { "Pb Transno", "lustre.ptlrpc_body.pb_transno", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_service_time, + { "Pb Service Time", "lustre.ptlrpc_body.pb_service_time",FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_conn_cnt, + { "Pb Conn Cnt", "lustre.ptlrpc_body.pb_conn_cnt", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_opc, + { "Pb Opc", "lustre.ptlrpc_body.pb_opc", FT_UINT32, BASE_DEC, VALS(lustre_op_codes), 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_connect_flags, + { "Ocd Connect Flags", "lustre.obd_connect_data.ocd_connect_flags", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_ost_data_v1_l_object_id, + { "L Object Id", "lustre.lov_ost_data_v1.l_object_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_ost_data_v1_l_ost_gen, + { "L Ost Gen", "lustre.lov_ost_data_v1.l_ost_gen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_bfree, + { "Os Bfree", "lustre.obd_statfs.os_bfree", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_version, + { "Ocd Version", "lustre.obd_connect_data.ocd_version", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_objects, + { "Lmm Objects", "lustre.lov_mds_md_v1.lmm_objects", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_status_req_flags, + { "Flags", "lustre.mds_status_req.flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_namelen, + { "Os Namelen", "lustre.obd_statfs.os_namelen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_blocks, + { "Os Blocks", "lustre.obd_statfs.os_blocks", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_secflvr, + { "Lm Secflvr", "lustre.lustre_msg_v2.lm_secflvr", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_transno, + { "Lm Transno", "lustre.lustre_msg_v1.lm_transno", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_pattern, + { "Lmm Pattern", "lustre.lov_mds_md_v1.lmm_pattern", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_opc, + { "Lm Opc", "lustre.lustre_msg_v1.lm_opc", FT_UINT32, BASE_DEC, VALS(lustre_op_codes), 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_grant, + { "Ocd Grant", "lustre.obd_connect_data.ocd_grant", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_ioobj_ioo_bufcnt, + { "Ioo Bufcnt", "lustre.obd_ioobj.ioo_bufcnt", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_version, + { "Lm Version", "lustre.lustre_msg_v1.lm_version", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare7, + { "Os Spare7", "lustre.obd_statfs.os_spare7", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_fsid, + { "Os Fsid", "lustre.obd_statfs.os_fsid", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_cksum_types, + { "Ocd Cksum Types", "lustre.obd_connect_data.ocd_cksum_types", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_lvb_lvb_size, + { "Lvb Size", "lustre.ost_lvb.lvb_size", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_type, + { "Os Type", "lustre.obd_statfs.os_type", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare6, + { "Os Spare6", "lustre.obd_statfs.os_spare6", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_state, + { "Os State", "lustre.obd_statfs.os_state", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare3, + { "Os Spare3", "lustre.obd_statfs.os_spare3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_magic, + { "Lm Magic", "lustre.lustre_msg_v2.lm_magic", FT_UINT32, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1_lmm_object_id, + { "Lmm Object Id", "lustre.lov_mds_md_v1.lmm_object_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_last_seen, + { "Pb Last Seen", "lustre.ptlrpc_body.pb_last_seen", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_ioobj_ioo_type, /* TODO : create the corresponding value_string */ + { "Ioo Type", "lustre.obd_ioobj.ioo_type", FT_UINT32, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_last_xid, + { "Pb Last Xid", "lustre.ptlrpc_body.pb_last_xid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_status, + { "Pb Status", "lustre.ptlrpc_body.pb_status", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_niobuf_remote_flags, + { "Flags", "lustre.niobuf_remote.flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ll_fid_id, + { "Id", "lustre.ll_fid.id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_lvb_lvb_blocks, + { "Lvb Blocks", "lustre.ost_lvb.lvb_blocks", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_padding_2, + { "Lm Padding 2", "lustre.lustre_msg_v2.lm_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_padding1, + { "Padding1", "lustre.obd_connect_data.padding1", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_ost_data_v1_l_ost_idx, + { "L Ost Idx", "lustre.lov_ost_data_v1.l_ost_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_padding2, + { "Padding2", "lustre.obd_connect_data.padding2", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_ioobj_ioo_gr, + { "Ioo Gr", "lustre.obd_ioobj.ioo_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_niobuf_remote_offset, + { "Offset", "lustre.niobuf_remote.offset", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_status_req_repbuf, + { "Repbuf", "lustre.mds_status_req.repbuf", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_statfs_os_spare2, + { "Os Spare2", "lustre.obd_statfs.os_spare2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v2_lm_bufcount, + { "Lm Bufcount", "lustre.lustre_msg_v2.lm_bufcount", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb_handle, + { "Pb Handle", "lustre.ptlrpc_body.pb_handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_connect_data_ocd_transno, + { "Ocd Transno", "lustre.obd_connect_data.ocd_transno", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lustre_msg_v1_lm_magic, + { "Lm Magic", "lustre.lustre_msg_v1.lm_magic", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ptlrpc_body_pb, + { "ptl rpc", "lustre.ptlrpc_body", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_mds_body, + { "mds body", "lustre.mds_body", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_mds_rec_create, + { "mds rec create", "lustre.mds_rec_create", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_mds_rec_link, + { "mds rec link", "lustre.mds_rec_link", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_mds_rec_unlink, + { "mds rec unlink", "lustre.mds_rec_unlink", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + + { &hf_lustre_obd_uuid, + { "obd uid name", "lustre.obd_uid", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_obd_connect_data , + { "obd connect data", "lustre.obd_connect_data", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + + { &hf_lustre_ldlm_intent, + { "ldlm intent", "lustre.ldlm_intent", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + + { &hf_lustre_lov_user_md_join, + { "lov user md join", "lustre.lov_user_md_join", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_obd_ioobj, + { "lustre obd ioobj", "lustre.obd_ioobj", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_niobuf_remote, + { "lustre niobuf remote", "lustre.niobuf_remote", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_ost_key, + { "lustre ost key", "lustre.ost_key", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_ost_val, + { "lustre ost val", "lustre.ost_val", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llogd_chunk, + { "lustre llogd chunk", "lustre.llogd_chunk", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llogd_keyword, + { "lustre llogd keyword", "lustre.llogd_keyword", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llogd_client, + { "lustre llogd client", "lustre.llogd_client", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llogd_name, + { "lustre llogd name", "lustre.llogd_name", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llogd_log_hdr, + { "lustre llogd log hdr", "lustre.llogd_log_hdr", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_llog_logid_rec, + { "lustre llog logid rec", "lustre.llog_logid_rec", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + + { &hf_lustre_llogd_body, + { "lustre llogd body", "lustre.llogd_body", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_nio, + { "lustre nio", "lustre.nio", FT_STRING, BASE_NONE, NULL , 0 , "", HFILL}}, + { &hf_lustre_ost_body, + { "ost body", "lustre.ost_body", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_obd_statfs, + { "obd statfs", "lustre.obd_statfs", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_obd_quotactl, + { "obd quotactl", "lustre.obd_quotacl", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_quota_adjust_qunit, + { "obd quota adjust qunit", "lustre.quota_adjust_qunit", FT_NONE, BASE_NONE, NULL , 0 , "", HFILL }}, + { &hf_lustre_llog_unlink_rec_lur_tail, + { "Lur Tail", "lustre.llog_unlink_rec.lur_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_size_change_rec_lsc_io_epoch, + { "Lsc Io Epoch", "lustre.llog_size_change_rec.lsc_io_epoch", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_flags, + { "Mti Flags", "lustre.mgs_target_info.mti_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_policy_res1, + { "Lock Policy Res1", "lustre.ldlm_reply.lock_policy_res1", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_fsuid, + { "Lk Fsuid", "lustre.mds_rec_link.lk_fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_len, + { "Lgd Len", "lustre.llogd_body.lgd_len", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old_qd_id, + { "Qd Id", "lustre.qunit_data_old.qd_id", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_stripe_count, + { "Lmm Stripe Count", "lustre.lov_user_md_join.lmm_stripe_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_padding1, + { "Padding1", "lustre.llog_logid_rec.padding1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_quota_adjust_qunit_padding1, + { "Padding1", "lustre.quota_adjust_qunit.padding1", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_size_change_rec_lsc_fid, + { "Lsc Fid", "lustre.llog_size_change_rec.lsc_fid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_rec_hdr_padding, + { "Padding", "lustre.llog_rec_hdr.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_time, + { "Cr Time", "lustre.mds_rec_create.cr_time",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_padding_1, + { "Cr Padding 1", "lustre.mds_rec_create.cr_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_nlink, + { "O Nlink", "lustre.obdo.o_nlink", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_suppgid2, + { "Rn Suppgid2", "lustre.mds_rec_rename.rn_suppgid2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_padding_4, + { "Lk Padding 4", "lustre.mds_rec_link.lk_padding_4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_cap, + { "Lk Cap", "lustre.mds_rec_link.lk_cap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_extent_gid, + { "Gid", "lustre.ldlm_extent.gid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_uid, + { "O Uid", "lustre.obdo.o_uid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_padding_5, + { "Cr Padding 5", "lustre.mds_rec_create.cr_padding_5", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_xattr_name, + { "mds xattr name", "lustre.mds_xattr_name", FT_STRING, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_v1, + { "lov mds md v1", "lustre.lov_mds_md_v1", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_cookie, + { "llog cookie", "lustre.llog_cookie", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_md_data, + { "mds md data", "lustre.mds_md_data", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_reint_opcode, + { "mds reint opcode", "lustre.mds_reint_opcode", FT_STRING, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_xattr_eadata, + { "mds xattr eadata", "lustre.mds_xattr_eadata", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_join, + { "lov mds md join", "lustre.lov_mds_md_join", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_reint_name, + { "mds reint name", "lustre.mds_reint_name", FT_STRING, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_reint_old_name, + { "mds reint old name", "lustre.mds_reint_old_name", FT_STRING, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_reint_new_name, + { "mds reint new name", "lustre.mds_reint_new_name", FT_STRING, BASE_NONE, NULL, 0, "", HFILL }}, + + + + { &hf_lustre_obdo_o_valid, + { "O Valid", "lustre.obdo.o_valid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_flags, + { "Lock Flags", "lustre.ldlm_reply.lock_flags", FT_UINT32,BASE_HEX, NULL, 0, "", HFILL }}, + + {&hf_lustre_ldlm_fl_lock_changed, {"LDLM_FL_LOCK_CHANGED", "lustre.ldlm_fl_lock_changed", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_LOCK_CHANGED, "", HFILL}}, + {&hf_lustre_ldlm_fl_block_granted, {"LDLM_FL_BLOCK_GRANTED", "lustre.ldlm_fl_block_granted", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BLOCK_GRANTED, "", HFILL}}, + {&hf_lustre_ldlm_fl_block_conv, {"LDLM_FL_BLOCK_CONV", "lustre.ldlm_fl_block_conv", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BLOCK_CONV, "", HFILL}}, + {&hf_lustre_ldlm_fl_block_wait, {"LDLM_FL_BLOCK_WAIT", "lustre.ldlm_fl_block_wait", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BLOCK_WAIT, "", HFILL}}, + {&hf_lustre_ldlm_fl_cbpending, {"LDLM_FL_CBPENDING", "lustre.ldlm_fl_cbpending", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CBPENDING, "", HFILL}}, + {&hf_lustre_ldlm_fl_ast_sent, {"LDLM_FL_AST_SENT", "lustre.ldlm_fl_ast_sent", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_AST_SENT, "", HFILL}}, + {&hf_lustre_ldlm_fl_wait_noreproc, {"LDLM_FL_WAIT_NOREPROC", "lustre.ldlm_fl_wait_noreproc", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_WAIT_NOREPROC, "", HFILL}}, + {&hf_lustre_ldlm_fl_cancel, {"LDLM_FL_CANCEL", "lustre.ldlm_fl_cancel", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CANCEL, "", HFILL}}, + {&hf_lustre_ldlm_fl_replay, {"LDLM_FL_REPLAY", "lustre.ldlm_fl_replay", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_REPLAY, "", HFILL}}, + {&hf_lustre_ldlm_fl_intent_only, {"LDLM_FL_INTENT_ONLY", "lustre.ldlm_fl_intent_only", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_INTENT_ONLY, "", HFILL}}, + {&hf_lustre_ldlm_fl_local_only, {"LDLM_FL_LOCAL_ONLY", "lustre.ldlm_fl_local_only", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_LOCAL_ONLY, "", HFILL}}, + {&hf_lustre_ldlm_fl_failed, {"LDLM_FL_FAILED", "lustre.ldlm_fl_failed", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_FAILED, "", HFILL}}, + {&hf_lustre_ldlm_fl_has_intent, {"LDLM_FL_HAS_INTENT", "lustre.ldlm_fl_has_intent", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_HAS_INTENT, "", HFILL}}, + {&hf_lustre_ldlm_fl_canceling, {"LDLM_FL_CANCELING", "lustre.ldlm_fl_canceling", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CANCELING, "", HFILL}}, + {&hf_lustre_ldlm_fl_local, {"LDLM_FL_LOCAL", "lustre.ldlm_fl_local", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_LOCAL, "", HFILL}}, + {&hf_lustre_ldlm_fl_warn, {"LDLM_FL_WARN", "lustre.ldlm_fl_warn", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_WARN, "", HFILL}}, + {&hf_lustre_ldlm_fl_discard_data, {"LDLM_FL_DISCARD_DATA", "lustre.ldlm_fl_discard_data", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_DISCARD_DATA, "", HFILL}}, + {&hf_lustre_ldlm_fl_no_timeout, {"LDLM_FL_NO_TIMEOUT", "lustre.ldlm_fl_no_timeout", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_NO_TIMEOUT, "", HFILL}}, + {&hf_lustre_ldlm_fl_block_nowait, {"LDLM_FL_BLOCK_NOWAIT", "lustre.ldlm_fl_block_nowait", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BLOCK_NOWAIT, "", HFILL}}, + {&hf_lustre_ldlm_fl_test_lock, {"LDLM_FL_TEST_LOCK", "lustre.ldlm_fl_test_lock", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_TEST_LOCK, "", HFILL}}, + {&hf_lustre_ldlm_fl_lvb_ready, {"LDLM_FL_LVB_READY", "lustre.ldlm_fl_lvb_ready", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_LVB_READY, "", HFILL}}, + {&hf_lustre_ldlm_fl_kms_ignore, {"LDLM_FL_KMS_IGNORE", "lustre.ldlm_fl_kms_ignore", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_KMS_IGNORE, "", HFILL}}, + {&hf_lustre_ldlm_fl_no_lru, {"LDLM_FL_NO_LRU", "lustre.ldlm_fl_no_lru", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_NO_LRU, "", HFILL}}, + {&hf_lustre_ldlm_fl_cancel_on_block, {"LDLM_FL_CANCEL_ON_BLOCK", "lustre.ldlm_fl_cancel_on_block", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CANCEL_ON_BLOCK, "", HFILL}}, + {&hf_lustre_ldlm_fl_cp_reqd, {"LDLM_FL_CP_REQD", "lustre.ldlm_fl_cp_reqd", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CP_REQD, "", HFILL}}, + {&hf_lustre_ldlm_fl_cleaned, {"LDLM_FL_CLEANED", "lustre.ldlm_fl_cleaned", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_CLEANED, "", HFILL}}, + {&hf_lustre_ldlm_fl_atomic_cb, {"LDLM_FL_ATOMIC_CB", "lustre.ldlm_fl_atomic_cb", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_ATOMIC_CB, "", HFILL}}, + {&hf_lustre_ldlm_fl_bl_ast, {"LDLM_FL_BL_AST", "lustre.ldlm_fl_bl_ast", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BL_AST, "", HFILL}}, + {&hf_lustre_ldlm_fl_bl_done, {"LDLM_FL_BL_DONE", "lustre.ldlm_fl_bl_done", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_BL_DONE, "", HFILL}}, + {&hf_lustre_ldlm_fl_deny_on_contention, {"LDLM_FL_DENY_ON_CONTENTION", "lustre.ldlm_fl_deny_on_contention", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_FL_DENY_ON_CONTENTION, "", HFILL}}, + {&hf_lustre_ldlm_ast_discard_data, {"LDLM_AST_DISCARD_DATA", "lustre.ldlm_ast_discard_data", FT_BOOLEAN, 32, TFS(&flags_set_truth), LDLM_AST_DISCARD_DATA, "", HFILL}}, + + + { &hf_lustre_mds_rec_link_lk_padding_3, + { "Lk Padding 3", "lustre.mds_rec_link.lk_padding_3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_misc, + { "O Misc", "lustre.obdo.o_misc", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_suppgid, + { "Sa Suppgid", "lustre.mds_rec_setattr.sa_suppgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_attr_flags, + { "Sa Attr Flags", "lustre.mds_rec_setattr.sa_attr_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_padding_2, + { "Rn Padding 2", "lustre.mds_rec_rename.rn_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_request_lock_handle, + { "Lock Handle", "lustre.ldlm_request.lock_handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_lgl_oid, + { "Lgl Oid", "lustre.llog_logid.lgl_oid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_inodebits_bits, + { "Bits", "lustre.ldlm_inodebits.bits", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_count, + { "Llh Count", "lustre.llog_log_hdr.llh_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_padding_4, + { "Ul Padding 4", "lustre.mds_rec_unlink.ul_padding_4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_stripe_size, + { "Lmm Stripe Size", "lustre.lov_user_md_join.lmm_stripe_size", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_gen_rec_lgr_tail, + { "Lgr Tail", "lustre.llog_gen_rec.lgr_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_catid_lci_padding3, + { "Lci Padding3", "lustre.llog_catid.lci_padding3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_qd_qunit, + { "Qd Qunit", "lustre.qunit_data.qd_qunit", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_fid1, + { "Ul Fid1", "lustre.mds_rec_unlink.ul_fid1", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_padding, + { "Padding", "lustre.llog_setattr_rec.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_opcode, + { "Rn Opcode", "lustre.mds_rec_rename.rn_opcode", FT_UINT32, BASE_DEC, VALS(lustre_mds_reint_t_vals) , 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_flags, + { "Cr Flags", "lustre.mds_rec_create.cr_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_fid1, + { "Rn Fid1", "lustre.mds_rec_rename.rn_fid1", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_extent_desc_med_start, + { "Med Start", "lustre.mds_extent_desc.med_start", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_cookie_lgc_lgl, + { "Lgc Lgl", "lustre.llog_cookie.lgc_lgl", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_dqinfo, + { "Qc Dqinfo", "lustre.obd_quotactl.qc_dqinfo", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_bitmap, + { "Llh Bitmap", "lustre.llog_log_hdr.llh_bitmap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_size, + { "Sa Size", "lustre.mds_rec_setattr.sa_size", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_suppgid1, + { "Rn Suppgid1", "lustre.mds_rec_rename.rn_suppgid1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_stat, + { "Qc Stat", "lustre.obd_quotactl.qc_stat", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old2_qd_id, + { "Qd Id", "lustre.qunit_data_old2.qd_id", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_padding2, + { "Padding2", "lustre.llog_logid_rec.padding2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_mode, + { "Ul Mode", "lustre.mds_rec_unlink.ul_mode", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_orphan_rec_lor_tail, + { "Lor Tail", "lustre.llog_orphan_rec.lor_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_padding5, + { "Padding5", "lustre.llog_logid_rec.padding5", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_fsgid, + { "Cr Fsgid", "lustre.mds_rec_create.cr_fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_join_jr_fid, + { "Jr Fid", "lustre.mds_rec_join.jr_fid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + /*-------------------------------------------------------------------------------------------------------------*/ + /*all this flags are uint64, but I don't find the way to use something like TFS() with a Uint64*/ + { &hf_lustre_ldlm_intent_opc_open, + { "open", "lustre.ldlm_intent.opc_open", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_OPEN, "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_creat, + { "creat", "lustre.ldlm_intent.opc_creat", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_CREAT , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_readdir, + { "readdir", "lustre.ldlm_intent.opc_readdir", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_READDIR , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_getattr, + { "getattr", "lustre.ldlm_intent.opc_getattr", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_GETATTR, "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_lookup, + { "lookup", "lustre.ldlm_intent.opc_lookup", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_LOOKUP , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_unlink, + { "unlink", "lustre.ldlm_intent.opc_unlink", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_UNLINK , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_getxattr, + { "getxattr", "lustre.ldlm_intent.opc_getxattr", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_GETXATTR , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_exec, + { "exec", "lustre.ldlm_intent.opc_exec", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_EXEC , "", HFILL}}, + { &hf_lustre_ldlm_intent_opc_pin, + { "pin", "lustre.ldlm_intent.opc_pin", FT_BOOLEAN, 32, TFS(&flags_set_truth), IT_PIN , "", HFILL}}, + /*-------------------------------------------------------------------------------------------------------------*/ + { &hf_lustre_ldlm_intent_opc, + { "intent opcode", "lustre.ldlm_intent.opc", FT_NONE, BASE_NONE, NULL, 0, "", HFILL}}, + { &hf_lustre_llog_rec_hdr_lrh_type, + { "Lrh Type", "lustre.llog_rec_hdr.lrh_type", FT_UINT32, BASE_HEX, VALS(lustre_llog_op_type), 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_fsgid, + { "Lk Fsgid", "lustre.mds_rec_link.lk_fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_rec_hdr_lrh_len, + { "Lrh Len", "lustre.llog_rec_hdr.lrh_len", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_lsr_uid, + { "Lsr Uid", "lustre.llog_setattr_rec.lsr_uid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_padding_1, + { "Ld Padding 1", "lustre.lov_desc.ld_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_4, + { "O Padding 4", "lustre.obdo.o_padding_4", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_padding, + { "Padding", "lustre.mgs_target_info.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_dqblk, + { "Qc Dqblk", "lustre.obd_quotactl.qc_dqblk", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_conn_body_lgdc_gen, + { "Lgdc Gen", "lustre.llogd_conn_body.lgdc_gen", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_padding_2, + { "Cr Padding 2", "lustre.mds_rec_create.cr_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_ctime, + { "Sa Ctime", "lustre.mds_rec_setattr.sa_ctime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_tail, + { "Llh Tail", "lustre.llog_log_hdr.llh_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_size, + { "O Size", "lustre.obdo.o_size", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_cap, + { "Sa Cap", "lustre.mds_rec_setattr.sa_cap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_extent_start, + { "Start", "lustre.ldlm_extent.start", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_opcode, + { "Ul Opcode", "lustre.mds_rec_unlink.ul_opcode", FT_UINT32, BASE_DEC, VALS(lustre_mds_reint_t_vals) , 0, "", HFILL }}, + { &hf_lustre_llog_size_change_rec_lsc_hdr, + { "Lsc Hdr", "lustre.llog_size_change_rec.lsc_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_time, + { "Ul Time", "lustre.mds_rec_unlink.ul_time",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_extent_start, + { "L Extent Start", "lustre.lov_user_ost_data_join.l_extent_start", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_tree_id, + { "Lmm Tree Id", "lustre.lov_user_md_join.lmm_tree_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_lcr_tail, + { "Lcr Tail", "lustre.llog_create_rec.lcr_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_mode, + { "Sa Mode", "lustre.mds_rec_setattr.sa_mode", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_lgl_ogr, + { "Lgl Ogr", "lustre.llog_logid.lgl_ogr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_lcr_hdr, + { "Lcr Hdr", "lustre.llog_create_rec.lcr_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_cookie_lgc_padding, + { "Lgc Padding", "lustre.llog_cookie.lgc_padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_cap, + { "Cr Cap", "lustre.mds_rec_create.cr_cap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old_qd_type, + { "Qd Type", "lustre.qunit_data_old.qd_type", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_flock_blocking_export, + { "Blocking Export", "lustre.ldlm_flock.blocking_export", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_gid, + { "Sa Gid", "lustre.mds_rec_setattr.sa_gid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_pattern, + { "Ld Pattern", "lustre.lov_desc.ld_pattern", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_qd_id, + { "Qd Id", "lustre.qunit_data.qd_id", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_fsname, + { "Mti Fsname", "lustre.mgs_target_info.mti_fsname", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_object_gr, + { "Lmm Object Gr", "lustre.lov_user_md_join.lmm_object_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_request_lock_flags, + { "Lock Flags", "lustre.ldlm_request.lock_flags", FT_UINT32, BASE_HEX, NULL, 0 , "", HFILL }}, + { &hf_lustre_obdo_o_mode, + { "O Mode", "lustre.obdo.o_mode", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_svname, + { "Mti Svname", "lustre.mgs_target_info.mti_svname", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_logid, + { "Lgd Logid", "lustre.llogd_body.lgd_logid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_opcode, + { "Cr Opcode", "lustre.mds_rec_create.cr_opcode", FT_UINT32, BASE_DEC, VALS(lustre_mds_reint_t_vals), 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_size, + { "Llh Size", "lustre.llog_log_hdr.llh_size", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_padding, + { "Padding", "lustre.llog_create_rec.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_handle, + { "O Handle", "lustre.obdo.o_handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_atime, + { "O Atime", "lustre.obdo.o_atime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_quota_adjust_qunit_qaq_id, + { "Qaq Id", "lustre.quota_adjust_qunit.qaq_id", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_fid2, + { "Rn Fid2", "lustre.mds_rec_rename.rn_fid2", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_replayfid, + { "Cr Replayfid", "lustre.mds_rec_create.cr_replayfid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_lock_desc_l_policy_data, + { "L Policy Data", "lustre.ldlm_lock_desc.l_policy_data", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_suppgid1, + { "Lk Suppgid1", "lustre.mds_rec_link.lk_suppgid1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_cmd, + { "Qc Cmd", "lustre.obd_quotactl.qc_cmd", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_object_id, + { "Lmm Object Id", "lustre.lov_user_md_join.lmm_object_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_padding_3, + { "Rn Padding 3", "lustre.mds_rec_rename.rn_padding_3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_padding, + { "Padding", "lustre.qunit_data.padding", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_objects, + { "Lmm Objects", "lustre.lov_user_md_join.lmm_objects", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_quota_adjust_qunit_qaq_flags, + { "Qaq Flags", "lustre.quota_adjust_qunit.qaq_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_object_gr, + { "L Object Gr", "lustre.lov_user_ost_data_join.l_object_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_lock_desc_l_granted_mode, + { "L Granted Mode", "lustre.ldlm_lock_desc.l_granted_mode", FT_UINT16, BASE_DEC, VALS(lustre_ldlm_mode_t_vals), 0, "", HFILL }}, + { &hf_lustre_obdo_o_gr, + { "O Gr", "lustre.obdo.o_gr", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_padding_2, + { "Ul Padding 2", "lustre.mds_rec_unlink.ul_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_gid, + { "O Gid", "lustre.obdo.o_gid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_catid_lci_logid, + { "Lci Logid", "lustre.llog_catid.lci_logid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_rec_tail_lrt_index, + { "Lrt Index", "lustre.llog_rec_tail.lrt_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_mds, + { "O Mds", "lustre.obdo.o_mds", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_extent_desc_med_lmm, + { "Med Lmm", "lustre.mds_extent_desc.med_lmm", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_default_stripe_count, + { "Ld Default Stripe Count", "lustre.lov_desc.ld_default_stripe_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_resource_desc_lr_padding, + { "Lr Padding", "lustre.ldlm_resource_desc.lr_padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_vers, + { "Cm Vers", "lustre.cfg_marker.cm_vers", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_fid, + { "Cr Fid", "lustre.mds_rec_create.cr_fid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_unlink_rec_lur_hdr, + { "Lur Hdr", "lustre.llog_unlink_rec.lur_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_index, + { "Lgd Index", "lustre.llogd_body.lgd_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_tgtname, + { "Cm Tgtname", "lustre.cfg_marker.cm_tgtname", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_padding_1, + { "Ul Padding 1", "lustre.mds_rec_unlink.ul_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_cap, + { "Ul Cap", "lustre.mds_rec_unlink.ul_cap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_array_rec_lmr_med, + { "Lmr Med", "lustre.llog_array_rec.lmr_med", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_lsr_ogen, + { "Lsr Ogen", "lustre.llog_setattr_rec.lsr_ogen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_padding_3, + { "Cr Padding 3", "lustre.mds_rec_create.cr_padding_3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_lid_hdr, + { "Lid Hdr", "lustre.llog_logid_rec.lid_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_ost_idx, + { "L Ost Idx", "lustre.lov_user_ost_data_join.l_ost_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_easize, + { "O Easize", "lustre.obdo.o_easize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_array_id, + { "Lmm Array Id", "lustre.lov_user_md_join.lmm_array_id", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ost_body_oa, + { "Oa", "lustre.ost_body.oa", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_padding3, + { "Padding3", "lustre.llog_logid_rec.padding3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_flags, + { "Llh Flags", "lustre.llog_log_hdr.llh_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + {&hf_lustre_llog_hdr_llh_flag_zap_when_empty, + {"LLOG_F_ZAP_WHEN_EMPTY","lustre.lustre.llog_log_hdr.llh_flag_zap",FT_BOOLEAN,32,TFS(&flags_set_truth),LLOG_F_ZAP_WHEN_EMPTY,"",HFILL}}, + {&hf_lustre_llog_hdr_llh_flag_is_cat, + {"LLOG_F_IS_CAT","lustre.lustre.llog_log_hdr.llh_flag_cat",FT_BOOLEAN,32,TFS(&flags_set_truth),LLOG_F_IS_CAT,"",HFILL}}, + {&hf_lustre_llog_hdr_llh_flag_is_play, + {"LOG_F_IS_PLAIN","lustre.lustre.llog_log_hdr.llh_flag_play",FT_BOOLEAN,32,TFS(&flags_set_truth),LLOG_F_IS_PLAIN,"",HFILL}}, + + { &hf_lustre_llog_setattr_rec_lsr_oid, + { "Lsr Oid", "lustre.llog_setattr_rec.lsr_oid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_mode, + { "Cr Mode", "lustre.mds_rec_create.cr_mode", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_size_change_rec_padding, + { "Padding", "lustre.llog_size_change_rec.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_config_ver, + { "Mti Config Ver", "lustre.mgs_target_info.mti_config_ver", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_createtime, + { "Cm Createtime", "lustre.cfg_marker.cm_createtime",FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old_qd_count, + { "Qd Count", "lustre.qunit_data_old.qd_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_join_lmmj_array_id, + { "Lmmj Array Id", "lustre.lov_mds_md_join.lmmj_array_id", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_uid, + { "Sa Uid", "lustre.mds_rec_setattr.sa_uid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_catid_lci_padding1, + { "Lci Padding1", "lustre.llog_catid.lci_padding1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_atime, + { "Sa Atime", "lustre.mds_rec_setattr.sa_atime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_active_tgt_count, + { "Ld Active Tgt Count", "lustre.lov_desc.ld_active_tgt_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_lcookie, + { "O Lcookie", "lustre.obdo.o_lcookie", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_gen_rec_lgr_gen, + { "Lgr Gen", "lustre.llog_gen_rec.lgr_gen", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_object_id, + { "L Object Id", "lustre.lov_user_ost_data_join.l_object_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_id, + { "O Id", "lustre.obdo.o_id", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_uuid, + { "Mti Uuid", "lustre.mgs_target_info.mti_uuid", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_padding_1, + { "Lk Padding 1", "lustre.mds_rec_link.lk_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_rec_hdr_lrh_index, + { "Lrh Index", "lustre.llog_rec_hdr.lrh_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_lsr_hdr, + { "Lsr Hdr", "lustre.llog_setattr_rec.lsr_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_stripe_index, + { "Mti Stripe Index", "lustre.mgs_target_info.mti_stripe_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_gen_conn_cnt, + { "Conn Cnt", "lustre.llog_gen.conn_cnt", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_6, + { "O Padding 6", "lustre.obdo.o_padding_6", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_suppgid, + { "Cr Suppgid", "lustre.mds_rec_create.cr_suppgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_cookie_lgc_index, + { "Lgc Index", "lustre.llog_cookie.lgc_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_uuid, + { "Ld Uuid", "lustre.lov_desc.ld_uuid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_lcr_oid, + { "Lcr Oid", "lustre.llog_create_rec.lcr_oid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_desc, + { "Lock Desc", "lustre.ldlm_reply.lock_desc", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_padding_0, + { "Ld Padding 0", "lustre.lov_desc.ld_padding_0", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_unlink_rec_lur_ogen, + { "Lur Ogen", "lustre.llog_unlink_rec.lur_ogen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_orphan_rec_lor_hdr, + { "Lor Hdr", "lustre.llog_orphan_rec.lor_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_fsuid, + { "Rn Fsuid", "lustre.mds_rec_rename.rn_fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_flags, + { "Cm Flags", "lustre.cfg_marker.cm_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_3, + { "O Padding 3", "lustre.obdo.o_padding_3", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_ost_gen, + { "L Ost Gen", "lustre.lov_user_ost_data_join.l_ost_gen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_fsuid, + { "Cr Fsuid", "lustre.mds_rec_create.cr_fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_fsgid, + { "Ul Fsgid", "lustre.mds_rec_unlink.ul_fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_request_lock_desc, + { "Lock Desc", "lustre.ldlm_request.lock_desc", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_pattern, + { "Lmm Pattern", "lustre.lov_user_md_join.lmm_pattern", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_fsuid, + { "Ul Fsuid", "lustre.mds_rec_unlink.ul_fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_suppgid2, + { "Lk Suppgid2", "lustre.mds_rec_link.lk_suppgid2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_orphan_rec_padding, + { "Padding", "lustre.llog_orphan_rec.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_tree_gen, + { "Lmm Tree Gen", "lustre.lov_user_md_join.lmm_tree_gen", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_flags, + { "O Flags", "lustre.obdo.o_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_params, + { "Mti Params", "lustre.mgs_target_info.mti_params", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_lgl_ogen, + { "Lgl Ogen", "lustre.llog_logid.lgl_ogen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_valid, + { "Sa Valid", "lustre.mds_rec_setattr.sa_valid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_comment, + { "Cm Comment", "lustre.cfg_marker.cm_comment", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_unlink_rec_lur_oid, + { "Lur Oid", "lustre.llog_unlink_rec.lur_oid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_qd_count, + { "Qd Count", "lustre.qunit_data.qd_count", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_padding_1, + { "Rn Padding 1", "lustre.mds_rec_rename.rn_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_mtime, + { "O Mtime", "lustre.obdo.o_mtime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_join_lmmj_md, + { "Lmmj Md", "lustre.lov_mds_md_join.lmmj_md", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_fsgid, + { "Rn Fsgid", "lustre.mds_rec_rename.rn_fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_cap, + { "Rn Cap", "lustre.mds_rec_rename.rn_cap", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_blksize, + { "O Blksize", "lustre.obdo.o_blksize", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_suppgid, + { "Ul Suppgid", "lustre.mds_rec_unlink.ul_suppgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_res_id_name, + { "Name", "lustre.ldlm_res_id.name", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_time, + { "Lk Time", "lustre.mds_rec_link.lk_time",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_handle, + { "Lock Handle", "lustre.ldlm_reply.lock_handle", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_padding_3, + { "Ul Padding 3", "lustre.mds_rec_unlink.ul_padding_3", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_saved_index, + { "Lgd Saved Index", "lustre.llogd_body.lgd_saved_index", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_join_jr_headsize, + { "Jr Headsize", "lustre.mds_rec_join.jr_headsize", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_padding_4, + { "Rn Padding 4", "lustre.mds_rec_rename.rn_padding_4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old_qd_isblk, + { "Qd Isblk", "lustre.qunit_data_old.qd_isblk", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_blocks, + { "O Blocks", "lustre.obdo.o_blocks", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_padding_2, + { "Ld Padding 2", "lustre.lov_desc.ld_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_fid2, + { "Lk Fid2", "lustre.mds_rec_link.lk_fid2", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_lid_tail, + { "Lid Tail", "lustre.llog_logid_rec.lid_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_grant, + { "O Grant", "lustre.obdo.o_grant", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_2, + { "O Padding 2", "lustre.obdo.o_padding_2", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_quota_adjust_qunit_qaq_iunit_sz, + { "Qaq Iunit Sz", "lustre.quota_adjust_qunit.qaq_iunit_sz", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_unlink_rec_padding, + { "Padding", "lustre.llog_unlink_rec.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_lock_desc_l_req_mode, + { "L Req Mode", "lustre.ldlm_lock_desc.l_req_mode", FT_UINT16, BASE_DEC, VALS(lustre_ldlm_mode_t_vals), 0, "", HFILL }}, + { &hf_lustre_ldlm_extent_end, + { "End", "lustre.ldlm_extent.end", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_gen_rec_lgr_hdr, + { "Lgr Hdr", "lustre.llog_gen_rec.lgr_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_orphan_rec_lor_ogen, + { "Lor Ogen", "lustre.llog_orphan_rec.lor_ogen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_extent_count, + { "Lmm Extent Count", "lustre.lov_user_md_join.lmm_extent_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_extent_desc_med_len, + { "Med Len", "lustre.mds_extent_desc.med_len", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_llh_flags, + { "Lgd Llh Flags", "lustre.llogd_body.lgd_llh_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_array_rec_lmr_hdr, + { "Lmr Hdr", "lustre.llog_array_rec.lmr_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_cat_idx, + { "Llh Cat Idx", "lustre.llog_log_hdr.llh_cat_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_bitmap_offset, + { "Llh Bitmap Offset", "lustre.llog_log_hdr.llh_bitmap_offset", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_orphan_rec_lor_oid, + { "Lor Oid", "lustre.llog_orphan_rec.lor_oid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_padding, + { "Lock Padding", "lustre.ldlm_reply.lock_padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_id, + { "Qc Id", "lustre.obd_quotactl.qc_id", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_padding_4, + { "Cr Padding 4", "lustre.mds_rec_create.cr_padding_4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_padding4, + { "Padding4", "lustre.llog_logid_rec.padding4", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_padding_2, + { "Lk Padding 2", "lustre.mds_rec_link.lk_padding_2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_lsr_gid, + { "Lsr Gid", "lustre.llog_setattr_rec.lsr_gid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_md_join_lmm_magic, + { "Lmm Magic", "lustre.lov_user_md_join.lmm_magic", FT_UINT32, BASE_HEX, VALS(lustre_lov_magic), 0, "", HFILL }}, + { &hf_lustre_obd_quotactl_qc_type, + { "Qc Type", "lustre.obd_quotactl.qc_type", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_padding, + { "Padding", "lustre.cfg_marker.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_nids, + { "Mti Nids", "lustre.mgs_target_info.mti_nids", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_user_ost_data_join_l_extent_end, + { "L Extent End", "lustre.lov_user_ost_data_join.l_extent_end", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_stripe_idx, + { "O Stripe Idx", "lustre.obdo.o_stripe_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_conn_body_lgdc_logid, + { "Lgdc Logid", "lustre.llogd_conn_body.lgdc_logid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_fsuid, + { "Sa Fsuid", "lustre.mds_rec_setattr.sa_fsuid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_flock_blocking_pid, + { "Blocking Pid", "lustre.ldlm_flock.blocking_pid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_tgt_count, + { "Ld Tgt Count", "lustre.lov_desc.ld_tgt_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_cur_offset, + { "Lgd Cur Offset", "lustre.llogd_body.lgd_cur_offset", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_lcr_ogen, + { "Lcr Ogen", "lustre.llog_create_rec.lcr_ogen", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old2_qd_count, + { "Qd Count", "lustre.qunit_data_old2.qd_count", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_opnum, + { "Operation", "lustre.opnum", FT_UINT16, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_old2_qd_flags, + { "Qd Flags", "lustre.qunit_data_old2.qd_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_flock_start, + { "Start", "lustre.ldlm_flock.start", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_quota_adjust_qunit_qaq_bunit_sz, + { "Qaq Bunit Sz", "lustre.quota_adjust_qunit.qaq_bunit_sz", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_array_rec_lmr_tail, + { "Lmr Tail", "lustre.llog_array_rec.lmr_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_flock_pid, + { "Pid", "lustre.ldlm_flock.pid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_default_stripe_size, + { "Ld Default Stripe Size", "lustre.lov_desc.ld_default_stripe_size", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_opcode, + { "Sa Opcode", "lustre.mds_rec_setattr.sa_opcode", FT_UINT32, BASE_DEC, VALS(lustre_mds_reint_t_vals), 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_tgtuuid, + { "Llh Tgtuuid", "lustre.llog_log_hdr.llh_tgtuuid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_fid1, + { "Lk Fid1", "lustre.mds_rec_link.lk_fid1", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_step, + { "Cm Step", "lustre.cfg_marker.cm_step", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_send_param_mgs_param, + { "Mgs Param", "lustre.mgs_send_param.mgs_param", FT_UINT8, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_create_rec_lcr_fid, + { "Lcr Fid", "lustre.llog_create_rec.lcr_fid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_default_stripe_offset, + { "Ld Default Stripe Offset", "lustre.lov_desc.ld_default_stripe_offset", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_resource_desc_lr_name, + { "Lr Name", "lustre.ldlm_resource_desc.lr_name", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_rec_tail_lrt_len, + { "Lrt Len", "lustre.llog_rec_tail.lrt_len", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_mtime, + { "Sa Mtime", "lustre.mds_rec_setattr.sa_mtime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_timestamp, + { "Llh Timestamp", "lustre.llog_log_hdr.llh_timestamp", FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_catid_lci_padding2, + { "Lci Padding2", "lustre.llog_catid.lci_padding2", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_conn_body_lgdc_ctxt_idx, + { "Lgdc Ctxt Idx", "lustre.llogd_conn_body.lgdc_ctxt_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_cfg_marker_cm_canceltime, + { "Cm Canceltime", "lustre.cfg_marker.cm_canceltime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_lustre_ver, + { "Mti Lustre Ver", "lustre.mgs_target_info.mti_lustre_ver", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_1, + { "O Padding 1", "lustre.obdo.o_padding_1", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_qunit_data_qd_flags, + { "Qd Flags", "lustre.qunit_data.qd_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_logid_rec_lid_id, + { "Lid Id", "lustre.llog_logid_rec.lid_id", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_generation, + { "O Generation", "lustre.obdo.o_generation", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_gen_mnt_cnt, + { "Mnt Cnt", "lustre.llog_gen.mnt_cnt", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_size_change_rec_lsc_tail, + { "Lsc Tail", "lustre.llog_size_change_rec.lsc_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_padding_5, + { "O Padding 5", "lustre.obdo.o_padding_5", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_lock_desc_l_resource, + { "L Resource", "lustre.ldlm_lock_desc.l_resource", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_rename_rn_time, + { "Rn Time", "lustre.mds_rec_rename.rn_time",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_create_cr_rdev, + { "Cr Rdev", "lustre.mds_rec_create.cr_rdev", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obdo_o_fid, + { "O Fid", "lustre.obdo.o_fid", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_fid, + { "Sa Fid", "lustre.mds_rec_setattr.sa_fid", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_request_lock_count, + { "Lock Count", "lustre.ldlm_request.lock_count", FT_UINT32, BASE_HEX_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_flock_end, + { "End", "lustre.ldlm_flock.end", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_link_lk_opcode, + { "Lk Opcode", "lustre.mds_rec_link.lk_opcode", FT_UINT32, BASE_DEC,VALS(lustre_mds_reint_t_vals) , 0, "", HFILL }}, + { &hf_lustre_mgs_target_info_mti_nid_count, + { "Mti Nid Count", "lustre.mgs_target_info.mti_nid_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + /* ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ + { &hf_lustre_mgs_target_info, + { "mgs target info", "lustre.mgs_target_info", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + { &hf_lustre_mgs_send_param, + { "mgs send param", "lustre.mgs_send_param", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + /* ----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- */ + { &hf_lustre_ost_lvb, + { "ost lvb data", "lustre.ost_lvb", FT_NONE, BASE_NONE, NULL, 0, "", HFILL }}, + + { &hf_lustre_obdo_o_ctime, + { "O Ctime", "lustre.obdo.o_ctime",FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_reply_lock_policy_res2, + { "Lock Policy Res2", "lustre.ldlm_reply.lock_policy_res2", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llogd_body_lgd_ctxt_idx, + { "Lgd Ctxt Idx", "lustre.llogd_body.lgd_ctxt_idx", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_unlink_ul_fid2, + { "Ul Fid2", "lustre.mds_rec_unlink.ul_fid2", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_desc_ld_qos_maxage, + { "Ld Qos Maxage", "lustre.lov_desc.ld_qos_maxage", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_ldlm_resource_desc_lr_type, + { "Lr Type", "lustre.ldlm_resource_desc.lr_type", FT_UINT16, BASE_DEC, VALS(lustre_ldlm_type_t_vals), 0, "", HFILL }}, + { &hf_lustre_llog_setattr_rec_lsr_tail, + { "Lsr Tail", "lustre.llog_setattr_rec.lsr_tail", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_cookie_lgc_subsys, + { "Lgc Subsys", "lustre.llog_cookie.lgc_subsys", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_hdr, + { "Llh Hdr", "lustre.llog_log_hdr.llh_hdr", FT_NONE, BASE_HEX, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_fsgid, + { "Sa Fsgid", "lustre.mds_rec_setattr.sa_fsgid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_mds_rec_setattr_sa_padding, + { "Sa Padding", "lustre.mds_rec_setattr.sa_padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_lov_mds_md_join_lmmj_extent_count, + { "Lmmj Extent Count", "lustre.lov_mds_md_join.lmmj_extent_count", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_llog_log_hdr_llh_reserved, + { "Llh Reserved", "lustre.llog_log_hdr.llh_reserved", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + + { &hf_lustre_extra_padding, + { "extra padding", "lustre.msg_v2_extra_padding", FT_NONE, BASE_NONE, NULL, 0, "", HFILL}}, + { &hf_lustre_ldlm_reply, + { "ldlm reply", "lustre.ldlm_reply", FT_NONE, BASE_NONE, NULL, 0, "", HFILL}}, + { &hf_lustre_ldlm_request, + { "ldlm request", "lustre.ldlm_request", FT_NONE, BASE_NONE, NULL, 0, "", HFILL}}, + + /* add for lustre_user.h */ + { &hf_lustre_obd_dqinfo_dqi_valid, + { "Dqi Valid", "lustre.obd_dqinfo.dqi_valid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_isoftlimit, + { "Dqb Isoftlimit", "lustre.obd_dqblk.dqb_isoftlimit", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_bhardlimit, + { "Dqb Bhardlimit", "lustre.obd_dqblk.dqb_bhardlimit", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_curspace, + { "Dqb Curspace", "lustre.obd_dqblk.dqb_curspace", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_itime, + { "Dqb Itime", "lustre.obd_dqblk.dqb_itime", FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_valid, + { "Dqb Valid", "lustre.obd_dqblk.dqb_valid", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqinfo_dqi_igrace, + { "Dqi Igrace", "lustre.obd_dqinfo.dqi_igrace", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqinfo_dqi_bgrace, + { "Dqi Bgrace", "lustre.obd_dqinfo.dqi_bgrace", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_padding, + { "Padding", "lustre.obd_dqblk.padding", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_curinodes, + { "Dqb Curinodes", "lustre.obd_dqblk.dqb_curinodes", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_bsoftlimit, + { "Dqb Bsoftlimit", "lustre.obd_dqblk.dqb_bsoftlimit", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqinfo_dqi_flags, + { "Dqi Flags", "lustre.obd_dqinfo.dqi_flags", FT_UINT32, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_btime, + { "Dqb Btime", "lustre.obd_dqblk.dqb_btime", FT_ABSOLUTE_TIME, BASE_DEC, NULL, 0, "", HFILL }}, + { &hf_lustre_obd_dqblk_dqb_ihardlimit, + { "Dqb Ihardlimit", "lustre.obd_dqblk.dqb_ihardlimit", FT_UINT64, BASE_DEC, NULL, 0, "", HFILL }} + + }; + + + static gint *ett[] = { + &ett_lustre, + &ett_lustre_lustre_handle_cookie, + &ett_lustre_lustre_msg_v1, + &ett_lustre_lustre_handle_v1, + &ett_lustre_lustre_msg_v2, + &ett_lustre_ptlrpc_body, + &ett_lustre_lustre_handle_v2, + &ett_lustre_obd_connect_data, + &ett_lustre_lov_mds_md_v1, + &ett_lustre_lov_ost_data_v1, + &ett_lustre_obd_statfs, + &ett_lustre_obd_ioobj, + &ett_lustre_niobuf_remote, + &ett_lustre_ost_lvb, + &ett_lustre_ll_fid, + &ett_lustre_mds_status_req, + &ett_lustre_mds_body, + &ett_lustre_obd_quotactl, + &ett_lustre_obd_dqinfo, + &ett_lustre_obd_dqblk, + &ett_lustre_quota_adjust_qunit, + &ett_lustre_mds_rec_setattr, + &ett_lustre_mds_rec_create, + &ett_lustre_mds_rec_join, + &ett_lustre_mds_rec_link, + &ett_lustre_mds_rec_unlink, + &ett_lustre_mds_rec_rename, + &ett_lustre_lov_desc, + &ett_lustre_obd_uuid, + &ett_lustre_ldlm_res_id, + &ett_lustre_ldlm_extent, + &ett_lustre_ldlm_inodebits, + &ett_lustre_ldlm_flock, + &ett_lustre_ldlm_intent_opc, + &ett_lustre_ldlm_resource_desc, + &ett_lustre_ldlm_res_id, + &ett_lustre_ldlm_lock_desc, + &ett_lustre_ldlm_resource_desc, + &ett_lustre_ldlm_request, + &ett_lustre_ldlm_lock_desc, + &ett_lustre_lustre_handle, + &ett_lustre_ldlm_reply, + &ett_lustre_ldlm_lock_desc, + &ett_lustre_lustre_handle, + &ett_lustre_mgs_send_param, + &ett_lustre_mgs_target_info, + &ett_lustre_cfg_marker, + &ett_lustre_llog_logid, + &ett_lustre_llog_catid, + &ett_lustre_llog_logid, + &ett_lustre_lov_mds_md_join, + &ett_lustre_lov_mds_md, + &ett_lustre_llog_logid, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_logid_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_logid, + &ett_lustre_llog_rec_tail, + &ett_lustre_mds_extent_desc, + &ett_lustre_lov_mds_md, + &ett_lustre_llog_array_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_mds_extent_desc, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_create_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_ll_fid, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_orphan_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_unlink_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_setattr_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_size_change_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_ll_fid, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_gen, + &ett_lustre_llog_gen_rec, + &ett_lustre_llog_rec_hdr, + &ett_lustre_llog_gen, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_log_hdr, + &ett_lustre_llog_rec_hdr, + &ett_lustre_obd_uuid, + &ett_lustre_llog_rec_tail, + &ett_lustre_llog_cookie, + &ett_lustre_llog_logid, + &ett_lustre_llogd_body, + &ett_lustre_llog_logid, + &ett_lustre_llogd_conn_body, + &ett_lustre_llog_gen, + &ett_lustre_llog_logid, + &ett_lustre_lov_user_ost_data_join, + &ett_lustre_lov_user_md_join, + &ett_lustre_llog_logid, + &ett_lustre_lov_user_ost_data_join, + &ett_lustre_obdo, + &ett_lustre_lustre_handle, + &ett_lustre_llog_cookie, + &ett_lustre_ost_body, + &ett_lustre_obdo, + &ett_lustre_qunit_data, + &ett_lustre_qunit_data_old2, + &ett_lustre_qunit_data_old, + &ett_lustre_ldlm_lock_flags, + &ett_lustre_llog_log_llh_flags, + + }; + + proto_lustre = proto_register_protocol("Lustre", "lustre", "lustre"); + proto_register_field_array(proto_lustre, hf, array_length (hf)); + proto_register_subtree_array(ett, array_length(ett)); +} + + +void proto_reg_handoff_lustre(void) +{ + dissector_handle_t lustre_handle; + lustre_handle=create_dissector_handle(dissect_lustre, proto_lustre); + /* we use Lustre only if we get ptl_index = One of this code (we have removed the bulk code) */ + /* in LNET we test if the message is a put or not before adding an lnet.ptl_index value */ + dissector_add("lnet.ptl_index", MDC_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", CONNMGR_REQUEST_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", CONNMGR_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", OSC_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", OST_IO_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", OST_CREATE_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MDC_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MDS_REQUEST_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", LDLM_CB_REQUEST_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", LDLM_CB_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", LDLM_CANCEL_REQUEST_PORTAL, lustre_handle); + dissector_add("lnet.ptl_index", LDLM_CANCEL_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MDS_SETATTR_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MDS_READPAGE_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MGC_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MGS_REQUEST_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", MGS_REPLY_PORTAL , lustre_handle); + dissector_add("lnet.ptl_index", OST_REQUEST_PORTAL , lustre_handle); +} diff --git a/lustre/doc/lfs.1 b/lustre/doc/lfs.1 index 3b3b7fb..532a60b 100644 --- a/lustre/doc/lfs.1 +++ b/lustre/doc/lfs.1 @@ -44,13 +44,33 @@ lfs \- Lustre utility to create a file with specific striping pattern, find the .br .B lfs quotaoff [-ug] .br -.B lfs setquota [-u|-g] - \fB +.B lfs quotainv [-ug] [-f] +.br +.B lfs setquota [-u|--user|-g|--group] + \fB[--block-softlimit ] + \fB[--block-hardlimit ] + \fB[--inode-softlimit ] + \fB[--inode-hardlimit ] \fB\fR .br -.B lfs setquota -t [-u|-g] +.B lfs setquota [-u|--user|-g|--group] + \fB[-b ] [-B ] + \fB[-i ] [-I ] + \fB\fR .br -.B lfs quota [-o obd_uuid] [-u|-g] +.B lfs setquota -t [-u|-g] + \fB[--block-grace ] + \fB[--inode-grace ] + \fB\fR +.br +.B lfs setquota -t [-u|-g] + \fB[-b ] [-i ] + \fB\fR +.br + +.B lfs quota [-v] [-o obd_uuid|-i mdt_idx|-I ost_idx] [-u|-g] +.br +.B lfs quota .br .B lfs quota -t [-u|-g] .br @@ -121,14 +141,17 @@ To turn filesystem quotas on. Options specify quota for users (-u) groups (-g) a .B quotaoff [-ugf] To turn filesystem quotas off. Options specify quota for users (-u) groups (-g) and force (-f) .TP -.B setquota [-u|-g] -To set filesystem quotas for users or groups. Limits are specific as blocks and inodes, see EXAMPLES +.B quotainv [-ug] [-f] +Clear quota files (administrative quota files if used without -f, operational quota files otherwise), all of their quota entries, for (-u) users or (-g) groups; after quotainv one must use quotacheck before using quotas. DO NOT USE THIS COMMAND UNLESS YOU REALLY KNOW WHAT IT DOES. IT IS MAINLY FOR INTERNAL PURPOSES. +.TP +.B setquota [-u|-g] [--block-softlimit ] [--block-hardlimit ] [--inode-softlimit ] [--inode-hardlimit ] +To set filesystem quotas for users or groups. Limits can be specified with -b, -k, -m, -g, -t, -p suffixes which specify units of 1, 2^10, 2^20, 2^30, 2^40 and 2^50 accordingly. Block limits unit is kilobyte (1024) by default and block limits are always kilobyte-grained (even if specified in bytes), see EXAMPLES .TP -.B setquota -t [-u|-g] +.B setquota -t [-u|-g] [--block-grace ] [--inode-grace ] To set filesystem quota grace times for users or groups. Grace time is specified in "XXwXXdXXhXXmXXs" format or as an integer seconds value, see EXAMPLES .TP -.B quota [-o obd_uuid] [-u|-g] -To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name must be specified. +.B quota [-v] [-o obd_uuid|-i mdt_idx|-I ost_idx] [-u|-g] +To display disk usage and limits, either for the full filesystem, or for objects on a specific obd. A user or group name can be specified. If both user and group are omitted quotas for current uid/gid are shown. -v provides more verbose (with per-obd statistics) output. .TP .B quota -t [-u|-g] To display block and inode grace times for user (-u) or group (-g) quotas @@ -141,7 +164,7 @@ Quit the interactive lfs session .SH EXAMPLES .TP .B $ lfs setstripe -s 128k -c 2 /mnt/lustre/file1 -This creats a file striped on two OSTs with 128kB on each stripe. +This creates a file striped on two OSTs with 128kB on each stripe. .TP .B $ lfs setstripe -d /mnt/lustre/dir This deletes a default stripe pattern on dir. New files will use the default striping pattern created therein. @@ -182,10 +205,10 @@ Turn quotas of user and group on .B $ lfs quotaoff -ug /mnt/lustre Turn quotas of user and group off .TP -.B $ lfs setquota -u bob 0 1000000 0 10000 /mnt/lustre -Set quotas of user `bob': 1GB block quota and 10,000 file quota +.B $ lfs setquota -u bob --block-softlimit 2000000 --block-hardlimit 1000000 /mnt/lustre +Set quotas of user `bob': 1GB block quota hardlimit and 2 GB block quota softlimit .TP -.B $ lfs setquota -t -u 1000 1w4d /mnt/lustre +.B $ lfs setquota -t -u --block-grace 1000 --inode-grace 1w4d /mnt/lustre Set grace times for user quotas: 1000 seconds for block quotas, 1 week and 4 days for inode quotas .TP .B $ lfs quota -u bob /mnt/lustre diff --git a/lustre/doc/lock-ordering b/lustre/doc/lock-ordering new file mode 100644 index 0000000..3bea748 --- /dev/null +++ b/lustre/doc/lock-ordering @@ -0,0 +1,309 @@ +/* this is dot(1) input file for lock-ordering diagram */ +/* it should be passed through C preprocessor first */ +/* cpp -P -DFITPAGE lock-ordering | tred | dot -Tps | gv -media a4 - */ + +/* +sb->s_umount + libcfs_nidstring_lock + obd_dev_lock + g_uuid_lock + obd_types_lock + type->obd_type_lock + obd->obd_dev_lock + handle_base_lock + bucket->lock + _lprocfs_lock + the_lnet.ln_lock + request->rq_lock + ptlrpc_all_services_lock + service->srv_lock + shrinker_rwsem + conn_lock + hash_body->lchb_hash_tables[i].lhb_lock + hash_body->lchb_hash_tables[i].lhb_lock + imp->imp_lock + olg->olg_lock + cli->cl_sem + handle_base_lock + bucket->lock + obd->obd_dev_lock + ref->lf_guard + hash_body->lchb_hash_tables[i].lhb_lock + h->h_lock + _lprocfs_lock + imp->imp_lock + h->h_lock + policy_lock + null_sec.ps_lock + ptlrpc_last_xid_lock + set->set_new_req_lock + h->h_lock + ns->ns_hash_lock + ns->ns_unused_lock + lock->l_lock + null_sec.ps_lock + ptlrpc_last_xid_lock + request->rq_lock + ksocknal_data.ksnd_global_lock + at->at_lock + fld->lcf_lock + obd->obd_pool_lock + obd->obd_osfs_lock + lov->lov_qos.lq_rw_sem + sbi->ll_lco.lco_lock + cache->fci_lock + inode_lock + dcache_lock + dentry->d_lock +slock-AF_INET/1 + ksocknal_data.ksnd_global_lock + ksocknal_data.ksnd_connd_lock + kss->kss_lock +pl->pl_lock + obd->obd_pool_lock +inode->i_mutex + ns->ns_unused_lock + ns->ns_hash_lock + imp->imp_lock + null_sec.ps_lock + ptlrpc_last_xid_lock + bucket->lock + lock->l_lock + res->lr_lock + ns->ns_unused_lock + bucket->lock + h->h_lock + res->lr_lock/1 + inode_lock + osc_ast_guard_class + ref->lf_guard + ksocknal_data.ksnd_global_lock + at->at_lock + h->h_lock + blp->blp_lock + cache->fci_lock + obd->obd_pool_lock + fld->lcf_lock + pl->pl_lock + lu_site_guard_class + files_lock +lov->lo_type_guard + h->coh_lock_guard + ref->lf_guard + cl_lock_guard_class + ref->lf_guard + cl_lock_guard_class#2 + cl_lock_guard_class#2 + ref->lf_guard + ns->ns_hash_lock + ns->ns_unused_lock + imp->imp_lock + null_sec.ps_lock + ptlrpc_last_xid_lock + handle_base_lock + bucket->lock + lock->l_lock + set->set_new_req_lock + h->h_lock + h->coh_lock_guard + h->coh_page_guard + +*/ +#define CATTR fontsize=8 /*, fontname=Helvetica */ +#define NATTR CATTR +#define EATTR CATTR + +#define SYSATTR color=yellow, style=filled +#define PSEUDOATTR color=pink, style=filled, peripheries=2 + +#define BLOCKATTR shape=ellipse +#define SPINATTR shape=box + +#define CONDATTR color=blue, peripheries=2, BLOCKATTR + +#define MARKBLOCK(name) /* name -> schedulable [style=invis, weight=0] */ + +#define SBLOCK(name, l) name [label=l, NATTR, BLOCKATTR, SYSATTR]; MARKBLOCK(name) + +#define SPSEUDO(name) name [NATTR, BLOCKATTR, PSEUDOATTR]; MARKBLOCK(name) + +#define LBLOCK(name, l) name [label=l, NATTR, BLOCKATTR]; MARKBLOCK(name) + +#define RCOND(name, l) name [label=l, NATTR, CONDATTR]; MARKBLOCK(name) + +#define MARKSPIN(name) /* schedulable -> name [style=invis, weight=0] */ + +#define SSPIN(name, l) name [label=l, NATTR, SYSATTR, SPINATTR]; MARKSPIN(name) +#define LSPIN(name, l) name [label=l, NATTR, SPINATTR]; MARKSPIN(name) + +#define ARC(from, to, func, ...) from -> to [EATTR, label=func, ## __VA_ARGS__] + +digraph locks { + + subgraph blocking { + SBLOCK(sb__s_umount, "sb->s_umount") + LBLOCK(_lprocfs_lock, "_lprocfs_lock") + LBLOCK(cli__cl_sem, "cli->cl_sem") + SBLOCK(shrinker_rwsem, "shrinker_rwsem") + LBLOCK(lov__lov_qos_lq_rw_sem, "lov->lov_qos.lq_rw_sem") + SBLOCK(inode__i_mutex, "inode->i_mutex") + LBLOCK(lov__lo_type_guard, "lov->lo_type_guard") + LBLOCK(cl_lock_guard_class, "cl_lock_guard_class") + } + + subgraph spin { + LSPIN(h__coh_lock_guard, "h->coh_lock_guard") + LSPIN(h__coh_page_guard, "h->coh_page_guard") + LSPIN(libcfs_nidstring_lock, "libcfs_nidstring_lock") + LSPIN(obd_dev_lock, "obd_dev_lock") + LSPIN(g_uuid_lock, "g_uuid_lock") + LSPIN(obd_types_lock, "obd_types_lock") + LSPIN(obd_type__obd_type_lock, "obd_type->obd_type_lock") + LSPIN(obd__obd_dev_lock, "obd->obd_dev_lock") + LSPIN(handle_base_lock, "handle_base_lock") + LSPIN(bucket__lock, "bucket->lock") + LSPIN(the_lnet_ln_lock, "the_lnet.ln_lock") + LSPIN(request__rq_lock, "request->rq_lock") + LSPIN(hash_body__lchb_hash_tables_i__lhb_lock, "hash_body->lchb_hash_tables[i].lhb_lock") + LSPIN(imp__imp_lock, "imp->imp_lock") + LSPIN(ref__lf_guard, "ref->lf_guard") + LSPIN(h__h_lock, "h->h_lock") + LSPIN(null_sec_ps_lock, "null_sec.ps_lock") + LSPIN(set__set_new_req_lock, "set->set_new_req_lock") + LSPIN(ns__ns_hash_lock, "ns->ns_hash_lock") + LSPIN(ns__ns_unused_lock, "ns->ns_unused_lock") + LSPIN(lock__l_lock, "lock->l_lock") + LSPIN(ksocknal_data_ksnd_global_lock, "ksocknal_data.ksnd_global_lock") + LSPIN(at__at_lock, "at->at_lock") + LSPIN(fld__lcf_lock, "fld->lcf_lock") + LSPIN(obd__obd_pool_lock, "obd->obd_pool_lock") + LSPIN(service__srv_lock, "service->srv_lock") + LSPIN(obd__obd_osfs_lock, "obd->obd_osfs_lock") + LSPIN(sbi__ll_lco_lco_lock, "sbi->ll_lco.lco_lock") + LSPIN(cache__fci_lock, "cache->fci_lock") + SSPIN(inode_lock, "inode_lock") + SSPIN(dcache_lock, "dcache_lock") + SSPIN(dentry__d_lock, "dentry->d_lock") + LSPIN(ksocknal_data_ksnd_connd_lock, "ksocknal_data.ksnd_connd_lock") + LSPIN(kss__kss_lock, "kss->kss_lock") + LSPIN(pl__pl_lock, "pl->pl_lock") + LSPIN(osc_ast_guard_class, "osc_ast_guard_class") + LSPIN(blp__blp_lock, "blp->blp_lock") + LSPIN(lu_site_guard_class, "lu_site_guard_class") + SSPIN(files_lock, "files_lock") + LSPIN(ptlrpc_all_services_lock, "ptlrpc_all_services_lock") + LSPIN(conn_lock, "conn_lock") + LSPIN(olg__olg_lock, "olg->olg_lock") + LSPIN(policy_lock, "policy_lock") + LSPIN(ptlrpc_last_xid_lock, "ptlrpc_last_xid_lock") + } + +ARC(sb__s_umount, libcfs_nidstring_lock, "") +ARC(sb__s_umount, obd_dev_lock, "") +ARC(sb__s_umount, g_uuid_lock, "") +ARC(sb__s_umount, obd_types_lock, "") +ARC(sb__s_umount, type__obd_type_lock, "") +ARC(sb__s_umount, obd__obd_dev_lock, "") +ARC(sb__s_umount, handle_base_lock, "") +ARC(sb__s_umount, bucket__lock, "") +ARC(sb__s_umount, _lprocfs_lock, "") +ARC(sb__s_umount, the_lnet_ln_lock, "") +ARC(sb__s_umount, ptlrpc_all_services_lock, "") +ARC(sb__s_umount, service__srv_lock, "") +ARC(sb__s_umount, shrinker_rwsem, "") +ARC(sb__s_umount, conn_lock, "") +ARC(sb__s_umount, hash_body__lchb_hash_tables_i__lhb_lock, "") +ARC(sb__s_umount, imp__imp_lock, "") +ARC(sb__s_umount, olg__olg_lock, "") +ARC(sb__s_umount, cli__cl_sem, "") +ARC(sb__s_umount, h__h_lock, "") +ARC(sb__s_umount, ns__ns_hash_lock, "") +ARC(sb__s_umount, ns__ns_unused_lock, "") +ARC(sb__s_umount, lock__l_lock, "") +ARC(sb__s_umount, null_sec_ps_lock, "") +ARC(sb__s_umount, ptlrpc_last_xid_lock, "") +ARC(sb__s_umount, request__rq_lock, "") +ARC(sb__s_umount, ksocknal_data_ksnd_global_lock, "") +ARC(sb__s_umount, at__at_lock, "") +ARC(sb__s_umount, fld__lcf_lock, "") +ARC(sb__s_umount, obd__obd_pool_lock, "") +ARC(sb__s_umount, obd__obd_osfs_lock, "") +ARC(sb__s_umount, lov__lov_qos_lq_rw_sem, "") +ARC(sb__s_umount, sbi__ll_lco_lco_lock, "") +ARC(sb__s_umount, cache__fci_lock, "") +ARC(sb__s_umount, inode_lock, "") +ARC(sb__s_umount, dcache_lock, "") + +ARC(the_lnet_ln_lock, request__rq_lock, "") +ARC(conn_lock, hash_body__lchb_hash_tables_i__lhb_lock, "") +ARC(cli__cl_sem, handle_base_lock, "") +ARC(cli__cl_sem, bucket__lock, "") +ARC(cli__cl_sem, obd__obd_dev_lock, "") +ARC(cli__cl_sem, h__h_lock, "") +ARC(cli__cl_sem, _lprocfs_lock, "") +ARC(cli__cl_sem, imp__imp_lock, "") +ARC(cli__cl_sem, policy_lock, "") +ARC(cli__cl_sem, null_sec_ps_lock, "") +ARC(cli__cl_sem, ptlrpc_last_xid_lock, "") +ARC(cli__cl_sem, set__set_new_req_lock, "") + +ARC(obd__obd_dev_lock, ref__lf_guard, "") +ARC(obd__obd_dev_lock, hash_body__lchb_hash_tables_i__lhb_lock, "") +ARC(imp__imp_lock, h__h_lock, "") + +ARC(dcache_lock, dentry__d_lock, "") + +ARC(ksocknal_data_ksnd_global_lock, ksocknal_data_ksnd_connd_lock, "") +ARC(ksocknal_data_ksnd_global_lock, kss__kss_lock, "") +ARC(pl__pl_lock, obd__obd_pool_lock, "") + +ARC(inode__i_mutex, ns__ns_unused_lock, "") +ARC(inode__i_mutex, ns__ns_hash_lock, "") +ARC(inode__i_mutex, imp__imp_lock, "") +ARC(inode__i_mutex, null_sec_ps_lock, "") +ARC(inode__i_mutex, ptlrpc_last_xid_lock, "") +ARC(inode__i_mutex, bucket__lock, "") +ARC(inode__i_mutex, lock__l_lock, "") +ARC(inode__i_mutex, ksocknal_data_ksnd_global_lock, "") +ARC(inode__i_mutex, at__at_lock, "") +ARC(inode__i_mutex, h__h_lock, "") +ARC(inode__i_mutex, blp__blp_lock, "") +ARC(inode__i_mutex, cache__fci_lock, "") +ARC(inode__i_mutex, obd__obd_pool_lock, "") +ARC(inode__i_mutex, fld__lcf_lock, "") +ARC(inode__i_mutex, pl__pl_lock, "") +ARC(inode__i_mutex, lu_site_guard_class, "") +ARC(inode__i_mutex, files_lock, "") + +ARC(lock__l_lock, res__lr_lock, "") +ARC(res__lr_lock, ns__ns_unused_lock, "") +ARC(res__lr_lock, bucket__lock, "") +ARC(res__lr_lock, res__lr_lock, "") +ARC(res__lr_lock, inode_lock, "") +ARC(res__lr_lock, osc_ast_guard_class, "") + +ARC(osc_ast_guard_class, ref__lf_guard, "") +ARC(bucket__lock, h__h_lock, "") + +ARC(cl_lock_guard_class, cl_lock_guard_class, "") +ARC(cl_lock_guard_class, ref__lf_guard, "") +ARC(cl_lock_guard_class, ns__ns_hash_lock, "") +ARC(cl_lock_guard_class, ns__ns_unused_lock, "") +ARC(cl_lock_guard_class, imp__imp_lock, "") +ARC(cl_lock_guard_class, null_sec_ps_lock, "") +ARC(cl_lock_guard_class, ptlrpc_last_xid_lock, "") +ARC(cl_lock_guard_class, handle_base_lock, "") +ARC(cl_lock_guard_class, bucket__lock, "") +ARC(cl_lock_guard_class, lock__l_lock, "") +ARC(cl_lock_guard_class, set__set_new_req_lock, "") +ARC(cl_lock_guard_class, h__h_lock, "") +ARC(cl_lock_guard_class, ref__lf_guard, "") +ARC(cl_lock_guard_class, cl_lock_guard_class, "") +ARC(cl_lock_guard_class, h__coh_lock_guard, "") +ARC(cl_lock_guard_class, h__coh_page_guard, "") + +ARC(lov__lo_type_guard, h__coh_lock_guard, "") +ARC(lov__lo_type_guard, ref__lf_guard, "") +ARC(lov__lo_type_guard, cl_lock_guard_class, "") + +} diff --git a/lustre/fid/fid_handler.c b/lustre/fid/fid_handler.c index d52b770..5af9343 100644 --- a/lustre/fid/fid_handler.c +++ b/lustre/fid/fid_handler.c @@ -93,6 +93,7 @@ int seq_server_set_cli(struct lu_server_seq *seq, seq->lss_name, cli->lcs_name); seq->lss_cli = cli; + cli->lcs_space.lsr_mdt = seq->lss_site->ms_node_id; EXIT; out_up: up(&seq->lss_sem); @@ -100,16 +101,22 @@ out_up: } EXPORT_SYMBOL(seq_server_set_cli); -/* +/** * On controller node, allocate new super sequence for regular sequence server. + * As this super sequence controller, this node suppose to maintain fld + * and update index. + * \a out range always has currect mds node number of requester. */ + static int __seq_server_alloc_super(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env) { - struct lu_range *space = &seq->lss_space; - int rc; + struct lu_seq_range *space = &seq->lss_space; + struct thandle *th; + __u64 mdt = out->lsr_mdt; + int rc, credit; ENTRY; LASSERT(range_is_sane(space)); @@ -118,8 +125,8 @@ static int __seq_server_alloc_super(struct lu_server_seq *seq, CDEBUG(D_INFO, "%s: Input seq range: " DRANGE"\n", seq->lss_name, PRANGE(in)); - if (in->lr_end > space->lr_start) - space->lr_start = in->lr_end; + if (in->lsr_end > space->lsr_start) + space->lsr_start = in->lsr_end; *out = *in; CDEBUG(D_INFO, "%s: Recovered space: "DRANGE"\n", @@ -130,7 +137,7 @@ static int __seq_server_alloc_super(struct lu_server_seq *seq, "Only "LPU64" sequences left\n", seq->lss_name, range_space(space)); *out = *space; - space->lr_start = space->lr_end; + space->lsr_start = space->lsr_end; } else if (range_is_exhausted(space)) { CERROR("%s: Sequences space is exhausted\n", seq->lss_name); @@ -139,23 +146,40 @@ static int __seq_server_alloc_super(struct lu_server_seq *seq, range_alloc(out, space, seq->lss_width); } } + out->lsr_mdt = mdt; - rc = seq_store_write(seq, env); + credit = SEQ_TXN_STORE_CREDITS + FLD_TXN_INDEX_INSERT_CREDITS; + + th = seq_store_trans_start(seq, env, credit); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = seq_store_write(seq, env, th); if (rc) { CERROR("%s: Can't write space data, rc %d\n", seq->lss_name, rc); - RETURN(rc); + goto out; } - CDEBUG(D_INFO, "%s: Allocated super-sequence " - DRANGE"\n", seq->lss_name, PRANGE(out)); + rc = fld_server_create(seq->lss_site->ms_server_fld, + env, out, th); + if (rc) { + CERROR("%s: Can't Update fld database, rc %d\n", + seq->lss_name, rc); + } + +out: + seq_store_trans_stop(seq, env, th); + + CDEBUG(D_INFO, "%s: super-sequence allocation rc = %d " + DRANGE"\n", seq->lss_name, rc, PRANGE(out)); RETURN(rc); } int seq_server_alloc_super(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env) { int rc; @@ -169,12 +193,14 @@ int seq_server_alloc_super(struct lu_server_seq *seq, } static int __seq_server_alloc_meta(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env) { - struct lu_range *space = &seq->lss_space; + struct lu_seq_range *space = &seq->lss_space; + struct thandle *th; int rc = 0; + ENTRY; LASSERT(range_is_sane(space)); @@ -193,22 +219,22 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq, * we check here that range from client is "newer" than * exhausted super. */ - LASSERT(in->lr_end > space->lr_start); + LASSERT(in->lsr_end > space->lsr_start); /* * Start is set to end of last allocated, because it * *is* already allocated so we take that into account * and do not use for other allocations. */ - space->lr_start = in->lr_end; + space->lsr_start = in->lsr_end; /* - * End is set to in->lr_start + super sequence - * allocation unit. That is because in->lr_start is + * End is set to in->lsr_start + super sequence + * allocation unit. That is because in->lsr_start is * first seq in new allocated range from controller * before failure. */ - space->lr_end = in->lr_start + LUSTRE_SEQ_SUPER_WIDTH; + space->lsr_end = in->lsr_start + LUSTRE_SEQ_SUPER_WIDTH; if (!seq->lss_cli) { CERROR("%s: No sequence controller " @@ -221,6 +247,7 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq, * obtained range from it was @space. */ rc = seq_client_replay_super(seq->lss_cli, space, env); + if (rc) { CERROR("%s: Can't replay super-sequence, " "rc %d\n", seq->lss_name, rc); @@ -231,8 +258,8 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq, * Update super start by end from client's range. Super * end should not be changed if range was not exhausted. */ - if (in->lr_end > space->lr_start) - space->lr_start = in->lr_end; + if (in->lsr_end > space->lsr_start) + space->lsr_start = in->lsr_end; } *out = *in; @@ -266,7 +293,11 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq, range_alloc(out, space, seq->lss_width); } - rc = seq_store_write(seq, env); + th = seq_store_trans_start(seq, env, SEQ_TXN_STORE_CREDITS); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = seq_store_write(seq, env, th); if (rc) { CERROR("%s: Can't write space data, rc %d\n", seq->lss_name, rc); @@ -277,12 +308,13 @@ static int __seq_server_alloc_meta(struct lu_server_seq *seq, DRANGE"\n", seq->lss_name, PRANGE(out)); } + seq_store_trans_stop(seq, env, th); RETURN(rc); } int seq_server_alloc_meta(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env) { int rc; @@ -298,8 +330,8 @@ EXPORT_SYMBOL(seq_server_alloc_meta); static int seq_server_handle(struct lu_site *site, const struct lu_env *env, - __u32 opc, struct lu_range *in, - struct lu_range *out) + __u32 opc, struct lu_seq_range *in, + struct lu_seq_range *out) { int rc; struct md_site *mite; @@ -337,7 +369,7 @@ static int seq_req_handle(struct ptlrpc_request *req, const struct lu_env *env, struct seq_thread_info *info) { - struct lu_range *out, *in = NULL; + struct lu_seq_range *out, *in = NULL, *tmp; struct lu_site *site; int rc = -EPROTO; __u32 *opc; @@ -356,13 +388,16 @@ static int seq_req_handle(struct ptlrpc_request *req, if (out == NULL) RETURN(err_serious(-EPROTO)); - if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { - in = req_capsule_client_get(info->sti_pill, - &RMF_SEQ_RANGE); + tmp = req_capsule_client_get(info->sti_pill, &RMF_SEQ_RANGE); + if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + in = tmp; LASSERT(!range_is_zero(in) && range_is_sane(in)); } + /* seq client passed mdt id, we need to pass that using out + * range parameter */ + out->lsr_mdt = tmp->lsr_mdt; rc = seq_server_handle(site, env, *opc, in, out); } else rc = err_serious(-EPROTO); @@ -475,8 +510,10 @@ int seq_server_init(struct lu_server_seq *seq, struct dt_device *dev, const char *prefix, enum lu_mgr_type type, + struct md_site *ms, const struct lu_env *env) { + struct thandle *th; int rc, is_srv = (type == LUSTRE_SEQ_SERVER); ENTRY; @@ -485,7 +522,8 @@ int seq_server_init(struct lu_server_seq *seq, seq->lss_cli = NULL; seq->lss_type = type; - range_zero(&seq->lss_space); + seq->lss_site = ms; + range_init(&seq->lss_space); sema_init(&seq->lss_sem, 1); seq->lss_width = is_srv ? @@ -497,7 +535,6 @@ int seq_server_init(struct lu_server_seq *seq, rc = seq_store_init(seq, env, dev); if (rc) GOTO(out, rc); - /* Request backing store for saved sequence info. */ rc = seq_store_read(seq, env); if (rc == -ENODATA) { @@ -507,16 +544,22 @@ int seq_server_init(struct lu_server_seq *seq, LUSTRE_SEQ_ZERO_RANGE: LUSTRE_SEQ_SPACE_RANGE; + seq->lss_space.lsr_mdt = ms->ms_node_id; CDEBUG(D_INFO, "%s: No data found " "on store. Initialize space\n", seq->lss_name); + th = seq_store_trans_start(seq, env, SEQ_TXN_STORE_CREDITS); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + /* Save default controller value to store. */ - rc = seq_store_write(seq, env); + rc = seq_store_write(seq, env, th); if (rc) { CERROR("%s: Can't write space data, " "rc %d\n", seq->lss_name, rc); } + seq_store_trans_stop(seq, env, th); } else if (rc) { CERROR("%s: Can't read space data, rc %d\n", seq->lss_name, rc); @@ -556,6 +599,18 @@ EXPORT_SYMBOL(seq_server_fini); cfs_proc_dir_entry_t *seq_type_proc_dir = NULL; +static struct lu_local_obj_desc llod_seq_srv = { + .llod_name = LUSTRE_SEQ_SRV_NAME, + .llod_oid = FID_SEQ_SRV_OID, + .llod_is_index = 0, +}; + +static struct lu_local_obj_desc llod_seq_ctl = { + .llod_name = LUSTRE_SEQ_CTL_NAME, + .llod_oid = FID_SEQ_CTL_OID, + .llod_is_index = 0, +}; + static int __init fid_mod_init(void) { seq_type_proc_dir = lprocfs_register(LUSTRE_SEQ_NAME, @@ -564,6 +619,9 @@ static int __init fid_mod_init(void) if (IS_ERR(seq_type_proc_dir)) return PTR_ERR(seq_type_proc_dir); + llo_local_obj_register(&llod_seq_srv); + llo_local_obj_register(&llod_seq_ctl); + LU_CONTEXT_KEY_INIT(&seq_thread_key); lu_context_key_register(&seq_thread_key); return 0; @@ -571,6 +629,9 @@ static int __init fid_mod_init(void) static void __exit fid_mod_exit(void) { + llo_local_obj_unregister(&llod_seq_srv); + llo_local_obj_unregister(&llod_seq_ctl); + lu_context_key_degister(&seq_thread_key); if (seq_type_proc_dir != NULL && !IS_ERR(seq_type_proc_dir)) { lprocfs_remove(&seq_type_proc_dir); diff --git a/lustre/fid/fid_internal.h b/lustre/fid/fid_internal.h index 9137656..03c5227 100644 --- a/lustre/fid/fid_internal.h +++ b/lustre/fid/fid_internal.h @@ -49,10 +49,14 @@ struct seq_thread_info { struct req_capsule *sti_pill; struct txn_param sti_txn; - struct lu_range sti_space; + struct lu_seq_range sti_space; struct lu_buf sti_buf; }; +enum { + SEQ_TXN_STORE_CREDITS = 20 +}; + extern struct lu_context_key seq_thread_key; /* Functions used internally in module. */ @@ -60,7 +64,7 @@ int seq_client_alloc_super(struct lu_client_seq *seq, const struct lu_env *env); int seq_client_replay_super(struct lu_client_seq *seq, - struct lu_range *range, + struct lu_seq_range *range, const struct lu_env *env); /* Store API functions. */ @@ -72,11 +76,19 @@ void seq_store_fini(struct lu_server_seq *seq, const struct lu_env *env); int seq_store_write(struct lu_server_seq *seq, - const struct lu_env *env); + const struct lu_env *env, + struct thandle *th); int seq_store_read(struct lu_server_seq *seq, const struct lu_env *env); +struct thandle * seq_store_trans_start(struct lu_server_seq *seq, + const struct lu_env *env, + int credits); +void seq_store_trans_stop(struct lu_server_seq *seq, + const struct lu_env *env, + struct thandle *th); + #ifdef LPROCFS extern struct lprocfs_vars seq_server_proc_list[]; extern struct lprocfs_vars seq_client_proc_list[]; diff --git a/lustre/fid/fid_lib.c b/lustre/fid/fid_lib.c index f472011..76e779a 100644 --- a/lustre/fid/fid_lib.c +++ b/lustre/fid/fid_lib.c @@ -69,15 +69,16 @@ * * * The first 0x400 sequences of normal FID are reserved for special purpose. + * FID_SEQ_START + 1 is for local file id generation. */ -const struct lu_range LUSTRE_SEQ_SPACE_RANGE = { +const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE = { FID_SEQ_START + 0x400ULL, (__u64)~0ULL }; EXPORT_SYMBOL(LUSTRE_SEQ_SPACE_RANGE); /* Zero range, used for init and other purposes. */ -const struct lu_range LUSTRE_SEQ_ZERO_RANGE = { +const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE = { 0, 0 }; @@ -88,50 +89,3 @@ const struct lu_fid LUSTRE_BFL_FID = { .f_seq = 0x0000000000000003, .f_oid = 0x0000000000000001, .f_ver = 0x0000000000000000 }; EXPORT_SYMBOL(LUSTRE_BFL_FID); - -void range_cpu_to_le(struct lu_range *dst, const struct lu_range *src) -{ - /* check that all fields are converted */ - CLASSERT(sizeof *src == - sizeof src->lr_start + - sizeof src->lr_end); - dst->lr_start = cpu_to_le64(src->lr_start); - dst->lr_end = cpu_to_le64(src->lr_end); -} -EXPORT_SYMBOL(range_cpu_to_le); - -void range_le_to_cpu(struct lu_range *dst, const struct lu_range *src) -{ - /* check that all fields are converted */ - CLASSERT(sizeof *src == - sizeof src->lr_start + - sizeof src->lr_end); - dst->lr_start = le64_to_cpu(src->lr_start); - dst->lr_end = le64_to_cpu(src->lr_end); -} -EXPORT_SYMBOL(range_le_to_cpu); - -#ifdef __KERNEL__ -void range_cpu_to_be(struct lu_range *dst, const struct lu_range *src) -{ - /* check that all fields are converted */ - CLASSERT(sizeof *src == - sizeof src->lr_start + - sizeof src->lr_end); - dst->lr_start = cpu_to_be64(src->lr_start); - dst->lr_end = cpu_to_be64(src->lr_end); -} -EXPORT_SYMBOL(range_cpu_to_be); - -void range_be_to_cpu(struct lu_range *dst, const struct lu_range *src) -{ - /* check that all fields are converted */ - CLASSERT(sizeof *src == - sizeof src->lr_start + - sizeof src->lr_end); - dst->lr_start = be64_to_cpu(src->lr_start); - dst->lr_end = be64_to_cpu(src->lr_end); -} -EXPORT_SYMBOL(range_be_to_cpu); - -#endif diff --git a/lustre/fid/fid_request.c b/lustre/fid/fid_request.c index 4266358..9939c82 100644 --- a/lustre/fid/fid_request.c +++ b/lustre/fid/fid_request.c @@ -63,13 +63,13 @@ #include #include "fid_internal.h" -static int seq_client_rpc(struct lu_client_seq *seq, struct lu_range *input, - struct lu_range *output, __u32 opc, +static int seq_client_rpc(struct lu_client_seq *seq, struct lu_seq_range *input, + struct lu_seq_range *output, __u32 opc, const char *opcname) { struct obd_export *exp = seq->lcs_exp; struct ptlrpc_request *req; - struct lu_range *out, *in; + struct lu_seq_range *out, *in; __u32 *op; int rc; ENTRY; @@ -88,16 +88,20 @@ static int seq_client_rpc(struct lu_client_seq *seq, struct lu_range *input, if (input != NULL) *in = *input; else - range_zero(in); + range_init(in); ptlrpc_request_set_replen(req); if (seq->lcs_type == LUSTRE_SEQ_METADATA) { req->rq_request_portal = (opc == SEQ_ALLOC_SUPER) ? SEQ_CONTROLLER_PORTAL : SEQ_METADATA_PORTAL; + /* update mdt field of *in, it is required for fld update + * on super sequence allocator node. */ + if (opc == SEQ_ALLOC_SUPER) + in->lsr_mdt = seq->lcs_space.lsr_mdt; } else { - req->rq_request_portal = (opc == SEQ_ALLOC_SUPER) ? - SEQ_CONTROLLER_PORTAL : SEQ_DATA_PORTAL; + LASSERT(opc == SEQ_ALLOC_META); + req->rq_request_portal = SEQ_DATA_PORTAL; } ptlrpc_at_set_req_timeout(req); @@ -135,7 +139,7 @@ out_req: /* Request sequence-controller node to allocate new super-sequence. */ int seq_client_replay_super(struct lu_client_seq *seq, - struct lu_range *range, + struct lu_seq_range *range, const struct lu_env *env) { int rc; @@ -212,8 +216,8 @@ static int seq_client_alloc_seq(struct lu_client_seq *seq, seqno_t *seqnr) } LASSERT(!range_is_exhausted(&seq->lcs_space)); - *seqnr = seq->lcs_space.lr_start; - seq->lcs_space.lr_start += 1; + *seqnr = seq->lcs_space.lsr_start; + seq->lcs_space.lsr_start += 1; CDEBUG(D_INFO, "%s: Allocated sequence ["LPX64"]\n", seq->lcs_name, *seqnr); @@ -280,7 +284,14 @@ void seq_client_flush(struct lu_client_seq *seq) LASSERT(seq != NULL); down(&seq->lcs_sem); fid_zero(&seq->lcs_fid); - range_zero(&seq->lcs_space); + /** + * this id shld not be used for seq range allocation. + * set to -1 for dgb check. + */ + + seq->lcs_space.lsr_mdt = -1; + + range_init(&seq->lcs_space); up(&seq->lcs_sem); } EXPORT_SYMBOL(seq_client_flush); diff --git a/lustre/fid/fid_store.c b/lustre/fid/fid_store.c index 42fda49..56b950c 100644 --- a/lustre/fid/fid_store.c +++ b/lustre/fid/fid_store.c @@ -62,9 +62,6 @@ #include "fid_internal.h" #ifdef __KERNEL__ -enum { - SEQ_TXN_STORE_CREDITS = 20 -}; static struct lu_buf *seq_store_buf(struct seq_thread_info *info) { @@ -76,47 +73,68 @@ static struct lu_buf *seq_store_buf(struct seq_thread_info *info) return buf; } +struct thandle * seq_store_trans_start(struct lu_server_seq *seq, + const struct lu_env *env, int credit) +{ + struct seq_thread_info *info; + struct dt_device *dt_dev; + struct thandle *th; + ENTRY; + + dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); + info = lu_context_key_get(&env->le_ctx, &seq_thread_key); + LASSERT(info != NULL); + + txn_param_init(&info->sti_txn, credit); + + th = dt_dev->dd_ops->dt_trans_start(env, dt_dev, &info->sti_txn); + return th; +} + +void seq_store_trans_stop(struct lu_server_seq *seq, + const struct lu_env *env, + struct thandle *th) +{ + struct dt_device *dt_dev; + ENTRY; + + dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); + + dt_dev->dd_ops->dt_trans_stop(env, th); +} + /* This function implies that caller takes care about locking. */ int seq_store_write(struct lu_server_seq *seq, - const struct lu_env *env) + const struct lu_env *env, + struct thandle *th) { struct dt_object *dt_obj = seq->lss_obj; struct seq_thread_info *info; struct dt_device *dt_dev; - struct thandle *th; loff_t pos = 0; - int rc; - ENTRY; + int rc; + ENTRY; dt_dev = lu2dt_dev(seq->lss_obj->do_lu.lo_dev); info = lu_context_key_get(&env->le_ctx, &seq_thread_key); LASSERT(info != NULL); - /* Stub here, will fix it later. */ - txn_param_init(&info->sti_txn, SEQ_TXN_STORE_CREDITS); + /* Store ranges in le format. */ + range_cpu_to_le(&info->sti_space, &seq->lss_space); - th = dt_dev->dd_ops->dt_trans_start(env, dt_dev, &info->sti_txn); - if (!IS_ERR(th)) { - /* Store ranges in le format. */ - range_cpu_to_le(&info->sti_space, &seq->lss_space); - - rc = dt_obj->do_body_ops->dbo_write(env, dt_obj, - seq_store_buf(info), - &pos, th, BYPASS_CAPA); - if (rc == sizeof(info->sti_space)) { - CDEBUG(D_INFO, "%s: Space - "DRANGE"\n", - seq->lss_name, PRANGE(&seq->lss_space)); - rc = 0; - } else if (rc >= 0) { - rc = -EIO; - } - - dt_dev->dd_ops->dt_trans_stop(env, th); - } else { - rc = PTR_ERR(th); + rc = dt_obj->do_body_ops->dbo_write(env, dt_obj, + seq_store_buf(info), + &pos, th, BYPASS_CAPA, 1); + if (rc == sizeof(info->sti_space)) { + CDEBUG(D_INFO, "%s: Space - "DRANGE"\n", + seq->lss_name, PRANGE(&seq->lss_space)); + rc = 0; + } else if (rc >= 0) { + rc = -EIO; } - - RETURN(rc); + + + RETURN(rc); } /* @@ -167,7 +185,7 @@ int seq_store_init(struct lu_server_seq *seq, name = seq->lss_type == LUSTRE_SEQ_SERVER ? LUSTRE_SEQ_SRV_NAME : LUSTRE_SEQ_CTL_NAME; - dt_obj = dt_store_open(env, dt, name, &fid); + dt_obj = dt_store_open(env, dt, "", name, &fid); if (!IS_ERR(dt_obj)) { seq->lss_obj = dt_obj; rc = 0; diff --git a/lustre/fid/lproc_fid.c b/lustre/fid/lproc_fid.c index 63fbacd..e9976f9 100644 --- a/lustre/fid/lproc_fid.c +++ b/lustre/fid/lproc_fid.c @@ -69,16 +69,16 @@ static int seq_proc_write_common(struct file *file, const char *buffer, unsigned long count, void *data, - struct lu_range *range) + struct lu_seq_range *range) { - struct lu_range tmp; + struct lu_seq_range tmp; int rc; ENTRY; LASSERT(range != NULL); - rc = sscanf(buffer, "[%Lx - %Lx]\n",(long long unsigned *)&tmp.lr_start, - (long long unsigned *)&tmp.lr_end); + rc = sscanf(buffer, "[%Lx - %Lx]\n",(long long unsigned *)&tmp.lsr_start, + (long long unsigned *)&tmp.lsr_end); if (rc != 2 || !range_is_sane(&tmp) || range_is_zero(&tmp)) RETURN(-EINVAL); *range = tmp; @@ -88,13 +88,13 @@ seq_proc_write_common(struct file *file, const char *buffer, static int seq_proc_read_common(char *page, char **start, off_t off, int count, int *eof, void *data, - struct lu_range *range) + struct lu_seq_range *range) { int rc; ENTRY; *eof = 1; - rc = snprintf(page, count, "["LPX64" - "LPX64"]\n", + rc = snprintf(page, count, "["LPX64" - "LPX64"]:%x\n", PRANGE(range)); RETURN(rc); } diff --git a/lustre/fld/fld_cache.c b/lustre/fld/fld_cache.c index 9ec1f1a..695fc21 100644 --- a/lustre/fld/fld_cache.c +++ b/lustre/fld/fld_cache.c @@ -37,6 +37,7 @@ * * FLD (Fids Location Database) * + * Author: Pravin Shelar * Author: Yury Umanets */ @@ -67,74 +68,35 @@ #include #include "fld_internal.h" -#ifdef __KERNEL__ -static inline __u32 fld_cache_hash(seqno_t seq) -{ - return (__u32)seq; -} - -void fld_cache_flush(struct fld_cache *cache) -{ - struct fld_cache_entry *flde; - struct hlist_head *bucket; - struct hlist_node *scan; - struct hlist_node *next; - int i; - ENTRY; - - /* Free all cache entries. */ - spin_lock(&cache->fci_lock); - for (i = 0; i < cache->fci_hash_size; i++) { - bucket = cache->fci_hash_table + i; - hlist_for_each_entry_safe(flde, scan, next, bucket, fce_list) { - hlist_del_init(&flde->fce_list); - list_del_init(&flde->fce_lru); - cache->fci_cache_count--; - OBD_FREE_PTR(flde); - } - } - spin_unlock(&cache->fci_lock); - EXIT; -} - -struct fld_cache *fld_cache_init(const char *name, int hash_size, +/** + * create fld cache. + */ +struct fld_cache *fld_cache_init(const char *name, int cache_size, int cache_threshold) { - struct fld_cache *cache; - int i; + struct fld_cache *cache; ENTRY; LASSERT(name != NULL); - LASSERT(IS_PO2(hash_size)); LASSERT(cache_threshold < cache_size); OBD_ALLOC_PTR(cache); if (cache == NULL) RETURN(ERR_PTR(-ENOMEM)); - INIT_LIST_HEAD(&cache->fci_lru); + CFS_INIT_LIST_HEAD(&cache->fci_entries_head); + CFS_INIT_LIST_HEAD(&cache->fci_lru); - cache->fci_cache_count = 0; + cache->fci_cache_count = 0; spin_lock_init(&cache->fci_lock); strncpy(cache->fci_name, name, sizeof(cache->fci_name)); - cache->fci_hash_size = hash_size; - cache->fci_cache_size = cache_size; + cache->fci_cache_size = cache_size; cache->fci_threshold = cache_threshold; /* Init fld cache info. */ - cache->fci_hash_mask = hash_size - 1; - OBD_ALLOC(cache->fci_hash_table, - hash_size * sizeof(*cache->fci_hash_table)); - if (cache->fci_hash_table == NULL) { - OBD_FREE_PTR(cache); - RETURN(ERR_PTR(-ENOMEM)); - } - - for (i = 0; i < hash_size; i++) - INIT_HLIST_HEAD(&cache->fci_hash_table[i]); memset(&cache->fci_stat, 0, sizeof(cache->fci_stat)); CDEBUG(D_INFO, "%s: FLD cache - Size: %d, Threshold: %d\n", @@ -142,8 +104,10 @@ struct fld_cache *fld_cache_init(const char *name, int hash_size, RETURN(cache); } -EXPORT_SYMBOL(fld_cache_init); +/** + * destroy fld cache. + */ void fld_cache_fini(struct fld_cache *cache) { __u64 pct; @@ -162,28 +126,109 @@ void fld_cache_fini(struct fld_cache *cache) CDEBUG(D_INFO, "FLD cache statistics (%s):\n", cache->fci_name); CDEBUG(D_INFO, " Total reqs: "LPU64"\n", cache->fci_stat.fst_count); CDEBUG(D_INFO, " Cache reqs: "LPU64"\n", cache->fci_stat.fst_cache); - CDEBUG(D_INFO, " Saved RPCs: "LPU64"\n", cache->fci_stat.fst_inflight); CDEBUG(D_INFO, " Cache hits: "LPU64"%%\n", pct); - OBD_FREE(cache->fci_hash_table, cache->fci_hash_size * - sizeof(*cache->fci_hash_table)); - OBD_FREE_PTR(cache); - + OBD_FREE_PTR(cache); + + EXIT; +} + +static inline void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node); + +/** + * fix list by checking new entry with NEXT entry in order. + */ +static void fld_fix_new_list(struct fld_cache *cache) +{ + struct fld_cache_entry *f_curr; + struct fld_cache_entry *f_next; + struct lu_seq_range *c_range; + struct lu_seq_range *n_range; + struct list_head *head = &cache->fci_entries_head; + ENTRY; + +restart_fixup: + + list_for_each_entry_safe(f_curr, f_next, head, fce_list) { + c_range = &f_curr->fce_range; + n_range = &f_next->fce_range; + + LASSERT(range_is_sane(c_range)); + if (&f_next->fce_list == head) + break; + + LASSERT(c_range->lsr_start <= n_range->lsr_start); + + /* check merge possibility with next range */ + if (c_range->lsr_end == n_range->lsr_start) { + if (c_range->lsr_mdt != n_range->lsr_mdt) + continue; + n_range->lsr_start = c_range->lsr_start; + fld_cache_entry_delete(cache, f_curr); + continue; + } + + /* check if current range overlaps with next range. */ + if (n_range->lsr_start < c_range->lsr_end) { + + if (c_range->lsr_mdt == n_range->lsr_mdt) { + n_range->lsr_start = c_range->lsr_start; + n_range->lsr_end = max(c_range->lsr_end, + n_range->lsr_end); + + fld_cache_entry_delete(cache, f_curr); + } else { + if (n_range->lsr_end <= c_range->lsr_end) { + *n_range = *c_range; + fld_cache_entry_delete(cache, f_curr); + } else + n_range->lsr_start = c_range->lsr_end; + } + + /* we could have overlap over next + * range too. better restart. */ + goto restart_fixup; + } + + /* kill duplicates */ + if (c_range->lsr_start == n_range->lsr_start && + c_range->lsr_end == n_range->lsr_end) + fld_cache_entry_delete(cache, f_curr); + } + EXIT; } -EXPORT_SYMBOL(fld_cache_fini); -static inline struct hlist_head * -fld_cache_bucket(struct fld_cache *cache, seqno_t seq) +/** + * add node to fld cache + */ +static inline void fld_cache_entry_add(struct fld_cache *cache, + struct fld_cache_entry *f_new, + struct list_head *pos) { - return cache->fci_hash_table + (fld_cache_hash(seq) & - cache->fci_hash_mask); + list_add(&f_new->fce_list, pos); + list_add(&f_new->fce_lru, &cache->fci_lru); + + cache->fci_cache_count++; + fld_fix_new_list(cache); } -/* - * Check if cache needs to be shrinked. If so - do it. Tries to keep all - * collision lists well balanced. That is, check all of them and remove one - * entry in list and so on until cache is shrinked enough. +/** + * delete given node from list. + */ +static inline void fld_cache_entry_delete(struct fld_cache *cache, + struct fld_cache_entry *node) +{ + list_del(&node->fce_list); + list_del(&node->fce_lru); + cache->fci_cache_count--; + OBD_FREE_PTR(node); +} + +/** + * Check if cache needs to be shrunk. If so - do it. + * Remove one entry in list and so on until cache is shrunk enough. */ static int fld_cache_shrink(struct fld_cache *cache) { @@ -200,257 +245,234 @@ static int fld_cache_shrink(struct fld_cache *cache) curr = cache->fci_lru.prev; while (cache->fci_cache_count + cache->fci_threshold > - cache->fci_cache_size && curr != &cache->fci_lru) - { + cache->fci_cache_size && curr != &cache->fci_lru) { + flde = list_entry(curr, struct fld_cache_entry, fce_lru); curr = curr->prev; - - /* keep inflights */ - if (flde->fce_inflight) - continue; - - hlist_del_init(&flde->fce_list); - list_del_init(&flde->fce_lru); - cache->fci_cache_count--; - OBD_FREE_PTR(flde); + fld_cache_entry_delete(cache, flde); num++; } - CDEBUG(D_INFO, "%s: FLD cache - Shrinked by " + CDEBUG(D_INFO, "%s: FLD cache - Shrunk by " "%d entries\n", cache->fci_name, num); RETURN(0); } -int fld_cache_insert_inflight(struct fld_cache *cache, seqno_t seq) +/** + * kill all fld cache entries. + */ +void fld_cache_flush(struct fld_cache *cache) { - struct fld_cache_entry *flde, *fldt; - struct hlist_head *bucket; - struct hlist_node *scan; ENTRY; spin_lock(&cache->fci_lock); - - /* Check if cache already has the entry with such a seq. */ - bucket = fld_cache_bucket(cache, seq); - hlist_for_each_entry(fldt, scan, bucket, fce_list) { - if (fldt->fce_seq == seq) { - spin_unlock(&cache->fci_lock); - RETURN(-EEXIST); - } - } + cache->fci_cache_size = 0; + fld_cache_shrink(cache); spin_unlock(&cache->fci_lock); - /* Allocate new entry. */ - OBD_ALLOC_PTR(flde); - if (!flde) - RETURN(-ENOMEM); + EXIT; +} - /* - * Check if cache has the entry with such a seq again. It could be added - * while we were allocating new entry. - */ - spin_lock(&cache->fci_lock); - hlist_for_each_entry(fldt, scan, bucket, fce_list) { - if (fldt->fce_seq == seq) { - spin_unlock(&cache->fci_lock); - OBD_FREE_PTR(flde); - RETURN(0); - } +/** + * punch hole in existing range. divide this range and add new + * entry accordingly. + */ + +void fld_cache_punch_hole(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) +{ + const struct lu_seq_range *range = &f_new->fce_range; + const seqno_t new_start = range->lsr_start; + const seqno_t new_end = range->lsr_end; + struct fld_cache_entry *fldt; + + ENTRY; + OBD_ALLOC_GFP(fldt, sizeof *fldt, CFS_ALLOC_ATOMIC); + if (!fldt) { + OBD_FREE_PTR(f_new); + EXIT; + /* overlap is not allowed, so dont mess up list. */ + return; } + /* break f_curr RANGE into three RANGES: + * f_curr, f_new , fldt + */ - /* Add new entry to cache and lru list. */ - INIT_HLIST_NODE(&flde->fce_list); - flde->fce_inflight = 1; - flde->fce_invalid = 1; - cfs_waitq_init(&flde->fce_waitq); - flde->fce_seq = seq; - - hlist_add_head(&flde->fce_list, bucket); - list_add(&flde->fce_lru, &cache->fci_lru); - cache->fci_cache_count++; + /* f_new = *range */ - spin_unlock(&cache->fci_lock); + /* fldt */ + fldt->fce_range.lsr_start = new_end; + fldt->fce_range.lsr_end = f_curr->fce_range.lsr_end; + fldt->fce_range.lsr_mdt = f_curr->fce_range.lsr_mdt; - RETURN(0); + /* f_curr */ + f_curr->fce_range.lsr_end = new_start; + + /* add these two entries to list */ + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + fld_cache_entry_add(cache, fldt, &f_new->fce_list); + + /* no need to fixup */ + EXIT; } -EXPORT_SYMBOL(fld_cache_insert_inflight); -int fld_cache_insert(struct fld_cache *cache, - seqno_t seq, mdsno_t mds) +/** + * handle range overlap in fld cache. + */ +void fld_cache_overlap_handle(struct fld_cache *cache, + struct fld_cache_entry *f_curr, + struct fld_cache_entry *f_new) { - struct fld_cache_entry *flde, *fldt; - struct hlist_head *bucket; - struct hlist_node *scan; - int rc; - ENTRY; + const struct lu_seq_range *range = &f_new->fce_range; + const seqno_t new_start = range->lsr_start; + const seqno_t new_end = range->lsr_end; + const mdsno_t mdt = range->lsr_mdt; - spin_lock(&cache->fci_lock); + /* this is overlap case, these case are checking overlapping with + * prev range only. fixup will handle overlaping with next range. */ - /* Check if need to shrink cache. */ - rc = fld_cache_shrink(cache); - if (rc) { - spin_unlock(&cache->fci_lock); - RETURN(rc); - } + if (f_curr->fce_range.lsr_mdt == mdt) { + f_curr->fce_range.lsr_start = min(f_curr->fce_range.lsr_start, + new_start); - /* Check if cache already has the entry with such a seq. */ - bucket = fld_cache_bucket(cache, seq); - hlist_for_each_entry(fldt, scan, bucket, fce_list) { - if (fldt->fce_seq == seq) { - if (fldt->fce_inflight) { - /* set mds for inflight entry */ - fldt->fce_mds = mds; - fldt->fce_inflight = 0; - fldt->fce_invalid = 0; - cfs_waitq_signal(&fldt->fce_waitq); - rc = 0; - } else - rc = -EEXIST; - spin_unlock(&cache->fci_lock); - RETURN(rc); - } - } - spin_unlock(&cache->fci_lock); + f_curr->fce_range.lsr_end = max(f_curr->fce_range.lsr_end, + new_end); - /* Allocate new entry. */ - OBD_ALLOC_PTR(flde); - if (!flde) - RETURN(-ENOMEM); + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); - /* - * Check if cache has the entry with such a seq again. It could be added - * while we were allocating new entry. - */ - spin_lock(&cache->fci_lock); - hlist_for_each_entry(fldt, scan, bucket, fce_list) { - if (fldt->fce_seq == seq) { - spin_unlock(&cache->fci_lock); - OBD_FREE_PTR(flde); - RETURN(0); - } - } + } else if (new_start <= f_curr->fce_range.lsr_start && + f_curr->fce_range.lsr_end <= new_end) { + /* case 1: new range completely overshadowed existing range. + * e.g. whole range migrated. update fld cache entry */ - /* Add new entry to cache and lru list. */ - INIT_HLIST_NODE(&flde->fce_list); - flde->fce_mds = mds; - flde->fce_seq = seq; - flde->fce_inflight = 0; - flde->fce_invalid = 0; - - hlist_add_head(&flde->fce_list, bucket); - list_add(&flde->fce_lru, &cache->fci_lru); - cache->fci_cache_count++; + f_curr->fce_range = *range; + OBD_FREE_PTR(f_new); + fld_fix_new_list(cache); - spin_unlock(&cache->fci_lock); + } else if (f_curr->fce_range.lsr_start < new_start && + new_end < f_curr->fce_range.lsr_end) { + /* case 2: new range fit within existing range. */ - RETURN(0); + fld_cache_punch_hole(cache, f_curr, f_new); + + } else if (new_end <= f_curr->fce_range.lsr_end) { + /* case 3: overlap: + * [new_start [c_start new_end) c_end) + */ + + LASSERT(new_start <= f_curr->fce_range.lsr_start); + + f_curr->fce_range.lsr_start = new_end; + fld_cache_entry_add(cache, f_new, f_curr->fce_list.prev); + + } else if (f_curr->fce_range.lsr_start <= new_start) { + /* case 4: overlap: + * [c_start [new_start c_end) new_end) + */ + + LASSERT(f_curr->fce_range.lsr_end <= new_end); + + f_curr->fce_range.lsr_end = new_start; + fld_cache_entry_add(cache, f_new, &f_curr->fce_list); + } else + CERROR("NEW range ="DRANGE" curr = "DRANGE"\n", + PRANGE(range),PRANGE(&f_curr->fce_range)); } -EXPORT_SYMBOL(fld_cache_insert); -void fld_cache_delete(struct fld_cache *cache, seqno_t seq) +/** + * Insert FLD entry in FLD cache. + * + * This function handles all cases of merging and breaking up of + * ranges. + */ +void fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range) { - struct fld_cache_entry *flde; - struct hlist_node *scan, *n; - struct hlist_head *bucket; + struct fld_cache_entry *f_new; + struct fld_cache_entry *f_curr; + struct fld_cache_entry *n; + struct list_head *head; + struct list_head *prev = NULL; + const seqno_t new_start = range->lsr_start; + const seqno_t new_end = range->lsr_end; ENTRY; - bucket = fld_cache_bucket(cache, seq); - + LASSERT(range_is_sane(range)); + + /* Allocate new entry. */ + OBD_ALLOC_PTR(f_new); + if (!f_new) { + EXIT; + return; + } + + f_new->fce_range = *range; + + /* + * Duplicate entries are eliminated in inset op. + * So we don't need to search new entry before starting insertion loop. + */ + spin_lock(&cache->fci_lock); - hlist_for_each_entry_safe(flde, scan, n, bucket, fce_list) { - if (flde->fce_seq == seq) { - hlist_del_init(&flde->fce_list); - list_del_init(&flde->fce_lru); - if (flde->fce_inflight) { - flde->fce_inflight = 0; - flde->fce_invalid = 1; - cfs_waitq_signal(&flde->fce_waitq); - } - cache->fci_cache_count--; - OBD_FREE_PTR(flde); - GOTO(out_unlock, 0); + fld_cache_shrink(cache); + + head = &cache->fci_entries_head; + + list_for_each_entry_safe(f_curr, n, head, fce_list) { + /* add list if next is end of list */ + if (new_end < f_curr->fce_range.lsr_start) + break; + + prev = &f_curr->fce_list; + /* check if this range is to left of new range. */ + if (new_start < f_curr->fce_range.lsr_end) { + fld_cache_overlap_handle(cache, f_curr, f_new); + goto out; } } - EXIT; -out_unlock: - spin_unlock(&cache->fci_lock); -} -EXPORT_SYMBOL(fld_cache_delete); + if (prev == NULL) + prev = head; -static int fld_check_inflight(struct fld_cache_entry *flde) -{ - return (flde->fce_inflight); + /* Add new entry to cache and lru list. */ + fld_cache_entry_add(cache, f_new, prev); +out: + spin_unlock(&cache->fci_lock); + EXIT; } +/** + * lookup \a seq sequence for range in fld cache. + */ int fld_cache_lookup(struct fld_cache *cache, - seqno_t seq, mdsno_t *mds) + const seqno_t seq, struct lu_seq_range *range) { struct fld_cache_entry *flde; - struct hlist_node *scan, *n; - struct hlist_head *bucket; + struct list_head *head; ENTRY; - bucket = fld_cache_bucket(cache, seq); spin_lock(&cache->fci_lock); + head = &cache->fci_entries_head; + cache->fci_stat.fst_count++; - hlist_for_each_entry_safe(flde, scan, n, bucket, fce_list) { - if (flde->fce_seq == seq) { - if (flde->fce_inflight) { - /* lookup RPC is inflight need to wait */ - struct l_wait_info lwi; - spin_unlock(&cache->fci_lock); - lwi = LWI_TIMEOUT(0, NULL, NULL); - l_wait_event(flde->fce_waitq, - !fld_check_inflight(flde), &lwi); - LASSERT(!flde->fce_inflight); - if (flde->fce_invalid) - RETURN(-ENOENT); - - *mds = flde->fce_mds; - cache->fci_stat.fst_inflight++; - } else { - LASSERT(!flde->fce_invalid); - *mds = flde->fce_mds; - list_del(&flde->fce_lru); - list_add(&flde->fce_lru, &cache->fci_lru); - cache->fci_stat.fst_cache++; - spin_unlock(&cache->fci_lock); - } + list_for_each_entry(flde, head, fce_list) { + if (flde->fce_range.lsr_start > seq) + break; + + if (range_within(&flde->fce_range, seq)) { + *range = flde->fce_range; + + /* update position of this entry in lru list. */ + list_move(&flde->fce_lru, &cache->fci_lru); + cache->fci_stat.fst_cache++; + spin_unlock(&cache->fci_lock); RETURN(0); } } spin_unlock(&cache->fci_lock); RETURN(-ENOENT); } -EXPORT_SYMBOL(fld_cache_lookup); -#else -int fld_cache_insert_inflight(struct fld_cache *cache, seqno_t seq) -{ - return -ENOTSUPP; -} -EXPORT_SYMBOL(fld_cache_insert_inflight); - -int fld_cache_insert(struct fld_cache *cache, - seqno_t seq, mdsno_t mds) -{ - return -ENOTSUPP; -} -EXPORT_SYMBOL(fld_cache_insert); - -void fld_cache_delete(struct fld_cache *cache, - seqno_t seq) -{ - return; -} -EXPORT_SYMBOL(fld_cache_delete); - -int fld_cache_lookup(struct fld_cache *cache, - seqno_t seq, mdsno_t *mds) -{ - return -ENOTSUPP; -} -EXPORT_SYMBOL(fld_cache_lookup); -#endif diff --git a/lustre/fld/fld_handler.c b/lustre/fld/fld_handler.c index 3138a54..2b6ab12 100644 --- a/lustre/fld/fld_handler.c +++ b/lustre/fld/fld_handler.c @@ -39,6 +39,7 @@ * * Author: Yury Umanets * Author: WangDi + * Author: Pravin Shelar */ #ifndef EXPORT_SYMTAB @@ -63,6 +64,7 @@ #include #include +#include #include #include "fld_internal.h" @@ -76,6 +78,13 @@ LU_CONTEXT_KEY_DEFINE(fld, LCT_MD_THREAD|LCT_DT_THREAD); cfs_proc_dir_entry_t *fld_type_proc_dir = NULL; +static struct lu_local_obj_desc llod_fld_index = { + .llod_name = fld_index_name, + .llod_oid = FLD_INDEX_OID, + .llod_is_index = 1, + .llod_feat = &fld_index_features, +}; + static int __init fld_mod_init(void) { fld_type_proc_dir = lprocfs_register(LUSTRE_FLD_NAME, @@ -84,6 +93,8 @@ static int __init fld_mod_init(void) if (IS_ERR(fld_type_proc_dir)) return PTR_ERR(fld_type_proc_dir); + llo_local_obj_register(&llod_fld_index); + LU_CONTEXT_KEY_INIT(&fld_thread_key); lu_context_key_register(&fld_thread_key); return 0; @@ -91,6 +102,7 @@ static int __init fld_mod_init(void) static void __exit fld_mod_exit(void) { + llo_local_obj_unregister(&llod_fld_index); lu_context_key_degister(&fld_thread_key); if (fld_type_proc_dir != NULL && !IS_ERR(fld_type_proc_dir)) { lprocfs_remove(&fld_type_proc_dir); @@ -98,106 +110,200 @@ static void __exit fld_mod_exit(void) } } -/* Insert index entry and update cache. */ +/** + * Insert FLD index entry and update FLD cache. + * + * First it try to merge given range with existing range then update + * FLD index and FLD cache accordingly. FLD index consistency is maintained + * by this function. + * This function is called from the sequence allocator when a super-sequence + * is granted to a server. + */ + int fld_server_create(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t mds) + struct lu_seq_range *add_range, + struct thandle *th) { - int rc; + struct lu_seq_range *erange; + struct lu_seq_range *new; + struct fld_thread_info *info; + int rc = 0; + int do_merge=0; + ENTRY; - - rc = fld_index_create(fld, env, seq, mds); - - if (rc == 0) { - /* - * Do not return result of calling fld_cache_insert() - * here. First of all because it may return -EEXISTS. Another - * reason is that, we do not want to stop proceeding even after - * cache errors. + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + mutex_lock(&fld->lsf_lock); + + erange = &info->fti_lrange; + new = &info->fti_irange; + *new = *add_range; + + /* STEP 1: try to merge with previous range */ + rc = fld_index_lookup(fld, env, new->lsr_start, erange); + if (!rc) { + /* in case of range overlap, mdt ID must be same for both ranges */ + if (new->lsr_mdt != erange->lsr_mdt) { + CERROR("mdt[%x] for given range is different from" + "existing overlapping range mdt[%x]\n", + new->lsr_mdt, erange->lsr_mdt); + rc = -EIO; + GOTO(out, rc); + } + + if (new->lsr_end < erange->lsr_end) + GOTO(out, rc); + do_merge = 1; + + } else if (rc == -ENOENT) { + /* check for merge case: optimizes for single mds lustre. + * As entry does not exist, returned entry must be left side + * entry compared to start of new range (ref dio_lookup()). + * So try to merge from left. */ - fld_cache_insert(fld->lsf_cache, seq, mds); + if (new->lsr_start == erange->lsr_end && + new->lsr_mdt == erange->lsr_mdt) + do_merge = 1; + } else { + /* no overlap allowed in fld, so failure in lookup is error */ + GOTO(out, rc); } - RETURN(rc); -} -EXPORT_SYMBOL(fld_server_create); + if (do_merge) { + /* new range can be combined with existing one. + * So delete existing range. + */ -/* Delete index entry. */ -int fld_server_delete(struct lu_server_fld *fld, - const struct lu_env *env, - seqno_t seq) -{ - int rc; - ENTRY; + rc = fld_index_delete(fld, env, erange, th); + if (rc == 0) { + new->lsr_start = min(erange->lsr_start, new->lsr_start); + new->lsr_end = max(erange->lsr_end, new->lsr_end); + } else + GOTO(out, rc); - fld_cache_delete(fld->lsf_cache, seq); - rc = fld_index_delete(fld, env, seq); + do_merge = 0; + } + + /* STEP 2: try to merge with next range */ + rc = fld_index_lookup(fld, env, new->lsr_end, erange); + if (!rc) { + /* case range overlap: with right side entry. */ + if (new->lsr_mdt == erange->lsr_mdt) + do_merge = 1; + } else if (rc == -ENOENT) { + /* this range is left of new range end point */ + LASSERT(erange->lsr_end <= new->lsr_end); + + if (new->lsr_end == erange->lsr_end) + do_merge = 1; + if (new->lsr_start <= erange->lsr_start) + do_merge = 1; + } else + GOTO(out, rc); + + if (do_merge) { + if (new->lsr_mdt != erange->lsr_mdt) { + CERROR("mdt[%x] for given range is different from" + "existing overlapping range mdt[%x]\n", + new->lsr_mdt, erange->lsr_mdt); + rc = -EIO; + GOTO(out, rc); + } + /* merge with next range */ + rc = fld_index_delete(fld, env, erange, th); + if (rc == 0) { + new->lsr_start = min(erange->lsr_start, new->lsr_start); + new->lsr_end = max(erange->lsr_end, new->lsr_end); + } else + GOTO(out, rc); + } + + /* now update fld entry. */ + rc = fld_index_create(fld, env, new, th); + + LASSERT(rc != -EEXIST); +out: + if (rc == 0) + fld_cache_insert(fld->lsf_cache, new); + + mutex_unlock(&fld->lsf_lock); + + CDEBUG((rc != 0 ? D_ERROR : D_INFO), + "%s: FLD create: given range : "DRANGE + "after merge "DRANGE" rc = %d \n", fld->lsf_name, + PRANGE(add_range), PRANGE(new), rc); + RETURN(rc); } -EXPORT_SYMBOL(fld_server_delete); -/* Lookup mds by seq. */ +EXPORT_SYMBOL(fld_server_create); + +/** + * Lookup mds by seq, returns a range for given seq. + * + * If that entry is not cached in fld cache, request is sent to super + * sequence controller node (MDT0). All other MDT[1...N] and client + * cache fld entries, but this cache is not persistent. + */ + int fld_server_lookup(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t *mds) + seqno_t seq, struct lu_seq_range *range) { int rc; ENTRY; - + /* Lookup it in the cache. */ - rc = fld_cache_lookup(fld->lsf_cache, seq, mds); + rc = fld_cache_lookup(fld->lsf_cache, seq, range); if (rc == 0) RETURN(0); - rc = fld_index_lookup(fld, env, seq, mds); - if (rc == 0) { - /* - * Do not return error here as well. See previous comment in - * same situation in function fld_server_create(). + if (fld->lsf_obj) + rc = fld_index_lookup(fld, env, seq, range); + else { + LASSERT(fld->lsf_control_exp); + /* send request to mdt0 i.e. super seq. controller. + * This is temporary solution, long term solution is fld + * replication on all mdt servers. */ - fld_cache_insert(fld->lsf_cache, seq, *mds); + rc = fld_client_rpc(fld->lsf_control_exp, + range, FLD_LOOKUP); } + + if (rc == 0) + fld_cache_insert(fld->lsf_cache, range); + RETURN(rc); } EXPORT_SYMBOL(fld_server_lookup); +/** + * All MDT server handle fld lookup operation. But only MDT0 has fld index. + * if entry is not found in cache we need to forward lookup request to MDT0 + */ + static int fld_server_handle(struct lu_server_fld *fld, const struct lu_env *env, - __u32 opc, struct md_fld *mf, + __u32 opc, struct lu_seq_range *range, struct fld_thread_info *info) { int rc; ENTRY; switch (opc) { - case FLD_CREATE: - rc = fld_server_create(fld, env, - mf->mf_seq, mf->mf_mds); - - /* Do not return -EEXIST error for resent case */ - if ((info->fti_flags & MSG_RESENT) && rc == -EEXIST) - rc = 0; - break; - case FLD_DELETE: - rc = fld_server_delete(fld, env, mf->mf_seq); - - /* Do not return -ENOENT error for resent case */ - if ((info->fti_flags & MSG_RESENT) && rc == -ENOENT) - rc = 0; - break; case FLD_LOOKUP: rc = fld_server_lookup(fld, env, - mf->mf_seq, &mf->mf_mds); + range->lsr_start, range); break; default: rc = -EINVAL; break; } - CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, seq: " - LPX64", mds: "LPU64")\n", fld->lsf_name, rc, opc, - mf->mf_seq, mf->mf_mds); + CDEBUG(D_INFO, "%s: FLD req handle: error %d (opc: %d, range: " + DRANGE"\n", fld->lsf_name, rc, opc, PRANGE(range)); RETURN(rc); @@ -207,8 +313,8 @@ static int fld_req_handle(struct ptlrpc_request *req, struct fld_thread_info *info) { struct lu_site *site; - struct md_fld *in; - struct md_fld *out; + struct lu_seq_range *in; + struct lu_seq_range *out; int rc; __u32 *opc; ENTRY; @@ -241,8 +347,6 @@ static int fld_req_handle(struct ptlrpc_request *req, static void fld_thread_info_init(struct ptlrpc_request *req, struct fld_thread_info *info) { - info->fti_flags = lustre_msg_get_flags(req->rq_reqmsg); - info->fti_pill = &req->rq_pill; /* Init request capsule. */ req_capsule_init(info->fti_pill, req, RCL_SERVER); @@ -290,21 +394,27 @@ EXPORT_SYMBOL(fld_query); * * fid_is_local() is supposed to be used in assertion checks only. */ -int fid_is_local(struct lu_site *site, const struct lu_fid *fid) +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid) { int result; struct md_site *msite; + struct lu_seq_range *range; + struct fld_thread_info *info; + ENTRY; + + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + range = &info->fti_lrange; result = 1; /* conservatively assume fid is local */ msite = lu_site2md(site); if (msite->ms_client_fld != NULL) { - mdsno_t mds; int rc; rc = fld_cache_lookup(msite->ms_client_fld->lcf_cache, - fid_seq(fid), &mds); + fid_seq(fid), range); if (rc == 0) - result = (mds == msite->ms_node_id); + result = (range->lsr_mdt == msite->ms_node_id); } return result; } @@ -352,7 +462,8 @@ static void fld_server_proc_fini(struct lu_server_fld *fld) #endif int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, - const char *prefix, const struct lu_env *env) + const char *prefix, const struct lu_env *env, + int mds_node_id) { int cache_size, cache_threshold; int rc; @@ -367,8 +478,8 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, cache_threshold = cache_size * FLD_SERVER_CACHE_THRESHOLD / 100; + mutex_init(&fld->lsf_lock); fld->lsf_cache = fld_cache_init(fld->lsf_name, - FLD_SERVER_HTABLE_SIZE, cache_size, cache_threshold); if (IS_ERR(fld->lsf_cache)) { rc = PTR_ERR(fld->lsf_cache); @@ -376,14 +487,18 @@ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, GOTO(out, rc); } - rc = fld_index_init(fld, env, dt); - if (rc) - GOTO(out, rc); + if (!mds_node_id) { + rc = fld_index_init(fld, env, dt); + if (rc) + GOTO(out, rc); + } else + fld->lsf_obj = NULL; rc = fld_server_proc_init(fld); if (rc) GOTO(out, rc); + fld->lsf_control_exp = NULL; EXIT; out: if (rc) diff --git a/lustre/fld/fld_index.c b/lustre/fld/fld_index.c index a1e88d4..03da47e 100644 --- a/lustre/fld/fld_index.c +++ b/lustre/fld/fld_index.c @@ -60,26 +60,25 @@ #include #include #include +#include #include #include "fld_internal.h" const char fld_index_name[] = "fld"; -static const struct dt_index_features fld_index_features = { +static const struct lu_seq_range IGIF_FLD_RANGE = { + .lsr_start = 1, + .lsr_end = IDIF_SEQ_START, + .lsr_mdt = 0 +}; + +const struct dt_index_features fld_index_features = { .dif_flags = DT_IND_UPDATE, .dif_keysize_min = sizeof(seqno_t), .dif_keysize_max = sizeof(seqno_t), - .dif_recsize_min = sizeof(mdsno_t), - .dif_recsize_max = sizeof(mdsno_t) -}; - -/* - * number of blocks to reserve for particular operations. Should be function of - * ... something. Stub for now. - */ -enum { - FLD_TXN_INDEX_INSERT_CREDITS = 20, - FLD_TXN_INDEX_DELETE_CREDITS = 20, + .dif_recsize_min = sizeof(struct lu_seq_range), + .dif_recsize_max = sizeof(struct lu_seq_range), + .dif_ptrsize = 4 }; extern struct lu_context_key fld_thread_key; @@ -98,83 +97,174 @@ static struct dt_key *fld_key(const struct lu_env *env, } static struct dt_rec *fld_rec(const struct lu_env *env, - const mdsno_t mds) + const struct lu_seq_range *range) { struct fld_thread_info *info; + struct lu_seq_range *rec; ENTRY; info = lu_context_key_get(&env->le_ctx, &fld_thread_key); LASSERT(info != NULL); + rec = &info->fti_rec; + + range_cpu_to_be(rec, range); + RETURN((void *)rec); +} - info->fti_rec = cpu_to_be64(mds); - RETURN((void *)&info->fti_rec); +struct thandle* fld_trans_start(struct lu_server_fld *fld, + const struct lu_env *env, int credit) +{ + struct fld_thread_info *info; + struct dt_device *dt_dev; + struct txn_param *p; + + dt_dev = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + p = &info->fti_txn_param; + txn_param_init(p, credit); + + return dt_dev->dd_ops->dt_trans_start(env, dt_dev, p); } +void fld_trans_stop(struct lu_server_fld *fld, + const struct lu_env *env, struct thandle* th) +{ + struct dt_device *dt_dev; + + dt_dev = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); + dt_dev->dd_ops->dt_trans_stop(env, th); +} + +/** + * insert range in fld store. + * + * \param range range to be inserted + * \param th transaction for this operation as it could compound + * transaction. + * + * \retval 0 success + * \retval -ve error + */ + int fld_index_create(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t mds) + const struct lu_seq_range *range, + struct thandle *th) { struct dt_object *dt_obj = fld->lsf_obj; struct dt_device *dt_dev; - struct txn_param txn; - struct thandle *th; + seqno_t start; int rc; + ENTRY; + start = range->lsr_start; + LASSERT(range_is_sane(range)); dt_dev = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); - /* stub here, will fix it later */ - txn_param_init(&txn, FLD_TXN_INDEX_INSERT_CREDITS); - - th = dt_dev->dd_ops->dt_trans_start(env, dt_dev, &txn); - if (!IS_ERR(th)) { - rc = dt_obj->do_index_ops->dio_insert(env, dt_obj, - fld_rec(env, mds), - fld_key(env, seq), - th, BYPASS_CAPA); - dt_dev->dd_ops->dt_trans_stop(env, th); - } else - rc = PTR_ERR(th); + rc = dt_obj->do_index_ops->dio_insert(env, dt_obj, + fld_rec(env, range), + fld_key(env, start), + th, BYPASS_CAPA, 1); + + CDEBUG(D_INFO, "%s: insert given range : "DRANGE" rc = %d\n", + fld->lsf_name, PRANGE(range), rc); RETURN(rc); } +/** + * delete range in fld store. + * + * \param range range to be deleted + * \param th transaction + * + * \retval 0 success + * \retval -ve error + */ + int fld_index_delete(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq) + struct lu_seq_range *range, + struct thandle *th) { struct dt_object *dt_obj = fld->lsf_obj; struct dt_device *dt_dev; - struct txn_param txn; - struct thandle *th; + seqno_t seq = range->lsr_start; int rc; + ENTRY; dt_dev = lu2dt_dev(fld->lsf_obj->do_lu.lo_dev); - txn_param_init(&txn, FLD_TXN_INDEX_DELETE_CREDITS); - th = dt_dev->dd_ops->dt_trans_start(env, dt_dev, &txn); - if (!IS_ERR(th)) { - rc = dt_obj->do_index_ops->dio_delete(env, dt_obj, - fld_key(env, seq), th, - BYPASS_CAPA); - dt_dev->dd_ops->dt_trans_stop(env, th); - } else - rc = PTR_ERR(th); + rc = dt_obj->do_index_ops->dio_delete(env, dt_obj, + fld_key(env, seq), th, + BYPASS_CAPA); + + CDEBUG(D_INFO, "%s: delete given range : "DRANGE" rc = %d\n", + fld->lsf_name, PRANGE(range), rc); + RETURN(rc); } +/** + * lookup range for a seq passed + * + * \param seq seq for lookup. + * \param range result of lookup. + * + * \retval 0 success + * \retval -ve error + */ + int fld_index_lookup(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t *mds) + seqno_t seq, + struct lu_seq_range *range) { - struct dt_object *dt_obj = fld->lsf_obj; - struct dt_rec *rec = fld_rec(env, 0); + struct dt_object *dt_obj = fld->lsf_obj; + struct lu_seq_range *fld_rec; + struct dt_key *key = fld_key(env, seq); + struct fld_thread_info *info; int rc; + ENTRY; - rc = dt_obj->do_index_ops->dio_lookup(env, dt_obj, rec, - fld_key(env, seq), BYPASS_CAPA); - if (rc == 0) - *mds = be64_to_cpu(*(__u64 *)rec); + info = lu_context_key_get(&env->le_ctx, &fld_thread_key); + fld_rec = &info->fti_rec; + + rc = dt_obj->do_index_ops->dio_lookup(env, dt_obj, + (struct dt_rec*) fld_rec, + key, BYPASS_CAPA); + + if (rc >= 0) { + range_be_to_cpu(fld_rec, fld_rec); + *range = *fld_rec; + if (range_within(range, seq)) + rc = 0; + else + rc = -ENOENT; + } + + CDEBUG(D_INFO, "%s: lookup seq = %llx range : "DRANGE" rc = %d\n", + fld->lsf_name, seq, PRANGE(range), rc); + + RETURN(rc); +} + +static int fld_insert_igif_fld(struct lu_server_fld *fld, + const struct lu_env *env) +{ + struct thandle *th; + int rc; + + ENTRY; + th = fld_trans_start(fld, env, FLD_TXN_INDEX_INSERT_CREDITS); + if (IS_ERR(th)) + RETURN(PTR_ERR(th)); + + rc = fld_index_create(fld, env, &IGIF_FLD_RANGE, th); + fld_trans_stop(fld, env, th); + if (rc == -EEXIST) + rc = 0; RETURN(rc); } @@ -187,16 +277,25 @@ int fld_index_init(struct lu_server_fld *fld, int rc; ENTRY; - dt_obj = dt_store_open(env, dt, fld_index_name, &fid); + dt_obj = dt_store_open(env, dt, "", fld_index_name, &fid); if (!IS_ERR(dt_obj)) { fld->lsf_obj = dt_obj; rc = dt_obj->do_ops->do_index_try(env, dt_obj, &fld_index_features); - if (rc == 0) + if (rc == 0) { LASSERT(dt_obj->do_index_ops != NULL); - else + rc = fld_insert_igif_fld(fld, env); + + if (rc != 0) { + CERROR("insert igif in fld! = %d\n", rc); + lu_object_put(env, &dt_obj->do_lu); + fld->lsf_obj = NULL; + } + } else CERROR("%s: File \"%s\" is not an index!\n", fld->lsf_name, fld_index_name); + + } else { CERROR("%s: Can't find \"%s\" obj %d\n", fld->lsf_name, fld_index_name, (int)PTR_ERR(dt_obj)); diff --git a/lustre/fld/fld_internal.h b/lustre/fld/fld_internal.h index 7a86e2e..6b50b16 100644 --- a/lustre/fld/fld_internal.h +++ b/lustre/fld/fld_internal.h @@ -45,10 +45,75 @@ #include #include - #include #include +enum { + LUSTRE_FLD_INIT = 1 << 0, + LUSTRE_FLD_RUN = 1 << 1 +}; + +struct fld_stats { + __u64 fst_count; + __u64 fst_cache; + __u64 fst_inflight; +}; + +typedef int (*fld_hash_func_t) (struct lu_client_fld *, __u64); + +typedef struct lu_fld_target * +(*fld_scan_func_t) (struct lu_client_fld *, __u64); + +struct lu_fld_hash { + const char *fh_name; + fld_hash_func_t fh_hash_func; + fld_scan_func_t fh_scan_func; +}; + +struct fld_cache_entry { + struct list_head fce_lru; + struct list_head fce_list; + /** + * fld cache entries are sorted on range->lsr_start field. */ + struct lu_seq_range fce_range; +}; + +struct fld_cache { + /** + * Cache guard, protects fci_hash mostly because others immutable after + * init is finished. + */ + spinlock_t fci_lock; + + /** + * Cache shrink threshold */ + int fci_threshold; + + /** + * Prefered number of cached entries */ + int fci_cache_size; + + /** + * Current number of cached entries. Protected by @fci_lock */ + int fci_cache_count; + + /** + * LRU list fld entries. */ + struct list_head fci_lru; + + /** + * sorted fld entries. */ + struct list_head fci_entries_head; + + /** + * Cache statistics. */ + struct fld_stats fci_stat; + + /** + * Cache name used for debug and messages. */ + char fci_name[80]; +}; + enum fld_op { FLD_CREATE = 0, FLD_DELETE = 1, @@ -71,30 +136,26 @@ enum { FLD_CLIENT_CACHE_THRESHOLD = 10 }; -enum { - /* - * One page is used for hashtable. That is sizeof(struct hlist_head) * - * 1024. - */ - FLD_CLIENT_HTABLE_SIZE = (1024 * 1), - - /* - * Here 4 pages are used for hashtable of server cache. This is is - * because cache it self is 4 times bugger. - */ - FLD_SERVER_HTABLE_SIZE = (1024 * 4) -}; - extern struct lu_fld_hash fld_hash[]; #ifdef __KERNEL__ + struct fld_thread_info { struct req_capsule *fti_pill; __u64 fti_key; - __u64 fti_rec; - __u32 fti_flags; + struct lu_seq_range fti_rec; + struct lu_seq_range fti_lrange; + struct lu_seq_range fti_irange; + struct txn_param fti_txn_param; }; + +struct thandle* fld_trans_start(struct lu_server_fld *fld, + const struct lu_env *env, int credit); + +void fld_trans_stop(struct lu_server_fld *fld, + const struct lu_env *env, struct thandle* th); + int fld_index_init(struct lu_server_fld *fld, const struct lu_env *env, struct dt_device *dt); @@ -104,15 +165,20 @@ void fld_index_fini(struct lu_server_fld *fld, int fld_index_create(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t mds); + const struct lu_seq_range *range, + struct thandle *th); int fld_index_delete(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq); + struct lu_seq_range *range, + struct thandle *th); int fld_index_lookup(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t *mds); + seqno_t seq, struct lu_seq_range *range); + +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op); #ifdef LPROCFS extern struct lprocfs_vars fld_server_proc_list[]; @@ -121,6 +187,22 @@ extern struct lprocfs_vars fld_client_proc_list[]; #endif +struct fld_cache *fld_cache_init(const char *name, + int cache_size, int cache_threshold); + +void fld_cache_fini(struct fld_cache *cache); + +void fld_cache_flush(struct fld_cache *cache); + +void fld_cache_insert(struct fld_cache *cache, + const struct lu_seq_range *range); + +void fld_cache_delete(struct fld_cache *cache, + const struct lu_seq_range *range); + +int fld_cache_lookup(struct fld_cache *cache, + const seqno_t seq, struct lu_seq_range *range); + static inline const char * fld_target_name(struct lu_fld_target *tar) { diff --git a/lustre/fld/fld_request.c b/lustre/fld/fld_request.c index cdb5110..dff5498 100644 --- a/lustre/fld/fld_request.c +++ b/lustre/fld/fld_request.c @@ -68,7 +68,7 @@ #include #include "fld_internal.h" -/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c +/* TODO: these 3 functions are copies of flow-control code from mdc_lib.c * It should be common thing. The same about mdc RPC lock */ static int fld_req_avail(struct client_obd *cli, struct mdc_cache_waiter *mcw) { @@ -105,7 +105,7 @@ static void fld_exit_request(struct client_obd *cli) spin_lock(&cli->cl_loi_list_lock); cli->cl_r_in_flight--; list_for_each_safe(l, tmp, &cli->cl_cache_waiters) { - + if (cli->cl_r_in_flight >= cli->cl_max_rpcs_in_flight) { /* No free request slots anymore */ break; @@ -164,26 +164,7 @@ fld_rrb_scan(struct lu_client_fld *fld, seqno_t seq) RETURN(NULL); } -static int fld_dht_hash(struct lu_client_fld *fld, - seqno_t seq) -{ - /* XXX: here should be DHT hash */ - return fld_rrb_hash(fld, seq); -} - -static struct lu_fld_target * -fld_dht_scan(struct lu_client_fld *fld, seqno_t seq) -{ - /* XXX: here should be DHT scan code */ - return fld_rrb_scan(fld, seq); -} - -struct lu_fld_hash fld_hash[3] = { - { - .fh_name = "DHT", - .fh_hash_func = fld_dht_hash, - .fh_scan_func = fld_dht_scan - }, +struct lu_fld_hash fld_hash[] = { { .fh_name = "RRB", .fh_hash_func = fld_rrb_hash, @@ -394,7 +375,6 @@ int fld_client_init(struct lu_client_fld *fld, FLD_CLIENT_CACHE_THRESHOLD / 100; fld->lcf_cache = fld_cache_init(fld->lcf_name, - FLD_CLIENT_HTABLE_SIZE, cache_size, cache_threshold); if (IS_ERR(fld->lcf_cache)) { rc = PTR_ERR(fld->lcf_cache); @@ -447,11 +427,11 @@ void fld_client_fini(struct lu_client_fld *fld) } EXPORT_SYMBOL(fld_client_fini); -static int fld_client_rpc(struct obd_export *exp, - struct md_fld *mf, __u32 fld_op) +int fld_client_rpc(struct obd_export *exp, + struct lu_seq_range *range, __u32 fld_op) { struct ptlrpc_request *req; - struct md_fld *pmf; + struct lu_seq_range *prange; __u32 *op; int rc; ENTRY; @@ -466,8 +446,8 @@ static int fld_client_rpc(struct obd_export *exp, op = req_capsule_client_get(&req->rq_pill, &RMF_FLD_OPC); *op = fld_op; - pmf = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); - *pmf = *mf; + prange = req_capsule_client_get(&req->rq_pill, &RMF_FLD_MDFLD); + *prange = *range; ptlrpc_request_set_replen(req); req->rq_request_portal = FLD_REQUEST_PORTAL; @@ -483,110 +463,32 @@ static int fld_client_rpc(struct obd_export *exp, if (rc) GOTO(out_req, rc); - pmf = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD); - if (pmf == NULL) + prange = req_capsule_server_get(&req->rq_pill, &RMF_FLD_MDFLD); + if (prange == NULL) GOTO(out_req, rc = -EFAULT); - *mf = *pmf; + *range = *prange; EXIT; out_req: ptlrpc_req_finished(req); return rc; } -int fld_client_create(struct lu_client_fld *fld, - seqno_t seq, mdsno_t mds, - const struct lu_env *env) -{ - struct md_fld md_fld = { .mf_seq = seq, .mf_mds = mds }; - struct lu_fld_target *target; - int rc; - ENTRY; - - fld->lcf_flags |= LUSTRE_FLD_RUN; - target = fld_client_get_target(fld, seq); - LASSERT(target != NULL); - - CDEBUG(D_INFO, "%s: Create fld entry (seq: "LPX64"; mds: " - LPU64") on target %s (idx "LPU64")\n", fld->lcf_name, - seq, mds, fld_target_name(target), target->ft_idx); - -#ifdef __KERNEL__ - if (target->ft_srv != NULL) { - LASSERT(env != NULL); - rc = fld_server_create(target->ft_srv, env, seq, mds); - } else { -#endif - rc = fld_client_rpc(target->ft_exp, &md_fld, FLD_CREATE); -#ifdef __KERNEL__ - } -#endif - - if (rc == 0) { - /* - * Do not return result of calling fld_cache_insert() - * here. First of all because it may return -EEXIST. Another - * reason is that, we do not want to stop proceeding because of - * cache errors. - */ - fld_cache_insert(fld->lcf_cache, seq, mds); - } else { - CERROR("%s: Can't create FLD entry, rc %d\n", - fld->lcf_name, rc); - } - - RETURN(rc); -} -EXPORT_SYMBOL(fld_client_create); - -int fld_client_delete(struct lu_client_fld *fld, seqno_t seq, - const struct lu_env *env) -{ - struct md_fld md_fld = { .mf_seq = seq, .mf_mds = 0 }; - struct lu_fld_target *target; - int rc; - ENTRY; - - fld->lcf_flags |= LUSTRE_FLD_RUN; - fld_cache_delete(fld->lcf_cache, seq); - - target = fld_client_get_target(fld, seq); - LASSERT(target != NULL); - - CDEBUG(D_INFO, "%s: Delete fld entry (seq: "LPX64") on " - "target %s (idx "LPU64")\n", fld->lcf_name, seq, - fld_target_name(target), target->ft_idx); - -#ifdef __KERNEL__ - if (target->ft_srv != NULL) { - LASSERT(env != NULL); - rc = fld_server_delete(target->ft_srv, - env, seq); - } else { -#endif - rc = fld_client_rpc(target->ft_exp, - &md_fld, FLD_DELETE); -#ifdef __KERNEL__ - } -#endif - - RETURN(rc); -} -EXPORT_SYMBOL(fld_client_delete); - int fld_client_lookup(struct lu_client_fld *fld, seqno_t seq, mdsno_t *mds, const struct lu_env *env) { - struct md_fld md_fld = { .mf_seq = seq, .mf_mds = 0 }; + struct lu_seq_range res; struct lu_fld_target *target; int rc; ENTRY; fld->lcf_flags |= LUSTRE_FLD_RUN; - rc = fld_cache_lookup(fld->lcf_cache, seq, mds); - if (rc == 0) + rc = fld_cache_lookup(fld->lcf_cache, seq, &res); + if (rc == 0) { + *mds = res.lsr_mdt; RETURN(0); + } /* Can not find it in the cache */ target = fld_client_get_target(fld, seq); @@ -596,45 +498,24 @@ int fld_client_lookup(struct lu_client_fld *fld, "target %s (idx "LPU64")\n", fld->lcf_name, seq, fld_target_name(target), target->ft_idx); + res.lsr_start = seq; #ifdef __KERNEL__ if (target->ft_srv != NULL) { LASSERT(env != NULL); rc = fld_server_lookup(target->ft_srv, - env, seq, &md_fld.mf_mds); + env, seq, &res); } else { #endif - /* - * insert the 'inflight' sequence. No need to protect that, - * we are trying to reduce numbers of RPC but not restrict - * to them exactly one - */ - fld_cache_insert_inflight(fld->lcf_cache, seq); rc = fld_client_rpc(target->ft_exp, - &md_fld, FLD_LOOKUP); + &res, FLD_LOOKUP); #ifdef __KERNEL__ } #endif - if (seq < FID_SEQ_START) { - /* - * The current solution for IGIF is to bind it to mds0. - * In the future, this should be fixed once IGIF can be found - * in FLD. - */ - md_fld.mf_mds = 0; - rc = 0; - } if (rc == 0) { - *mds = md_fld.mf_mds; + *mds = res.lsr_mdt; - /* - * Do not return error here as well. See previous comment in - * same situation in function fld_client_create(). - */ - fld_cache_insert(fld->lcf_cache, seq, *mds); - } else { - /* remove 'inflight' seq if it exists */ - fld_cache_delete(fld->lcf_cache, seq); + fld_cache_insert(fld->lcf_cache, &res); } RETURN(rc); } diff --git a/lustre/include/Makefile.am b/lustre/include/Makefile.am index 9803237..d18e1a9 100644 --- a/lustre/include/Makefile.am +++ b/lustre/include/Makefile.am @@ -41,9 +41,9 @@ EXTRA_DIST = ioctl.h liblustre.h lprocfs_status.h lustre_cfg.h \ lustre_fsfilt.h lustre_ha.h lustre_handles.h lustre_import.h \ lustre_lib.h lustre_sec.h lustre_lite.h lustre_log.h lustre_mds.h \ lustre_mdc.h lustre_net.h lustre_quota.h lustre_ucache.h lvfs.h \ - class_hash.h obd_cache.h obd_class.h obd_echo.h obd.h obd_lov.h \ + class_hash.h obd_cache.h obd_class.h obd.h obd_lov.h \ obd_ost.h obd_support.h lustre_ver.h lu_object.h lu_time.h \ md_object.h dt_object.h lustre_param.h lustre_mdt.h \ lustre_fid.h lustre_fld.h lustre_req_layout.h lustre_capa.h \ lustre_idmap.h lustre_eacl.h interval_tree.h obd_cksum.h \ - lu_ref.h lustre_acl.h lustre_cache.h + lu_ref.h cl_object.h lustre_acl.h lclient.h diff --git a/lustre/include/cl_object.h b/lustre/include/cl_object.h new file mode 100644 index 0000000..3343139 --- /dev/null +++ b/lustre/include/cl_object.h @@ -0,0 +1,3033 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef _LUSTRE_CL_OBJECT_H +#define _LUSTRE_CL_OBJECT_H + +/** \defgroup clio clio + * + * Client objects implement io operations and cache pages. + * + * Examples: lov and osc are implementations of cl interface. + * + * Big Theory Statement. + * + * Layered objects. + * + * Client implementation is based on the following data-types: + * + * - cl_object + * + * - cl_page + * + * - cl_lock represents an extent lock on an object. + * + * - cl_io represents high-level i/o activity such as whole read/write + * system call, or write-out of pages from under the lock being + * canceled. cl_io has sub-ios that can be stopped and resumed + * independently, thus achieving high degree of transfer + * parallelism. Single cl_io can be advanced forward by + * the multiple threads (although in the most usual case of + * read/write system call it is associated with the single user + * thread, that issued the system call). + * + * - cl_req represents a collection of pages for a transfer. cl_req is + * constructed by req-forming engine that tries to saturate + * transport with large and continuous transfers. + * + * Terminology + * + * - to avoid confusion high-level I/O operation like read or write system + * call is referred to as "an io", whereas low-level I/O operation, like + * RPC, is referred to as "a transfer" + * + * - "generic code" means generic (not file system specific) code in the + * hosting environment. "cl-code" means code (mostly in cl_*.c files) that + * is not layer specific. + * + * Locking. + * + * - i_mutex + * - PG_locked + * - cl_object_header::coh_page_guard + * - cl_object_header::coh_lock_guard + * - lu_site::ls_guard + * + * See the top comment in cl_object.c for the description of overall locking and + * reference-counting design. + * + * See comments below for the description of i/o, page, and dlm-locking + * design. + * + * @{ + */ + +/* + * super-class definitions. + */ +#include +#include +#ifdef __KERNEL__ +# include +# include +#endif + +struct inode; + +struct cl_device; +struct cl_device_operations; + +struct cl_object; +struct cl_object_page_operations; +struct cl_object_lock_operations; + +struct cl_page; +struct cl_page_slice; +struct cl_lock; +struct cl_lock_slice; + +struct cl_lock_operations; +struct cl_page_operations; + +struct cl_io; +struct cl_io_slice; + +struct cl_req; +struct cl_req_slice; + +/** + * Operations for each data device in the client stack. + * + * \see vvp_cl_ops, lov_cl_ops, lovsub_cl_ops, osc_cl_ops + */ +struct cl_device_operations { + /** + * Initialize cl_req. This method is called top-to-bottom on all + * devices in the stack to get them a chance to allocate layer-private + * data, and to attach them to the cl_req by calling + * cl_req_slice_add(). + * + * \see osc_req_init(), lov_req_init(), lovsub_req_init() + * \see ccc_req_init() + */ + int (*cdo_req_init)(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +}; + +/** + * Device in the client stack. + * + * \see ccc_device, lov_device, lovsub_device, osc_device + */ +struct cl_device { + /** Super-class. */ + struct lu_device cd_lu_dev; + /** Per-layer operation vector. */ + const struct cl_device_operations *cd_ops; +}; + +/** \addtogroup cl_object cl_object + * @{ */ +/** + * "Data attributes" of cl_object. Data attributes can be updated + * independently for a sub-object, and top-object's attributes are calculated + * from sub-objects' ones. + */ +struct cl_attr { + /** Object size, in bytes */ + loff_t cat_size; + /** + * Known minimal size, in bytes. + * + * This is only valid when at least one DLM lock is held. + */ + loff_t cat_kms; + /** Modification time. Measured in seconds since epoch. */ + time_t cat_mtime; + /** Access time. Measured in seconds since epoch. */ + time_t cat_atime; + /** Change time. Measured in seconds since epoch. */ + time_t cat_ctime; + /** + * Blocks allocated to this cl_object on the server file system. + * + * \todo XXX An interface for block size is needed. + */ + __u64 cat_blocks; + /** + * User identifier for quota purposes. + */ + uid_t cat_uid; + /** + * Group identifier for quota purposes. + */ + gid_t cat_gid; +}; + +/** + * Fields in cl_attr that are being set. + */ +enum cl_attr_valid { + CAT_SIZE = 1 << 0, + CAT_KMS = 1 << 1, + CAT_MTIME = 1 << 3, + CAT_ATIME = 1 << 4, + CAT_CTIME = 1 << 5, + CAT_BLOCKS = 1 << 6, + CAT_UID = 1 << 7, + CAT_GID = 1 << 8 +}; + +/** + * Sub-class of lu_object with methods common for objects on the client + * stacks. + * + * cl_object: represents a regular file system object, both a file and a + * stripe. cl_object is based on lu_object: it is identified by a fid, + * layered, cached, hashed, and lrued. Important distinction with the server + * side, where md_object and dt_object are used, is that cl_object "fans out" + * at the lov/sns level: depending on the file layout, single file is + * represented as a set of "sub-objects" (stripes). At the implementation + * level, struct lov_object contains an array of cl_objects. Each sub-object + * is a full-fledged cl_object, having its fid, living in the lru and hash + * table. + * + * This leads to the next important difference with the server side: on the + * client, it's quite usual to have objects with the different sequence of + * layers. For example, typical top-object is composed of the following + * layers: + * + * - vvp + * - lov + * + * whereas its sub-objects are composed of + * + * - lovsub + * - osc + * + * layers. Here "lovsub" is a mostly dummy layer, whose purpose is to keep + * track of the object-subobject relationship. + * + * Sub-objects are not cached independently: when top-object is about to + * be discarded from the memory, all its sub-objects are torn-down and + * destroyed too. + * + * \see ccc_object, lov_object, lovsub_object, osc_object + */ +struct cl_object { + /** super class */ + struct lu_object co_lu; + /** per-object-layer operations */ + const struct cl_object_operations *co_ops; +}; + +/** + * Description of the client object configuration. This is used for the + * creation of a new client object that is identified by a more state than + * fid. + */ +struct cl_object_conf { + /** Super-class. */ + struct lu_object_conf coc_lu; + union { + /** + * Object layout. This is consumed by lov. + */ + struct lustre_md *coc_md; + /** + * Description of particular stripe location in the + * cluster. This is consumed by osc. + */ + struct lov_oinfo *coc_oinfo; + } u; + /** + * VFS inode. This is consumed by vvp. + */ + struct inode *coc_inode; +}; + +/** + * Operations implemented for each cl object layer. + * + * \see vvp_ops, lov_ops, lovsub_ops, osc_ops + */ +struct cl_object_operations { + /** + * Initialize page slice for this layer. Called top-to-bottom through + * every object layer when a new cl_page is instantiated. Layer + * keeping private per-page data, or requiring its own page operations + * vector should allocate these data here, and attach then to the page + * by calling cl_page_slice_add(). \a vmpage is locked (in the VM + * sense). Optional. + * + * \retval NULL success. + * + * \retval ERR_PTR(errno) failure code. + * + * \retval valid-pointer pointer to already existing referenced page + * to be used instead of newly created. + */ + struct cl_page *(*coo_page_init)(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, + cfs_page_t *vmpage); + /** + * Initialize lock slice for this layer. Called top-to-bottom through + * every object layer when a new cl_lock is instantiated. Layer + * keeping private per-lock data, or requiring its own lock operations + * vector should allocate these data here, and attach then to the lock + * by calling cl_lock_slice_add(). Mandatory. + */ + int (*coo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + /** + * Initialize io state for a given layer. + * + * called top-to-bottom once per io existence to initialize io + * state. If layer wants to keep some state for this type of io, it + * has to embed struct cl_io_slice in lu_env::le_ses, and register + * slice with cl_io_slice_add(). It is guaranteed that all threads + * participating in this io share the same session. + */ + int (*coo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + /** + * Fill portion of \a attr that this layer controls. This method is + * called top-to-bottom through all object layers. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return 0: to continue + * \return +ve: to stop iterating through layers (but 0 is returned + * from enclosing cl_object_attr_get()) + * \return -ve: to signal error + */ + int (*coo_attr_get)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + /** + * Update attributes. + * + * \a valid is a bitmask composed from enum #cl_attr_valid, and + * indicating what attributes are to be set. + * + * \pre cl_object_header::coh_attr_guard of the top-object is locked. + * + * \return the same convention as for + * cl_object_operations::coo_attr_get() is used. + */ + int (*coo_attr_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); + /** + * Update object configuration. Called top-to-bottom to modify object + * configuration. + * + * XXX error conditions and handling. + */ + int (*coo_conf_set)(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); + /** + * Glimpse ast. Executed when glimpse ast arrives for a lock on this + * object. Layers are supposed to fill parts of \a lvb that will be + * shipped to the glimpse originator as a glimpse result. + * + * \see ccc_object_glimpse(), lovsub_object_glimpse(), + * \see osc_object_glimpse() + */ + int (*coo_glimpse)(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +}; + +/** + * Extended header for client object. + */ +struct cl_object_header { + /** Standard lu_object_header. cl_object::co_lu::lo_header points + * here. */ + struct lu_object_header coh_lu; + /** \name locks + * \todo XXX move locks below to the separate cache-lines, they are + * mostly useless otherwise. + */ + /** @{ */ + /** Lock protecting page tree. */ + spinlock_t coh_page_guard; + /** Lock protecting lock list. */ + spinlock_t coh_lock_guard; + /** @} locks */ + /** Radix tree of cl_page's, cached for this object. */ + struct radix_tree_root coh_tree; + /** # of pages in radix tree. */ + unsigned long coh_pages; + /** List of cl_lock's granted for this object. */ + struct list_head coh_locks; + + /** + * Parent object. It is assumed that an object has a well-defined + * parent, but not a well-defined child (there may be multiple + * sub-objects, for the same top-object). cl_object_header::coh_parent + * field allows certain code to be written generically, without + * limiting possible cl_object layouts unduly. + */ + struct cl_object_header *coh_parent; + /** + * Protects consistency between cl_attr of parent object and + * attributes of sub-objects, that the former is calculated ("merged") + * from. + * + * \todo XXX this can be read/write lock if needed. + */ + spinlock_t coh_attr_guard; + /** + * Number of objects above this one: 0 for a top-object, 1 for its + * sub-object, etc. + */ + unsigned coh_nesting; +}; + +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer top-to-bottom to \a slice. + */ +#define cl_object_for_each(slice, obj) \ + list_for_each_entry((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** + * Helper macro: iterate over all layers of the object \a obj, assigning every + * layer bottom-to-top to \a slice. + */ +#define cl_object_for_each_reverse(slice, obj) \ + list_for_each_entry_reverse((slice), \ + &(obj)->co_lu.lo_header->loh_layers, \ + co_lu.lo_linkage) +/** @} cl_object */ + +#ifndef pgoff_t +#define pgoff_t unsigned long +#endif + +#define CL_PAGE_EOF ((pgoff_t)~0ull) + +/** \addtogroup cl_page cl_page + * @{ */ + +/** \struct cl_page + * Layered client page. + * + * cl_page: represents a portion of a file, cached in the memory. All pages + * of the given file are of the same size, and are kept in the radix tree + * hanging off the cl_object. cl_page doesn't fan out, but as sub-objects + * of the top-level file object are first class cl_objects, they have their + * own radix trees of pages and hence page is implemented as a sequence of + * struct cl_pages's, linked into double-linked list through + * cl_page::cp_parent and cl_page::cp_child pointers, each residing in the + * corresponding radix tree at the corresponding logical offset. + * + * cl_page is associated with VM page of the hosting environment (struct + * page in Linux kernel, for example), cfs_page_t. It is assumed, that this + * association is implemented by one of cl_page layers (top layer in the + * current design) that + * + * - intercepts per-VM-page call-backs made by the environment (e.g., + * memory pressure), + * + * - translates state (page flag bits) and locking between lustre and + * environment. + * + * The association between cl_page and cfs_page_t is immutable and + * established when cl_page is created. + * + * cl_page can be "owned" by a particular cl_io (see below), guaranteeing + * this io an exclusive access to this page w.r.t. other io attempts and + * various events changing page state (such as transfer completion, or + * eviction of the page from the memory). Note, that in general cl_io + * cannot be identified with a particular thread, and page ownership is not + * exactly equal to the current thread holding a lock on the page. Layer + * implementing association between cl_page and cfs_page_t has to implement + * ownership on top of available synchronization mechanisms. + * + * While lustre client maintains the notion of an page ownership by io, + * hosting MM/VM usually has its own page concurrency control + * mechanisms. For example, in Linux, page access is synchronized by the + * per-page PG_locked bit-lock, and generic kernel code (generic_file_*()) + * takes care to acquire and release such locks as necessary around the + * calls to the file system methods (->readpage(), ->prepare_write(), + * ->commit_write(), etc.). This leads to the situation when there are two + * different ways to own a page in the client: + * + * - client code explicitly and voluntary owns the page (cl_page_own()); + * + * - VM locks a page and then calls the client, that has "to assume" + * the ownership from the VM (cl_page_assume()). + * + * Dual methods to release ownership are cl_page_disown() and + * cl_page_unassume(). + * + * cl_page is reference counted (cl_page::cp_ref). When reference counter + * drops to 0, the page is returned to the cache, unless it is in + * cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * The general logic guaranteeing the absence of "existential races" for + * pages is the following: + * + * - there are fixed known ways for a thread to obtain a new reference + * to a page: + * + * - by doing a lookup in the cl_object radix tree, protected by the + * spin-lock; + * + * - by starting from VM-locked cfs_page_t and following some + * hosting environment method (e.g., following ->private pointer in + * the case of Linux kernel), see cl_vmpage_page(); + * + * - when the page enters cl_page_state::CPS_FREEING state, all these + * ways are severed with the proper synchronization + * (cl_page_delete()); + * + * - entry into cl_page_state::CPS_FREEING is serialized by the VM page + * lock; + * + * - no new references to the page in cl_page_state::CPS_FREEING state + * are allowed (checked in cl_page_get()). + * + * Together this guarantees that when last reference to a + * cl_page_state::CPS_FREEING page is released, it is safe to destroy the + * page, as neither references to it can be acquired at that point, nor + * ones exist. + * + * cl_page is a state machine. States are enumerated in enum + * cl_page_state. Possible state transitions are enumerated in + * cl_page_state_set(). State transition process (i.e., actual changing of + * cl_page::cp_state field) is protected by the lock on the underlying VM + * page. + * + * Linux Kernel implementation. + * + * Binding between cl_page and cfs_page_t (which is a typedef for + * struct page) is implemented in the vvp layer. cl_page is attached to the + * ->private pointer of the struct page, together with the setting of + * PG_private bit in page->flags, and acquiring additional reference on the + * struct page (much like struct buffer_head, or any similar file system + * private data structures). + * + * PG_locked lock is used to implement both ownership and transfer + * synchronization, that is, page is VM-locked in CPS_{OWNED,PAGE{IN,OUT}} + * states. No additional references are acquired for the duration of the + * transfer. + * + * \warning *THIS IS NOT* the behavior expected by the Linux kernel, where + * write-out is "protected" by the special PG_writeback bit. + */ + +/** + * States of cl_page. cl_page.c assumes particular order here. + * + * The page state machine is rather crude, as it doesn't recognize finer page + * states like "dirty" or "up to date". This is because such states are not + * always well defined for the whole stack (see, for example, the + * implementation of the read-ahead, that hides page up-to-dateness to track + * cache hits accurately). Such sub-states are maintained by the layers that + * are interested in them. + */ +enum cl_page_state { + /** + * Page is in the cache, un-owned. Page leaves cached state in the + * following cases: + * + * - [cl_page_state::CPS_OWNED] io comes across the page and + * owns it; + * + * - [cl_page_state::CPS_PAGEOUT] page is dirty, the + * req-formation engine decides that it wants to include this page + * into an cl_req being constructed, and yanks it from the cache; + * + * - [cl_page_state::CPS_FREEING] VM callback is executed to + * evict the page form the memory; + * + * \invariant cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_CACHED, + /** + * Page is exclusively owned by some cl_io. Page may end up in this + * state as a result of + * + * - io creating new page and immediately owning it; + * + * - [cl_page_state::CPS_CACHED] io finding existing cached page + * and owning it; + * + * - [cl_page_state::CPS_OWNED] io finding existing owned page + * and waiting for owner to release the page; + * + * Page leaves owned state in the following cases: + * + * - [cl_page_state::CPS_CACHED] io decides to leave the page in + * the cache, doing nothing; + * + * - [cl_page_state::CPS_PAGEIN] io starts read transfer for + * this page; + * + * - [cl_page_state::CPS_PAGEOUT] io starts immediate write + * transfer for this page; + * + * - [cl_page_state::CPS_FREEING] io decides to destroy this + * page (e.g., as part of truncate or extent lock cancellation). + * + * \invariant cl_page::cp_owner != NULL && cl_page::cp_req == NULL + */ + CPS_OWNED, + /** + * Page is being written out, as a part of a transfer. This state is + * entered when req-formation logic decided that it wants this page to + * be sent through the wire _now_. Specifically, it means that once + * this state is achieved, transfer completion handler (with either + * success or failure indication) is guaranteed to be executed against + * this page independently of any locks and any scheduling decisions + * made by the hosting environment (that effectively means that the + * page is never put into cl_page_state::CPS_PAGEOUT state "in + * advance". This property is mentioned, because it is important when + * reasoning about possible dead-locks in the system). The page can + * enter this state as a result of + * + * - [cl_page_state::CPS_OWNED] an io requesting an immediate + * write-out of this page, or + * + * - [cl_page_state::CPS_CACHED] req-forming engine deciding + * that it has enough dirty pages cached to issue a "good" + * transfer. + * + * The page leaves cl_page_state::CPS_PAGEOUT state when the transfer + * is completed---it is moved into cl_page_state::CPS_CACHED state. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEOUT, + /** + * Page is being read in, as a part of a transfer. This is quite + * similar to the cl_page_state::CPS_PAGEOUT state, except that + * read-in is always "immediate"---there is no such thing a sudden + * construction of read cl_req from cached, presumably not up to date, + * pages. + * + * Underlying VM page is locked for the duration of transfer. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req != NULL + */ + CPS_PAGEIN, + /** + * Page is being destroyed. This state is entered when client decides + * that page has to be deleted from its host object, as, e.g., a part + * of truncate. + * + * Once this state is reached, there is no way to escape it. + * + * \invariant: cl_page::cp_owner == NULL && cl_page::cp_req == NULL + */ + CPS_FREEING, + CPS_NR +}; + +enum cl_page_type { + /** Host page, the page is from the host inode which the cl_page + * belongs to. */ + CPT_CACHEABLE = 1, + + /** Transient page, the transient cl_page is used to bind a cl_page + * to vmpage which is not belonging to the same object of cl_page. + * it is used in DirectIO, lockless IO and liblustre. */ + CPT_TRANSIENT, +}; + +/** + * Flags maintained for every cl_page. + */ +enum cl_page_flags { + /** + * Set when pagein completes. Used for debugging (read completes at + * most once for a page). + */ + CPF_READ_COMPLETED = 1 << 0 +}; + +/** + * Fields are protected by the lock on cfs_page_t, except for atomics and + * immutables. + * + * \invariant Data type invariants are in cl_page_invariant(). Basically: + * cl_page::cp_parent and cl_page::cp_child are a well-formed double-linked + * list, consistent with the parent/child pointers in the cl_page::cp_obj and + * cl_page::cp_owner (when set). + */ +struct cl_page { + /** Reference counter. */ + atomic_t cp_ref; + /** An object this page is a part of. Immutable after creation. */ + struct cl_object *cp_obj; + /** Logical page index within the object. Immutable after creation. */ + pgoff_t cp_index; + /** List of slices. Immutable after creation. */ + struct list_head cp_layers; + /** Parent page, NULL for top-level page. Immutable after creation. */ + struct cl_page *cp_parent; + /** Lower-layer page. NULL for bottommost page. Immutable after + * creation. */ + struct cl_page *cp_child; + /** + * Page state. This field is const to avoid accidental update, it is + * modified only internally within cl_page.c. Protected by a VM lock. + */ + const enum cl_page_state cp_state; + /** + * Linkage of pages within some group. Protected by + * cl_page::cp_mutex. */ + struct list_head cp_batch; + /** Mutex serializing membership of a page in a batch. */ + struct mutex cp_mutex; + /** Linkage of pages within cl_req. */ + struct list_head cp_flight; + /** Transfer error. */ + int cp_error; + + /** + * Page type. Only CPT_TRANSIENT is used so far. Immutable after + * creation. + */ + enum cl_page_type cp_type; + + /** + * Owning IO in cl_page_state::CPS_OWNED state. Sub-page can be owned + * by sub-io. Protected by a VM lock. + */ + struct cl_io *cp_owner; + /** + * Owning IO request in cl_page_state::CPS_PAGEOUT and + * cl_page_state::CPS_PAGEIN states. This field is maintained only in + * the top-level pages. Protected by a VM lock. + */ + struct cl_req *cp_req; + /** List of references to this page, for debugging. */ + struct lu_ref cp_reference; + /** Link to an object, for debugging. */ + struct lu_ref_link *cp_obj_ref; + /** Link to a queue, for debugging. */ + struct lu_ref_link *cp_queue_ref; + /** Per-page flags from enum cl_page_flags. Protected by a VM lock. */ + unsigned cp_flags; +}; + +/** + * Per-layer part of cl_page. + * + * \see ccc_page, lov_page, osc_page + */ +struct cl_page_slice { + struct cl_page *cpl_page; + /** + * Object slice corresponding to this page slice. Immutable after + * creation. + */ + struct cl_object *cpl_obj; + const struct cl_page_operations *cpl_ops; + /** Linkage into cl_page::cp_layers. Immutable after creation. */ + struct list_head cpl_linkage; +}; + +/** + * Lock mode. For the client extent locks. + * + * \warning: cl_lock_mode_match() assumes particular ordering here. + * \ingroup cl_lock + */ +enum cl_lock_mode { + /** + * Mode of a lock that protects no data, and exists only as a + * placeholder. This is used for `glimpse' requests. A phantom lock + * might get promoted to real lock at some point. + */ + CLM_PHANTOM, + CLM_READ, + CLM_WRITE +}; + +/** + * Requested transfer type. + * \ingroup cl_req + */ +enum cl_req_type { + CRT_READ, + CRT_WRITE, + CRT_NR +}; + +/** + * Per-layer page operations. + * + * Methods taking an \a io argument are for the activity happening in the + * context of given \a io. Page is assumed to be owned by that io, except for + * the obvious cases (like cl_page_operations::cpo_own()). + * + * \see vvp_page_ops, lov_page_ops, osc_page_ops + */ +struct cl_page_operations { + /** + * cl_page<->cfs_page_t methods. Only one layer in the stack has to + * implement these. Current code assumes that this functionality is + * provided by the topmost layer, see cl_page_disown0() as an example. + */ + + /** + * \return the underlying VM page. Optional. + */ + cfs_page_t *(*cpo_vmpage)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Called when \a io acquires this page into the exclusive + * ownership. When this method returns, it is guaranteed that the is + * not owned by other io, and no transfer is going on against + * it. Optional. + * + * \see cl_page_own() + * \see vvp_page_own(), lov_page_own() + */ + void (*cpo_own)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Called when ownership it yielded. Optional. + * + * \see cl_page_disown() + * \see vvp_page_disown() + */ + void (*cpo_disown)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Called for a page that is already "owned" by \a io from VM point of + * view. Optional. + * + * \see cl_page_assume() + * \see vvp_page_assume(), lov_page_assume() + */ + void (*cpo_assume)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** Dual to cl_page_operations::cpo_assume(). Optional. Called + * bottom-to-top when IO releases a page without actually unlocking + * it. + * + * \see cl_page_unassume() + * \see vvp_page_unassume() + */ + void (*cpo_unassume)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Announces that page contains valid data and user space can look and + * them without client's involvement from now on. Effectively marks + * the page up-to-date. Optional. + * + * \see cl_page_export() + * \see vvp_page_export() + */ + void (*cpo_export)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Unmaps page from the user space (if it is mapped). + * + * \see cl_page_unmap() + * \see vvp_page_unmap() + */ + int (*cpo_unmap)(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); + /** + * Checks whether underlying VM page is locked (in the suitable + * sense). Used for assertions. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. (Should never happen.) + */ + int (*cpo_is_vmlocked)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Page destruction. + */ + + /** + * Called when page is truncated from the object. Optional. + * + * \see cl_page_discard() + * \see vvp_page_discard(), osc_page_discard() + */ + void (*cpo_discard)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Called when page is removed from the cache, and is about to being + * destroyed. Optional. + * + * \see cl_page_delete() + * \see vvp_page_delete(), osc_page_delete() + */ + void (*cpo_delete)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** Destructor. Frees resources and slice itself. */ + void (*cpo_fini)(const struct lu_env *env, + struct cl_page_slice *slice); + + /** + * Checks whether the page is protected by a cl_lock. This is a + * per-layer method, because certain layers have ways to check for the + * lock much more efficiently than through the generic locks scan, or + * implement locking mechanisms separate from cl_lock, e.g., + * LL_FILE_GROUP_LOCKED in vvp. If \a pending is true, check for locks + * being canceled, or scheduled for cancellation as soon as the last + * user goes away, too. + * + * \retval -EBUSY: page is protected by a lock of a given mode; + * \retval -ENODATA: page is not protected by a lock; + * \retval 0: this layer cannot decide. + * + * \see cl_page_is_under_lock() + */ + int (*cpo_is_under_lock)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + + /** + * Optional debugging helper. Prints given page slice. + * + * \see cl_page_print() + */ + int (*cpo_print)(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p); + /** + * \name transfer + * + * Transfer methods. See comment on cl_req for a description of + * transfer formation and life-cycle. + * + * @{ + */ + /** + * Request type dependent vector of operations. + * + * Transfer operations depend on transfer mode (cl_req_type). To avoid + * passing transfer mode to each and every of these methods, and to + * avoid branching on request type inside of the methods, separate + * methods for cl_req_type:CRT_READ and cl_req_type:CRT_WRITE are + * provided. That is, method invocation usually looks like + * + * slice->cp_ops.io[req->crq_type].cpo_method(env, slice, ...); + */ + struct { + /** + * Called when a page is submitted for a transfer as a part of + * cl_page_list. + * + * \return 0 : page is eligible for submission; + * \return -EALREADY : skip this page; + * \return -ve : error. + * + * \see cl_page_prep() + */ + int (*cpo_prep)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + /** + * Completion handler. This is guaranteed to be eventually + * fired after cl_page_operations::cpo_prep() or + * cl_page_operations::cpo_make_ready() call. + * + * This method can be called in a non-blocking context. It is + * guaranteed however, that the page involved and its object + * are pinned in memory (and, hence, calling cl_page_put() is + * safe). + * + * \see cl_page_completion() + */ + void (*cpo_completion)(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret); + /** + * Called when cached page is about to be added to the + * cl_req as a part of req formation. + * + * \return 0 : proceed with this page; + * \return -EAGAIN : skip this page; + * \return -ve : error. + * + * \see cl_page_make_ready() + */ + int (*cpo_make_ready)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** + * Announce that this page is to be written out + * opportunistically, that is, page is dirty, it is not + * necessary to start write-out transfer right now, but + * eventually page has to be written out. + * + * Main caller of this is the write path (see + * vvp_io_commit_write()), using this method to build a + * "transfer cache" from which large transfers are then + * constructed by the req-formation engine. + * + * \todo XXX it would make sense to add page-age tracking + * semantics here, and to oblige the req-formation engine to + * send the page out not later than it is too old. + * + * \see cl_page_cache_add() + */ + int (*cpo_cache_add)(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); + } io[CRT_NR]; + /** + * Tell transfer engine that only [to, from] part of a page should be + * transmitted. + * + * This is used for immediate transfers. + * + * \todo XXX this is not very good interface. It would be much better + * if all transfer parameters were supplied as arguments to + * cl_io_operations::cio_submit() call, but it is not clear how to do + * this for page queues. + * + * \see cl_page_clip() + */ + void (*cpo_clip)(const struct lu_env *env, + const struct cl_page_slice *slice, + int from, int to); + /** + * \pre the page was queued for transferring. + * \post page is removed from client's pending list, or -EBUSY + * is returned if it has already been in transferring. + * + * This is one of seldom page operation which is: + * 0. called from top level; + * 1. don't have vmpage locked; + * 2. every layer should synchronize execution of its ->cpo_cancel() + * with completion handlers. Osc uses client obd lock for this + * purpose. Based on there is no vvp_page_cancel and + * lov_page_cancel(), cpo_cancel is defacto protected by client lock. + * + * \see osc_page_cancel(). + */ + int (*cpo_cancel)(const struct lu_env *env, + const struct cl_page_slice *slice); + /** @} transfer */ +}; + +/** + * Helper macro, dumping detailed information about \a page into a log. + */ +#define CL_PAGE_DEBUG(mask, env, page, format, ...) \ +do { \ + static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask); \ + \ + if (cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_print(env, &__info, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** + * Helper macro, dumping shorter information about \a page into a log. + */ +#define CL_PAGE_HEADER(mask, env, page, format, ...) \ +do { \ + static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask); \ + \ + if (cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_page_header_print(env, &__info, lu_cdebug_printer, page); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** @} cl_page */ + +/** \addtogroup cl_lock cl_lock + * @{ */ +/** \struct cl_lock + * + * Extent locking on the client. + * + * LAYERING + * + * The locking model of the new client code is built around + * + * struct cl_lock + * + * data-type representing an extent lock on a regular file. cl_lock is a + * layered object (much like cl_object and cl_page), it consists of a header + * (struct cl_lock) and a list of layers (struct cl_lock_slice), linked to + * cl_lock::cll_layers list through cl_lock_slice::cls_linkage. + * + * All locks for a given object are linked into cl_object_header::coh_locks + * list (protected by cl_object_header::coh_lock_guard spin-lock) through + * cl_lock::cll_linkage. Currently this list is not sorted in any way. We can + * sort it in starting lock offset, or use altogether different data structure + * like a tree. + * + * Typical cl_lock consists of the two layers: + * + * - vvp_lock (vvp specific data), and + * - lov_lock (lov specific data). + * + * lov_lock contains an array of sub-locks. Each of these sub-locks is a + * normal cl_lock: it has a header (struct cl_lock) and a list of layers: + * + * - lovsub_lock, and + * - osc_lock + * + * Each sub-lock is associated with a cl_object (representing stripe + * sub-object or the file to which top-level cl_lock is associated to), and is + * linked into that cl_object::coh_locks. In this respect cl_lock is similar to + * cl_object (that at lov layer also fans out into multiple sub-objects), and + * is different from cl_page, that doesn't fan out (there is usually exactly + * one osc_page for every vvp_page). We shall call vvp-lov portion of the lock + * a "top-lock" and its lovsub-osc portion a "sub-lock". + * + * LIFE CYCLE + * + * cl_lock is reference counted. When reference counter drops to 0, lock is + * placed in the cache, except when lock is in CLS_FREEING state. CLS_FREEING + * lock is destroyed when last reference is released. Referencing between + * top-lock and its sub-locks is described in the lov documentation module. + * + * STATE MACHINE + * + * Also, cl_lock is a state machine. This requires some clarification. One of + * the goals of client IO re-write was to make IO path non-blocking, or at + * least to make it easier to make it non-blocking in the future. Here + * `non-blocking' means that when a system call (read, write, truncate) + * reaches a situation where it has to wait for a communication with the + * server, it should --instead of waiting-- remember its current state and + * switch to some other work. E.g,. instead of waiting for a lock enqueue, + * client should proceed doing IO on the next stripe, etc. Obviously this is + * rather radical redesign, and it is not planned to be fully implemented at + * this time, instead we are putting some infrastructure in place, that would + * make it easier to do asynchronous non-blocking IO easier in the + * future. Specifically, where old locking code goes to sleep (waiting for + * enqueue, for example), new code returns cl_lock_transition::CLO_WAIT. When + * enqueue reply comes, its completion handler signals that lock state-machine + * is ready to transit to the next state. There is some generic code in + * cl_lock.c that sleeps, waiting for these signals. As a result, for users of + * this cl_lock.c code, it looks like locking is done in normal blocking + * fashion, and it the same time it is possible to switch to the non-blocking + * locking (simply by returning cl_lock_transition::CLO_WAIT from cl_lock.c + * functions). + * + * For a description of state machine states and transitions see enum + * cl_lock_state. + * + * There are two ways to restrict a set of states which lock might move to: + * + * - placing a "hold" on a lock guarantees that lock will not be moved + * into cl_lock_state::CLS_FREEING state until hold is released. Hold + * can be only acquired on a lock that is not in + * cl_lock_state::CLS_FREEING. All holds on a lock are counted in + * cl_lock::cll_holds. Hold protects lock from cancellation and + * destruction. Requests to cancel and destroy a lock on hold will be + * recorded, but only honored when last hold on a lock is released; + * + * - placing a "user" on a lock guarantees that lock will not leave + * cl_lock_state::CLS_NEW, cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED and cl_lock_state::CLS_HELD set of + * states, once it enters this set. That is, if a user is added onto a + * lock in a state not from this set, it doesn't immediately enforce + * lock to move to this set, but once lock enters this set it will + * remain there until all users are removed. Lock users are counted in + * cl_lock::cll_users. + * + * User is used to assure that lock is not canceled or destroyed while + * it is being enqueued, or actively used by some IO. + * + * Currently, a user always comes with a hold (cl_lock_invariant() + * checks that a number of holds is not less than a number of users). + * + * CONCURRENCY + * + * This is how lock state-machine operates. struct cl_lock contains a mutex + * cl_lock::cll_guard that protects struct fields. + * + * - mutex is taken, and cl_lock::cll_state is examined. + * + * - for every state there are possible target states where lock can move + * into. They are tried in order. Attempts to move into next state are + * done by _try() functions in cl_lock.c:cl_{enqueue,unlock,wait}_try(). + * + * - if the transition can be performed immediately, state is changed, + * and mutex is released. + * + * - if the transition requires blocking, _try() function returns + * cl_lock_transition::CLO_WAIT. Caller unlocks mutex and goes to + * sleep, waiting for possibility of lock state change. It is woken + * up when some event occurs, that makes lock state change possible + * (e.g., the reception of the reply from the server), and repeats + * the loop. + * + * Top-lock and sub-lock has separate mutexes and the latter has to be taken + * first to avoid dead-lock. + * + * To see an example of interaction of all these issues, take a look at the + * lov_cl.c:lov_lock_enqueue() function. It is called as a part of + * cl_enqueue_try(), and tries to advance top-lock to ENQUEUED state, by + * advancing state-machines of its sub-locks (lov_lock_enqueue_one()). Note + * also, that it uses trylock to grab sub-lock mutex to avoid dead-lock. It + * also has to handle CEF_ASYNC enqueue, when sub-locks enqueues have to be + * done in parallel, rather than one after another (this is used for glimpse + * locks, that cannot dead-lock). + * + * INTERFACE AND USAGE + * + * struct cl_lock_operations provide a number of call-backs that are invoked + * when events of interest occurs. Layers can intercept and handle glimpse, + * blocking, cancel ASTs and a reception of the reply from the server. + * + * One important difference with the old client locking model is that new + * client has a representation for the top-lock, whereas in the old code only + * sub-locks existed as real data structures and file-level locks are + * represented by "request sets" that are created and destroyed on each and + * every lock creation. + * + * Top-locks are cached, and can be found in the cache by the system calls. It + * is possible that top-lock is in cache, but some of its sub-locks were + * canceled and destroyed. In that case top-lock has to be enqueued again + * before it can be used. + * + * Overall process of the locking during IO operation is as following: + * + * - once parameters for IO are setup in cl_io, cl_io_operations::cio_lock() + * is called on each layer. Responsibility of this method is to add locks, + * needed by a given layer into cl_io.ci_lockset. + * + * - once locks for all layers were collected, they are sorted to avoid + * dead-locks (cl_io_locks_sort()), and enqueued. + * + * - when all locks are acquired, IO is performed; + * + * - locks are released into cache. + * + * Striping introduces major additional complexity into locking. The + * fundamental problem is that it is generally unsafe to actively use (hold) + * two locks on the different OST servers at the same time, as this introduces + * inter-server dependency and can lead to cascading evictions. + * + * Basic solution is to sub-divide large read/write IOs into smaller pieces so + * that no multi-stripe locks are taken (note that this design abandons POSIX + * read/write semantics). Such pieces ideally can be executed concurrently. At + * the same time, certain types of IO cannot be sub-divived, without + * sacrificing correctness. This includes: + * + * - O_APPEND write, where [0, EOF] lock has to be taken, to guarantee + * atomicity; + * + * - ftruncate(fd, offset), where [offset, EOF] lock has to be taken. + * + * Also, in the case of read(fd, buf, count) or write(fd, buf, count), where + * buf is a part of memory mapped Lustre file, a lock or locks protecting buf + * has to be held together with the usual lock on [offset, offset + count]. + * + * As multi-stripe locks have to be allowed, it makes sense to cache them, so + * that, for example, a sequence of O_APPEND writes can proceed quickly + * without going down to the individual stripes to do lock matching. On the + * other hand, multi-stripe locks shouldn't be used by normal read/write + * calls. To achieve this, every layer can implement ->clo_fits_into() method, + * that is called by lock matching code (cl_lock_lookup()), and that can be + * used to selectively disable matching of certain locks for certain IOs. For + * exmaple, lov layer implements lov_lock_fits_into() that allow multi-stripe + * locks to be matched only for truncates and O_APPEND writes. + * + * Interaction with DLM + * + * In the expected setup, cl_lock is ultimately backed up by a collection of + * DLM locks (struct ldlm_lock). Association between cl_lock and DLM lock is + * implemented in osc layer, that also matches DLM events (ASTs, cancellation, + * etc.) into cl_lock_operation calls. See struct osc_lock for a more detailed + * description of interaction with DLM. + */ + +/** + * Lock description. + */ +struct cl_lock_descr { + /** Object this lock is granted for. */ + struct cl_object *cld_obj; + /** Index of the first page protected by this lock. */ + pgoff_t cld_start; + /** Index of the last page (inclusive) protected by this lock. */ + pgoff_t cld_end; + /** Lock mode. */ + enum cl_lock_mode cld_mode; +}; + +#define DDESCR "%s(%d):[%lu, %lu]" +#define PDESCR(descr) \ + cl_lock_mode_name((descr)->cld_mode), (descr)->cld_mode, \ + (descr)->cld_start, (descr)->cld_end + +const char *cl_lock_mode_name(const enum cl_lock_mode mode); + +/** + * Lock state-machine states. + * + * \htmlonly + *
+ *
+ * Possible state transitions:
+ *
+ *              +------------------>NEW
+ *              |                    |
+ *              |                    | cl_enqueue_try()
+ *              |                    |
+ *              |    cl_unuse_try()  V
+ *              |  +--------------QUEUING (*)
+ *              |  |                 |
+ *              |  |                 | cl_enqueue_try()
+ *              |  |                 |
+ *              |  | cl_unuse_try()  V
+ *    sub-lock  |  +-------------ENQUEUED (*)
+ *    canceled  |  |                 |
+ *              |  |                 | cl_wait_try()
+ *              |  |                 |
+ *              |  |                (R)
+ *              |  |                 |
+ *              |  |                 V
+ *              |  |                HELD<---------+
+ *              |  |                 |            |
+ *              |  |                 |            |
+ *              |  |  cl_unuse_try() |            |
+ *              |  |                 |            |
+ *              |  |                 V            | cached
+ *              |  +------------>UNLOCKING (*)    | lock found
+ *              |                    |            |
+ *              |     cl_unuse_try() |            |
+ *              |                    |            |
+ *              |                    |            | cl_use_try()
+ *              |                    V            |
+ *              +------------------CACHED---------+
+ *                                   |
+ *                                  (C)
+ *                                   |
+ *                                   V
+ *                                FREEING
+ *
+ * Legend:
+ *
+ *         In states marked with (*) transition to the same state (i.e., a loop
+ *         in the diagram) is possible.
+ *
+ *         (R) is the point where Receive call-back is invoked: it allows layers
+ *         to handle arrival of lock reply.
+ *
+ *         (C) is the point where Cancellation call-back is invoked.
+ *
+ *         Transition to FREEING state is possible from any other state in the
+ *         diagram in case of unrecoverable error.
+ * 
+ * \endhtmlonly + * + * These states are for individual cl_lock object. Top-lock and its sub-locks + * can be in the different states. Another way to say this is that we have + * nested state-machines. + * + * Separate QUEUING and ENQUEUED states are needed to support non-blocking + * operation for locks with multiple sub-locks. Imagine lock on a file F, that + * intersects 3 stripes S0, S1, and S2. To enqueue F client has to send + * enqueue to S0, wait for its completion, then send enqueue for S1, wait for + * its completion and at last enqueue lock for S2, and wait for its + * completion. In that case, top-lock is in QUEUING state while S0, S1 are + * handled, and is in ENQUEUED state after enqueue to S2 has been sent (note + * that in this case, sub-locks move from state to state, and top-lock remains + * in the same state). + * + * Separate UNLOCKING state is needed to maintain an invariant that in HELD + * state lock is immediately ready for use. + */ +enum cl_lock_state { + /** + * Lock that wasn't yet enqueued + */ + CLS_NEW, + /** + * Enqueue is in progress, blocking for some intermediate interaction + * with the other side. + */ + CLS_QUEUING, + /** + * Lock is fully enqueued, waiting for server to reply when it is + * granted. + */ + CLS_ENQUEUED, + /** + * Lock granted, actively used by some IO. + */ + CLS_HELD, + /** + * Lock is in the transition from CLS_HELD to CLS_CACHED. Lock is in + * this state only while cl_unuse() is executing against it. + */ + CLS_UNLOCKING, + /** + * Lock granted, not used. + */ + CLS_CACHED, + /** + * Lock is being destroyed. + */ + CLS_FREEING, + CLS_NR +}; + +enum cl_lock_flags { + /** + * lock has been cancelled. This flag is never cleared once set (by + * cl_lock_cancel0()). + */ + CLF_CANCELLED = 1 << 0, + /** cancellation is pending for this lock. */ + CLF_CANCELPEND = 1 << 1, + /** destruction is pending for this lock. */ + CLF_DOOMED = 1 << 2, + /** State update is pending. */ + CLF_STATE = 1 << 3 +}; + +/** + * Lock closure. + * + * Lock closure is a collection of locks (both top-locks and sub-locks) that + * might be updated in a result of an operation on a certain lock (which lock + * this is a closure of). + * + * Closures are needed to guarantee dead-lock freedom in the presence of + * + * - nested state-machines (top-lock state-machine composed of sub-lock + * state-machines), and + * + * - shared sub-locks. + * + * Specifically, many operations, such as lock enqueue, wait, unlock, + * etc. start from a top-lock, and then operate on a sub-locks of this + * top-lock, holding a top-lock mutex. When sub-lock state changes as a result + * of such operation, this change has to be propagated to all top-locks that + * share this sub-lock. Obviously, no natural lock ordering (e.g., + * top-to-bottom or bottom-to-top) captures this scenario, so try-locking has + * to be used. Lock closure systematizes this try-and-repeat logic. + */ +struct cl_lock_closure { + /** + * Lock that is mutexed when closure construction is started. When + * closure in is `wait' mode (cl_lock_closure::clc_wait), mutex on + * origin is released before waiting. + */ + struct cl_lock *clc_origin; + /** + * List of enclosed locks, so far. Locks are linked here through + * cl_lock::cll_inclosure. + */ + struct list_head clc_list; + /** + * True iff closure is in a `wait' mode. This determines what + * cl_lock_enclosure() does when a lock L to be added to the closure + * is currently mutexed by some other thread. + * + * If cl_lock_closure::clc_wait is not set, then closure construction + * fails with CLO_REPEAT immediately. + * + * In wait mode, cl_lock_enclosure() waits until next attempt to build + * a closure might succeed. To this end it releases an origin mutex + * (cl_lock_closure::clc_origin), that has to be the only lock mutex + * owned by the current thread, and then waits on L mutex (by grabbing + * it and immediately releasing), before returning CLO_REPEAT to the + * caller. + */ + int clc_wait; + /** Number of locks in the closure. */ + int clc_nr; +}; + +/** + * Layered client lock. + */ +struct cl_lock { + /** Reference counter. */ + atomic_t cll_ref; + /** List of slices. Immutable after creation. */ + struct list_head cll_layers; + /** + * Linkage into cl_lock::cll_descr::cld_obj::coh_locks list. Protected + * by cl_lock::cll_descr::cld_obj::coh_lock_guard. + */ + struct list_head cll_linkage; + /** + * Parameters of this lock. Protected by + * cl_lock::cll_descr::cld_obj::coh_lock_guard nested within + * cl_lock::cll_guard. Modified only on lock creation and in + * cl_lock_modify(). + */ + struct cl_lock_descr cll_descr; + /** Protected by cl_lock::cll_guard. */ + enum cl_lock_state cll_state; + /** signals state changes. */ + cfs_waitq_t cll_wq; + /** + * Recursive lock, most fields in cl_lock{} are protected by this. + * + * Locking rules: this mutex is never held across network + * communication, except when lock is being canceled. + * + * Lock ordering: a mutex of a sub-lock is taken first, then a mutex + * on a top-lock. Other direction is implemented through a + * try-lock-repeat loop. Mutices of unrelated locks can be taken only + * by try-locking. + * + * \see osc_lock_enqueue_wait(), lov_lock_cancel(), lov_sublock_wait(). + */ + struct mutex cll_guard; + cfs_task_t *cll_guarder; + int cll_depth; + + int cll_error; + /** + * Number of holds on a lock. A hold prevents a lock from being + * canceled and destroyed. Protected by cl_lock::cll_guard. + * + * \see cl_lock_hold(), cl_lock_unhold(), cl_lock_release() + */ + int cll_holds; + /** + * Number of lock users. Valid in cl_lock_state::CLS_HELD state + * only. Lock user pins lock in CLS_HELD state. Protected by + * cl_lock::cll_guard. + * + * \see cl_wait(), cl_unuse(). + */ + int cll_users; + /** + * Flag bit-mask. Values from enum cl_lock_flags. Updates are + * protected by cl_lock::cll_guard. + */ + unsigned long cll_flags; + /** + * A linkage into a list of locks in a closure. + * + * \see cl_lock_closure + */ + struct list_head cll_inclosure; + /** + * A list of references to this lock, for debugging. + */ + struct lu_ref cll_reference; + /** + * A list of holds on this lock, for debugging. + */ + struct lu_ref cll_holders; + /** + * A reference for cl_lock::cll_descr::cld_obj. For debugging. + */ + struct lu_ref_link *cll_obj_ref; +#ifdef CONFIG_LOCKDEP + /* "dep_map" name is assumed by lockdep.h macros. */ + struct lockdep_map dep_map; +#endif +}; + +/** + * Per-layer part of cl_lock + * + * \see ccc_lock, lov_lock, lovsub_lock, osc_lock + */ +struct cl_lock_slice { + struct cl_lock *cls_lock; + /** Object slice corresponding to this lock slice. Immutable after + * creation. */ + struct cl_object *cls_obj; + const struct cl_lock_operations *cls_ops; + /** Linkage into cl_lock::cll_layers. Immutable after creation. */ + struct list_head cls_linkage; +}; + +/** + * Possible (non-error) return values of ->clo_{enqueue,wait,unlock}(). + * + * NOTE: lov_subresult() depends on ordering here. + */ +enum cl_lock_transition { + /** operation cannot be completed immediately. Wait for state change. */ + CLO_WAIT = 1, + /** operation had to release lock mutex, restart. */ + CLO_REPEAT = 2 +}; + +/** + * + * \see vvp_lock_ops, lov_lock_ops, lovsub_lock_ops, osc_lock_ops + */ +struct cl_lock_operations { + /** + * \name statemachine + * + * State machine transitions. These 3 methods are called to transfer + * lock from one state to another, as described in the commentary + * above enum #cl_lock_state. + * + * \retval 0 this layer has nothing more to do to before + * transition to the target state happens; + * + * \retval CLO_REPEAT method had to release and re-acquire cl_lock + * mutex, repeat invocation of transition method + * across all layers; + * + * \retval CLO_WAIT this layer cannot move to the target state + * immediately, as it has to wait for certain event + * (e.g., the communication with the server). It + * is guaranteed, that when the state transfer + * becomes possible, cl_lock::cll_wq wait-queue + * is signaled. Caller can wait for this event by + * calling cl_lock_state_wait(); + * + * \retval -ve failure, abort state transition, move the lock + * into cl_lock_state::CLS_FREEING state, and set + * cl_lock::cll_error. + * + * Once all layers voted to agree to transition (by returning 0), lock + * is moved into corresponding target state. All state transition + * methods are optional. + */ + /** @{ */ + /** + * Attempts to enqueue the lock. Called top-to-bottom. + * + * \see ccc_lock_enqueue(), lov_lock_enqueue(), lovsub_lock_enqueue(), + * \see osc_lock_enqueue() + */ + int (*clo_enqueue)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); + /** + * Attempts to wait for enqueue result. Called top-to-bottom. + * + * \see ccc_lock_wait(), lov_lock_wait(), osc_lock_wait() + */ + int (*clo_wait)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Attempts to unlock the lock. Called bottom-to-top. In addition to + * usual return values of lock state-machine methods, this can return + * -ESTALE to indicate that lock cannot be returned to the cache, and + * has to be re-initialized. + * + * \see ccc_lock_unlock(), lov_lock_unlock(), osc_lock_unlock() + */ + int (*clo_unuse)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Notifies layer that cached lock is started being used. + * + * \pre lock->cll_state == CLS_CACHED + * + * \see lov_lock_use(), osc_lock_use() + */ + int (*clo_use)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} statemachine */ + /** + * A method invoked when lock state is changed (as a result of state + * transition). This is used, for example, to track when the state of + * a sub-lock changes, to propagate this change to the corresponding + * top-lock. Optional + * + * \see lovsub_lock_state() + */ + void (*clo_state)(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state st); + /** + * Returns true, iff given lock is suitable for the given io, idea + * being, that there are certain "unsafe" locks, e.g., ones acquired + * for O_APPEND writes, that we don't want to re-use for a normal + * write, to avoid the danger of cascading evictions. Optional. Runs + * under cl_object_header::coh_lock_guard. + * + * XXX this should take more information about lock needed by + * io. Probably lock description or something similar. + * + * \see lov_fits_into() + */ + int (*clo_fits_into)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); + /** + * \name ast + * Asynchronous System Traps. All of then are optional, all are + * executed bottom-to-top. + */ + /** @{ */ + + /** + * Cancellation callback. Cancel a lock voluntarily, or under + * the request of server. + */ + void (*clo_cancel)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Lock weighting ast. Executed to estimate how precious this lock + * is. The sum of results across all layers is used to determine + * whether lock worth keeping in cache given present memory usage. + * + * \see osc_lock_weigh(), vvp_lock_weigh(), lovsub_lock_weigh(). + */ + unsigned long (*clo_weigh)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** @} ast */ + + /** + * \see lovsub_lock_closure() + */ + int (*clo_closure)(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure); + /** + * Executed top-to-bottom when lock description changes (e.g., as a + * result of server granting more generous lock than was requested). + * + * \see lovsub_lock_modify() + */ + int (*clo_modify)(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *updated); + /** + * Notifies layers (bottom-to-top) that lock is going to be + * destroyed. Responsibility of layers is to prevent new references on + * this lock from being acquired once this method returns. + * + * This can be called multiple times due to the races. + * + * \see cl_lock_delete() + * \see osc_lock_delete(), lovsub_lock_delete() + */ + void (*clo_delete)(const struct lu_env *env, + const struct cl_lock_slice *slice); + /** + * Destructor. Frees resources and the slice. + * + * \see ccc_lock_fini(), lov_lock_fini(), lovsub_lock_fini(), + * \see osc_lock_fini() + */ + void (*clo_fini)(const struct lu_env *env, struct cl_lock_slice *slice); + /** + * Optional debugging helper. Prints given lock slice. + */ + int (*clo_print)(const struct lu_env *env, + void *cookie, lu_printer_t p, + const struct cl_lock_slice *slice); +}; + +#define CL_LOCK_DEBUG(mask, env, lock, format, ...) \ +do { \ + static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask); \ + \ + if (cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ + cl_lock_print(env, &__info, lu_cdebug_printer, lock); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ + } \ +} while (0) + +/** @} cl_lock */ + +/** \addtogroup cl_page_list cl_page_list + * Page list used to perform collective operations on a group of pages. + * + * Pages are added to the list one by one. cl_page_list acquires a reference + * for every page in it. Page list is used to perform collective operations on + * pages: + * + * - submit pages for an immediate transfer, + * + * - own pages on behalf of certain io (waiting for each page in turn), + * + * - discard pages. + * + * When list is finalized, it releases references on all pages it still has. + * + * \todo XXX concurrency control. + * + * @{ + */ +struct cl_page_list { + unsigned pl_nr; + struct list_head pl_pages; + cfs_task_t *pl_owner; +}; + +/** \addtogroup cl_page_list cl_page_list + * A 2-queue of pages. A convenience data-type for common use case, 2-queue + * contains an incoming page list and an outgoing page list. + */ +struct cl_2queue { + struct cl_page_list c2_qin; + struct cl_page_list c2_qout; +}; + +/** @} cl_page_list */ + +/** \addtogroup cl_io cl_io + * @{ */ +/** \struct cl_io + * I/O + * + * cl_io represents a high level I/O activity like + * read(2)/write(2)/truncate(2) system call, or cancellation of an extent + * lock. + * + * cl_io is a layered object, much like cl_{object,page,lock} but with one + * important distinction. We want to minimize number of calls to the allocator + * in the fast path, e.g., in the case of read(2) when everything is cached: + * client already owns the lock over region being read, and data are cached + * due to read-ahead. To avoid allocation of cl_io layers in such situations, + * per-layer io state is stored in the session, associated with the io, see + * struct {vvp,lov,osc}_io for example. Sessions allocation is amortized + * by using free-lists, see cl_env_get(). + * + * There is a small predefined number of possible io types, enumerated in enum + * cl_io_type. + * + * cl_io is a state machine, that can be advanced concurrently by the multiple + * threads. It is up to these threads to control the concurrency and, + * specifically, to detect when io is done, and its state can be safely + * released. + * + * For read/write io overall execution plan is as following: + * + * (0) initialize io state through all layers; + * + * (1) loop: prepare chunk of work to do + * + * (2) call all layers to collect locks they need to process current chunk + * + * (3) sort all locks to avoid dead-locks, and acquire them + * + * (4) process the chunk: call per-page methods + * (cl_io_operations::cio_read_page() for read, + * cl_io_operations::cio_prepare_write(), + * cl_io_operations::cio_commit_write() for write) + * + * (5) release locks + * + * (6) repeat loop. + * + * To implement the "parallel IO mode", lov layer creates sub-io's (lazily to + * address allocation efficiency issues mentioned above), and returns with the + * special error condition from per-page method when current sub-io has to + * block. This causes io loop to be repeated, and lov switches to the next + * sub-io in its cl_io_operations::cio_iter_init() implementation. + */ + +/** IO types */ +enum cl_io_type { + /** read system call */ + CIT_READ, + /** write system call */ + CIT_WRITE, + /** truncate system call */ + CIT_TRUNC, + /** + * page fault handling + */ + CIT_FAULT, + /** + * Miscellaneous io. This is used for occasional io activity that + * doesn't fit into other types. Currently this is used for: + * + * - cancellation of an extent lock. This io exists as a context + * to write dirty pages from under the lock being canceled back + * to the server; + * + * - VM induced page write-out. An io context for writing page out + * for memory cleansing; + * + * - glimpse. An io context to acquire glimpse lock. + * + * CIT_MISC io is used simply as a context in which locks and pages + * are manipulated. Such io has no internal "process", that is, + * cl_io_loop() is never called for it. + */ + CIT_MISC, + CIT_OP_NR +}; + +/** + * States of cl_io state machine + */ +enum cl_io_state { + /** Not initialized. */ + CIS_ZERO, + /** Initialized. */ + CIS_INIT, + /** IO iteration started. */ + CIS_IT_STARTED, + /** Locks taken. */ + CIS_LOCKED, + /** Actual IO is in progress. */ + CIS_IO_GOING, + /** IO for the current iteration finished. */ + CIS_IO_FINISHED, + /** Locks released. */ + CIS_UNLOCKED, + /** Iteration completed. */ + CIS_IT_ENDED, + /** cl_io finalized. */ + CIS_FINI +}; + +/** + * IO state private for a layer. + * + * This is usually embedded into layer session data, rather than allocated + * dynamically. + * + * \see vvp_io, lov_io, osc_io, ccc_io + */ +struct cl_io_slice { + struct cl_io *cis_io; + /** corresponding object slice. Immutable after creation. */ + struct cl_object *cis_obj; + /** io operations. Immutable after creation. */ + const struct cl_io_operations *cis_iop; + /** + * linkage into a list of all slices for a given cl_io, hanging off + * cl_io::ci_layers. Immutable after creation. + */ + struct list_head cis_linkage; +}; + + +/** + * Per-layer io operations. + * \see vvp_io_ops, lov_io_ops, lovsub_io_ops, osc_io_ops + */ +struct cl_io_operations { + /** + * Vector of io state transition methods for every io type. + * + * \see cl_page_operations::io + */ + struct { + /** + * Prepare io iteration at a given layer. + * + * Called top-to-bottom at the beginning of each iteration of + * "io loop" (if it makes sense for this type of io). Here + * layer selects what work it will do during this iteration. + * + * \see cl_io_operations::cio_iter_fini() + */ + int (*cio_iter_init) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize io iteration. + * + * Called bottom-to-top at the end of each iteration of "io + * loop". Here layers can decide whether IO has to be + * continued. + * + * \see cl_io_operations::cio_iter_init() + */ + void (*cio_iter_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Collect locks for the current iteration of io. + * + * Called top-to-bottom to collect all locks necessary for + * this iteration. This methods shouldn't actually enqueue + * anything, instead it should post a lock through + * cl_io_lock_add(). Once all locks are collected, they are + * sorted and enqueued in the proper order. + */ + int (*cio_lock) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Finalize unlocking. + * + * Called bottom-to-top to finish layer specific unlocking + * functionality, after generic code released all locks + * acquired by cl_io_operations::cio_lock(). + */ + void (*cio_unlock)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Start io iteration. + * + * Once all locks are acquired, called top-to-bottom to + * commence actual IO. In the current implementation, + * top-level vvp_io_{read,write}_start() does all the work + * synchronously by calling generic_file_*(), so other layers + * are called when everything is done. + */ + int (*cio_start)(const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called top-to-bottom at the end of io loop. Here layer + * might wait for an unfinished asynchronous io. + */ + void (*cio_end) (const struct lu_env *env, + const struct cl_io_slice *slice); + /** + * Called bottom-to-top to notify layers that read/write IO + * iteration finished, with \a nob bytes transferred. + */ + void (*cio_advance)(const struct lu_env *env, + const struct cl_io_slice *slice, + size_t nob); + /** + * Called once per io, bottom-to-top to release io resources. + */ + void (*cio_fini) (const struct lu_env *env, + const struct cl_io_slice *slice); + } op[CIT_OP_NR]; + struct { + /** + * Submit pages from \a queue->c2_qin for IO, and move + * successfully submitted pages into \a queue->c2_qout. Return + * non-zero if failed to submit even the single page. If + * submission failed after some pages were moved into \a + * queue->c2_qout, completion callback with non-zero ioret is + * executed on them. + */ + int (*cio_submit)(const struct lu_env *env, + const struct cl_io_slice *slice, + enum cl_req_type crt, + struct cl_2queue *queue); + } req_op[CRT_NR]; + /** + * Read missing page. + * + * Called by a top-level cl_io_operations::op[CIT_READ]::cio_start() + * method, when it hits not-up-to-date page in the range. Optional. + * + * \pre io->ci_type == CIT_READ + */ + int (*cio_read_page)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page); + /** + * Prepare write of a \a page. Called bottom-to-top by a top-level + * cl_io_operations::op[CIT_WRITE]::cio_start() to prepare page for + * get data from user-level buffer. + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_prepare_write(), lov_io_prepare_write(), + * osc_io_prepare_write(). + */ + int (*cio_prepare_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * + * \pre io->ci_type == CIT_WRITE + * + * \see vvp_io_commit_write(), lov_io_commit_write(), + * osc_io_commit_write(). + */ + int (*cio_commit_write)(const struct lu_env *env, + const struct cl_io_slice *slice, + const struct cl_page_slice *page, + unsigned from, unsigned to); + /** + * Optional debugging helper. Print given io slice. + */ + int (*cio_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_io_slice *slice); +}; + +/** + * Flags to lock enqueue procedure. + * \ingroup cl_lock + */ +enum cl_enq_flags { + /** + * instruct server to not block, if conflicting lock is found. Instead + * -EWOULDBLOCK is returned immediately. + */ + CEF_NONBLOCK = 0x00000001, + /** + * take lock asynchronously (out of order), as it cannot + * deadlock. This is for LDLM_FL_HAS_INTENT locks used for glimpsing. + */ + CEF_ASYNC = 0x00000002, + /** + * tell the server to instruct (though a flag in the blocking ast) an + * owner of the conflicting lock, that it can drop dirty pages + * protected by this lock, without sending them to the server. + */ + CEF_DISCARD_DATA = 0x00000004 +}; + +/** + * Link between lock and io. Intermediate structure is needed, because the + * same lock can be part of multiple io's simultaneously. + */ +struct cl_io_lock_link { + /** linkage into one of cl_lockset lists. */ + struct list_head cill_linkage; + struct cl_lock_descr cill_descr; + struct cl_lock *cill_lock; + /** + * flags to enqueue lock for this IO. A combination of bit-flags from + * enum cl_enq_flags. + */ + __u32 cill_enq_flags; + /** optional destructor */ + void (*cill_fini)(const struct lu_env *env, + struct cl_io_lock_link *link); +}; + +/** + * Lock-set represents a collection of locks, that io needs at a + * time. Generally speaking, client tries to avoid holding multiple locks when + * possible, because + * + * - holding extent locks over multiple ost's introduces the danger of + * "cascading timeouts"; + * + * - holding multiple locks over the same ost is still dead-lock prone, + * see comment in osc_lock_enqueue(), + * + * but there are certain situations where this is unavoidable: + * + * - O_APPEND writes have to take [0, EOF] lock for correctness; + * + * - truncate has to take [new-size, EOF] lock for correctness; + * + * - SNS has to take locks across full stripe for correctness; + * + * - in the case when user level buffer, supplied to {read,write}(file0), + * is a part of a memory mapped lustre file, client has to take a dlm + * locks on file0, and all files that back up the buffer (or a part of + * the buffer, that is being processed in the current chunk, in any + * case, there are situations where at least 2 locks are necessary). + * + * In such cases we at least try to take locks in the same consistent + * order. To this end, all locks are first collected, then sorted, and then + * enqueued. + */ +struct cl_lockset { + /** locks to be acquired. */ + struct list_head cls_todo; + /** locks currently being processed. */ + struct list_head cls_curr; + /** locks acquired. */ + struct list_head cls_done; +}; + +/** + * Lock requirements(demand) for IO. It should be cl_io_lock_req, + * but 'req' is always to be thought as 'request' :-) + */ +enum cl_io_lock_dmd { + /** Always lock data (e.g., O_APPEND). */ + CILR_MANDATORY = 0, + /** Layers are free to decide between local and global locking. */ + CILR_MAYBE, + /** Never lock: there is no cache (e.g., liblustre). */ + CILR_NEVER +}; + +struct cl_io_rw_common { + loff_t crw_pos; + size_t crw_count; + int crw_nonblock; +}; + +/** + * State for io. + * + * cl_io is shared by all threads participating in this IO (in current + * implementation only one thread advances IO, but parallel IO design and + * concurrent copy_*_user() require multiple threads acting on the same IO. It + * is up to these threads to serialize their activities, including updates to + * mutable cl_io fields. + */ +struct cl_io { + /** type of this IO. Immutable after creation. */ + enum cl_io_type ci_type; + /** current state of cl_io state machine. */ + enum cl_io_state ci_state; + /** main object this io is against. Immutable after creation. */ + struct cl_object *ci_obj; + /** + * Upper layer io, of which this io is a part of. Immutable after + * creation. + */ + struct cl_io *ci_parent; + /** List of slices. Immutable after creation. */ + struct list_head ci_layers; + /** list of locks (to be) acquired by this io. */ + struct cl_lockset ci_lockset; + /** lock requirements, this is just a help info for sublayers. */ + enum cl_io_lock_dmd ci_lockreq; + union { + struct cl_rd_io { + struct cl_io_rw_common rd; + int rd_is_sendfile; + } ci_rd; + struct cl_wr_io { + struct cl_io_rw_common wr; + int wr_append; + } ci_wr; + struct cl_io_rw_common ci_rw; + struct cl_truncate_io { + /** new size to which file is truncated */ + size_t tr_size; + struct obd_capa *tr_capa; + } ci_truncate; + struct cl_fault_io { + /** page index within file. */ + pgoff_t ft_index; + /** bytes valid byte on a faulted page. */ + int ft_nob; + /** writable page? */ + int ft_writable; + /** page of an executable? */ + int ft_executable; + /** resulting page */ + struct cl_page *ft_page; + } ci_fault; + } u; + struct cl_2queue ci_queue; + size_t ci_nob; + int ci_result; + int ci_continue; + /** + * Number of pages owned by this IO. For invariant checking. + */ + unsigned ci_owned_nr; +}; + +/** @} cl_io */ + +/** \addtogroup cl_req cl_req + * @{ */ +/** \struct cl_req + * Transfer. + * + * There are two possible modes of transfer initiation on the client: + * + * - immediate transfer: this is started when a high level io wants a page + * or a collection of pages to be transferred right away. Examples: + * read-ahead, synchronous read in the case of non-page aligned write, + * page write-out as a part of extent lock cancellation, page write-out + * as a part of memory cleansing. Immediate transfer can be both + * cl_req_type::CRT_READ and cl_req_type::CRT_WRITE; + * + * - opportunistic transfer (cl_req_type::CRT_WRITE only), that happens + * when io wants to transfer a page to the server some time later, when + * it can be done efficiently. Example: pages dirtied by the write(2) + * path. + * + * In any case, transfer takes place in the form of a cl_req, which is a + * representation for a network RPC. + * + * Pages queued for an opportunistic transfer are cached until it is decided + * that efficient RPC can be composed of them. This decision is made by "a + * req-formation engine", currently implemented as a part of osc + * layer. Req-formation depends on many factors: the size of the resulting + * RPC, whether or not multi-object RPCs are supported by the server, + * max-rpc-in-flight limitations, size of the dirty cache, etc. + * + * For the immediate transfer io submits a cl_page_list, that req-formation + * engine slices into cl_req's, possibly adding cached pages to some of + * the resulting req's. + * + * Whenever a page from cl_page_list is added to a newly constructed req, its + * cl_page_operations::cpo_prep() layer methods are called. At that moment, + * page state is atomically changed from cl_page_state::CPS_OWNED to + * cl_page_state::CPS_PAGEOUT or cl_page_state::CPS_PAGEIN, cl_page::cp_owner + * is zeroed, and cl_page::cp_req is set to the + * req. cl_page_operations::cpo_prep() method at the particular layer might + * return -EALREADY to indicate that it does not need to submit this page + * at all. This is possible, for example, if page, submitted for read, + * became up-to-date in the meantime; and for write, the page don't have + * dirty bit marked. \see cl_io_submit_rw() + * + * Whenever a cached page is added to a newly constructed req, its + * cl_page_operations::cpo_make_ready() layer methods are called. At that + * moment, page state is atomically changed from cl_page_state::CPS_CACHED to + * cl_page_state::CPS_PAGEOUT, and cl_page::cp_req is set to + * req. cl_page_operations::cpo_make_ready() method at the particular layer + * might return -EAGAIN to indicate that this page is not eligible for the + * transfer right now. + * + * FUTURE + * + * Plan is to divide transfers into "priority bands" (indicated when + * submitting cl_page_list, and queuing a page for the opportunistic transfer) + * and allow glueing of cached pages to immediate transfers only within single + * band. This would make high priority transfers (like lock cancellation or + * memory pressure induced write-out) really high priority. + * + */ + +/** + * Per-transfer attributes. + */ +struct cl_req_attr { + /** Generic attributes for the server consumption. */ + struct obdo *cra_oa; + /** Capability. */ + struct obd_capa *cra_capa; +}; + +/** + * Transfer request operations definable at every layer. + * + * Concurrency: transfer formation engine synchronizes calls to all transfer + * methods. + */ +struct cl_req_operations { + /** + * Invoked top-to-bottom by cl_req_prep() when transfer formation is + * complete (all pages are added). + * + * \see osc_req_prep() + */ + int (*cro_prep)(const struct lu_env *env, + const struct cl_req_slice *slice); + /** + * Called top-to-bottom to fill in \a oa fields. This is called twice + * with different flags, see bug 10150 and osc_build_req(). + * + * \param obj an object from cl_req which attributes are to be set in + * \a oa. + * + * \param oa struct obdo where attributes are placed + * + * \param flags \a oa fields to be filled. + */ + void (*cro_attr_set)(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags); + /** + * Called top-to-bottom from cl_req_completion() to notify layers that + * transfer completed. Has to free all state allocated by + * cl_device_operations::cdo_req_init(). + */ + void (*cro_completion)(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +}; + +/** + * A per-object state that (potentially multi-object) transfer request keeps. + */ +struct cl_req_obj { + /** object itself */ + struct cl_object *ro_obj; + /** reference to cl_req_obj::ro_obj. For debugging. */ + struct lu_ref_link *ro_obj_ref; + /* something else? Number of pages for a given object? */ +}; + +/** + * Transfer request. + * + * Transfer requests are not reference counted, because IO sub-system owns + * them exclusively and knows when to free them. + * + * Life cycle. + * + * cl_req is created by cl_req_alloc() that calls + * cl_device_operations::cdo_req_init() device methods to allocate per-req + * state in every layer. + * + * Then pages are added (cl_req_page_add()), req keeps track of all objects it + * contains pages for. + * + * Once all pages were collected, cl_page_operations::cpo_prep() method is + * called top-to-bottom. At that point layers can modify req, let it pass, or + * deny it completely. This is to support things like SNS that have transfer + * ordering requirements invisible to the individual req-formation engine. + * + * On transfer completion (or transfer timeout, or failure to initiate the + * transfer of an allocated req), cl_req_operations::cro_completion() method + * is called, after execution of cl_page_operations::cpo_completion() of all + * req's pages. + */ +struct cl_req { + enum cl_req_type crq_type; + /** A list of pages being transfered */ + struct list_head crq_pages; + /** Number of pages in cl_req::crq_pages */ + unsigned crq_nrpages; + /** An array of objects which pages are in ->crq_pages */ + struct cl_req_obj *crq_o; + /** Number of elements in cl_req::crq_objs[] */ + unsigned crq_nrobjs; + struct list_head crq_layers; +}; + +/** + * Per-layer state for request. + */ +struct cl_req_slice { + struct cl_req *crs_req; + struct cl_device *crs_dev; + struct list_head crs_linkage; + const struct cl_req_operations *crs_ops; +}; + +/* @} cl_req */ + +/** + * Stats for a generic cache (similar to inode, lu_object, etc. caches). + */ +struct cache_stats { + const char *cs_name; + /** how many entities were created at all */ + atomic_t cs_created; + /** how many cache lookups were performed */ + atomic_t cs_lookup; + /** how many times cache lookup resulted in a hit */ + atomic_t cs_hit; + /** how many entities are in the cache right now */ + atomic_t cs_total; + /** how many entities in the cache are actively used (and cannot be + * evicted) right now */ + atomic_t cs_busy; +}; + +/** These are not exported so far */ +void cache_stats_init (struct cache_stats *cs, const char *name); +int cache_stats_print(const struct cache_stats *cs, + char *page, int count, int header); + +/** + * Client-side site. This represents particular client stack. "Global" + * variables should (directly or indirectly) be added here to allow multiple + * clients to co-exist in the single address space. + */ +struct cl_site { + struct lu_site cs_lu; + /** + * Statistical counters. Atomics do not scale, something better like + * per-cpu counters is needed. + * + * These are exported as /proc/fs/lustre/llite/.../site + * + * When interpreting keep in mind that both sub-locks (and sub-pages) + * and top-locks (and top-pages) are accounted here. + */ + struct cache_stats cs_pages; + struct cache_stats cs_locks; + atomic_t cs_pages_state[CPS_NR]; + atomic_t cs_locks_state[CLS_NR]; +}; + +int cl_site_init (struct cl_site *s, struct cl_device *top); +void cl_site_fini (struct cl_site *s); +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl); + +/** + * Output client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *s, char *page, int count); + +/** + * \name helpers + * + * Type conversion and accessory functions. + */ +/** @{ */ + +static inline struct cl_site *lu2cl_site(const struct lu_site *site) +{ + return container_of(site, struct cl_site, cs_lu); +} + +static inline int lu_device_is_cl(const struct lu_device *d) +{ + return d->ld_type->ldt_tags & LU_DEVICE_CL; +} + +static inline struct cl_device *lu2cl_dev(const struct lu_device *d) +{ + LASSERT(d == NULL || IS_ERR(d) || lu_device_is_cl(d)); + return container_of0(d, struct cl_device, cd_lu_dev); +} + +static inline struct lu_device *cl2lu_dev(struct cl_device *d) +{ + return &d->cd_lu_dev; +} + +static inline struct cl_object *lu2cl(const struct lu_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->lo_dev)); + return container_of0(o, struct cl_object, co_lu); +} + +static inline const struct cl_object_conf * +lu2cl_conf(const struct lu_object_conf *conf) +{ + return container_of0(conf, struct cl_object_conf, coc_lu); +} + +static inline struct cl_object *cl_object_next(const struct cl_object *obj) +{ + return obj ? lu2cl(lu_object_next(&obj->co_lu)) : NULL; +} + +static inline struct cl_device *cl_object_device(const struct cl_object *o) +{ + LASSERT(o == NULL || IS_ERR(o) || lu_device_is_cl(o->co_lu.lo_dev)); + return container_of0(o->co_lu.lo_dev, struct cl_device, cd_lu_dev); +} + +static inline struct cl_object_header *luh2coh(const struct lu_object_header *h) +{ + return container_of0(h, struct cl_object_header, coh_lu); +} + +static inline struct cl_site *cl_object_site(const struct cl_object *obj) +{ + return lu2cl_site(obj->co_lu.lo_dev->ld_site); +} + +static inline +struct cl_object_header *cl_object_header(const struct cl_object *obj) +{ + return luh2coh(obj->co_lu.lo_header); +} + +static inline int cl_device_init(struct cl_device *d, struct lu_device_type *t) +{ + return lu_device_init(&d->cd_lu_dev, t); +} + +static inline void cl_device_fini(struct cl_device *d) +{ + lu_device_fini(&d->cd_lu_dev); +} + +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops); +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops); +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, const struct cl_io_operations *ops); +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops); +/** @} helpers */ + +/** \defgroup cl_object cl_object + * @{ */ +struct cl_object *cl_object_top (struct cl_object *o); +struct cl_object *cl_object_find(const struct lu_env *env, struct cl_device *cd, + const struct lu_fid *fid, + const struct cl_object_conf *c); + +int cl_object_header_init(struct cl_object_header *h); +void cl_object_header_fini(struct cl_object_header *h); +void cl_object_put (const struct lu_env *env, struct cl_object *o); +void cl_object_get (struct cl_object *o); +void cl_object_attr_lock (struct cl_object *o); +void cl_object_attr_unlock(struct cl_object *o); +int cl_object_attr_get (const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +int cl_object_attr_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int cl_object_glimpse (const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb); +int cl_conf_set (const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +void cl_object_prune (const struct lu_env *env, struct cl_object *obj); +void cl_object_kill (const struct lu_env *env, struct cl_object *obj); + +/** + * Returns true, iff \a o0 and \a o1 are slices of the same object. + */ +static inline int cl_object_same(struct cl_object *o0, struct cl_object *o1) +{ + return cl_object_header(o0) == cl_object_header(o1); +} + +/** @} cl_object */ + +/** \defgroup cl_page cl_page + * @{ */ +struct cl_page *cl_page_lookup(struct cl_object_header *hdr, + pgoff_t index); +void cl_page_gang_lookup(const struct lu_env *env, + struct cl_object *obj, + struct cl_io *io, + pgoff_t start, pgoff_t end, + struct cl_page_list *plist); +struct cl_page *cl_page_find (const struct lu_env *env, + struct cl_object *obj, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type); +void cl_page_get (struct cl_page *page); +void cl_page_put (const struct lu_env *env, + struct cl_page *page); +void cl_page_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_page *pg); +cfs_page_t *cl_page_vmpage (const struct lu_env *env, + struct cl_page *page); +struct cl_page *cl_vmpage_page (cfs_page_t *vmpage, struct cl_object *obj); +struct cl_page *cl_page_top (struct cl_page *page); +int cl_is_page (const void *addr); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype); + +/** + * \name ownership + * + * Functions dealing with the ownership of page by io. + */ +/** @{ */ + +int cl_page_own (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +void cl_page_unassume (const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); +void cl_page_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page *page); +int cl_page_is_owned (const struct cl_page *pg, const struct cl_io *io); + +/** @} ownership */ + +/** + * \name transfer + * + * Functions dealing with the preparation of a page for a transfer, and + * tracking transfer state. + */ +/** @{ */ +int cl_page_prep (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_completion (const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret); +int cl_page_make_ready (const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt); +int cl_page_cache_add (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt); +void cl_page_clip (const struct lu_env *env, struct cl_page *pg, + int from, int to); +int cl_page_cancel (const struct lu_env *env, struct cl_page *page); + +/** @} transfer */ + + +/** + * \name helper routines + * Functions to discard, delete and export a cl_page. + */ +/** @{ */ +void cl_page_discard (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +void cl_page_delete (const struct lu_env *env, struct cl_page *pg); +int cl_page_unmap (const struct lu_env *env, struct cl_io *io, + struct cl_page *pg); +int cl_page_is_vmlocked (const struct lu_env *env, + const struct cl_page *pg); +void cl_page_export (const struct lu_env *env, struct cl_page *pg); +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +loff_t cl_offset (const struct cl_object *obj, pgoff_t idx); +pgoff_t cl_index (const struct cl_object *obj, loff_t offset); +int cl_page_size (const struct cl_object *obj); +int cl_pages_prune (const struct lu_env *env, struct cl_object *obj); + +void cl_lock_print (const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock); +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr); +/* @} helper */ + +/** @} cl_page */ + +/** \defgroup cl_lock cl_lock + * @{ */ + +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source); +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + __u32 enqflags, + const char *scope, const void *source); +struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct cl_lock *except, + int pending, int canceld); + +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype); + +void cl_lock_get (struct cl_lock *lock); +void cl_lock_get_trust (struct cl_lock *lock); +void cl_lock_put (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_hold_add (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_unhold (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_release (const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source); +void cl_lock_user_add (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_user_del (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_compatible(const struct cl_lock *lock1, + const struct cl_lock *lock2); + +/** \name statemachine statemachine + * Interface to lock state machine consists of 3 parts: + * + * - "try" functions that attempt to effect a state transition. If state + * transition is not possible right now (e.g., if it has to wait for some + * asynchronous event to occur), these functions return + * cl_lock_transition::CLO_WAIT. + * + * - "non-try" functions that implement synchronous blocking interface on + * top of non-blocking "try" functions. These functions repeatedly call + * corresponding "try" versions, and if state transition is not possible + * immediately, wait for lock state change. + * + * - methods from cl_lock_operations, called by "try" functions. Lock can + * be advanced to the target state only when all layers voted that they + * are ready for this transition. "Try" functions call methods under lock + * mutex. If a layer had to release a mutex, it re-acquires it and returns + * cl_lock_transition::CLO_REPEAT, causing "try" function to call all + * layers again. + * + * TRY NON-TRY METHOD FINAL STATE + * + * cl_enqueue_try() cl_enqueue() cl_lock_operations::clo_enqueue() CLS_ENQUEUED + * + * cl_wait_try() cl_wait() cl_lock_operations::clo_wait() CLS_HELD + * + * cl_unuse_try() cl_unuse() cl_lock_operations::clo_unuse() CLS_CACHED + * + * cl_use_try() NONE cl_lock_operations::clo_use() CLS_HELD + * + * @{ */ + +int cl_enqueue (const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_unuse (const struct lu_env *env, struct cl_lock *lock); +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags); +int cl_unuse_try (const struct lu_env *env, struct cl_lock *lock); +int cl_wait_try (const struct lu_env *env, struct cl_lock *lock); +int cl_use_try (const struct lu_env *env, struct cl_lock *lock); +/** @} statemachine */ + +void cl_lock_signal (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_state_wait (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_state_set (const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state); +int cl_queue_match (const struct list_head *queue, + const struct cl_lock_descr *need); + +void cl_lock_mutex_get (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_mutex_try (const struct lu_env *env, struct cl_lock *lock); +void cl_lock_mutex_put (const struct lu_env *env, struct cl_lock *lock); +int cl_lock_is_mutexed (struct cl_lock *lock); +int cl_lock_nr_mutexed (const struct lu_env *env); +int cl_lock_page_out (const struct lu_env *env, struct cl_lock *lock, + int discard); +int cl_lock_ext_match (const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need); +int cl_lock_mode_match (enum cl_lock_mode has, enum cl_lock_mode need); +int cl_lock_modify (const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc); + +void cl_lock_closure_init (const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait); +void cl_lock_closure_fini (struct cl_lock_closure *closure); +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); +void cl_lock_disclosure (const struct lu_env *env, + struct cl_lock_closure *closure); +int cl_lock_enclosure (const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure); + +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock); +void cl_lock_error (const struct lu_env *env, struct cl_lock *lock, int error); +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int wait); +int cl_is_lock (const void *addr); + +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock); + +/** @} cl_lock */ + +/** \defgroup cl_io cl_io + * @{ */ + +int cl_io_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_sub_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj); +int cl_io_rw_init (const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count); +int cl_io_loop (const struct lu_env *env, struct cl_io *io); + +void cl_io_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_iter_init (const struct lu_env *env, struct cl_io *io); +void cl_io_iter_fini (const struct lu_env *env, struct cl_io *io); +int cl_io_lock (const struct lu_env *env, struct cl_io *io); +void cl_io_unlock (const struct lu_env *env, struct cl_io *io); +int cl_io_start (const struct lu_env *env, struct cl_io *io); +void cl_io_end (const struct lu_env *env, struct cl_io *io); +int cl_io_lock_add (const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link); +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr); +int cl_io_read_page (const struct lu_env *env, struct cl_io *io, + struct cl_page *page); +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_commit_write (const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to); +int cl_io_submit_rw (const struct lu_env *env, struct cl_io *io, + enum cl_req_type iot, struct cl_2queue *queue); +void cl_io_rw_advance (const struct lu_env *env, struct cl_io *io, + size_t nob); +int cl_io_cancel (const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue); +int cl_io_is_going (const struct lu_env *env); + +/** + * True, iff \a io is an O_APPEND write(2). + */ +static inline int cl_io_is_append(const struct cl_io *io) +{ + return io->ci_type == CIT_WRITE && io->u.ci_wr.wr_append; +} + +int cl_io_is_sendfile(const struct cl_io *io); + +struct cl_io *cl_io_top(struct cl_io *io); + +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io); + +#define CL_IO_SLICE_CLEAN(foo_io, base) \ +do { \ + typeof(foo_io) __foo_io = (foo_io); \ + \ + CLASSERT(offsetof(typeof(*__foo_io), base) == 0); \ + memset(&__foo_io->base + 1, 0, \ + (sizeof *__foo_io) - sizeof __foo_io->base); \ +} while (0) + +/** @} cl_io */ + +/** \defgroup cl_page_list cl_page_list + * @{ */ + +/** + * Iterate over pages in a page list. + */ +#define cl_page_list_for_each(page, list) \ + list_for_each_entry((page), &(list)->pl_pages, cp_batch) + +/** + * Iterate over pages in a page list, taking possible removals into account. + */ +#define cl_page_list_for_each_safe(page, temp, list) \ + list_for_each_entry_safe((page), (temp), &(list)->pl_pages, cp_batch) + +void cl_page_list_init (struct cl_page_list *plist); +void cl_page_list_add (struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_move (struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page); +void cl_page_list_splice (struct cl_page_list *list, + struct cl_page_list *head); +void cl_page_list_del (const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page); +void cl_page_list_disown (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_own (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_assume (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +int cl_page_list_unmap (const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist); +void cl_page_list_fini (const struct lu_env *env, struct cl_page_list *plist); + +void cl_2queue_init (struct cl_2queue *queue); +void cl_2queue_add (struct cl_2queue *queue, struct cl_page *page); +void cl_2queue_disown (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_assume (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_discard (const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue); +void cl_2queue_fini (const struct lu_env *env, struct cl_2queue *queue); +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page); + +/** @} cl_page_list */ + +/** \defgroup cl_req cl_req + * @{ */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects); + +void cl_req_page_add (const struct lu_env *env, struct cl_req *req, + struct cl_page *page); +void cl_req_page_done (const struct lu_env *env, struct cl_page *page); +int cl_req_prep (const struct lu_env *env, struct cl_req *req); +void cl_req_attr_set (const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags); +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int ioret); + +/** \defgroup cl_sync_io cl_sync_io + * @{ */ + +/** + * Anchor for synchronous transfer. This is allocated on a stack by thread + * doing synchronous transfer, and a pointer to this structure is set up in + * every page submitted for transfer. Transfer completion routine updates + * anchor and wakes up waiting thread when transfer is complete. + */ +struct cl_sync_io { + /** number of pages yet to be transferred. */ + atomic_t csi_sync_nr; + /** completion to be signaled when transfer is complete. */ + struct completion csi_sync_completion; + /** error code. */ + int csi_sync_rc; +}; + +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages); +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor); +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret); + +/** @} cl_sync_io */ + +/** @} cl_req */ + +/** \defgroup cl_env cl_env + * + * lu_env handling for a client. + * + * lu_env is an environment within which lustre code executes. Its major part + * is lu_context---a fast memory allocation mechanism that is used to conserve + * precious kernel stack space. Originally lu_env was designed for a server, + * where + * + * - there is a (mostly) fixed number of threads, and + * + * - call chains have no non-lustre portions inserted between lustre code. + * + * On a client both these assumtpion fails, because every user thread can + * potentially execute lustre code as part of a system call, and lustre calls + * into VFS or MM that call back into lustre. + * + * To deal with that, cl_env wrapper functions implement the following + * optimizations: + * + * - allocation and destruction of environment is amortized by caching no + * longer used environments instead of destroying them; + * + * - there is a notion of "current" environment, attached to the kernel + * data structure representing current thread (current->journal_info in + * Linux kernel). Top-level lustre code allocates an environment and makes + * it current, then calls into non-lustre code, that in turn calls lustre + * back. Low-level lustre code thus called can fetch environment created + * by the top-level code and reuse it, avoiding additional environment + * allocation. + * + * \see lu_env, lu_context, lu_context_key + * @{ */ + +struct cl_env_nest { + int cen_refcheck; + void *cen_cookie; +}; + +struct lu_env *cl_env_peek (int *refcheck); +struct lu_env *cl_env_get (int *refcheck); +struct lu_env *cl_env_alloc (int *refcheck, __u32 tags); +struct lu_env *cl_env_nested_get (struct cl_env_nest *nest); +void cl_env_put (struct lu_env *env, int *refcheck); +void cl_env_nested_put (struct cl_env_nest *nest, struct lu_env *env); +void *cl_env_reenter (void); +void cl_env_reexit (void *cookie); +void cl_env_implant (struct lu_env *env, int *refcheck); +void cl_env_unplant (struct lu_env *env, int *refcheck); +unsigned cl_env_cache_purge(unsigned nr); + +/** @} cl_env */ + +/* + * Misc + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr); +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb); + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next); +/** @} clio */ + +#endif /* _LINUX_CL_OBJECT_H */ diff --git a/lustre/include/class_hash.h b/lustre/include/class_hash.h index 70349cb..6210c7f 100644 --- a/lustre/include/class_hash.h +++ b/lustre/include/class_hash.h @@ -50,9 +50,10 @@ typedef struct lustre_hash_bucket { #define LUSTRE_MAX_HASH_NAME 16 typedef struct lustre_hash { - int lh_cur_size; /* current hash size */ - int lh_min_size; /* min hash size */ - int lh_max_size; /* max hash size */ + int lh_cur_bits; /* current hash bits */ + int lh_cur_mask; /* current hash mask */ + int lh_min_bits; /* min hash bits */ + int lh_max_bits; /* max hash bits */ int lh_min_theta; /* resize min threshold */ int lh_max_theta; /* resize max threshold */ int lh_flags; /* hash flags */ @@ -104,8 +105,7 @@ lh_key(lustre_hash_t *lh, struct hlist_node *hnode) return NULL; } -/** - * Returns 1 on a match, +/* Returns 1 on a match, * XXX: This would be better if it returned, -1, 0, or 1 for * <, =, > respectivly. It could then be used to implement * a LH_SORT feature flags which could keep each lustre hash @@ -164,20 +164,16 @@ lh_exit(lustre_hash_t *lh, struct hlist_node *hnode) return LHP(lh, exit)(hnode); } -/** - * Validate hnode references the correct key. - */ +/* Validate hnode references the correct key */ static inline void __lustre_hash_key_validate(lustre_hash_t *lh, void *key, struct hlist_node *hnode) { if (unlikely(lh->lh_flags & LH_DEBUG)) - LASSERT(lh_compare(lh, key, hnode)); + LASSERT(lh_compare(lh, key, hnode) > 0); } -/** - * Validate hnode is in the correct bucket. - */ +/* Validate hnode is in the correct bucket */ static inline void __lustre_hash_bucket_validate(lustre_hash_t *lh, lustre_hash_bucket_t *lhb, struct hlist_node *hnode) @@ -185,7 +181,7 @@ __lustre_hash_bucket_validate(lustre_hash_t *lh, lustre_hash_bucket_t *lhb, unsigned i; if (unlikely(lh->lh_flags & LH_DEBUG)) { - i = lh_hash(lh, lh_key(lh, hnode), lh->lh_cur_size - 1); + i = lh_hash(lh, lh_key(lh, hnode), lh->lh_cur_mask); LASSERT(&lh->lh_buckets[i] == lhb); } } @@ -197,7 +193,7 @@ __lustre_hash_bucket_lookup(lustre_hash_t *lh, struct hlist_node *hnode; hlist_for_each(hnode, &lhb->lhb_head) - if (lh_compare(lh, key, hnode)) + if (lh_compare(lh, key, hnode) > 0) return hnode; return NULL; @@ -229,33 +225,25 @@ __lustre_hash_bucket_del(lustre_hash_t *lh, return lh_put(lh, hnode); } -/* - * Hash init/cleanup functions. - */ -lustre_hash_t *lustre_hash_init(char *name, unsigned int cur_size, - unsigned int max_size, +/* Hash init/cleanup functions */ +lustre_hash_t *lustre_hash_init(char *name, unsigned int cur_bits, + unsigned int max_bits, lustre_hash_ops_t *ops, int flags); void lustre_hash_exit(lustre_hash_t *lh); -/* - * Hash addition functions. - */ +/* Hash addition functions */ void lustre_hash_add(lustre_hash_t *lh, void *key, struct hlist_node *hnode); -int lustre_hash_add_unique(lustre_hash_t *lh, void *key, - struct hlist_node *hnode); +int lustre_hash_add_unique(lustre_hash_t *lh, void *key, + struct hlist_node *hnode); void *lustre_hash_findadd_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode); -/* - * Hash deletion functions. - */ +/* Hash deletion functions */ void *lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode); void *lustre_hash_del_key(lustre_hash_t *lh, void *key); -/* - * Hash lookup/for_each functions. - */ +/* Hash lookup/for_each functions */ void *lustre_hash_lookup(lustre_hash_t *lh, void *key); typedef void (*lh_for_each_cb)(void *obj, void *data); void lustre_hash_for_each(lustre_hash_t *lh, lh_for_each_cb, void *data); @@ -268,41 +256,48 @@ void lustre_hash_for_each_key(lustre_hash_t *lh, void *key, * Rehash - Theta is calculated to be the average chained * hash depth assuming a perfectly uniform hash funcion. */ -int lustre_hash_rehash(lustre_hash_t *lh, int size); +int lustre_hash_rehash(lustre_hash_t *lh, int bits); void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key, struct hlist_node *hnode); -static inline int -__lustre_hash_theta(lustre_hash_t *lh) +#define LH_THETA_BITS 10 + +/* Return integer component of theta */ +static inline int __lustre_hash_theta_int(int theta) { - return ((atomic_read(&lh->lh_count) * 1000) / lh->lh_cur_size); + return (theta >> LH_THETA_BITS); } -static inline void -__lustre_hash_set_theta(lustre_hash_t *lh, int min, int max) +/* Return a fractional value between 0 and 999 */ +static inline int __lustre_hash_theta_frac(int theta) +{ + return ((theta * 1000) >> LH_THETA_BITS) - + (__lustre_hash_theta_int(theta) * 1000); +} + +static inline int __lustre_hash_theta(lustre_hash_t *lh) +{ + return (atomic_read(&lh->lh_count) << LH_THETA_BITS) >> lh->lh_cur_bits; +} + +static inline void __lustre_hash_set_theta(lustre_hash_t *lh, int min, int max) { LASSERT(min < max); lh->lh_min_theta = min; - lh->lh_min_theta = max; + lh->lh_max_theta = max; } -/* - * Generic debug formatting routines mainly for proc handler. - */ +/* Generic debug formatting routines mainly for proc handler */ int lustre_hash_debug_header(char *str, int size); int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size); -/* - * 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 - */ +/* 2^31 + 2^29 - 2^25 + 2^22 - 2^19 - 2^16 + 1 */ #define CFS_GOLDEN_RATIO_PRIME_32 0x9e370001UL -/* - * 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 - */ +/* 2^63 + 2^61 - 2^57 + 2^54 - 2^51 - 2^18 + 1 */ #define CFS_GOLDEN_RATIO_PRIME_64 0x9e37fffffffc0001ULL -/** +/* * Generic djb2 hash algorithm for character arrays. */ static inline unsigned @@ -318,7 +313,7 @@ lh_djb2_hash(void *key, size_t size, unsigned mask) return (hash & mask); } -/** +/* * Generic u32 hash algorithm. */ static inline unsigned @@ -327,7 +322,7 @@ lh_u32_hash(__u32 key, unsigned mask) return ((key * CFS_GOLDEN_RATIO_PRIME_32) & mask); } -/** +/* * Generic u64 hash algorithm. */ static inline unsigned @@ -338,7 +333,7 @@ lh_u64_hash(__u64 key, unsigned mask) #define lh_for_each_bucket(lh, lhb, pos) \ for (pos = 0; \ - pos < lh->lh_cur_size && \ + pos <= lh->lh_cur_mask && \ ({ lhb = &lh->lh_buckets[i]; 1; }); \ pos++) diff --git a/lustre/include/dt_object.h b/lustre/include/dt_object.h index 760cee5..536273d 100644 --- a/lustre/include/dt_object.h +++ b/lustre/include/dt_object.h @@ -66,6 +66,7 @@ struct txn_param; struct dt_device; struct dt_object; struct dt_index_features; +struct dt_quota_ctxt; struct dt_device_param { unsigned ddp_max_name_len; @@ -82,11 +83,12 @@ enum dt_txn_op { DTO_IDNEX_UPDATE, DTO_OBJECT_CREATE, DTO_OBJECT_DELETE, - DTO_ATTR_SET, + DTO_ATTR_SET_BASE, DTO_XATTR_SET, DTO_LOG_REC, /**< XXX temporary: dt layer knows nothing about llog. */ DTO_WRITE_BASE, DTO_WRITE_BLOCK, + DTO_ATTR_SET_CHOWN, DTO_NR }; @@ -128,12 +130,28 @@ struct dt_device_operations { int (*dt_sync)(const struct lu_env *env, struct dt_device *dev); void (*dt_ro)(const struct lu_env *env, struct dt_device *dev); /** + * Start a transaction commit asynchronously + * + * \param env environment + * \param dev dt_device to start commit on + * + * \return 0 success, negative value if error + */ + int (*dt_commit_async)(const struct lu_env *env, + struct dt_device *dev); + /** * Initialize capability context. */ int (*dt_init_capa_ctxt)(const struct lu_env *env, struct dt_device *dev, int mode, unsigned long timeout, __u32 alg, struct lustre_capa_key *keys); + /** + * Initialize quota context. + */ + void (*dt_init_quota_ctxt)(const struct lu_env *env, + struct dt_device *dev, + struct dt_quota_ctxt *ctxt, void *data); /** * get transaction credits for given \a op. @@ -153,6 +171,8 @@ struct dt_index_features { size_t dif_recsize_min; /** maximal required record size, 0 if no limit */ size_t dif_recsize_max; + /** pointer size for record */ + size_t dif_ptrsize; }; enum dt_index_flags { @@ -174,14 +194,54 @@ extern const struct dt_index_features dt_directory_features; /** * This is a general purpose dt allocation hint. - * It now contains the parent object. + * It now contains the parent object. * It can contain any allocation hint in the future. */ struct dt_allocation_hint { - struct dt_object *dah_parent; - __u32 dah_mode; + struct dt_object *dah_parent; + __u32 dah_mode; +}; + +/** + * object type specifier. + */ + +enum dt_format_type { + DFT_REGULAR, + DFT_DIR, + /** for mknod */ + DFT_NODE, + /** for special index */ + DFT_INDEX, + /** for symbolic link */ + DFT_SYM, +}; + +/** + * object format specifier. + */ +struct dt_object_format { + /** type for dt object */ + enum dt_format_type dof_type; + union { + struct dof_regular { + } dof_reg; + struct dof_dir { + } dof_dir; + struct dof_node { + } dof_node; + /** + * special index need feature as parameter to create + * special idx + */ + struct dof_index { + const struct dt_index_features *di_feat; + } dof_idx; + } u; }; +enum dt_format_type dt_mode_to_dft(__u32 mode); + /** * Per-dt-object operations. */ @@ -277,8 +337,9 @@ struct dt_object_operations { * postcondition: ergo(result == 0, dt_object_exists(dt)); */ int (*do_create)(const struct lu_env *env, struct dt_object *dt, - struct lu_attr *attr, + struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th); /** @@ -327,7 +388,8 @@ struct dt_body_operations { */ ssize_t (*dbo_write)(const struct lu_env *env, struct dt_object *dt, const struct lu_buf *buf, loff_t *pos, - struct thandle *handle, struct lustre_capa *capa); + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); }; /** @@ -360,7 +422,8 @@ struct dt_index_operations { */ int (*dio_insert)(const struct lu_env *env, struct dt_object *dt, const struct dt_rec *rec, const struct dt_key *key, - struct thandle *handle, struct lustre_capa *capa); + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota); /** * precondition: dt_object_exists(dt); */ @@ -377,7 +440,7 @@ struct dt_index_operations { * precondition: dt_object_exists(dt); */ struct dt_it *(*init)(const struct lu_env *env, - struct dt_object *dt, int writable, + struct dt_object *dt, struct lustre_capa *capa); void (*fini)(const struct lu_env *env, struct dt_it *di); @@ -386,8 +449,6 @@ struct dt_index_operations { const struct dt_key *key); void (*put)(const struct lu_env *env, struct dt_it *di); - int (*del)(const struct lu_env *env, - struct dt_it *di, struct thandle *th); int (*next)(const struct lu_env *env, struct dt_it *di); struct dt_key *(*key)(const struct lu_env *env, @@ -404,7 +465,7 @@ struct dt_index_operations { }; struct dt_device { - struct lu_device dd_lu_dev; + struct lu_device dd_lu_dev; const struct dt_device_operations *dd_ops; /** @@ -412,7 +473,7 @@ struct dt_device { * way, because callbacks are supposed to be added/deleted only during * single-threaded start-up shut-down procedures. */ - struct list_head dd_txn_callbacks; + struct list_head dd_txn_callbacks; }; int dt_device_init(struct dt_device *dev, struct lu_device_type *t); @@ -430,7 +491,7 @@ static inline struct dt_device * lu2dt_dev(struct lu_device *l) } struct dt_object { - struct lu_object do_lu; + struct lu_object do_lu; const struct dt_object_operations *do_ops; const struct dt_body_operations *do_body_ops; const struct dt_index_operations *do_index_ops; @@ -516,10 +577,30 @@ int dt_txn_hook_stop(const struct lu_env *env, struct thandle *txn); int dt_txn_hook_commit(const struct lu_env *env, struct thandle *txn); int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj); + +/** + * Callback function used for parsing path. + * \see llo_store_resolve + */ +typedef int (*dt_entry_func_t)(const struct lu_env *env, + const char *name, + void *pvt); + +#define DT_MAX_PATH 1024 + +int dt_path_parser(const struct lu_env *env, + char *local, dt_entry_func_t entry_func, + void *data); + struct dt_object *dt_store_open(const struct lu_env *env, - struct dt_device *dt, const char *name, + struct dt_device *dt, + const char *dirname, + const char *filename, struct lu_fid *fid); -/** @} dt */ +struct dt_object *dt_locate(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid); +/** @} dt */ #endif /* __LUSTRE_DT_OBJECT_H */ diff --git a/lustre/include/interval_tree.h b/lustre/include/interval_tree.h index 117d5c0..b50278b 100644 --- a/lustre/include/interval_tree.h +++ b/lustre/include/interval_tree.h @@ -48,8 +48,10 @@ struct interval_node { struct interval_node *in_left; struct interval_node *in_right; struct interval_node *in_parent; - __u8 in_color; - __u8 res1[7]; /* tags, 8-bytes aligned */ + unsigned in_color:1, + in_intree:1, /** set if the node is in tree */ + in_res1:30; + __u8 in_res2[4]; /** tags, 8-bytes aligned */ __u64 in_max_high; struct interval_node_extent { __u64 start; @@ -62,6 +64,11 @@ enum interval_iter { INTERVAL_ITER_STOP = 2 }; +static inline int interval_is_intree(struct interval_node *node) +{ + return node->in_intree == 1; +} + static inline __u64 interval_low(struct interval_node *node) { return node->in_extent.start; diff --git a/lustre/include/lclient.h b/lustre/include/lclient.h new file mode 100644 index 0000000..dfd7f65 --- /dev/null +++ b/lustre/include/lclient.h @@ -0,0 +1,375 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Definitions shared between vvp and liblustre, and other clients in the + * future. + * + * Author: Oleg Drokin + * Author: Nikita Danilov + */ + +#ifndef LCLIENT_H +#define LCLIENT_H + +int cl_glimpse_size(struct inode *inode); +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob); + +/** + * Common IO arguments for various VFS I/O interfaces. + */ +struct ccc_io_args { + int cia_is_sendfile; +#ifndef HAVE_FILE_WRITEV + struct kiocb *cia_iocb; +#endif + struct iovec *cia_iov; + unsigned long cia_nrsegs; + read_actor_t cia_actor; + void *cia_target; +}; + +/** + * Locking policy for truncate. + */ +enum ccc_trunc_lock_type { + /** Locking is done by server */ + TRUNC_NOLOCK, + /** Extent lock is enqueued */ + TRUNC_EXTENT, + /** Existing local extent lock is used */ + TRUNC_MATCH +}; + +/** + * IO state private to vvp or slp layers. + */ +struct ccc_io { + /** super class */ + struct cl_io_slice cui_cl; + struct cl_io_lock_link cui_link; + /** + * I/O vector information to or from which read/write is going. + */ + struct iovec *cui_iov; + unsigned long cui_nrsegs; + /** + * Total iov count for left IO. + */ + unsigned long cui_tot_nrsegs; + /** + * Old length for iov that was truncated partially. + */ + size_t cui_iov_olen; + /** + * Total size for the left IO. + */ + size_t cui_tot_count; + + union { + struct { + int cui_locks_released; + enum ccc_trunc_lock_type cui_local_lock; + } trunc; + } u; + /** + * True iff io is processing glimpse right now. + */ + int cui_glimpse; + /** + * File descriptor against which IO is done. + */ + struct ll_file_data *cui_fd; +#ifndef HAVE_FILE_WRITEV + struct kiocb *cui_iocb; +#endif +}; + +extern struct lu_context_key ccc_key; +extern struct lu_context_key ccc_session_key; + +struct ccc_thread_info { + struct cl_lock_descr cti_descr; + struct cl_io cti_io; + struct cl_sync_io cti_sync_io; + struct cl_attr cti_attr; +}; + +static inline struct ccc_thread_info *ccc_env_info(const struct lu_env *env) +{ + struct ccc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &ccc_key); + LASSERT(info != NULL); + return info; +} + +struct ccc_session { + struct ccc_io cs_ios; +}; + +static inline struct ccc_session *ccc_env_session(const struct lu_env *env) +{ + struct ccc_session *ses; + + ses = lu_context_key_get(env->le_ses, &ccc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct ccc_io *ccc_env_io(const struct lu_env *env) +{ + return &ccc_env_session(env)->cs_ios; +} + +/** + * ccc-private object state. + */ +struct ccc_object { + struct cl_object_header cob_header; + struct cl_object cob_cl; + struct inode *cob_inode; + + /** + * A list of dirty pages pending IO in the cache. Used by + * SOM. Protected by ll_inode_info::lli_lock. + * + * \see ccc_page::cpg_pending_linkage + */ + struct list_head cob_pending_list; + + /** + * Access this counter is protected by inode->i_sem. Now that + * the lifetime of transient pages must be covered by inode sem, + * we don't need to hold any lock.. + */ + int cob_transient_pages; + /** + * Number of outstanding mmaps on this file. + * + * \see ll_vm_open(), ll_vm_close(). + */ + atomic_t cob_mmap_cnt; +}; + +/** + * ccc-private page state. + */ +struct ccc_page { + struct cl_page_slice cpg_cl; + int cpg_defer_uptodate; + int cpg_ra_used; + int cpg_write_queued; + /** + * Non-empty iff this page is already counted in + * ccc_object::cob_pending_list. Protected by + * ccc_object::cob_pending_guard. This list is only used as a flag, + * that is, never iterated through, only checked for list_empty(), but + * having a list is useful for debugging. + */ + struct list_head cpg_pending_linkage; + /** VM page */ + cfs_page_t *cpg_page; + struct cl_sync_io *cpg_sync_io; + /** + * checksum for paranoid I/O debugging enabled by + * ENABLE_LLITE_CHECKSUM configuration option. + * + * XXX This cannot be implemented reliably because checksum cannot be + * updated from ->set_page_dirty() that is called without page VM + * lock. + */ + __u32 cpg_checksum; +}; + +static inline struct ccc_page *cl2ccc_page(const struct cl_page_slice *slice) +{ + return container_of(slice, struct ccc_page, cpg_cl); +} + +struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage); + +struct ccc_device { + struct cl_device cdv_cl; + struct super_block *cdv_sb; + struct cl_device *cdv_next; +}; + +struct ccc_lock { + struct cl_lock_slice clk_cl; +}; + +struct ccc_req { + struct cl_req_slice crq_cl; +}; + +void *ccc_key_init (const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_key_fini (const struct lu_context *ctx, + struct lu_context_key *key, void *data); +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key); +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data); + +int ccc_device_init (const struct lu_env *env, + struct lu_device *d, + const char *name, struct lu_device *next); +struct lu_device *ccc_device_fini (const struct lu_env *env, + struct lu_device *d); +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops); +struct lu_device *ccc_device_free (const struct lu_env *env, + struct lu_device *d); +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops); + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +void ccc_umount(const struct lu_env *env, struct cl_device *dev); +int ccc_global_init(struct lu_device_type *device_type); +void ccc_global_fini(struct lu_device_type *device_type); +int ccc_object_init0(const struct lu_env *env,struct ccc_object *vob, + const struct cl_object_conf *conf); +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +void ccc_object_free(const struct lu_env *env, struct lu_object *obj); +int ccc_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io, + const struct cl_lock_operations *lkops); +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid); +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb); +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf); +cfs_page_t *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice); +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io); +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice); +void ccc_transient_page_verify(const struct cl_page *page); +void ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io); +void ccc_lock_fini(const struct lu_env *env,struct cl_lock_slice *slice); +int ccc_lock_enqueue(const struct lu_env *env,const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags); +int ccc_lock_unuse(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_wait(const struct lu_env *env,const struct cl_lock_slice *slice); +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io); +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state); + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios); +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end); +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end); +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios); +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t pos, int vfslock); +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret); +void ccc_req_attr_set(const struct lu_env *env,const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *oa, obd_valid flags); + +struct lu_device *ccc2lu_dev (struct ccc_device *vdv); +struct lu_object *ccc2lu (struct ccc_object *vob); +struct ccc_device *lu2ccc_dev (const struct lu_device *d); +struct ccc_device *cl2ccc_dev (const struct cl_device *d); +struct ccc_object *lu2ccc (const struct lu_object *obj); +struct ccc_object *cl2ccc (const struct cl_object *obj); +struct ccc_lock *cl2ccc_lock (const struct cl_lock_slice *slice); +struct ccc_io *cl2ccc_io (const struct lu_env *env, + const struct cl_io_slice *slice); +struct ccc_req *cl2ccc_req (const struct cl_req_slice *slice); +cfs_page_t *cl2vm_page (const struct cl_page_slice *slice); +struct inode *ccc_object_inode(const struct cl_object *obj); +struct ccc_object *cl_inode2ccc (struct inode *inode); + +int cl_setattr_do_truncate(struct inode *inode, loff_t size, + struct obd_capa *capa); +int cl_setattr_ost(struct inode *inode, struct obd_capa *capa); + +struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage); +int ccc_object_invariant(const struct cl_object *obj); +int cl_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); +int cl_local_size(struct inode *inode); + +#ifdef INVARIANT_CHECK +# define CLOBINVRNT(env, clob, expr) \ + do { \ + if (unlikely(!(expr))) { \ + LU_OBJECT_DEBUG(D_ERROR, (env), &(clob)->co_lu, #expr "\n"); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !INVARIANT_CHECK */ +# define CLOBINVRNT(env, clob, expr) \ + ((void)sizeof(env), (void)sizeof(clob), (void)sizeof !!(expr)) +#endif /* !INVARIANT_CHECK */ + + +#endif /*LCLIENT_H */ diff --git a/lustre/include/liblustre.h b/lustre/include/liblustre.h index 13b2e26..09bf725 100644 --- a/lustre/include/liblustre.h +++ b/lustre/include/liblustre.h @@ -71,7 +71,7 @@ typedef unsigned short umode_t; /* * The inter_module_get implementation is specific to liblustre, so this needs * to stay here for now. - */ + */ static inline void inter_module_put(void *a) { return; @@ -251,6 +251,7 @@ struct task_struct { int ngroups; gid_t *groups; __u32 cap_effective; + void *journal_info; }; @@ -378,7 +379,7 @@ void *liblustre_register_wait_callback(const char *name, void liblustre_deregister_wait_callback(void *notifier); int liblustre_wait_event(int timeout); -void *liblustre_register_idle_callback(const char *name, +void *liblustre_register_idle_callback(const char *name, int (*fn)(void *arg), void *arg); void liblustre_deregister_idle_callback(void *notifier); void liblustre_wait_idle(void); @@ -484,10 +485,10 @@ void posix_acl_release(struct posix_acl *acl) } #ifdef LIBLUSTRE_POSIX_ACL -# ifndef posix_acl_xattr_entry +# ifndef posix_acl_xattr_entry # define posix_acl_xattr_entry xattr_acl_entry # endif -# ifndef posix_acl_xattr_header +# ifndef posix_acl_xattr_header # define posix_acl_xattr_header xattr_acl_header # endif # ifndef posix_acl_xattr_size diff --git a/lustre/include/linux/lustre_acl.h b/lustre/include/linux/lustre_acl.h index f5d07a5..cfdc247 100644 --- a/lustre/include/linux/lustre_acl.h +++ b/lustre/include/linux/lustre_acl.h @@ -43,7 +43,7 @@ #define _LUSTRE_LINUX_ACL_H #ifndef _LUSTRE_ACL_H -#error Shoud not include direectly. use #include instead +#error Shoud not include direectly. use #include instead #endif #ifdef __KERNEL__ @@ -76,7 +76,11 @@ # define LUSTRE_POSIX_ACL_MAX_ENTRIES (32) +#ifdef __KERNEL__ # define LUSTRE_POSIX_ACL_MAX_SIZE XATTR_ACL_SIZE +#else +# define LUSTRE_POSIX_ACL_MAX_SIZE 0 +#endif # else /* CONFIG_FS_POSIX_ACL */ # define LUSTRE_POSIX_ACL_MAX_SIZE 0 diff --git a/lustre/include/linux/lustre_compat25.h b/lustre/include/linux/lustre_compat25.h index 4bd7b0c..13c0385 100644 --- a/lustre/include/linux/lustre_compat25.h +++ b/lustre/include/linux/lustre_compat25.h @@ -339,7 +339,7 @@ int filemap_fdatawrite_range(struct address_space *mapping, #endif #ifdef HAVE_VFS_KERN_MOUNT -static inline +static inline struct vfsmount * ll_kern_mount(const char *fstype, int flags, const char *name, void *data) { @@ -355,45 +355,6 @@ ll_kern_mount(const char *fstype, int flags, const char *name, void *data) #define ll_kern_mount(fstype, flags, name, data) do_kern_mount((fstype), (flags), (name), (data)) #endif -#ifndef HAVE_GENERIC_FILE_READ -static inline -ssize_t -generic_file_read(struct file *filp, char __user *buf, size_t len, loff_t *ppos) -{ - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; - kiocb.ki_left = len; - - ret = generic_file_aio_read(&kiocb, &iov, 1, kiocb.ki_pos); - *ppos = kiocb.ki_pos; - return ret; -} -#endif - -#ifndef HAVE_GENERIC_FILE_WRITE -static inline -ssize_t -generic_file_write(struct file *filp, const char __user *buf, size_t len, loff_t *ppos) -{ - struct iovec iov = { .iov_base = (void __user *)buf, .iov_len = len }; - struct kiocb kiocb; - ssize_t ret; - - init_sync_kiocb(&kiocb, filp); - kiocb.ki_pos = *ppos; - kiocb.ki_left = len; - - ret = generic_file_aio_write(&kiocb, &iov, 1, kiocb.ki_pos); - *ppos = kiocb.ki_pos; - - return ret; -} -#endif - #ifdef HAVE_STATFS_DENTRY_PARAM #define ll_do_statfs(sb, sfs) (sb)->s_op->statfs((sb)->s_root, (sfs)) #else @@ -426,7 +387,7 @@ static inline u32 get_sb_time_gran(struct super_block *sb) #ifdef HAVE_UNREGISTER_BLKDEV_RETURN_INT #define ll_unregister_blkdev(a,b) unregister_blkdev((a),(b)) #else -static inline +static inline int ll_unregister_blkdev(unsigned int dev, const char *name) { unregister_blkdev(dev, name); @@ -540,8 +501,31 @@ struct blkcipher_desc { #define ll_crypto_blkcipher_encrypt_iv(desc, dst, src, bytes) \ crypto_cipher_encrypt_iv((desc)->tfm, dst, src, bytes, (desc)->info) -extern struct ll_crypto_cipher *ll_crypto_alloc_blkcipher( - const char * algname, u32 type, u32 mask); +static inline +struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char * algname, + u32 type, u32 mask) +{ + char buf[CRYPTO_MAX_ALG_NAME + 1]; + const char *pan = algname; + u32 flag = 0; + + if (strncmp("cbc(", algname, 4) == 0) + flag |= CRYPTO_TFM_MODE_CBC; + else if (strncmp("ecb(", algname, 4) == 0) + flag |= CRYPTO_TFM_MODE_ECB; + if (flag) { + char *vp = strnchr(algname, CRYPTO_MAX_ALG_NAME, ')'); + if (vp) { + memcpy(buf, algname + 4, vp - algname - 4); + buf[vp - algname - 4] = '\0'; + pan = buf; + } else { + flag = 0; + } + } + return crypto_alloc_tfm(pan, flag); +} + static inline struct ll_crypto_hash *ll_crypto_alloc_hash(const char *alg, u32 type, u32 mask) { @@ -568,10 +552,10 @@ static inline int ll_crypto_hash_update(struct hash_desc *desc, { struct scatterlist *sl = sg; unsigned int count; - /* + /* * This way is very weakness. We must ensure that * the sum of sg[0..i]->length isn't greater than nbytes. - * In the upstream kernel the crypto_hash_update() also + * In the upstream kernel the crypto_hash_update() also * via the nbytes computed the count of sg[...]. * The old style is more safely. but it gone. */ @@ -617,7 +601,7 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm, #define ll_vfs_mknod(dir,entry,mnt,mode,dev) \ vfs_mknod(dir,entry,mnt,mode,dev) #define ll_security_inode_unlink(dir,entry,mnt) \ - security_inode_unlink(dir,entry,mnt) + security_inode_unlink(dir,entry,mnt) #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \ vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) #else @@ -627,7 +611,7 @@ static inline int ll_crypto_hmac(struct crypto_tfm *tfm, #define ll_vfs_link(old,mnt,dir,new,mnt1) vfs_link(old,dir,new) #define ll_vfs_unlink(inode,entry,mnt) vfs_unlink(inode,entry) #define ll_vfs_mknod(dir,entry,mnt,mode,dev) vfs_mknod(dir,entry,mode,dev) -#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) +#define ll_security_inode_unlink(dir,entry,mnt) security_inode_unlink(dir,entry) #define ll_vfs_rename(old,old_dir,mnt,new,new_dir,mnt1) \ vfs_rename(old,old_dir,new,new_dir) #endif diff --git a/lustre/include/linux/lustre_fsfilt.h b/lustre/include/linux/lustre_fsfilt.h index 2996e36..b544341 100644 --- a/lustre/include/linux/lustre_fsfilt.h +++ b/lustre/include/linux/lustre_fsfilt.h @@ -113,12 +113,12 @@ struct fsfilt_operations { int (* fs_read_record)(struct file *, void *, int size, loff_t *); int (* fs_setup)(struct super_block *sb); int (* fs_get_op_len)(int, struct fsfilt_objinfo *, int); - int (* fs_quotactl)(struct super_block *sb, - struct obd_quotactl *oqctl); int (* fs_quotacheck)(struct super_block *sb, struct obd_quotactl *oqctl); __u64 (* fs_get_version) (struct inode *inode); __u64 (* fs_set_version) (struct inode *inode, __u64 new_version); + int (* fs_quotactl)(struct super_block *sb, + struct obd_quotactl *oqctl); int (* fs_quotainfo)(struct lustre_quota_info *lqi, int type, int cmd); int (* fs_qids)(struct file *file, struct inode *inode, int type, @@ -167,18 +167,21 @@ static inline lvfs_sbdev_type fsfilt_journal_sbdev(struct obd_device *obd, return (lvfs_sbdev_type)0; } -#define FSFILT_OP_UNLINK 1 -#define FSFILT_OP_RMDIR 2 -#define FSFILT_OP_RENAME 3 -#define FSFILT_OP_CREATE 4 -#define FSFILT_OP_MKDIR 5 -#define FSFILT_OP_SYMLINK 6 -#define FSFILT_OP_MKNOD 7 -#define FSFILT_OP_SETATTR 8 -#define FSFILT_OP_LINK 9 -#define FSFILT_OP_CANCEL_UNLINK 10 -#define FSFILT_OP_JOIN 11 -#define FSFILT_OP_NOOP 15 +#define FSFILT_OP_UNLINK 1 +#define FSFILT_OP_RMDIR 2 +#define FSFILT_OP_RENAME 3 +#define FSFILT_OP_CREATE 4 +#define FSFILT_OP_MKDIR 5 +#define FSFILT_OP_SYMLINK 6 +#define FSFILT_OP_MKNOD 7 +#define FSFILT_OP_SETATTR 8 +#define FSFILT_OP_LINK 9 +#define FSFILT_OP_CANCEL_UNLINK 10 +#define FSFILT_OP_JOIN 11 +#define FSFILT_OP_NOOP 15 +#define FSFILT_OP_UNLINK_PARTIAL_CHILD 21 +#define FSFILT_OP_UNLINK_PARTIAL_PARENT 22 +#define FSFILT_OP_CREATE_PARTIAL_CHILD 23 #define __fsfilt_check_slow(obd, start, msg) \ do { \ diff --git a/lustre/include/linux/lustre_user.h b/lustre/include/linux/lustre_user.h index b44679e..da302bc 100644 --- a/lustre/include/linux/lustre_user.h +++ b/lustre/include/linux/lustre_user.h @@ -48,6 +48,9 @@ # endif #else # include +# if LINUX_VERSION_CODE < KERNEL_VERSION(2,4,21) +# define NEED_QUOTA_DEFS +# endif # ifdef HAVE_QUOTA_SUPPORT # include # endif diff --git a/lustre/include/linux/lvfs.h b/lustre/include/linux/lvfs.h index 26959b5..17576c3 100644 --- a/lustre/include/linux/lvfs.h +++ b/lustre/include/linux/lvfs.h @@ -96,7 +96,7 @@ struct lvfs_run_ctxt { #ifdef __KERNEL__ struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, - char *name, int mode, int fix); + const char *name, int mode, int fix); struct dentry *simple_mknod(struct dentry *dir, char *name, int mode, int fix); int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname, char *newname); diff --git a/lustre/include/linux/obd_support.h b/lustre/include/linux/obd_support.h index ccb001d..2e21b08 100644 --- a/lustre/include/linux/obd_support.h +++ b/lustre/include/linux/obd_support.h @@ -45,9 +45,12 @@ #ifndef AUTOCONF_INCLUDED #include #endif +#include +#include #include #include #include +#include #endif #include #include diff --git a/lustre/include/lprocfs_status.h b/lustre/include/lprocfs_status.h index 8ef613e..7763498 100644 --- a/lustre/include/lprocfs_status.h +++ b/lustre/include/lprocfs_status.h @@ -222,7 +222,7 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); -} else if (opc < FLD_LAST_OPC) { + } else if (opc < FLD_LAST_OPC) { /* FLD opcode */ return (opc - FLD_FIRST_OPC + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + @@ -252,6 +252,18 @@ static inline int opcode_offset(__u32 opc) { (LDLM_LAST_OPC - LDLM_FIRST_OPC) + (MDS_LAST_OPC - MDS_FIRST_OPC) + (OST_LAST_OPC - OST_FIRST_OPC)); + } else if (opc < QUOTA_LAST_OPC) { + /* LQUOTA Opcode */ + return (opc - QUOTA_FIRST_OPC + + (SEC_LAST_OPC - SEC_FIRST_OPC) + + (SEQ_LAST_OPC - SEQ_FIRST_OPC) + + (FLD_LAST_OPC - FLD_FIRST_OPC) + + (LLOG_LAST_OPC - LLOG_FIRST_OPC) + + (OBD_LAST_OPC - OBD_FIRST_OPC) + + (MGS_LAST_OPC - MGS_FIRST_OPC) + + (LDLM_LAST_OPC - LDLM_FIRST_OPC) + + (MDS_LAST_OPC - MDS_FIRST_OPC) + + (OST_LAST_OPC - OST_FIRST_OPC)); } else { /* Unknown Opcode */ return -1; @@ -266,7 +278,8 @@ static inline int opcode_offset(__u32 opc) { (SEQ_LAST_OPC - SEQ_FIRST_OPC) + \ (MGS_LAST_OPC - MGS_FIRST_OPC) + \ (LLOG_LAST_OPC - LLOG_FIRST_OPC) + \ - (SEC_LAST_OPC - SEC_FIRST_OPC)) + (SEC_LAST_OPC - SEC_FIRST_OPC) + \ + (QUOTA_LAST_OPC - QUOTA_FIRST_OPC)) #define EXTRA_MAX_OPCODES ((PTLRPC_LAST_CNTR - PTLRPC_FIRST_CNTR) + \ (EXTRA_LAST_OPC - EXTRA_FIRST_OPC)) @@ -288,12 +301,13 @@ enum { LDLM_EXTENT_ENQUEUE, LDLM_FLOCK_ENQUEUE, LDLM_IBITS_ENQUEUE, + MDS_REINT_SETATTR, MDS_REINT_CREATE, MDS_REINT_LINK, - MDS_REINT_OPEN, - MDS_REINT_SETATTR, - MDS_REINT_RENAME, MDS_REINT_UNLINK, + MDS_REINT_RENAME, + MDS_REINT_OPEN, + MDS_REINT_SETXATTR, BRW_READ_BYTES, BRW_WRITE_BYTES, EXTRA_LAST_OPC @@ -617,6 +631,56 @@ int lprocfs_obd_rd_recovery_maxtime(char *page, char **start, off_t off, /* lprocfs_status.c: write recovery max time bz13079 */ int lprocfs_obd_wr_recovery_maxtime(struct file *file, const char *buffer, unsigned long count, void *data); + +/* all quota proc functions */ +extern int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_btune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_itune(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count, + int *eof, void *data); +extern int lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_switch_qs(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_boundary_factor(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_bunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_least_iunit(struct file *file, const char *buffer, + unsigned long count, void *data); +extern int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off, + int count, int *eof, void *data); +extern int lprocfs_quota_wr_qs_factor(struct file *file, const char *buffer, + unsigned long count, void *data); #else /* LPROCFS is not defined */ static inline void lprocfs_counter_add(struct lprocfs_stats *stats, @@ -651,7 +715,7 @@ static inline void lprocfs_init_ops_stats(int num_private_stats, static inline void lprocfs_init_ldlm_stats(struct lprocfs_stats *ldlm_stats) { return; } static inline int lprocfs_alloc_obd_stats(struct obd_device *obddev, - unsigned int num_private_stats) + unsigned int num_private_stats) { return 0; } static inline int lprocfs_alloc_md_stats(struct obd_device *obddev, unsigned int num_private_stats) @@ -663,7 +727,7 @@ struct obd_export; static inline int lprocfs_add_clear_entry(struct obd_export *exp) { return 0; } static inline int lprocfs_exp_setup(struct obd_export *exp, - lnet_nid_t *peer_nid, int *newnid) + lnet_nid_t *peer_nid, int *newnid) { return 0; } static inline int lprocfs_exp_cleanup(struct obd_export *exp) { return 0; } diff --git a/lustre/include/lu_object.h b/lustre/include/lu_object.h index 9dd557f..1b00b02 100644 --- a/lustre/include/lu_object.h +++ b/lustre/include/lu_object.h @@ -143,7 +143,7 @@ struct lu_device_operations { * repeatedly, until no new objects are created. * * \post ergo(!IS_ERR(result), result->lo_dev == d && - * result->lo_ops != NULL); + * result->lo_ops != NULL); */ struct lu_object *(*ldo_object_alloc)(const struct lu_env *env, const struct lu_object_header *h, @@ -156,6 +156,16 @@ struct lu_device_operations { int (*ldo_recovery_complete)(const struct lu_env *, struct lu_device *); + /** + * initialize local objects for device. this method called after layer has + * been initialized (after LCFG_SETUP stage) and before it starts serving + * user requests. + */ + + int (*ldo_prepare)(const struct lu_env *, + struct lu_device *parent, + struct lu_device *dev); + }; /** @@ -177,7 +187,7 @@ typedef int (*lu_printer_t)(const struct lu_env *env, void *cookie, const char *format, ...) __attribute__ ((format (printf, 3, 4))); -/* +/** * Operations specific for particular lu_object. */ struct lu_object_operations { @@ -247,7 +257,7 @@ struct lu_device { * * \todo XXX which means that atomic_t is probably too small. */ - atomic_t ld_ref; + atomic_t ld_ref; /** * Pointer to device type. Never modified once set. */ @@ -259,11 +269,11 @@ struct lu_device { /** * Stack this device belongs to. */ - struct lu_site *ld_site; - struct proc_dir_entry *ld_proc_entry; + struct lu_site *ld_site; + struct proc_dir_entry *ld_proc_entry; /** \todo XXX: temporary back pointer into obd. */ - struct obd_device *ld_obd; + struct obd_device *ld_obd; /** * A list of references to this object, for debugging. */ @@ -292,11 +302,11 @@ struct lu_device_type { /** * Tag bits. Taken from enum lu_device_tag. Never modified once set. */ - __u32 ldt_tags; + __u32 ldt_tags; /** * Name of this class. Unique system-wide. Never modified once set. */ - char *ldt_name; + char *ldt_name; /** * Operations for this type. */ @@ -304,11 +314,11 @@ struct lu_device_type { /** * \todo XXX: temporary pointer to associated obd_type. */ - struct obd_type *ldt_obd_type; + struct obd_type *ldt_obd_type; /** * \todo XXX: temporary: context tags used by obd_*() calls. */ - __u32 ldt_ctx_tags; + __u32 ldt_ctx_tags; /** * Number of existing device type instances. */ @@ -437,34 +447,34 @@ enum la_valid { LA_BLKSIZE = 1 << 12, }; -/* +/** * Layer in the layered object. */ struct lu_object { - /* + /** * Header for this object. */ - struct lu_object_header *lo_header; - /* + struct lu_object_header *lo_header; + /** * Device for this layer. */ - struct lu_device *lo_dev; - /* + struct lu_device *lo_dev; + /** * Operations for this object. */ const struct lu_object_operations *lo_ops; - /* + /** * Linkage into list of all layers. */ - struct list_head lo_linkage; - /* + struct list_head lo_linkage; + /** * Depth. Top level layer depth is 0. */ - int lo_depth; - /* + int lo_depth; + /** * Flags from enum lu_object_flags. */ - unsigned long lo_flags; + unsigned long lo_flags; /** * Link to the device, for debugging. */ @@ -472,7 +482,7 @@ struct lu_object { }; enum lu_object_header_flags { - /* + /** * Don't keep this object in cache. Object will be destroyed as soon * as last reference to it is released. This flag cannot be cleared * once set. @@ -483,14 +493,14 @@ enum lu_object_header_flags { enum lu_object_header_attr { LOHA_EXISTS = 1 << 0, LOHA_REMOTE = 1 << 1, - /* + /** * UNIX file type is stored in S_IFMT bits. */ - LOHA_FT_START = 1 << 12, /* S_IFIFO */ - LOHA_FT_END = 1 << 15, /* S_IFREG */ + LOHA_FT_START = 1 << 12, /**< S_IFIFO */ + LOHA_FT_END = 1 << 15, /**< S_IFREG */ }; -/* +/** * "Compound" object, consisting of multiple layers. * * Compound object with given fid is unique with given lu_site. @@ -506,33 +516,33 @@ struct lu_object_header { * Object flags from enum lu_object_header_flags. Set and checked * atomically. */ - unsigned long loh_flags; + unsigned long loh_flags; /** * Object reference count. Protected by lu_site::ls_guard. */ - atomic_t loh_ref; + atomic_t loh_ref; /** * Fid, uniquely identifying this object. */ - struct lu_fid loh_fid; + struct lu_fid loh_fid; /** * Common object attributes, cached for efficiency. From enum * lu_object_header_attr. */ - __u32 loh_attr; + __u32 loh_attr; /** * Linkage into per-site hash table. Protected by lu_site::ls_guard. */ - struct hlist_node loh_hash; + struct hlist_node loh_hash; /** * Linkage into per-site LRU list. Protected by lu_site::ls_guard. */ - struct list_head loh_lru; + struct list_head loh_lru; /** * Linkage into list of layers. Never modified once set (except lately * during object destruction). No locking is necessary. */ - struct list_head loh_layers; + struct list_head loh_layers; /** * A list of references to this object, for debugging. */ @@ -608,6 +618,7 @@ struct lu_site { * Top-level device for this stack. */ struct lu_device *ls_top_dev; + /** * Wait-queue signaled when an object in this site is ultimately * destroyed (lu_object_free()). It is used by lu_object_find() to @@ -666,10 +677,10 @@ void lu_device_get (struct lu_device *d); void lu_device_put (struct lu_device *d); int lu_device_init (struct lu_device *d, struct lu_device_type *t); void lu_device_fini (struct lu_device *d); -int lu_object_header_init(struct lu_object_header *h); +int lu_object_header_init(struct lu_object_header *h); void lu_object_header_fini(struct lu_object_header *h); int lu_object_init (struct lu_object *o, - struct lu_object_header *h, struct lu_device *d); + struct lu_object_header *h, struct lu_device *d); void lu_object_fini (struct lu_object *o); void lu_object_add_top (struct lu_object_header *h, struct lu_object *o); void lu_object_add (struct lu_object *before, struct lu_object *o); @@ -801,20 +812,20 @@ int lu_cdebug_printer(const struct lu_env *env, * Print object description followed by a user-supplied message. */ #define LU_OBJECT_DEBUG(mask, env, object, format, ...) \ -({ \ +do { \ static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask); \ \ if (cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ - lu_object_print(env, &__info, lu_cdebug_printer, object); \ - CDEBUG(mask, format , ## __VA_ARGS__); \ + lu_object_print(env, &__info, lu_cdebug_printer, object); \ + CDEBUG(mask, format , ## __VA_ARGS__); \ } \ -}) +} while (0) /** * Print short object description followed by a user-supplied message. */ #define LU_OBJECT_HEADER(mask, env, object, format, ...) \ -({ \ +do { \ static DECLARE_LU_CDEBUG_PRINT_INFO(__info, mask); \ \ if (cdebug_show(mask, DEBUG_SUBSYSTEM)) { \ @@ -823,10 +834,10 @@ int lu_cdebug_printer(const struct lu_env *env, lu_cdebug_printer(env, &__info, "\n"); \ CDEBUG(mask, format , ## __VA_ARGS__); \ } \ -}) +} while (0) void lu_object_print (const struct lu_env *env, void *cookie, - lu_printer_t printer, const struct lu_object *o); + lu_printer_t printer, const struct lu_object *o); void lu_object_header_print(const struct lu_env *env, void *cookie, lu_printer_t printer, const struct lu_object_header *hdr); @@ -975,6 +986,10 @@ struct lu_context { * keys were registered. */ unsigned lc_version; + /** + * Debugging cookie. + */ + unsigned lc_cookie; }; /** @@ -1167,50 +1182,52 @@ void lu_context_key_revive (struct lu_context_key *key); #define LU_KEY_INIT_GENERIC(mod) \ static void mod##_key_init_generic(struct lu_context_key *k, ...) \ - { \ + { \ struct lu_context_key *key = k; \ - va_list args; \ - \ - va_start(args, k); \ - do { \ - LU_CONTEXT_KEY_INIT(key); \ + va_list args; \ + \ + va_start(args, k); \ + do { \ + LU_CONTEXT_KEY_INIT(key); \ key = va_arg(args, struct lu_context_key *); \ - } while (key != NULL); \ - va_end(args); \ + } while (key != NULL); \ + va_end(args); \ } -#define LU_TYPE_INIT(mod, ...) \ +#define LU_TYPE_INIT(mod, ...) \ LU_KEY_INIT_GENERIC(mod) \ - static int mod##_type_init(struct lu_device_type *t) \ - { \ + static int mod##_type_init(struct lu_device_type *t) \ + { \ mod##_key_init_generic(__VA_ARGS__, NULL); \ return lu_context_key_register_many(__VA_ARGS__, NULL); \ - } \ + } \ struct __##mod##_dummy_type_init {;} -#define LU_TYPE_FINI(mod, ...) \ - static void mod##_type_fini(struct lu_device_type *t) \ - { \ +#define LU_TYPE_FINI(mod, ...) \ + static void mod##_type_fini(struct lu_device_type *t) \ + { \ lu_context_key_degister_many(__VA_ARGS__, NULL); \ - } \ + } \ struct __##mod##_dummy_type_fini {;} #define LU_TYPE_START(mod, ...) \ static void mod##_type_start(struct lu_device_type *t) \ { \ + lu_context_key_revive_many(__VA_ARGS__, NULL); \ } \ struct __##mod##_dummy_type_start {;} #define LU_TYPE_STOP(mod, ...) \ static void mod##_type_stop(struct lu_device_type *t) \ { \ + lu_context_key_quiesce_many(__VA_ARGS__, NULL); \ } \ struct __##mod##_dummy_type_stop {;} -#define LU_TYPE_INIT_FINI(mod, ...) \ - LU_TYPE_INIT(mod, __VA_ARGS__); \ +#define LU_TYPE_INIT_FINI(mod, ...) \ + LU_TYPE_INIT(mod, __VA_ARGS__); \ LU_TYPE_FINI(mod, __VA_ARGS__); \ LU_TYPE_START(mod, __VA_ARGS__); \ LU_TYPE_STOP(mod, __VA_ARGS__) @@ -1245,8 +1262,9 @@ struct lu_env { struct lu_context *le_ses; }; -int lu_env_init(struct lu_env *env, struct lu_context *ses, __u32 tags); -void lu_env_fini(struct lu_env *env); +int lu_env_init (struct lu_env *env, __u32 tags); +void lu_env_fini (struct lu_env *env); +int lu_env_refill(struct lu_env *env); /** @} lu_context */ @@ -1260,8 +1278,8 @@ int lu_site_stats_print(const struct lu_site *s, char *page, int count); * Common name structure to be passed around for various name related methods. */ struct lu_name { - char *ln_name; - int ln_namelen; + const char *ln_name; + int ln_namelen; }; /** @@ -1312,5 +1330,4 @@ int lu_kmem_init(struct lu_kmem_descr *caches); void lu_kmem_fini(struct lu_kmem_descr *caches); /** @} lu */ - #endif /* __LUSTRE_LU_OBJECT_H */ diff --git a/lustre/include/lustre/liblustreapi.h b/lustre/include/lustre/liblustreapi.h index 8717532..3f4fd1e 100644 --- a/lustre/include/lustre/liblustreapi.h +++ b/lustre/include/lustre/liblustreapi.h @@ -84,7 +84,7 @@ extern int llapi_poollist(char *name); extern int llapi_file_get_stripe(const char *path, struct lov_user_md *lum); #define HAVE_LLAPI_FILE_LOOKUP extern int llapi_file_lookup(int dirfd, const char *name); - + struct find_param { unsigned int maxdepth; time_t atime; @@ -151,8 +151,9 @@ extern int llapi_file_get_lov_uuid(const char *path, struct obd_uuid *lov_uuid); extern int llapi_file_fget_lov_uuid(int fd, struct obd_uuid *lov_uuid); extern int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count); extern int llapi_is_lustre_mnttype(const char *type); +extern int llapi_get_obd_count(char *mnt, int *count, int is_mdt); extern int parse_size(char *optarg, unsigned long long *size, - unsigned long long *size_units); + unsigned long long *size_units, int bytes_spec); extern int llapi_path2fid(const char *path, unsigned long long *seq, unsigned long *oid, unsigned long *ver); diff --git a/lustre/include/lustre/lustre_idl.h b/lustre/include/lustre/lustre_idl.h index 21bc557..48bbc4b 100644 --- a/lustre/include/lustre/lustre_idl.h +++ b/lustre/include/lustre/lustre_idl.h @@ -168,59 +168,85 @@ #define LUSTRE_LOG_VERSION 0x00050000 #define LUSTRE_MGS_VERSION 0x00060000 -typedef __u64 mdsno_t; +typedef __u32 mdsno_t; typedef __u64 seqno_t; -struct lu_range { - __u64 lr_start; - __u64 lr_end; +/** + * Describes a range of sequence, lsr_start is included but lsr_end is + * not in the range. + * Same structure is used in fld module where lsr_mdt field holds mdt id + * of the home mdt. + */ + +struct lu_seq_range { + __u64 lsr_start; + __u64 lsr_end; + __u32 lsr_mdt; + __u32 lsr_padding; }; -static inline __u64 range_space(struct lu_range *r) +/** + * returns width of given range \a r + */ + +static inline __u64 range_space(const struct lu_seq_range *range) { - return r->lr_end - r->lr_start; + return range->lsr_end - range->lsr_start; } -static inline void range_zero(struct lu_range *r) +/** + * initialize range to zero + */ + +static inline void range_init(struct lu_seq_range *range) { - r->lr_start = r->lr_end = 0; + range->lsr_start = range->lsr_end = range->lsr_mdt = 0; } -static inline int range_within(struct lu_range *r, +/** + * check if given seq id \a s is within given range \a r + */ + +static inline int range_within(const struct lu_seq_range *range, __u64 s) { - return s >= r->lr_start && s < r->lr_end; + return s >= range->lsr_start && s < range->lsr_end; } -static inline void range_alloc(struct lu_range *r, - struct lu_range *s, - __u64 w) +/** + * allocate \a w units of sequence from range \a from. + */ +static inline void range_alloc(struct lu_seq_range *to, + struct lu_seq_range *from, + __u64 width) { - r->lr_start = s->lr_start; - r->lr_end = s->lr_start + w; - s->lr_start += w; + to->lsr_start = from->lsr_start; + to->lsr_end = from->lsr_start + width; + from->lsr_start += width; } -static inline int range_is_sane(struct lu_range *r) +static inline int range_is_sane(const struct lu_seq_range *range) { - return (r->lr_end >= r->lr_start); + return (range->lsr_end >= range->lsr_start); } -static inline int range_is_zero(struct lu_range *r) +static inline int range_is_zero(const struct lu_seq_range *range) { - return (r->lr_start == 0 && r->lr_end == 0); + return (range->lsr_start == 0 && range->lsr_end == 0); } -static inline int range_is_exhausted(struct lu_range *r) +static inline int range_is_exhausted(const struct lu_seq_range *range) + { - return range_space(r) == 0; + return range_space(range) == 0; } -#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x]" +#define DRANGE "[%#16.16"LPF64"x-%#16.16"LPF64"x):%x" #define PRANGE(range) \ - (range)->lr_start, \ - (range)->lr_end + (range)->lsr_start, \ + (range)->lsr_end, \ + (range)->lsr_mdt /** \defgroup lu_fid lu_fid * @{ */ @@ -254,10 +280,24 @@ struct lu_fid { }; /** + * Following struct for MDT attributes, that will be kept inode's EA. + * Introduced in 2.0 release (please see b15993, for details) + */ +struct lustre_mdt_attrs { + /** FID of this inode */ + struct lu_fid lma_self_fid; + /** SOM state, mdt/ost type, others */ + __u64 lma_flags; + /** total sectors in objects */ + __u64 lma_som_sectors; +}; + + +/** * fid constants */ enum { - /* initial fid id value */ + /** initial fid id value */ LUSTRE_FID_INIT_OID = 1UL }; @@ -293,7 +333,7 @@ static inline void fid_zero(struct lu_fid *fid) /** * Check if a fid is igif or not. * \param fid the fid to be tested. - * \return true if the fid is a igif; otherwise false. + * \return true if the fid is a igif; otherwise false. */ static inline int fid_is_igif(const struct lu_fid *fid) { @@ -303,7 +343,7 @@ static inline int fid_is_igif(const struct lu_fid *fid) /** * Check if a fid is idif or not. * \param fid the fid to be tested. - * \return true if the fid is a idif; otherwise false. + * \return true if the fid is a idif; otherwise false. */ static inline int fid_is_idif(const struct lu_fid *fid) { @@ -324,7 +364,7 @@ static inline ino_t lu_igif_ino(const struct lu_fid *fid) * Get inode generation from a igif. * \param fid a igif to get inode generation from. * \return inode generation for the igif. - */ + */ static inline __u32 lu_igif_gen(const struct lu_fid *fid) { return fid_oid(fid); @@ -390,7 +430,6 @@ static inline void fid_be_to_cpu(struct lu_fid *dst, const struct lu_fid *src) * * Variable size, first byte contains the length of the whole record. */ - struct lu_fid_pack { char fp_len; char fp_area[sizeof(struct lu_fid)]; @@ -415,7 +454,7 @@ static inline int fid_is_zero(const struct lu_fid *fid) } extern void lustre_swab_lu_fid(struct lu_fid *fid); -extern void lustre_swab_lu_range(struct lu_range *range); +extern void lustre_swab_lu_seq_range(struct lu_seq_range *range); static inline int lu_fid_eq(const struct lu_fid *f0, const struct lu_fid *f1) @@ -639,8 +678,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); #define OBD_CONNECT_JOIN 0x00002000ULL /* files can be concatenated */ #define OBD_CONNECT_ATTRFID 0x00004000ULL /* Server supports GetAttr By Fid */ #define OBD_CONNECT_NODEVOH 0x00008000ULL /* No open handle for special nodes */ -#define OBD_CONNECT_LCL_CLIENT 0x00010000ULL /* local 1.8 client */ -#define OBD_CONNECT_RMT_CLIENT 0x00020000ULL /* Remote 1.8 client */ +#define OBD_CONNECT_RMT_CLIENT 0x00010000ULL /* Remote client */ +#define OBD_CONNECT_RMT_CLIENT_FORCE 0x00020000ULL /* Remote client by force */ #define OBD_CONNECT_BRW_SIZE 0x00040000ULL /* Max bytes per rpc */ #define OBD_CONNECT_QUOTA64 0x00080000ULL /* 64bit qunit_data.qd_count b=10707*/ #define OBD_CONNECT_MDS_CAPA 0x00100000ULL /* MDS capability */ @@ -669,8 +708,8 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_ACL | OBD_CONNECT_XATTR | \ OBD_CONNECT_IBITS | OBD_CONNECT_JOIN | \ OBD_CONNECT_NODEVOH |/* OBD_CONNECT_ATTRFID |*/\ - OBD_CONNECT_LCL_CLIENT | \ OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | \ OBD_CONNECT_MDS_MDS | OBD_CONNECT_CANCELSET | \ OBD_CONNECT_FID | \ @@ -682,7 +721,9 @@ extern void lustre_swab_ptlrpc_body(struct ptlrpc_body *pb); OBD_CONNECT_BRW_SIZE | OBD_CONNECT_QUOTA64 | \ OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET | \ OBD_CONNECT_CKSUM | LRU_RESIZE_CONNECT_FLAG | \ - OBD_CONNECT_AT) + OBD_CONNECT_AT | OBD_CONNECT_CHANGE_QS | \ + OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE) #define ECHO_CONNECT_SUPPORTED (0) #define MGS_CONNECT_SUPPORTED (OBD_CONNECT_VERSION | OBD_CONNECT_AT) @@ -752,6 +793,7 @@ typedef enum { OST_SET_INFO = 17, OST_QUOTACHECK = 18, OST_QUOTACTL = 19, + OST_QUOTA_ADJUST_QUNIT = 20, OST_LAST_OPC } ost_cmd_t; #define OST_FIRST_OPC OST_REPLY @@ -894,6 +936,8 @@ struct lov_mds_md_v3 { /* LOV EA mds/wire data (little-endian) */ #define OBD_MD_FLCKSPLIT (0x0000080000000000ULL) /* Check split on server */ #define OBD_MD_FLCROSSREF (0x0000100000000000ULL) /* Cross-ref case */ +#define OBD_FL_TRUNC (0x0000200000000000ULL) /* for filter_truncate */ + #define OBD_MD_FLRMTLSETFACL (0x0001000000000000ULL) /* lfs lsetfacl case */ #define OBD_MD_FLRMTLGETFACL (0x0002000000000000ULL) /* lfs lgetfacl case */ #define OBD_MD_FLRMTRSETFACL (0x0004000000000000ULL) /* lfs rsetfacl case */ @@ -947,7 +991,7 @@ extern void lustre_swab_obd_statfs (struct obd_statfs *os); #define OBD_BRW_CHECK 0x10 #define OBD_BRW_FROM_GRANT 0x20 /* the osc manages this under llite */ #define OBD_BRW_GRANTED 0x40 /* the ost manages this */ -#define OBD_BRW_DROP 0x80 /* drop the page after IO */ +#define OBD_BRW_NOCACHE 0x80 /* this page is a part of non-cached IO */ #define OBD_BRW_NOQUOTA 0x100 #define OBD_BRW_SRVLOCK 0x200 /* Client holds no lock over this page */ @@ -976,7 +1020,7 @@ extern void lustre_swab_niobuf_remote (struct niobuf_remote *nbr); /* lock value block communicated between the filter and llite */ -/* OST_LVB_ERR_INIT is needed because the return code in rc is +/* OST_LVB_ERR_INIT is needed because the return code in rc is * negative, i.e. because ((MASK + rc) & MASK) != MASK. */ #define OST_LVB_ERR_INIT 0xffbadbad80000000ULL #define OST_LVB_ERR_MASK 0xffbadbad00000000ULL @@ -1176,6 +1220,12 @@ struct mdt_body { __u32 max_mdsize; __u32 max_cookiesize; __u32 padding_4; /* also fix lustre_swab_mdt_body */ + __u64 padding_5; + __u64 padding_6; + __u64 padding_7; + __u64 padding_8; + __u64 padding_9; + __u64 padding_10; }; struct mds_body { @@ -1224,13 +1274,26 @@ extern void lustre_swab_mdt_epoch (struct mdt_epoch *b); #define Q_INITQUOTA 0x800101 /* init slave limits */ #define Q_GETOINFO 0x800102 /* get obd quota info */ #define Q_GETOQUOTA 0x800103 /* get obd quotas */ +#define Q_FINVALIDATE 0x800104 /* invalidate operational quotas */ -#define Q_TYPESET(oqc, type) \ - ((oqc)->qc_type == type || (oqc)->qc_type == UGQUOTA) +#define Q_TYPEMATCH(id, type) \ + ((id) == (type) || (id) == UGQUOTA) + +#define Q_TYPESET(oqc, type) Q_TYPEMATCH((oqc)->qc_type, type) #define Q_GETOCMD(oqc) \ ((oqc)->qc_cmd == Q_GETOINFO || (oqc)->qc_cmd == Q_GETOQUOTA) +#define QCTL_COPY(out, in) \ +do { \ + Q_COPY(out, in, qc_cmd); \ + Q_COPY(out, in, qc_type); \ + Q_COPY(out, in, qc_id); \ + Q_COPY(out, in, qc_stat); \ + Q_COPY(out, in, qc_dqinfo); \ + Q_COPY(out, in, qc_dqblk); \ +} while (0) + struct obd_quotactl { __u32 qc_cmd; __u32 qc_type; @@ -1242,6 +1305,34 @@ struct obd_quotactl { extern void lustre_swab_obd_quotactl(struct obd_quotactl *q); +struct quota_adjust_qunit { + __u32 qaq_flags; + __u32 qaq_id; + __u64 qaq_bunit_sz; + __u64 qaq_iunit_sz; + __u64 padding1; +}; +extern void lustre_swab_quota_adjust_qunit(struct quota_adjust_qunit *q); + +/* flags in qunit_data and quota_adjust_qunit will use macroes below */ +#define LQUOTA_FLAGS_GRP 1UL /* 0 is user, 1 is group */ +#define LQUOTA_FLAGS_BLK 2UL /* 0 is inode, 1 is block */ +#define LQUOTA_FLAGS_ADJBLK 4UL /* adjust the block qunit size */ +#define LQUOTA_FLAGS_ADJINO 8UL /* adjust the inode qunit size */ +#define LQUOTA_FLAGS_CHG_QS 16UL /* indicate whether it has capability of + * OBD_CONNECT_CHANGE_QS */ + +/* the status of lqsk_flags in struct lustre_qunit_size_key */ +#define LQUOTA_QUNIT_FLAGS (LQUOTA_FLAGS_GRP | LQUOTA_FLAGS_BLK) + +#define QAQ_IS_GRP(qaq) ((qaq)->qaq_flags & LQUOTA_FLAGS_GRP) +#define QAQ_IS_ADJBLK(qaq) ((qaq)->qaq_flags & LQUOTA_FLAGS_ADJBLK) +#define QAQ_IS_ADJINO(qaq) ((qaq)->qaq_flags & LQUOTA_FLAGS_ADJINO) + +#define QAQ_SET_GRP(qaq) ((qaq)->qaq_flags |= LQUOTA_FLAGS_GRP) +#define QAQ_SET_ADJBLK(qaq) ((qaq)->qaq_flags |= LQUOTA_FLAGS_ADJBLK) +#define QAQ_SET_ADJINO(qaq) ((qaq)->qaq_flags |= LQUOTA_FLAGS_ADJINO) + /* inode access permission for remote user, the inode info are omitted, * for client knows them. */ struct mds_remote_perm { @@ -1257,7 +1348,8 @@ enum { CFS_SETUID_PERM = 0x01, CFS_SETGID_PERM = 0x02, CFS_SETGRP_PERM = 0x04, - CFS_RMTACL_PERM = 0x08 + CFS_RMTACL_PERM = 0x08, + CFS_RMTOWN_PERM = 0x10 }; extern void lustre_swab_mds_remote_perm(struct mds_remote_perm *p); @@ -1295,11 +1387,15 @@ extern void lustre_swab_mds_rec_setattr (struct mds_rec_setattr *sa); struct mdt_rec_setattr { __u32 sa_opcode; + __u32 sa_cap; __u32 sa_fsuid; + __u32 sa_fsuid_h; __u32 sa_fsgid; - __u32 sa_cap; + __u32 sa_fsgid_h; __u32 sa_suppgid; + __u32 sa_suppgid_h; __u32 sa_padding_1; + __u32 sa_padding_1_h; struct lu_fid sa_fid; __u64 sa_valid; __u32 sa_uid; @@ -1397,7 +1493,8 @@ enum { MDS_CROSS_REF = 1 << 1, MDS_VTX_BYPASS = 1 << 2, MDS_PERM_BYPASS = 1 << 3, - MDS_SOM = 1 << 4 + MDS_SOM = 1 << 4, + MDS_QUOTA_IGNORE = 1 << 5 }; struct mds_rec_join { @@ -1437,11 +1534,15 @@ extern void lustre_swab_mds_rec_create (struct mds_rec_create *cr); struct mdt_rec_create { __u32 cr_opcode; + __u32 cr_cap; __u32 cr_fsuid; + __u32 cr_fsuid_h; __u32 cr_fsgid; - __u32 cr_cap; + __u32 cr_fsgid_h; __u32 cr_suppgid1; + __u32 cr_suppgid1_h; __u32 cr_suppgid2; + __u32 cr_suppgid2_h; struct lu_fid cr_fid1; struct lu_fid cr_fid2; struct lustre_handle cr_old_handle; /* u64 handle in case of open replay */ @@ -1479,11 +1580,15 @@ extern void lustre_swab_mds_rec_link (struct mds_rec_link *lk); struct mdt_rec_link { __u32 lk_opcode; + __u32 lk_cap; __u32 lk_fsuid; + __u32 lk_fsuid_h; __u32 lk_fsgid; - __u32 lk_cap; + __u32 lk_fsgid_h; __u32 lk_suppgid1; + __u32 lk_suppgid1_h; __u32 lk_suppgid2; + __u32 lk_suppgid2_h; struct lu_fid lk_fid1; struct lu_fid lk_fid2; __u64 lk_time; @@ -1519,11 +1624,15 @@ extern void lustre_swab_mds_rec_unlink (struct mds_rec_unlink *ul); struct mdt_rec_unlink { __u32 ul_opcode; + __u32 ul_cap; __u32 ul_fsuid; + __u32 ul_fsuid_h; __u32 ul_fsgid; - __u32 ul_cap; + __u32 ul_fsgid_h; __u32 ul_suppgid1; + __u32 ul_suppgid1_h; __u32 ul_suppgid2; + __u32 ul_suppgid2_h; struct lu_fid ul_fid1; struct lu_fid ul_fid2; __u64 ul_time; @@ -1559,11 +1668,15 @@ extern void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn); struct mdt_rec_rename { __u32 rn_opcode; + __u32 rn_cap; __u32 rn_fsuid; + __u32 rn_fsuid_h; __u32 rn_fsgid; - __u32 rn_cap; + __u32 rn_fsgid_h; __u32 rn_suppgid1; + __u32 rn_suppgid1_h; __u32 rn_suppgid2; + __u32 rn_suppgid2_h; struct lu_fid rn_fid1; struct lu_fid rn_fid2; __u64 rn_time; @@ -1581,11 +1694,15 @@ struct mdt_rec_rename { struct mdt_rec_setxattr { __u32 sx_opcode; + __u32 sx_cap; __u32 sx_fsuid; + __u32 sx_fsuid_h; __u32 sx_fsgid; - __u32 sx_cap; + __u32 sx_fsgid_h; __u32 sx_suppgid1; + __u32 sx_suppgid1_h; __u32 sx_suppgid2; + __u32 sx_suppgid2_h; struct lu_fid sx_fid; __u64 sx_padding_1; /* These three members are lu_fid size */ __u32 sx_padding_2; @@ -1605,11 +1722,15 @@ struct mdt_rec_setxattr { struct mdt_rec_reint { __u32 rr_opcode; + __u32 rr_cap; __u32 rr_fsuid; + __u32 rr_fsuid_h; __u32 rr_fsgid; - __u32 rr_cap; + __u32 rr_fsgid_h; __u32 rr_suppgid1; + __u32 rr_suppgid1_h; __u32 rr_suppgid2; + __u32 rr_suppgid2_h; struct lu_fid rr_fid1; struct lu_fid rr_fid2; __u64 rr_mtime; @@ -1635,13 +1756,6 @@ struct lmv_desc { extern void lustre_swab_lmv_desc (struct lmv_desc *ld); -struct md_fld { - seqno_t mf_seq; - mdsno_t mf_mds; -}; - -extern void lustre_swab_md_fld (struct md_fld *mf); - enum fld_rpc_opc { FLD_QUERY = 600, FLD_LAST_OPC, @@ -1725,10 +1839,11 @@ typedef enum { LCK_CR = 16, LCK_NL = 32, LCK_GROUP = 64, + LCK_COS = 128, LCK_MAXMODE } ldlm_mode_t; -#define LCK_MODE_NUM 7 +#define LCK_MODE_NUM 8 typedef enum { LDLM_PLAIN = 10, @@ -1948,6 +2063,7 @@ typedef enum { OST_RAID1_REC = LLOG_OP_MAGIC | 0x01000, MDS_UNLINK_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_UNLINK, MDS_SETATTR_REC = LLOG_OP_MAGIC | 0x10000 | (MDS_REINT << 8) | REINT_SETATTR, + MDS_SETATTR64_REC= LLOG_OP_MAGIC | 0x90000 | (MDS_REINT << 8) | REINT_SETATTR, OBD_CFG_REC = LLOG_OP_MAGIC | 0x20000, PTL_CFG_REC = LLOG_OP_MAGIC | 0x30000, /* obsolete */ LLOG_GEN_REC = LLOG_OP_MAGIC | 0x40000, @@ -2044,6 +2160,18 @@ struct llog_setattr_rec { struct llog_rec_tail lsr_tail; } __attribute__((packed)); +struct llog_setattr64_rec { + struct llog_rec_hdr lsr_hdr; + obd_id lsr_oid; + obd_count lsr_ogen; + __u32 padding; + __u32 lsr_uid; + __u32 lsr_uid_h; + __u32 lsr_gid; + __u32 lsr_gid_h; + struct llog_rec_tail lsr_tail; +} __attribute__((packed)); + struct llog_size_change_rec { struct llog_rec_hdr lsc_hdr; struct ll_fid lsc_fid; @@ -2199,7 +2327,6 @@ struct obdo { extern void lustre_swab_obdo (struct obdo *o); /* request structure for OST's */ - struct ost_body { struct obdo oa; }; @@ -2231,37 +2358,71 @@ extern void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct lustre_cfg; extern void lustre_swab_lustre_cfg(struct lustre_cfg *lcfg); -/* quota. fixed by tianzy for bug10707 */ -#define QUOTA_IS_GRP 0X1UL /* 0 is user, 1 is group. Used by qd_flags*/ -#define QUOTA_IS_BLOCK 0x2UL /* 0 is inode, 1 is block. Used by qd_flags*/ - +/* this will be used when OBD_CONNECT_CHANGE_QS is set */ struct qunit_data { - __u32 qd_id; /* ID appiles to (uid, gid) */ - __u32 qd_flags; /* Quota type (USRQUOTA, GRPQUOTA) occupy one bit; - * Block quota or file quota occupy one bit */ - __u64 qd_count; /* acquire/release count (bytes for block quota) */ + /** + * ID appiles to (uid, gid) + */ + __u32 qd_id; + /** + * LQUOTA_FLAGS_* affect the responding bits + */ + __u32 qd_flags; + /** + * acquire/release count (bytes for block quota) + */ + __u64 qd_count; + /** + * when a master returns the reply to a slave, it will + * contain the current corresponding qunit size + */ + __u64 qd_qunit; + __u64 padding; }; -struct qunit_data_old { - __u32 qd_id; /* ID appiles to (uid, gid) */ - __u32 qd_type; /* Quota type (USRQUOTA, GRPQUOTA) */ - __u32 qd_count; /* acquire/release count (bytes for block quota) */ - __u32 qd_isblk; /* Block quota or file quota */ -}; +#define QDATA_IS_GRP(qdata) ((qdata)->qd_flags & LQUOTA_FLAGS_GRP) +#define QDATA_IS_BLK(qdata) ((qdata)->qd_flags & LQUOTA_FLAGS_BLK) +#define QDATA_IS_ADJBLK(qdata) ((qdata)->qd_flags & LQUOTA_FLAGS_ADJBLK) +#define QDATA_IS_ADJINO(qdata) ((qdata)->qd_flags & LQUOTA_FLAGS_ADJINO) +#define QDATA_IS_CHANGE_QS(qdata) ((qdata)->qd_flags & LQUOTA_FLAGS_CHG_QS) + +#define QDATA_SET_GRP(qdata) ((qdata)->qd_flags |= LQUOTA_FLAGS_GRP) +#define QDATA_SET_BLK(qdata) ((qdata)->qd_flags |= LQUOTA_FLAGS_BLK) +#define QDATA_SET_ADJBLK(qdata) ((qdata)->qd_flags |= LQUOTA_FLAGS_ADJBLK) +#define QDATA_SET_ADJINO(qdata) ((qdata)->qd_flags |= LQUOTA_FLAGS_ADJINO) +#define QDATA_SET_CHANGE_QS(qdata) ((qdata)->qd_flags |= LQUOTA_FLAGS_CHG_QS) + +#define QDATA_CLR_GRP(qdata) ((qdata)->qd_flags &= ~LQUOTA_FLAGS_GRP) +#define QDATA_CLR_CHANGE_QS(qdata) ((qdata)->qd_flags &= ~LQUOTA_FLAGS_CHG_QS) extern void lustre_swab_qdata(struct qunit_data *d); -extern void lustre_swab_qdata_old(struct qunit_data_old *d); -extern struct qunit_data *lustre_quota_old_to_new(struct qunit_data_old *d); -extern struct qunit_data_old *lustre_quota_new_to_old(struct qunit_data *d); +extern int quota_get_qdata(void*req, struct qunit_data *qdata, + int is_req, int is_exp); +extern int quota_copy_qdata(void *request, struct qunit_data *qdata, + int is_req, int is_exp); typedef enum { - QUOTA_DQACQ = 601, - QUOTA_DQREL = 602, + QUOTA_DQACQ = 901, + QUOTA_DQREL = 902, + QUOTA_LAST_OPC } quota_cmd_t; +#define QUOTA_FIRST_OPC QUOTA_DQACQ #define JOIN_FILE_ALIGN 4096 -/** security opcodes */ +#define QUOTA_REQUEST 1 +#define QUOTA_REPLY 0 +#define QUOTA_EXPORT 1 +#define QUOTA_IMPORT 0 + +/* quota check function */ +#define QUOTA_RET_OK 0 /**< return successfully */ +#define QUOTA_RET_NOQUOTA 1 /**< not support quota */ +#define QUOTA_RET_NOLIMIT 2 /**< quota limit isn't set */ +#define QUOTA_RET_ACQUOTA 4 /**< need to acquire extra quota */ +#define QUOTA_RET_INC_PENDING 8 /**< pending value is increased */ + +/* security opcodes */ typedef enum { SEC_CTX_INIT = 801, SEC_CTX_INIT_CONT = 802, @@ -2279,15 +2440,15 @@ typedef enum { /* NB take care when changing the sequence of elements this struct, * because the offset info is used in find_capa() */ struct lustre_capa { - struct lu_fid lc_fid; /* fid */ - __u64 lc_opc; /* operations allowed */ - __u32 lc_uid; /* uid, it is obsolete, but maybe used in - * future, reserve it for 64-bits aligned.*/ - __u32 lc_flags; /* HMAC algorithm & flags */ - __u32 lc_keyid; /* key used for the capability */ - __u32 lc_timeout; /* capa timeout value (sec) */ - __u64 lc_expiry; /* expiry time (sec) */ - __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /* HMAC */ + struct lu_fid lc_fid; /** fid */ + __u64 lc_opc; /** operations allowed */ + __u64 lc_uid; /** file owner */ + __u64 lc_gid; /** file group */ + __u32 lc_flags; /** HMAC algorithm & flags */ + __u32 lc_keyid; /** key# used for the capability */ + __u32 lc_timeout; /** capa timeout value (sec) */ + __u32 lc_expiry; /** expiry time (sec) */ + __u8 lc_hmac[CAPA_HMAC_MAX_LEN]; /** HMAC */ } __attribute__((packed)); extern void lustre_swab_lustre_capa(struct lustre_capa *c); @@ -2302,9 +2463,9 @@ enum { CAPA_OPC_OSS_WRITE = 1<<5, /**< write oss object data */ CAPA_OPC_OSS_READ = 1<<6, /**< read oss object data */ CAPA_OPC_OSS_TRUNC = 1<<7, /**< truncate oss object */ - CAPA_OPC_META_WRITE = 1<<8, /**< write object meta data */ - CAPA_OPC_META_READ = 1<<9, /**< read object meta data */ - + CAPA_OPC_OSS_DESTROY = 1<<8, /**< destroy oss object */ + CAPA_OPC_META_WRITE = 1<<9, /**< write object meta data */ + CAPA_OPC_META_READ = 1<<10, /**< read object meta data */ }; #define CAPA_OPC_OSS_RW (CAPA_OPC_OSS_READ | CAPA_OPC_OSS_WRITE) @@ -2312,7 +2473,8 @@ enum { (CAPA_OPC_BODY_WRITE | CAPA_OPC_BODY_READ | CAPA_OPC_INDEX_LOOKUP | \ CAPA_OPC_INDEX_INSERT | CAPA_OPC_INDEX_DELETE) #define CAPA_OPC_OSS_ONLY \ - (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC) + (CAPA_OPC_OSS_WRITE | CAPA_OPC_OSS_READ | CAPA_OPC_OSS_TRUNC | \ + CAPA_OPC_OSS_DESTROY) #define CAPA_OPC_MDS_DEFAULT ~CAPA_OPC_OSS_ONLY #define CAPA_OPC_OSS_DEFAULT ~(CAPA_OPC_MDS_ONLY | CAPA_OPC_OSS_ONLY) @@ -2349,11 +2511,6 @@ struct lustre_capa_key { extern void lustre_swab_lustre_capa_key(struct lustre_capa_key *k); -/* quota check function */ -#define QUOTA_RET_OK 0 /**< return successfully */ -#define QUOTA_RET_NOQUOTA 1 /**< not support quota */ -#define QUOTA_RET_NOLIMIT 2 /**< quota limit isn't set */ -#define QUOTA_RET_ACQUOTA 3 /**< need to acquire extra quota */ #endif /** @} lustreidl */ diff --git a/lustre/include/lustre/lustre_user.h b/lustre/include/lustre/lustre_user.h index bd76396..12a0f0e 100644 --- a/lustre/include/lustre/lustre_user.h +++ b/lustre/include/lustre/lustre_user.h @@ -98,6 +98,8 @@ struct obd_statfs; #define LL_IOC_FLUSHCTX _IOW ('f', 166, long) #define LL_IOC_RMTACL _IOW ('f', 167, long) +#define LL_IOC_GETOBDCOUNT _IOR ('f', 168, long) + #define LL_IOC_LLOOP_ATTACH _IOWR('f', 169, long) #define LL_IOC_LLOOP_DETACH _IOWR('f', 170, long) #define LL_IOC_LLOOP_INFO _IOWR('f', 171, long) @@ -228,17 +230,19 @@ static inline char *obd_uuid2str(struct obd_uuid *uuid) return (char *)(uuid->uuid); } -#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ -#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ -#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ -#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ -#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ -#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* these must be explicitly translated into linux Q_* in ll_dir_ioctl */ +#define LUSTRE_Q_QUOTAON 0x800002 /* turn quotas on */ +#define LUSTRE_Q_QUOTAOFF 0x800003 /* turn quotas off */ +#define LUSTRE_Q_GETINFO 0x800005 /* get information about quota files */ +#define LUSTRE_Q_SETINFO 0x800006 /* set information about quota files */ +#define LUSTRE_Q_GETQUOTA 0x800007 /* get user quota structure */ +#define LUSTRE_Q_SETQUOTA 0x800008 /* set user quota structure */ +/* lustre-specific control commands */ +#define LUSTRE_Q_INVALIDATE 0x80000b /* invalidate quota data */ +#define LUSTRE_Q_FINVALIDATE 0x80000c /* invalidate filter quota data */ #define UGQUOTA 2 /* set both USRQUOTA and GRPQUOTA */ -#define QFMT_LDISKFS 2 /* QFMT_VFS_V0(2), quota format for ldiskfs */ - struct if_quotacheck { char obd_type[16]; struct obd_uuid obd_uuid; @@ -306,6 +310,10 @@ enum { #endif /* !__KERNEL__ */ +typedef enum lustre_quota_version { + LUSTRE_QUOTA_V2 = 1 +} lustre_quota_version_t; + /* XXX: same as if_dqinfo struct in kernel */ struct obd_dqinfo { __u64 dqi_bgrace; @@ -328,11 +336,20 @@ struct obd_dqblk { __u32 padding; }; +enum { + QC_GENERAL = 0, + QC_MDTIDX = 1, + QC_OSTIDX = 2, + QC_UUID = 3 +}; + struct if_quotactl { __u32 qc_cmd; __u32 qc_type; __u32 qc_id; __u32 qc_stat; + __u32 qc_valid; + __u32 qc_idx; struct obd_dqinfo qc_dqinfo; struct obd_dqblk qc_dqblk; char obd_type[16]; diff --git a/lustre/include/lustre_cache.h b/lustre/include/lustre_cache.h deleted file mode 100644 index 5bff0a2..0000000 --- a/lustre/include/lustre_cache.h +++ /dev/null @@ -1,87 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - */ - -#ifndef LUSTRE_CACHE_H -#define LUSTRE_CACHE_H -#include -#include -#include - -struct lustre_cache; -struct osc_async_page; -struct page_removal_cb_element { - struct list_head prce_list; - obd_page_removal_cb_t prce_callback; - atomic_t prce_refcnt; -}; - -typedef int (*cache_iterate_extents_cb_t)(struct lustre_cache *, - struct lustre_handle *, - struct osc_async_page *, - void *); -typedef int (*cache_iterate_locks_cb_t)(struct lustre_cache *, - struct ldlm_res_id *, - struct lustre_handle *, void *); - -struct lustre_cache { - struct list_head lc_locks_list; - spinlock_t lc_locks_list_lock; - struct list_head lc_page_removal_callback_list; - rwlock_t lc_page_removal_cb_lock; /* iterate vs modify list */ - struct obd_device *lc_obd; - obd_pin_extent_cb lc_pin_extent_cb; -}; - -int cache_add_lock(struct lustre_cache *cache, struct lustre_handle *lockh); -int cache_add_extent(struct lustre_cache *cache, struct ldlm_res_id *res, - struct osc_async_page *extent, - struct lustre_handle *lockh); -void cache_remove_extent(struct lustre_cache *, struct osc_async_page *); -int cache_add_extent_removal_cb(struct lustre_cache *cache, - obd_page_removal_cb_t func_cb, - obd_pin_extent_cb pin_cb); -int cache_del_extent_removal_cb(struct lustre_cache *cache, - obd_page_removal_cb_t func_cb); -int cache_iterate_extents(struct lustre_cache *cache, struct lustre_handle *lockh, - cache_iterate_extents_cb_t cb_func, void *data); -int cache_remove_lock(struct lustre_cache *cache, struct lustre_handle *lockh); -int cache_iterate_locks(struct lustre_cache *cache, struct ldlm_res_id *res, - cache_iterate_locks_cb_t cb_fun, void *data); -struct lustre_cache *cache_create(struct obd_device *obd); -int cache_destroy(struct lustre_cache *cache); - - -#endif /* LUSTRE_CACHE_H */ diff --git a/lustre/include/lustre_capa.h b/lustre/include/lustre_capa.h index 1fb6a7d..7f65a44 100644 --- a/lustre/include/lustre_capa.h +++ b/lustre/include/lustre_capa.h @@ -95,29 +95,24 @@ enum { CAPA_SITE_MAX }; -static inline __u64 capa_opc(struct lustre_capa *capa) -{ - return capa->lc_opc; -} - -static inline __u32 capa_uid(struct lustre_capa *capa) +static inline struct lu_fid *capa_fid(struct lustre_capa *capa) { - return capa->lc_uid; + return &capa->lc_fid; } -static inline struct lu_fid *capa_fid(struct lustre_capa *capa) +static inline __u64 capa_opc(struct lustre_capa *capa) { - return &capa->lc_fid; + return capa->lc_opc; } -static inline __u32 capa_keyid(struct lustre_capa *capa) +static inline __u64 capa_uid(struct lustre_capa *capa) { - return capa->lc_keyid; + return capa->lc_uid; } -static inline __u64 capa_expiry(struct lustre_capa *capa) +static inline __u64 capa_gid(struct lustre_capa *capa) { - return capa->lc_expiry; + return capa->lc_gid; } static inline __u32 capa_flags(struct lustre_capa *capa) @@ -127,9 +122,12 @@ static inline __u32 capa_flags(struct lustre_capa *capa) static inline __u32 capa_alg(struct lustre_capa *capa) { - __u32 alg = capa->lc_flags; + return (capa->lc_flags >> 24); +} - return alg >> 24; +static inline __u32 capa_keyid(struct lustre_capa *capa) +{ + return capa->lc_keyid; } static inline __u64 capa_key_mdsid(struct lustre_capa_key *key) @@ -142,12 +140,23 @@ static inline __u32 capa_key_keyid(struct lustre_capa_key *key) return key->lk_keyid; } +static inline __u32 capa_timeout(struct lustre_capa *capa) +{ + return capa->lc_timeout; +} + +static inline __u32 capa_expiry(struct lustre_capa *capa) +{ + return capa->lc_expiry; +} + #define DEBUG_CAPA(level, c, fmt, args...) \ do { \ -CDEBUG(level, fmt " capability@%p uid %u opc "LPX64" fid "DFID" keyid %u " \ - "expiry "LPU64" flags %u alg %d\n", \ - ##args, c, capa_uid(c), capa_opc(c), PFID(capa_fid(c)), capa_keyid(c), \ - capa_expiry(c), capa_flags(c), capa_alg(c)); \ +CDEBUG(level, fmt " capability@%p fid "DFID" opc "LPX64" uid "LPU64" gid " \ + LPU64" flags %u alg %d keyid %u timeout %u expiry %u\n", \ + ##args, c, PFID(capa_fid(c)), capa_opc(c), capa_uid(c), capa_gid(c), \ + capa_flags(c), capa_alg(c), capa_keyid(c), capa_timeout(c), \ + capa_expiry(c)); \ } while (0) #define DEBUG_CAPA_KEY(level, k, fmt, args...) \ @@ -172,38 +181,33 @@ struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa, int alive); int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key); +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen); void capa_cpy(void *dst, struct obd_capa *ocapa); - -char *dump_capa_content(char *buf, char *key, int len); - static inline struct obd_capa *alloc_capa(int site) { #ifdef __KERNEL__ struct obd_capa *ocapa; + if (unlikely(site != CAPA_SITE_CLIENT && site != CAPA_SITE_SERVER)) + return ERR_PTR(-EINVAL); + OBD_SLAB_ALLOC(ocapa, capa_cachep, GFP_KERNEL, sizeof(*ocapa)); - if (ocapa) { - atomic_set(&ocapa->c_refc, 0); - spin_lock_init(&ocapa->c_lock); - CFS_INIT_LIST_HEAD(&ocapa->c_list); - ocapa->c_site = site; - } - return ocapa; -#else - return NULL; -#endif -} + if (unlikely(!ocapa)) + return ERR_PTR(-ENOMEM); + + CFS_INIT_LIST_HEAD(&ocapa->c_list); + atomic_set(&ocapa->c_refc, 1); + spin_lock_init(&ocapa->c_lock); + ocapa->c_site = site; + if (ocapa->c_site == CAPA_SITE_CLIENT) + CFS_INIT_LIST_HEAD(&ocapa->u.cli.lli_list); + else + CFS_INIT_HLIST_NODE(&ocapa->u.tgt.c_hash); -static inline void free_capa(struct obd_capa *ocapa) -{ -#ifdef __KERNEL__ - if (atomic_read(&ocapa->c_refc)) { - DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc %d for", - atomic_read(&ocapa->c_refc)); - LBUG(); - } - OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa)); + return ocapa; #else + return ERR_PTR(-EOPNOTSUPP); #endif } @@ -225,7 +229,19 @@ static inline void capa_put(struct obd_capa *ocapa) DEBUG_CAPA(D_ERROR, &ocapa->c_capa, "refc is 0 for"); LBUG(); } - atomic_dec(&ocapa->c_refc); + + if (atomic_dec_and_test(&ocapa->c_refc)) { + LASSERT(list_empty(&ocapa->c_list)); + if (ocapa->c_site == CAPA_SITE_CLIENT) { + LASSERT(list_empty(&ocapa->u.cli.lli_list)); + } else { + struct hlist_node *hnode; + + hnode = &ocapa->u.tgt.c_hash; + LASSERT(!hnode->next && !hnode->pprev); + } + OBD_SLAB_FREE(ocapa, capa_cachep, sizeof(*ocapa)); + } } static inline int open_flags_to_accmode(int flags) @@ -253,6 +269,11 @@ static inline void set_capa_expiry(struct obd_capa *ocapa) cfs_time_seconds(expiry)); } +static inline int capa_is_expired_sec(struct lustre_capa *capa) +{ + return (capa->lc_expiry - cfs_time_current_sec() <= 0); +} + static inline int capa_is_expired(struct obd_capa *ocapa) { return cfs_time_beforeq(ocapa->c_expiry, cfs_time_current()); @@ -284,5 +305,11 @@ struct filter_capa_key { struct lustre_capa_key k_key; }; +enum { + LC_ID_NONE = 0, + LC_ID_PLAIN = 1, + LC_ID_CONVERT = 2 +}; + #define BYPASS_CAPA (struct lustre_capa *)ERR_PTR(-ENOENT) #endif /* __LINUX_CAPA_H_ */ diff --git a/lustre/include/lustre_disk.h b/lustre/include/lustre_disk.h index 0e253d0..5049532 100644 --- a/lustre/include/lustre_disk.h +++ b/lustre/include/lustre_disk.h @@ -50,9 +50,10 @@ #define MDT_LOGS_DIR "LOGS" /* COMPAT_146 */ #define MOUNT_CONFIGS_DIR "CONFIGS" -/* Persistent mount data are stored on the disk in this file. */ -#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/mountdata" -#define LAST_RCVD "last_received" +#define CONFIGS_FILE "mountdata" +/** Persistent mount data are stored on the disk in this file. */ +#define MOUNT_DATA_FILE MOUNT_CONFIGS_DIR"/"CONFIGS_FILE +#define LAST_RCVD "last_rcvd" #define LOV_OBJID "lov_objid" #define HEALTH_CHECK "health_check" #define CAPA_KEYS "capa_keys" @@ -62,13 +63,23 @@ #define LDD_F_SV_TYPE_MDT 0x0001 #define LDD_F_SV_TYPE_OST 0x0002 #define LDD_F_SV_TYPE_MGS 0x0004 -#define LDD_F_NEED_INDEX 0x0010 /* need an index assignment */ -#define LDD_F_VIRGIN 0x0020 /* never registered */ -#define LDD_F_UPDATE 0x0040 /* update the config logs for this server*/ -#define LDD_F_REWRITE_LDD 0x0080 /* rewrite the LDD */ -#define LDD_F_WRITECONF 0x0100 /* regenerate all logs for this fs */ -#define LDD_F_UPGRADE14 0x0200 /* COMPAT_14 */ -#define LDD_F_PARAM 0x0400 /* process as lctl conf_param */ +#define LDD_F_SV_ALL 0x0008 +/** need an index assignment */ +#define LDD_F_NEED_INDEX 0x0010 +/** never registered */ +#define LDD_F_VIRGIN 0x0020 +/** update the config logs for this server*/ +#define LDD_F_UPDATE 0x0040 +/** rewrite the LDD */ +#define LDD_F_REWRITE_LDD 0x0080 +/** regenerate all logs for this fs */ +#define LDD_F_WRITECONF 0x0100 +/** COMPAT_14 */ +#define LDD_F_UPGRADE14 0x0200 +/** process as lctl conf_param */ +#define LDD_F_PARAM 0x0400 +/** backend fs make use of IAM directory format. */ +#define LDD_F_IAM_DIR 0x0800 enum ldd_mount_type { LDD_MT_EXT3 = 0, @@ -128,9 +139,10 @@ static inline int server_make_name(__u32 flags, __u16 index, char *fs, char *name) { if (flags & (LDD_F_SV_TYPE_MDT | LDD_F_SV_TYPE_OST)) { - sprintf(name, "%.8s-%s%04x", fs, - (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", - index); + if (!(flags & LDD_F_SV_ALL)) + sprintf(name, "%.8s-%s%04x", fs, + (flags & LDD_F_SV_TYPE_MDT) ? "MDT" : "OST", + index); } else if (flags & LDD_F_SV_TYPE_MGS) { sprintf(name, "MGS"); } else { @@ -159,6 +171,7 @@ struct lustre_mount_data { int lmd_exclude_count; char *lmd_dev; /* device name */ char *lmd_profile; /* client only */ + char *lmd_mgssec; /* sptlrpc flavor to mgs */ char *lmd_opts; /* lustre mount options (as opposed to _device_ mount options) */ __u32 *lmd_exclude; /* array of OSTs to ignore */ @@ -196,17 +209,28 @@ struct lustre_mount_data { #define LR_MAX_CLIENTS (CFS_PAGE_SIZE * 8) #endif -/* COMPAT_146 */ -#define OBD_COMPAT_OST 0x00000002 /* this is an OST (temporary) */ -#define OBD_COMPAT_MDT 0x00000004 /* this is an MDT (temporary) */ -/* end COMPAT_146 */ - -#define OBD_ROCOMPAT_LOVOBJID 0x00000001 /* MDS handles LOV_OBJID file */ - -#define OBD_INCOMPAT_GROUPS 0x00000001 /* OST handles group subdirs */ -#define OBD_INCOMPAT_OST 0x00000002 /* this is an OST */ -#define OBD_INCOMPAT_MDT 0x00000004 /* this is an MDT */ -#define OBD_INCOMPAT_COMMON_LR 0x00000008 /* common last_rvcd format */ +/** COMPAT_146: this is an OST (temporary) */ +#define OBD_COMPAT_OST 0x00000002 +/** COMPAT_146: this is an MDT (temporary) */ +#define OBD_COMPAT_MDT 0x00000004 + +/** MDS handles LOV_OBJID file */ +#define OBD_ROCOMPAT_LOVOBJID 0x00000001 + +/** OST handles group subdirs */ +#define OBD_INCOMPAT_GROUPS 0x00000001 +/** this is an OST */ +#define OBD_INCOMPAT_OST 0x00000002 +/** this is an MDT */ +#define OBD_INCOMPAT_MDT 0x00000004 +/** common last_rvcd format */ +#define OBD_INCOMPAT_COMMON_LR 0x00000008 +/** FID is enabled */ +#define OBD_INCOMPAT_FID 0x00000010 +/** + * lustre disk using iam format to store directory entries + */ +#define OBD_INCOMPAT_IAM_DIR 0x00000020 /* Data stored per server at the head of the last_rcvd file. In le32 order. diff --git a/lustre/include/lustre_dlm.h b/lustre/include/lustre_dlm.h index 368ee21..3eac378 100644 --- a/lustre/include/lustre_dlm.h +++ b/lustre/include/lustre_dlm.h @@ -201,6 +201,7 @@ typedef enum { #define LCK_COMPAT_CR (LCK_COMPAT_CW | LCK_PR | LCK_PW) #define LCK_COMPAT_NL (LCK_COMPAT_CR | LCK_EX | LCK_GROUP) #define LCK_COMPAT_GROUP (LCK_GROUP | LCK_NL) +#define LCK_COMPAT_COS (LCK_COS) extern ldlm_mode_t lck_compat_array[]; @@ -260,13 +261,13 @@ struct ldlm_pool_ops { int (*po_setup)(struct ldlm_pool *pl, int limit); }; -/** - * One second for pools thread check interval. Each pool has own period. +/** + * One second for pools thread check interval. Each pool has own period. */ #define LDLM_POOLS_THREAD_PERIOD (1) -/** - * 5% margin for modest pools. See ldlm_pool.c for details. +/** + * 5% margin for modest pools. See ldlm_pool.c for details. */ #define LDLM_POOLS_MODEST_MARGIN (5) @@ -431,7 +432,7 @@ struct ldlm_namespace { unsigned int ns_max_unused; unsigned int ns_max_age; - + unsigned int ns_timeouts; /** * Seconds. */ @@ -545,7 +546,7 @@ struct ldlm_interval_tree { }; struct ldlm_lock { - /** + /** * Must be first in the structure. */ struct portals_handle l_handle; @@ -553,34 +554,34 @@ struct ldlm_lock { * Lock reference count. */ atomic_t l_refc; - /** + /** * Internal spinlock protects l_resource. we should hold this lock * first before grabbing res_lock. */ spinlock_t l_lock; - /** - * ldlm_lock_change_resource() can change this. + /** + * ldlm_lock_change_resource() can change this. */ struct ldlm_resource *l_resource; - /** + /** * Protected by ns_hash_lock. List item for client side lru list. */ struct list_head l_lru; - /** - * Protected by lr_lock, linkage to resource's lock queues. + /** + * Protected by lr_lock, linkage to resource's lock queues. */ struct list_head l_res_link; - /** - * Tree node for ldlm_extent. + /** + * Tree node for ldlm_extent. */ struct ldlm_interval *l_tree_node; - /** + /** * Protected by per-bucket exp->exp_lock_hash locks. Per export hash * of locks. */ struct hlist_node l_exp_hash; - /** - * Protected by lr_lock. Requested mode. + /** + * Protected by lr_lock. Requested mode. */ ldlm_mode_t l_req_mode; /** @@ -632,27 +633,27 @@ struct ldlm_lock { */ __u8 l_destroyed; - /** + /** * If the lock is granted, a process sleeps on this waitq to learn when * it's no longer in use. If the lock is not granted, a process sleeps - * on this waitq to learn when it becomes granted. + * on this waitq to learn when it becomes granted. */ cfs_waitq_t l_waitq; struct timeval l_enqueued_time; /** - * Jiffies. Should be converted to time if needed. + * Jiffies. Should be converted to time if needed. */ cfs_time_t l_last_used; struct ldlm_extent l_req_extent; - /* - * Client-side-only members. + /* + * Client-side-only members. */ - - /** + + /** * Temporary storage for an LVB received during an enqueue operation. */ __u32 l_lvb_len; @@ -665,40 +666,43 @@ struct ldlm_lock { struct list_head l_cache_locks_list; - /* - * Server-side-only members. + /* + * Server-side-only members. */ - /** + /** connection cookie for the client originated the operation. */ + __u64 l_client_cookie; + + /** * Protected by elt_lock. Callbacks pending. */ struct list_head l_pending_chain; cfs_time_t l_callback_timeout; - /** - * Pid which created this lock. + /** + * Pid which created this lock. */ __u32 l_pid; - /** - * For ldlm_add_ast_work_item(). + /** + * For ldlm_add_ast_work_item(). */ struct list_head l_bl_ast; - /** - * For ldlm_add_ast_work_item(). + /** + * For ldlm_add_ast_work_item(). */ struct list_head l_cp_ast; - /** - * For ldlm_add_ast_work_item(). + /** + * For ldlm_add_ast_work_item(). */ struct list_head l_rk_ast; struct ldlm_lock *l_blocking_lock; int l_bl_ast_run; - /** - * Protected by lr_lock, linkages to "skip lists". + /** + * Protected by lr_lock, linkages to "skip lists". */ struct list_head l_sl_mode; struct list_head l_sl_policy; @@ -867,7 +871,6 @@ void ldlm_lock2handle(const struct ldlm_lock *lock, struct lustre_handle *lockh); struct ldlm_lock *__ldlm_handle2lock(const struct lustre_handle *, int flags); void ldlm_cancel_callback(struct ldlm_lock *); -int ldlm_lock_set_data(struct lustre_handle *, void *data); int ldlm_lock_remove_from_lru(struct ldlm_lock *); static inline struct ldlm_lock *ldlm_handle2lock(const struct lustre_handle *h) @@ -955,14 +958,13 @@ int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_decref_and_cancel(struct lustre_handle *lockh, __u32 mode); void ldlm_lock_allow_match(struct ldlm_lock *lock); -int ldlm_lock_fast_match(struct ldlm_lock *, int, obd_off, obd_off, void **); -void ldlm_lock_fast_release(void *, int); ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags, const struct ldlm_res_id *, ldlm_type_t type, ldlm_policy_data_t *, ldlm_mode_t mode, - struct lustre_handle *); + struct lustre_handle *, int unref); struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, __u32 *flags); +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode); void ldlm_lock_cancel(struct ldlm_lock *lock); void ldlm_cancel_locks_for_export(struct obd_export *export); void ldlm_reprocess_all(struct ldlm_resource *res); @@ -1011,7 +1013,7 @@ int ldlm_lock_change_resource(struct ldlm_namespace *, struct ldlm_lock *, const struct ldlm_res_id *); #define LDLM_RESOURCE_ADDREF(res) do { \ - lu_ref_add(&(res)->lr_reference, __FUNCTION__, cfs_current()); \ + lu_ref_add_atomic(&(res)->lr_reference, __FUNCTION__, cfs_current()); \ } while (0) #define LDLM_RESOURCE_DELREF(res) do { \ @@ -1027,6 +1029,7 @@ struct ldlm_callback_suite { /* ldlm_request.c */ int ldlm_expired_completion_wait(void *data); +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock); int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag); int ldlm_glimpse_ast(struct ldlm_lock *lock, void *reqp); @@ -1062,6 +1065,7 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_completion_callback completion, ldlm_glimpse_callback glimpse, void *data, __u32 lvb_len, void *lvb_swabber, + const __u64 *client_cookie, struct lustre_handle *lockh); int ldlm_server_ast(struct lustre_handle *lockh, struct ldlm_lock_desc *new, void *data, __u32 data_len); diff --git a/lustre/include/lustre_export.h b/lustre/include/lustre_export.h index 4851fbc..94033ef 100644 --- a/lustre/include/lustre_export.h +++ b/lustre/include/lustre_export.h @@ -56,7 +56,6 @@ struct mdt_export_data { __u64 med_ibits_known; loff_t med_lr_off; int med_lr_idx; - unsigned int med_rmtclient:1; /* remote client? */ struct semaphore med_idmap_sem; struct lustre_idmap_table *med_idmap; }; @@ -149,6 +148,7 @@ struct obd_export { exp_flvr_changed:1, exp_flvr_adapt:1, exp_libclient:1; /* liblustre client? */ + struct list_head exp_queued_rpc; /* RPC to be handled */ /* also protected by exp_lock */ enum lustre_sec_part exp_sp_peer; struct sptlrpc_flavor exp_flvr; /* current */ @@ -178,6 +178,20 @@ static inline int exp_connect_lru_resize(struct obd_export *exp) return !!(exp->exp_connect_flags & OBD_CONNECT_LRU_RESIZE); } +static inline int exp_connect_rmtclient(struct obd_export *exp) +{ + LASSERT(exp != NULL); + return !!(exp->exp_connect_flags & OBD_CONNECT_RMT_CLIENT); +} + +static inline int client_is_remote(struct obd_export *exp) +{ + struct obd_import *imp = class_exp2cliimp(exp); + + return !!(imp->imp_connect_data.ocd_connect_flags & + OBD_CONNECT_RMT_CLIENT); +} + static inline int imp_connect_lru_resize(struct obd_import *imp) { struct obd_connect_data *ocd; diff --git a/lustre/include/lustre_fid.h b/lustre/include/lustre_fid.h index 7133abd..7c8085f 100644 --- a/lustre/include/lustre_fid.h +++ b/lustre/include/lustre_fid.h @@ -54,8 +54,8 @@ struct lu_site; struct lu_context; /* Whole sequences space range and zero range definitions */ -extern const struct lu_range LUSTRE_SEQ_SPACE_RANGE; -extern const struct lu_range LUSTRE_SEQ_ZERO_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_SPACE_RANGE; +extern const struct lu_seq_range LUSTRE_SEQ_ZERO_RANGE; extern const struct lu_fid LUSTRE_BFL_FID; enum { @@ -63,7 +63,7 @@ enum { * This is how may FIDs may be allocated in one sequence. 16384 for * now. */ - LUSTRE_SEQ_MAX_WIDTH = 0x0000000000004000ULL, + LUSTRE_SEQ_MAX_WIDTH = 0x0000000000000400ULL, /* * How many sequences may be allocate for meta-sequence (this is 128 @@ -79,6 +79,38 @@ enum { LUSTRE_SEQ_SUPER_WIDTH = (LUSTRE_SEQ_META_WIDTH * LUSTRE_SEQ_META_WIDTH) }; +/** special fid seq: used for local object create. */ +#define FID_SEQ_LOCAL_FILE (FID_SEQ_START + 1) + +/** special OID for local objects */ +enum { + /** \see osd_oi_index_create */ + OSD_OI_FID_SMALL_OID = 1UL, + OSD_OI_FID_OTHER_OID = 2UL, + /** \see fld_mod_init */ + FLD_INDEX_OID = 3UL, + /** \see fid_mod_init */ + FID_SEQ_CTL_OID = 4UL, + FID_SEQ_SRV_OID = 5UL, + /** \see mdd_mod_init */ + MDD_ROOT_INDEX_OID = 6UL, + MDD_ORPHAN_OID = 7UL, + MDD_LOV_OBJ_OID = 8UL, + MDD_CAPA_KEYS_OID = 9UL, + MDD_OBJECTS_OID = 10UL, + /** \see mdt_mod_init */ + MDT_LAST_RECV_OID = 11UL, + /** \see osd_mod_init */ + OSD_REM_OBJ_DIR_OID = 12UL, +}; + +static inline void lu_local_obj_fid(struct lu_fid *fid, __u32 oid) +{ + fid->f_seq = FID_SEQ_LOCAL_FILE; + fid->f_oid = oid; + fid->f_ver = 0; +} + enum lu_mgr_type { LUSTRE_SEQ_SERVER, LUSTRE_SEQ_CONTROLLER @@ -102,7 +134,7 @@ struct lu_client_seq { * clients, this contains meta-sequence range. And for servers this * contains super-sequence range. */ - struct lu_range lcs_space; + struct lu_seq_range lcs_space; /* Seq related proc */ cfs_proc_dir_entry_t *lcs_proc_dir; @@ -132,7 +164,7 @@ struct lu_client_seq { /* server sequence manager interface */ struct lu_server_seq { /* Available sequences space */ - struct lu_range lss_space; + struct lu_seq_range lss_space; /* * Device for server side seq manager needs (saving sequences to backing @@ -166,6 +198,11 @@ struct lu_server_seq { * LUSTRE_SEQ_SUPER_WIDTH and LUSTRE_SEQ_META_WIDTH. */ __u64 lss_width; + + /** + * Pointer to site object, required to access site fld. + */ + struct md_site *lss_site; }; int seq_query(struct com_thread_info *info); @@ -175,19 +212,20 @@ int seq_server_init(struct lu_server_seq *seq, struct dt_device *dev, const char *prefix, enum lu_mgr_type type, + struct md_site *ls, const struct lu_env *env); void seq_server_fini(struct lu_server_seq *seq, const struct lu_env *env); int seq_server_alloc_super(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env); int seq_server_alloc_meta(struct lu_server_seq *seq, - struct lu_range *in, - struct lu_range *out, + struct lu_seq_range *in, + struct lu_seq_range *out, const struct lu_env *env); int seq_server_set_cli(struct lu_server_seq *seq, @@ -209,7 +247,8 @@ int seq_client_alloc_fid(struct lu_client_seq *seq, struct lu_fid *fid); /* Fids common stuff */ -int fid_is_local(struct lu_site *site, const struct lu_fid *fid); +int fid_is_local(const struct lu_env *env, + struct lu_site *site, const struct lu_fid *fid); /* fid locking */ @@ -268,9 +307,32 @@ static inline __u64 fid_flatten(const struct lu_fid *fid) #define LUSTRE_SEQ_CTL_NAME "seq_ctl" /* Range common stuff */ -void range_cpu_to_le(struct lu_range *dst, const struct lu_range *src); -void range_cpu_to_be(struct lu_range *dst, const struct lu_range *src); -void range_le_to_cpu(struct lu_range *dst, const struct lu_range *src); -void range_be_to_cpu(struct lu_range *dst, const struct lu_range *src); +static inline void range_cpu_to_le(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_le64(src->lsr_start); + dst->lsr_end = cpu_to_le64(src->lsr_end); + dst->lsr_mdt = cpu_to_le32(src->lsr_mdt); +} + +static inline void range_le_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = le64_to_cpu(src->lsr_start); + dst->lsr_end = le64_to_cpu(src->lsr_end); + dst->lsr_mdt = le32_to_cpu(src->lsr_mdt); +} + +static inline void range_cpu_to_be(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = cpu_to_be64(src->lsr_start); + dst->lsr_end = cpu_to_be64(src->lsr_end); + dst->lsr_mdt = cpu_to_be32(src->lsr_mdt); +} + +static inline void range_be_to_cpu(struct lu_seq_range *dst, const struct lu_seq_range *src) +{ + dst->lsr_start = be64_to_cpu(src->lsr_start); + dst->lsr_end = be64_to_cpu(src->lsr_end); + dst->lsr_mdt = be32_to_cpu(src->lsr_mdt); +} #endif /* __LINUX_FID_H */ diff --git a/lustre/include/lustre_fld.h b/lustre/include/lustre_fld.h index a65408f..ec65b99 100644 --- a/lustre/include/lustre_fld.h +++ b/lustre/include/lustre_fld.h @@ -45,12 +45,11 @@ struct lu_client_fld; struct lu_server_fld; +struct lu_fld_hash; +struct fld_cache; -struct fld_stats { - __u64 fst_count; - __u64 fst_cache; - __u64 fst_inflight; -}; +extern const struct dt_index_features fld_index_features; +extern const char fld_index_name[]; /* * FLD (Fid Location Database) interface. @@ -60,7 +59,6 @@ enum { LUSTRE_CLI_FLD_HASH_RRB }; -struct lu_server_fld; struct lu_fld_target { struct list_head ft_chain; @@ -69,134 +67,101 @@ struct lu_fld_target { __u64 ft_idx; }; -typedef int -(*fld_hash_func_t) (struct lu_client_fld *, __u64); - -typedef struct lu_fld_target * -(*fld_scan_func_t) (struct lu_client_fld *, __u64); - -struct lu_fld_hash { - const char *fh_name; - fld_hash_func_t fh_hash_func; - fld_scan_func_t fh_scan_func; -}; - -struct fld_cache_entry { - struct hlist_node fce_list; - struct list_head fce_lru; - mdsno_t fce_mds; - seqno_t fce_seq; - cfs_waitq_t fce_waitq; - __u32 fce_inflight:1, - fce_invalid:1; -}; - -struct fld_cache { - /* - * Cache guard, protects fci_hash mostly because others immutable after - * init is finished. - */ - spinlock_t fci_lock; - - /* Cache shrink threshold */ - int fci_threshold; - - /* Prefered number of cached entries */ - int fci_cache_size; - - /* Current number of cached entries. Protected by @fci_lock */ - int fci_cache_count; - - /* Hash table size (number of collision lists) */ - int fci_hash_size; - - /* Hash table mask */ - int fci_hash_mask; - - /* Hash table for all collision lists */ - struct hlist_head *fci_hash_table; - - /* Lru list */ - struct list_head fci_lru; - - /* Cache statistics. */ - struct fld_stats fci_stat; - - /* Cache name used for debug and messages. */ - char fci_name[80]; -}; - struct lu_server_fld { - /* Fld dir proc entry. */ + /** + * Fld dir proc entry. */ cfs_proc_dir_entry_t *lsf_proc_dir; - /* /fld file object device */ + /** + * /fld file object device */ struct dt_object *lsf_obj; - /* Client FLD cache. */ + /** + * super sequence controller export, needed to forward fld + * lookup request. */ + struct obd_export *lsf_control_exp; + + /** + * Client FLD cache. */ struct fld_cache *lsf_cache; - /* Protect index modifications */ - struct semaphore lsf_sem; + /** + * Protect index modifications */ + struct mutex lsf_lock; - /* Fld service name in form "fld-srv-lustre-MDTXXX" */ + /** + * Fld service name in form "fld-srv-lustre-MDTXXX" */ char lsf_name[80]; }; -enum { - LUSTRE_FLD_INIT = 1 << 0, - LUSTRE_FLD_RUN = 1 << 1 -}; - struct lu_client_fld { - /* Client side proc entry. */ + /** + * Client side proc entry. */ cfs_proc_dir_entry_t *lcf_proc_dir; - /* List of exports client FLD knows about. */ + /** + * List of exports client FLD knows about. */ struct list_head lcf_targets; - /* Current hash to be used to chose an export. */ + /** + * Current hash to be used to chose an export. */ struct lu_fld_hash *lcf_hash; - /* Exports count. */ + /** + * Exports count. */ int lcf_count; - /* Lock protecting exports list and fld_hash. */ + /** + * Lock protecting exports list and fld_hash. */ spinlock_t lcf_lock; - /* Client FLD cache. */ + /** + * Client FLD cache. */ struct fld_cache *lcf_cache; - /* Client fld proc entry name. */ + /** + * Client fld proc entry name. */ char lcf_name[80]; const struct lu_context *lcf_ctx; - + int lcf_flags; }; +/** + * number of blocks to reserve for particular operations. Should be function of + * ... something. Stub for now. + */ +enum { + /* one insert operation can involve two delete and one insert */ + FLD_TXN_INDEX_INSERT_CREDITS = 60, + FLD_TXN_INDEX_DELETE_CREDITS = 20, +}; + int fld_query(struct com_thread_info *info); /* Server methods */ int fld_server_init(struct lu_server_fld *fld, struct dt_device *dt, const char *prefix, - const struct lu_env *env); + const struct lu_env *env, + int mds_node_id); void fld_server_fini(struct lu_server_fld *fld, const struct lu_env *env); int fld_server_create(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t mds); + struct lu_seq_range *add_range, + struct thandle *th); int fld_server_delete(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq); + struct lu_seq_range *range); int fld_server_lookup(struct lu_server_fld *fld, const struct lu_env *env, - seqno_t seq, mdsno_t *mds); + seqno_t seq, struct lu_seq_range *range); /* Client methods */ int fld_client_init(struct lu_client_fld *fld, @@ -211,7 +176,7 @@ int fld_client_lookup(struct lu_client_fld *fld, const struct lu_env *env); int fld_client_create(struct lu_client_fld *fld, - seqno_t seq, mdsno_t mds, + struct lu_seq_range *range, const struct lu_env *env); int fld_client_delete(struct lu_client_fld *fld, @@ -224,27 +189,4 @@ int fld_client_add_target(struct lu_client_fld *fld, int fld_client_del_target(struct lu_client_fld *fld, __u64 idx); -/* Cache methods */ -struct fld_cache *fld_cache_init(const char *name, - int hash_size, - int cache_size, - int cache_threshold); - -void fld_cache_fini(struct fld_cache *cache); - -void fld_cache_flush(struct fld_cache *cache); - -int fld_cache_insert(struct fld_cache *cache, - seqno_t seq, mdsno_t mds); - -int fld_cache_insert_inflight(struct fld_cache *cache, - seqno_t seq); - -void fld_cache_delete(struct fld_cache *cache, - seqno_t seq); - -int -fld_cache_lookup(struct fld_cache *cache, - seqno_t seq, mdsno_t *mds); - #endif diff --git a/lustre/include/lustre_import.h b/lustre/include/lustre_import.h index 3fde7b2..4f6e83f 100644 --- a/lustre/include/lustre_import.h +++ b/lustre/include/lustre_import.h @@ -126,6 +126,7 @@ struct obd_import { cfs_waitq_t imp_recovery_waitq; atomic_t imp_inflight; + atomic_t imp_unregistering; atomic_t imp_replay_inflight; atomic_t imp_inval_count; enum lustre_imp_state imp_state; @@ -199,9 +200,9 @@ static inline unsigned int at_est2timeout(unsigned int val) static inline unsigned int at_timeout2est(unsigned int val) { - /* restore estimate value from timeout */ + /* restore estimate value from timeout: e=4/5(t-5) */ LASSERT(val); - return ((val - 1) / 5 * 4); + return (max((val << 2) / 5, 5U) - 4); } static inline void at_init(struct adaptive_timeout *at, int val, int flags) { diff --git a/lustre/include/lustre_lib.h b/lustre/include/lustre_lib.h index 83697fe..ba36693 100644 --- a/lustre/include/lustre_lib.h +++ b/lustre/include/lustre_lib.h @@ -80,13 +80,12 @@ int target_pack_pool_reply(struct ptlrpc_request *req); int target_handle_ping(struct ptlrpc_request *req); void target_committed_to_req(struct ptlrpc_request *req); -#ifdef HAVE_QUOTA_SUPPORT /* quotacheck callback, dqacq/dqrel callback handler */ int target_handle_qc_callback(struct ptlrpc_request *req); +#ifdef HAVE_QUOTA_SUPPORT int target_handle_dqacq_callback(struct ptlrpc_request *req); #else #define target_handle_dqacq_callback(req) ldlm_callback_reply(req, -ENOTSUPP) -#define target_handle_qc_callback(req) (0) #endif #define OBD_RECOVERY_MAX_TIME (obd_timeout * 18) /* b13079 */ diff --git a/lustre/include/lustre_lite.h b/lustre/include/lustre_lite.h index 439d0dd..34bb9f1 100644 --- a/lustre/include/lustre_lite.h +++ b/lustre/include/lustre_lite.h @@ -165,15 +165,15 @@ static inline int ll_ocd_update(struct obd_device *host, RETURN(result); } -/* +/* * Chain of hash overflow pages. - */ + */ struct ll_dir_chain { /* XXX something. Later */ }; - + static inline void ll_dir_chain_init(struct ll_dir_chain *chain) -{ +{ } static inline void ll_dir_chain_fini(struct ll_dir_chain *chain) diff --git a/lustre/include/lustre_log.h b/lustre/include/lustre_log.h index 3140d5b..402e33e 100644 --- a/lustre/include/lustre_log.h +++ b/lustre/include/lustre_log.h @@ -282,17 +282,17 @@ struct llog_commit_master { */ atomic_t lcm_count; /** - * Ptlrpc requests set. All cancel rpcs go via this request set. - */ - struct ptlrpc_request_set *lcm_set; - /** * Thread control structure. Used for control commit thread. */ struct ptlrpcd_ctl lcm_pc; /** - * Busy resources waitq + * Lock protecting list of llcds. + */ + spinlock_t lcm_lock; + /** + * Llcds in flight for debugging purposes. */ - cfs_waitq_t lcm_waitq; + struct list_head lcm_llcds; /** * Commit thread name buffer. Only used for thread start. */ @@ -317,6 +317,10 @@ struct llog_canceld_ctxt { */ int llcd_size; /** + * Link to lcm llcds list. + */ + struct list_head llcd_list; + /** * Current llcd size while gathering cookies. This should not be * more than ->llcd_size. Used for determining if we need to * send this llcd (if full) and allocate new one. This is also diff --git a/lustre/include/lustre_mds.h b/lustre/include/lustre_mds.h index 4b81027..fb63c75 100644 --- a/lustre/include/lustre_mds.h +++ b/lustre/include/lustre_mds.h @@ -65,8 +65,9 @@ struct mds_group_info { int group; }; -/* mds/mds_reint.c */ +/* mds/mds_lov.c */ int mds_lov_write_objids(struct obd_device *obd); +int mds_lov_prepare_objids(struct obd_device *obd, struct lov_mds_md *lmm); void mds_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm); diff --git a/lustre/include/lustre_net.h b/lustre/include/lustre_net.h index 7ff653c..8fa08d2 100644 --- a/lustre/include/lustre_net.h +++ b/lustre/include/lustre_net.h @@ -58,6 +58,8 @@ #include #include +#include + /* MD flags we _always_ use */ #define PTLRPC_MD_OPTIONS 0 @@ -167,7 +169,7 @@ #define MGS_MAXREPSIZE (9 * 1024) /* Absolute limits */ -#define OSS_THREADS_MIN 2 +#define OSS_THREADS_MIN 3 /* difficult replies, HPQ, others */ #define OSS_THREADS_MAX 512 #define OST_NBUFS (64 * num_online_cpus()) #define OST_BUFSIZE (8 * 1024) @@ -210,8 +212,8 @@ union ptlrpc_async_args { * big enough. For _tons_ of context, OBD_ALLOC a struct and store * a pointer to it here. The pointer_arg ensures this struct is at * least big enough for that. */ - void *pointer_arg[9]; - __u64 space[4]; + void *pointer_arg[11]; + __u64 space[6]; }; struct ptlrpc_request_set; @@ -260,6 +262,8 @@ struct ptlrpc_reply_state { #endif /* updates to following flag serialised by srv_request_lock */ unsigned long rs_difficult:1; /* ACK/commit stuff */ + unsigned long rs_no_ack:1; /* no ACK, even for + difficult requests */ unsigned long rs_scheduled:1; /* being handled? */ unsigned long rs_scheduled_ever:1;/* any schedule attempts? */ unsigned long rs_handled:1; /* been handled yet? */ @@ -289,11 +293,13 @@ struct ptlrpc_reply_state { struct ptlrpc_thread; enum rq_phase { - RQ_PHASE_NEW = 0xebc0de00, - RQ_PHASE_RPC = 0xebc0de01, - RQ_PHASE_BULK = 0xebc0de02, - RQ_PHASE_INTERPRET = 0xebc0de03, - RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_NEW = 0xebc0de00, + RQ_PHASE_RPC = 0xebc0de01, + RQ_PHASE_BULK = 0xebc0de02, + RQ_PHASE_INTERPRET = 0xebc0de03, + RQ_PHASE_COMPLETE = 0xebc0de04, + RQ_PHASE_UNREGISTERING = 0xebc0de05, + RQ_PHASE_UNDEFINED = 0xebc0de06 }; /** Type of request interpreter call-back */ @@ -311,6 +317,20 @@ struct ptlrpc_request_pool { struct lu_context; struct lu_env; +struct ldlm_lock; + +struct ptlrpc_hpreq_ops { + /** + * Check if the lock handle of the given lock is the same as + * taken from the request. + */ + int (*hpreq_lock_match)(struct ptlrpc_request *, struct ldlm_lock *); + /** + * Check if the request is a high priority one. + */ + int (*hpreq_check)(struct ptlrpc_request *); +}; + /** * Represents remote procedure call. */ @@ -319,6 +339,8 @@ struct ptlrpc_request { struct list_head rq_list; struct list_head rq_timed_list; /* server-side early replies */ struct list_head rq_history_list; /* server-side history */ + struct list_head rq_exp_list; /* server-side per-export list */ + struct ptlrpc_hpreq_ops *rq_ops; /* server-side hp handlers */ __u64 rq_history_seq; /* history sequence # */ int rq_status; spinlock_t rq_lock; @@ -342,9 +364,11 @@ struct ptlrpc_request { rq_early:1, rq_must_unlink:1, /* server-side flags */ rq_packed_final:1, /* packed final reply */ - rq_sent_final:1; /* stop sending early replies */ + rq_sent_final:1, /* stop sending early replies */ + rq_hp:1; /* high priority RPC */ enum rq_phase rq_phase; /* one of RQ_PHASE_* */ + enum rq_phase rq_next_phase; /* one of RQ_PHASE_* to be used next */ atomic_t rq_refcount; /* client-side refcount for SENT race, server-side refcounf for multiple replies */ @@ -452,6 +476,8 @@ struct ptlrpc_request { volatile time_t rq_deadline; /* when request must finish. volatile so that servers' early reply updates to the deadline aren't kept in per-cpu cache */ + time_t rq_reply_deadline; /* when req reply unlink must finish. */ + time_t rq_bulk_deadline; /* when req bulk unlink must finish. */ int rq_timeout; /* service time estimate (secs) */ /* Multi-rpc bits */ @@ -503,9 +529,9 @@ static inline int lustre_rep_swabbed(struct ptlrpc_request *req, int index) } static inline const char * -ptlrpc_rqphase2str(const struct ptlrpc_request *req) +ptlrpc_phase2str(enum rq_phase phase) { - switch (req->rq_phase) { + switch (phase) { case RQ_PHASE_NEW: return "New"; case RQ_PHASE_RPC: @@ -516,11 +542,19 @@ ptlrpc_rqphase2str(const struct ptlrpc_request *req) return "Interpret"; case RQ_PHASE_COMPLETE: return "Complete"; + case RQ_PHASE_UNREGISTERING: + return "Unregistering"; default: return "?Phase?"; } } +static inline const char * +ptlrpc_rqphase2str(struct ptlrpc_request *req) +{ + return ptlrpc_phase2str(req->rq_phase); +} + /* Spare the preprocessor, spoil the bugs. */ #define FLAG(field, str) (field ? str : "") @@ -532,9 +566,9 @@ ptlrpc_rqphase2str(const struct ptlrpc_request *req) FLAG(req->rq_restart, "T"), FLAG(req->rq_replay, "P"), \ FLAG(req->rq_no_resend, "N"), \ FLAG(req->rq_waiting, "W"), \ - FLAG(req->rq_wait_ctx, "C") + FLAG(req->rq_wait_ctx, "C"), FLAG(req->rq_hp, "H") -#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s" +#define REQ_FLAGS_FMT "%s:%s%s%s%s%s%s%s%s%s%s%s" void _debug_req(struct ptlrpc_request *req, __u32 mask, struct libcfs_debug_msg_data *data, const char *fmt, ...) @@ -609,13 +643,23 @@ struct ptlrpc_bulk_desc { }; struct ptlrpc_thread { - - struct list_head t_link; /* active threads in svc->srv_threads */ - - void *t_data; /* thread-private data (preallocated memory) */ + /** + * active threads in svc->srv_threads + */ + struct list_head t_link; + /** + * thread-private data (preallocated memory) + */ + void *t_data; __u32 t_flags; - - unsigned int t_id; /* service thread index, from ptlrpc_start_threads */ + /** + * service thread index, from ptlrpc_start_threads + */ + unsigned int t_id; + /** + * put watchdog in the structure per thread b=14840 + */ + struct lc_watchdog *t_watchdog; cfs_waitq_t t_ctl_waitq; struct lu_env *t_env; }; @@ -633,6 +677,9 @@ struct ptlrpc_request_buffer_desc { typedef int (*svc_handler_t)(struct ptlrpc_request *req); typedef void (*svcreq_printfn_t)(void *, struct ptlrpc_request *); +typedef int (*svc_hpreq_handler_t)(struct ptlrpc_request *); + +#define PTLRPC_SVC_HP_RATIO 10 struct ptlrpc_service { struct list_head srv_list; /* chain thru all services */ @@ -647,10 +694,12 @@ struct ptlrpc_service { int srv_threads_running; /* # running threads */ int srv_n_difficult_replies; /* # 'difficult' replies */ int srv_n_active_reqs; /* # reqs being served */ + int srv_n_hpreq; /* # HPreqs being served */ cfs_duration_t srv_rqbd_timeout; /* timeout before re-posting reqs, in tick */ int srv_watchdog_factor; /* soft watchdog timeout mutiplier */ unsigned srv_cpu_affinity:1; /* bind threads to CPUs */ unsigned srv_at_check:1; /* check early replies */ + unsigned srv_is_stopping:1; /* under unregister_service */ cfs_time_t srv_at_checktime; /* debug */ __u32 srv_req_portal; @@ -663,8 +712,11 @@ struct ptlrpc_service { cfs_timer_t srv_at_timer; /* early reply timer */ int srv_n_queued_reqs; /* # reqs in either of the queues below */ + int srv_hpreq_count; /* # hp requests handled */ + int srv_hpreq_ratio; /* # hp per lp reqs to handle */ struct list_head srv_req_in_queue; /* incoming reqs */ struct list_head srv_request_queue; /* reqs waiting for service */ + struct list_head srv_request_hpq; /* high priority queue */ struct list_head srv_request_history; /* request history */ __u64 srv_request_seq; /* next request sequence # */ @@ -689,6 +741,7 @@ struct ptlrpc_service { struct list_head srv_threads; /* service thread list */ svc_handler_t srv_handler; + svc_hpreq_handler_t srv_hpreq_handler; /* hp request handler */ char *srv_name; /* only statically allocated strings here; we don't clean them */ char *srv_thread_name; /* only statically allocated strings here; we don't clean them */ @@ -724,7 +777,7 @@ struct ptlrpc_service { struct ptlrpcd_ctl { /** - * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_STOP_FORCE) + * Ptlrpc thread control flags (LIOD_START, LIOD_STOP, LIOD_FORCE) */ unsigned long pc_flags; /** @@ -783,10 +836,15 @@ enum ptlrpcd_ctl_flags { */ LIOD_STOP = 1 << 1, /** - * Ptlrpc thread stop force flag. This will cause also - * aborting any inflight rpcs handled by thread. + * Ptlrpc thread force flag (only stop force so far). + * This will cause aborting any inflight rpcs handled + * by thread if LIOD_STOP is specified. */ - LIOD_STOP_FORCE = 1 << 2 + LIOD_FORCE = 1 << 2, + /** + * This is a recovery ptlrpc thread. + */ + LIOD_RECOVERY = 1 << 3 }; /* ptlrpc/events.c */ @@ -814,16 +872,38 @@ extern lnet_pid_t ptl_get_pid(void); int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc); void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc); int ptlrpc_register_bulk(struct ptlrpc_request *req); -void ptlrpc_unregister_bulk (struct ptlrpc_request *req); +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async); + +static inline int ptlrpc_server_bulk_active(struct ptlrpc_bulk_desc *desc) +{ + int rc; + + LASSERT(desc != NULL); + + spin_lock(&desc->bd_lock); + rc = desc->bd_network_rw; + spin_unlock(&desc->bd_lock); + return rc; +} -static inline int ptlrpc_bulk_active (struct ptlrpc_bulk_desc *desc) +static inline int ptlrpc_client_bulk_active(struct ptlrpc_request *req) { - int rc; + struct ptlrpc_bulk_desc *desc = req->rq_bulk; + int rc; + + LASSERT(req != NULL); + + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + req->rq_bulk_deadline > cfs_time_current_sec()) + return 1; + + if (!desc) + return 0; spin_lock(&desc->bd_lock); rc = desc->bd_network_rw; spin_unlock(&desc->bd_lock); - return (rc); + return rc; } #define PTLRPC_REPLY_MAYBE_DIFFICULT 0x01 @@ -843,29 +923,9 @@ void ptlrpc_init_client(int req_portal, int rep_portal, char *name, void ptlrpc_cleanup_client(struct obd_import *imp); struct ptlrpc_connection *ptlrpc_uuid_to_connection(struct obd_uuid *uuid); -static inline int -ptlrpc_client_recv_or_unlink (struct ptlrpc_request *req) -{ - int rc; - - spin_lock(&req->rq_lock); - rc = req->rq_receiving_reply || req->rq_must_unlink; - spin_unlock(&req->rq_lock); - return (rc); -} - -static inline void -ptlrpc_wake_client_req (struct ptlrpc_request *req) -{ - if (req->rq_set == NULL) - cfs_waitq_signal(&req->rq_reply_waitq); - else - cfs_waitq_signal(&req->rq_set->set_waitq); -} - int ptlrpc_queue_wait(struct ptlrpc_request *req); int ptlrpc_replay_req(struct ptlrpc_request *req); -void ptlrpc_unregister_reply(struct ptlrpc_request *req); +int ptlrpc_unregister_reply(struct ptlrpc_request *req, int async); void ptlrpc_restart_req(struct ptlrpc_request *req); void ptlrpc_abort_inflight(struct obd_import *imp); void ptlrpc_abort_set(struct ptlrpc_request_set *set); @@ -944,7 +1004,7 @@ struct ptlrpc_service_conf { /* ptlrpc/service.c */ void ptlrpc_save_lock (struct ptlrpc_request *req, - struct lustre_handle *lock, int mode); + struct lustre_handle *lock, int mode, int no_ack); void ptlrpc_commit_replies (struct obd_device *obd); void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs); struct ptlrpc_service *ptlrpc_init_svc_conf(struct ptlrpc_service_conf *c, @@ -961,7 +1021,8 @@ struct ptlrpc_service *ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t, int min_threads, int max_threads, - char *threadname, __u32 ctx_tags); + char *threadname, __u32 ctx_tags, + svc_hpreq_handler_t); void ptlrpc_stop_all_threads(struct ptlrpc_service *svc); int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc); @@ -970,6 +1031,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service); int liblustre_check_services (void *arg); void ptlrpc_daemonize(char *name); int ptlrpc_service_health_check(struct ptlrpc_service *); +void ptlrpc_hpreq_reorder(struct ptlrpc_request *req); struct ptlrpc_svc_data { @@ -1074,6 +1136,81 @@ lustre_shrink_reply(struct ptlrpc_request *req, int segment, } static inline void +ptlrpc_rqphase_move(struct ptlrpc_request *req, enum rq_phase new_phase) +{ + if (req->rq_phase == new_phase) + return; + + if (new_phase == RQ_PHASE_UNREGISTERING) { + req->rq_next_phase = req->rq_phase; + if (req->rq_import) + atomic_inc(&req->rq_import->imp_unregistering); + } + + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + if (req->rq_import) + atomic_dec(&req->rq_import->imp_unregistering); + } + + DEBUG_REQ(D_RPCTRACE, req, "move req \"%s\" -> \"%s\"", + ptlrpc_rqphase2str(req), ptlrpc_phase2str(new_phase)); + + req->rq_phase = new_phase; +} + +static inline int +ptlrpc_client_early(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_early; +} + +static inline int +ptlrpc_client_replied(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 0; + return req->rq_replied; +} + +static inline int +ptlrpc_client_recv(struct ptlrpc_request *req) +{ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) + return 1; + return req->rq_receiving_reply; +} + +static inline int +ptlrpc_client_recv_or_unlink(struct ptlrpc_request *req) +{ + int rc; + + spin_lock(&req->rq_lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + req->rq_reply_deadline > cfs_time_current_sec()) { + spin_unlock(&req->rq_lock); + return 1; + } + rc = req->rq_receiving_reply || req->rq_must_unlink; + spin_unlock(&req->rq_lock); + return rc; +} + +static inline void +ptlrpc_client_wake_req(struct ptlrpc_request *req) +{ + if (req->rq_set == NULL) + cfs_waitq_signal(&req->rq_reply_waitq); + else + cfs_waitq_signal(&req->rq_set->set_waitq); +} + +static inline void ptlrpc_rs_addref(struct ptlrpc_reply_state *rs) { LASSERT(atomic_read(&rs->rs_refcount) > 0); @@ -1142,10 +1279,25 @@ void ping_evictor_stop(void); int ptlrpc_check_and_wait_suspend(struct ptlrpc_request *req); /* ptlrpc/ptlrpcd.c */ -int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc); + +/** + * Ptlrpcd scope is a set of two threads: ptlrpcd-foo and ptlrpcd-foo-rcv, + * these threads are used to asynchronously send requests queued with + * ptlrpcd_add_req(req, PCSOPE_FOO), and to handle completion call-backs for + * such requests. Multiple scopes are needed to avoid dead-locks. + */ +enum ptlrpcd_scope { + /** Scope of bulk read-write rpcs. */ + PSCOPE_BRW, + /** Everything else. */ + PSCOPE_OTHER, + PSCOPE_NR +}; + +int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc); void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force); void ptlrpcd_wake(struct ptlrpc_request *req); -void ptlrpcd_add_req(struct ptlrpc_request *req); +void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope); int ptlrpcd_addref(void); void ptlrpcd_decref(void); diff --git a/lustre/include/lustre_param.h b/lustre/include/lustre_param.h index c8ce970..d106fa1 100644 --- a/lustre/include/lustre_param.h +++ b/lustre/include/lustre_param.h @@ -45,6 +45,7 @@ /* obd_config.c */ int class_find_param(char *buf, char *key, char **valp); +int class_get_next_param(char **params, char *copy); int class_match_param(char *buf, char *key, char **valp); int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh); /* obd_mount.c */ diff --git a/lustre/include/lustre_quota.h b/lustre/include/lustre_quota.h index 2b26e24..b0dc442 100644 --- a/lustre/include/lustre_quota.h +++ b/lustre/include/lustre_quota.h @@ -50,6 +50,8 @@ #include #include #include +#include +#include struct obd_device; struct client_obd; @@ -62,6 +64,64 @@ struct client_obd; #ifdef __KERNEL__ +#ifdef LPROCFS +enum { + LQUOTA_FIRST_STAT = 0, + /** @{ */ + /** + * these four are for measuring quota requests, for both of + * quota master and quota slaves + */ + LQUOTA_SYNC_ACQ = LQUOTA_FIRST_STAT, + LQUOTA_SYNC_REL, + LQUOTA_ASYNC_ACQ, + LQUOTA_ASYNC_REL, + /** }@ */ + /** @{ */ + /** + * these four measure how much time I/O threads spend on dealing + * with quota before and after writing data or creating files, + * only for quota slaves(lquota_chkquota and lquota_pending_commit) + */ + LQUOTA_WAIT_FOR_CHK_BLK, + LQUOTA_WAIT_FOR_CHK_INO, + LQUOTA_WAIT_FOR_COMMIT_BLK, + LQUOTA_WAIT_FOR_COMMIT_INO, + /** }@ */ + /** @{ */ + /** + * these two are for measuring time waiting return of quota reqs + * (qctxt_wait_pending_dqacq), only for quota salves + */ + LQUOTA_WAIT_PENDING_BLK_QUOTA, + LQUOTA_WAIT_PENDING_INO_QUOTA, + /** }@ */ + /** @{ */ + /** + * these two are for those when they are calling + * qctxt_wait_pending_dqacq, the quota req has returned already, + * only for quota salves + */ + LQUOTA_NOWAIT_PENDING_BLK_QUOTA, + LQUOTA_NOWAIT_PENDING_INO_QUOTA, + /** }@ */ + /** @{ */ + /** + * these are for quota ctl + */ + LQUOTA_QUOTA_CTL, + /** }@ */ + /** @{ */ + /** + * these are for adjust quota qunit, for both of + * quota master and quota slaves + */ + LQUOTA_ADJUST_QUNIT, + LQUOTA_LAST_STAT + /** }@ */ +}; +#endif /* LPROCFS */ + /* structures to access admin quotafile */ struct lustre_mem_dqinfo { unsigned int dqi_bgrace; @@ -75,28 +135,45 @@ struct lustre_mem_dqinfo { struct lustre_quota_info { struct file *qi_files[MAXQUOTAS]; struct lustre_mem_dqinfo qi_info[MAXQUOTAS]; + lustre_quota_version_t qi_version; }; #define DQ_STATUS_AVAIL 0x0 /* Available dquot */ #define DQ_STATUS_SET 0x01 /* Sombody is setting dquot */ #define DQ_STATUS_RECOVERY 0x02 /* dquot is in recovery */ +struct lustre_mem_dqblk { + __u64 dqb_bhardlimit; /**< absolute limit on disk blks alloc */ + __u64 dqb_bsoftlimit; /**< preferred limit on disk blks */ + __u64 dqb_curspace; /**< current used space */ + __u64 dqb_ihardlimit; /**< absolute limit on allocated inodes */ + __u64 dqb_isoftlimit; /**< preferred inode limit */ + __u64 dqb_curinodes; /**< current # allocated inodes */ + time_t dqb_btime; /**< time limit for excessive disk use */ + time_t dqb_itime; /**< time limit for excessive inode use */ +}; + struct lustre_dquot { - /* Hash list in memory, protect by dquot_hash_lock */ + /** Hash list in memory, protect by dquot_hash_lock */ struct list_head dq_hash; - /* Protect the data in lustre_dquot */ + /** Protect the data in lustre_dquot */ struct semaphore dq_sem; - /* Use count */ + /** Use count */ int dq_refcnt; - /* Pointer of quota info it belongs to */ + /** Pointer of quota info it belongs to */ struct lustre_quota_info *dq_info; - - loff_t dq_off; /* Offset of dquot on disk */ - unsigned int dq_id; /* ID this applies to (uid, gid) */ - int dq_type; /* Type fo quota (USRQUOTA, GRPQUOUTA) */ - unsigned short dq_status; /* See DQ_STATUS_ */ - unsigned long dq_flags; /* See DQ_ in quota.h */ - struct mem_dqblk dq_dqb; /* Diskquota usage */ + /** Offset of dquot on disk */ + loff_t dq_off; + /** ID this applies to (uid, gid) */ + unsigned int dq_id; + /** Type fo quota (USRQUOTA, GRPQUOUTA) */ + int dq_type; + /** See DQ_STATUS_ */ + unsigned short dq_status; + /** See DQ_ in quota.h */ + unsigned long dq_flags; + /** Diskquota usage */ + struct lustre_mem_dqblk dq_dqb; }; struct dquot_id { @@ -110,37 +187,234 @@ struct dquot_id { #define QFILE_INIT_INFO 4 #define QFILE_RD_DQUOT 5 #define QFILE_WR_DQUOT 6 +#define QFILE_CONVERT 7 /* admin quotafile operations */ +#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0) int lustre_check_quota_file(struct lustre_quota_info *lqi, int type); int lustre_read_quota_info(struct lustre_quota_info *lqi, int type); int lustre_write_quota_info(struct lustre_quota_info *lqi, int type); int lustre_read_dquot(struct lustre_dquot *dquot); int lustre_commit_dquot(struct lustre_dquot *dquot); int lustre_init_quota_info(struct lustre_quota_info *lqi, int type); -int lustre_get_qids(struct file *file, struct inode *inode, int type, +int lustre_get_qids(struct file *file, struct inode *inode, int type, struct list_head *list); +int lustre_quota_convert(struct lustre_quota_info *lqi, int type); +#else + +#ifndef DQ_FAKE_B +#define DQ_FAKE_B 6 +#endif + +static inline int lustre_check_quota_file(struct lustre_quota_info *lqi, + int type) +{ + return 0; +} +static inline int lustre_read_quota_info(struct lustre_quota_info *lqi, + int type) +{ + return 0; +} +static inline int lustre_write_quota_info(struct lustre_quota_info *lqi, + int type) +{ + return 0; +} +static inline int lustre_read_dquot(struct lustre_dquot *dquot) +{ + return 0; +} +static inline int lustre_commit_dquot(struct lustre_dquot *dquot) +{ + return 0; +} +static inline int lustre_init_quota_info(struct lustre_quota_info *lqi, + int type) +{ + return 0; +} +static inline int lustre_quota_convert(struct lustre_quota_info *lqi, + int type) +{ + return 0; +} +#endif /* KERNEL_VERSION(2,5,0) */ #define LL_DQUOT_OFF(sb) DQUOT_OFF(sb) typedef int (*dqacq_handler_t) (struct obd_device * obd, struct qunit_data * qd, int opc); + +/* user quota is turned on on filter */ +#define LQC_USRQUOTA_FLAG (1 << 0) +/* group quota is turned on on filter */ +#define LQC_GRPQUOTA_FLAG (1 << 1) + +#define UGQUOTA2LQC(id) ((Q_TYPEMATCH(id, USRQUOTA) ? LQC_USRQUOTA_FLAG : 0) | \ + (Q_TYPEMATCH(id, GRPQUOTA) ? LQC_GRPQUOTA_FLAG : 0)) + struct lustre_quota_ctxt { - struct super_block *lqc_sb; /* superblock this applies to */ - struct obd_import *lqc_import; /* import used to send dqacq/dqrel RPC */ - dqacq_handler_t lqc_handler; /* dqacq/dqrel RPC handler, only for quota master */ - unsigned long lqc_recovery:1, /* Doing recovery */ - lqc_atype:2, /* Turn on user/group quota at setup automatically, - * 0: none, 1: user quota, 2: group quota, 3: both */ - lqc_status:1; /* Quota status. 0:Off, 1:On */ - unsigned long lqc_iunit_sz; /* Unit size of file quota */ - unsigned long lqc_itune_sz; /* Trigger dqacq when available file quota less than - * this value, trigger dqrel when available file quota - * more than this value + 1 iunit */ - unsigned long lqc_bunit_sz; /* Unit size of block quota */ - unsigned long lqc_btune_sz; /* See comment of lqc_itune_sz */ + /** superblock this applies to */ + struct super_block *lqc_sb; + /** obd_device_target for obt_rwsem */ + struct obd_device_target *lqc_obt; + /** import used to send dqacq/dqrel RPC */ + struct obd_import *lqc_import; + /** dqacq/dqrel RPC handler, only for quota master */ + dqacq_handler_t lqc_handler; + /** quota flags */ + unsigned long lqc_flags; + /** @{ */ + unsigned long lqc_recovery:1, /** Doing recovery */ + lqc_switch_qs:1, /** + * the function of change qunit size + * 0:Off, 1:On + */ + lqc_valid:1, /** this qctxt is valid or not */ + lqc_setup:1; /** + * tell whether of not quota_type has + * been processed, so that the master + * knows when it can start processing + * incoming acq/rel quota requests + */ + /** }@ */ + /** + * original unit size of file quota and + * upper limitation for adjust file qunit + */ + unsigned long lqc_iunit_sz; + /** + * Trigger dqacq when available file + * quota less than this value, trigger + * dqrel when available file quota + * more than this value + 1 iunit + */ + unsigned long lqc_itune_sz; + /** + * original unit size of block quota and + * upper limitation for adjust block qunit + */ + unsigned long lqc_bunit_sz; + /** See comment of lqc_itune_sz */ + unsigned long lqc_btune_sz; + /** all lustre_qunit_size structures */ + struct lustre_hash *lqc_lqs_hash; + + /** @{ */ + /** + * the values below are relative to how master change its qunit sizes + */ + /** + * this affects the boundary of + * shrinking and enlarging qunit size. default=4 + */ + unsigned long lqc_cqs_boundary_factor; + /** the least value of block qunit */ + unsigned long lqc_cqs_least_bunit; + /** the least value of inode qunit */ + unsigned long lqc_cqs_least_iunit; + /** + * when enlarging, qunit size will + * mutilple it; when shrinking, + * qunit size will divide it + */ + unsigned long lqc_cqs_qs_factor; + /** + * avoid ping-pong effect of + * adjusting qunit size. How many + * seconds must be waited between + * enlarging and shinking qunit + */ + /** }@ */ + int lqc_switch_seconds; + /** + * when blk qunit reaches this value, + * later write reqs from client should be sync b=16642 + */ + int lqc_sync_blk; + /** guard lqc_imp_valid now */ + spinlock_t lqc_lock; + /** + * when mds isn't connected, threads + * on osts who send the quota reqs + * with wait==1 will be put here b=14840 + */ + cfs_waitq_t lqc_wait_for_qmaster; + struct proc_dir_entry *lqc_proc_dir; + /** lquota statistics */ + struct lprocfs_stats *lqc_stats; }; +#define QUOTA_MASTER_READY(qctxt) (qctxt)->lqc_setup = 1 +#define QUOTA_MASTER_UNREADY(qctxt) (qctxt)->lqc_setup = 0 + +struct lustre_qunit_size { + struct hlist_node lqs_hash; /** the hash entry */ + unsigned int lqs_id; /** id of user/group */ + unsigned long lqs_flags; /** is user/group; FULLBUF or LESSBUF */ + unsigned long lqs_iunit_sz; /** Unit size of file quota currently */ + /** + * Trigger dqacq when available file quota + * less than this value, trigger dqrel + * when more than this value + 1 iunit + */ + unsigned long lqs_itune_sz; + unsigned long lqs_bunit_sz; /** Unit size of block quota currently */ + unsigned long lqs_btune_sz; /** See comment of lqs itune sz */ + /** the blocks reached ost and don't finish */ + unsigned long lqs_bwrite_pending; + /** the inodes reached mds and don't finish */ + unsigned long lqs_iwrite_pending; + /** when inodes are allocated/released, this value will record it */ + long long lqs_ino_rec; + /** when blocks are allocated/released, this value will record it */ + long long lqs_blk_rec; + atomic_t lqs_refcount; + cfs_time_t lqs_last_bshrink; /** time of last block shrink */ + cfs_time_t lqs_last_ishrink; /** time of last inode shrink */ + spinlock_t lqs_lock; + struct quota_adjust_qunit lqs_key; /** hash key */ + struct lustre_quota_ctxt *lqs_ctxt; /** quota ctxt */ +}; + +#define LQS_IS_GRP(lqs) ((lqs)->lqs_flags & LQUOTA_FLAGS_GRP) +#define LQS_IS_ADJBLK(lqs) ((lqs)->lqs_flags & LQUOTA_FLAGS_ADJBLK) +#define LQS_IS_ADJINO(lqs) ((lqs)->lqs_flags & LQUOTA_FLAGS_ADJINO) + +#define LQS_SET_GRP(lqs) ((lqs)->lqs_flags |= LQUOTA_FLAGS_GRP) +#define LQS_SET_ADJBLK(lqs) ((lqs)->lqs_flags |= LQUOTA_FLAGS_ADJBLK) +#define LQS_SET_ADJINO(lqs) ((lqs)->lqs_flags |= LQUOTA_FLAGS_ADJINO) + +static inline void lqs_getref(struct lustre_qunit_size *lqs) +{ + atomic_inc(&lqs->lqs_refcount); + CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", + lqs, atomic_read(&lqs->lqs_refcount)); +} + +static inline void lqs_putref(struct lustre_qunit_size *lqs) +{ + LASSERT(atomic_read(&lqs->lqs_refcount) > 0); + + /* killing last ref, let's let hash table kill it */ + if (atomic_read(&lqs->lqs_refcount) == 1) { + lustre_hash_del(lqs->lqs_ctxt->lqc_lqs_hash, + &lqs->lqs_key, &lqs->lqs_hash); + OBD_FREE_PTR(lqs); + } else { + atomic_dec(&lqs->lqs_refcount); + CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", + lqs, atomic_read(&lqs->lqs_refcount)); + + } +} + +static inline void lqs_initref(struct lustre_qunit_size *lqs) +{ + atomic_set(&lqs->lqs_refcount, 0); +} + #else struct lustre_quota_info { @@ -149,6 +423,9 @@ struct lustre_quota_info { struct lustre_quota_ctxt { }; +#define QUOTA_MASTER_READY(qctxt) +#define QUOTA_MASTER_UNREADY(qctxt) + #endif /* !__KERNEL__ */ #else @@ -168,58 +445,109 @@ struct lustre_quota_ctxt { #define MIN_QLIMIT 1 struct quotacheck_thread_args { - struct obd_export *qta_exp; /* obd export */ - struct obd_quotactl qta_oqctl; /* obd_quotactl args */ - struct super_block *qta_sb; /* obd super block */ - atomic_t *qta_sem; /* obt_quotachecking */ + struct obd_export *qta_exp; /** obd export */ + struct obd_device *qta_obd; /** obd device */ + struct obd_quotactl qta_oqctl; /** obd_quotactl args */ + struct super_block *qta_sb; /** obd super block */ + atomic_t *qta_sem; /** obt_quotachecking */ }; +struct obd_trans_info; +typedef int (*quota_acquire)(struct obd_device *obd, unsigned int uid, + unsigned int gid, struct obd_trans_info *oti, + int isblk); + typedef struct { int (*quota_init) (void); int (*quota_exit) (void); int (*quota_setup) (struct obd_device *); int (*quota_cleanup) (struct obd_device *); - /* For quota master, close admin quota files */ + /** + * For quota master, close admin quota files + */ int (*quota_fs_cleanup) (struct obd_device *); - int (*quota_ctl) (struct obd_export *, struct obd_quotactl *); - int (*quota_check) (struct obd_export *, struct obd_quotactl *); + int (*quota_ctl) (struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*quota_check) (struct obd_device *, struct obd_export *, + struct obd_quotactl *); int (*quota_recovery) (struct obd_device *); - - /* For quota master/slave, adjust quota limit after fs operation */ - int (*quota_adjust) (struct obd_device *, unsigned int[], - unsigned int[], int, int); - - /* For quota slave, set import, trigger quota recovery */ - int (*quota_setinfo) (struct obd_export *, struct obd_device *); - - /* For quota slave, set proper thread resoure capability */ + + /** + * For quota master/slave, adjust quota limit after fs operation + */ + int (*quota_adjust) (struct obd_device *, unsigned int[], + unsigned int[], int, int); + + /** + * For quota slave, set import, trigger quota recovery, + * For quota master, set lqc_setup + */ + int (*quota_setinfo) (struct obd_device *, void *); + + /** + * For quota slave, clear import when relative import is invalid + */ + int (*quota_clearinfo) (struct obd_export *, struct obd_device *); + + /** + * For quota slave, set proper thread resoure capability + */ int (*quota_enforce) (struct obd_device *, unsigned int); - - /* For quota slave, check whether specified uid/gid is over quota */ + + /** + * For quota slave, check whether specified uid/gid is over quota + */ int (*quota_getflag) (struct obd_device *, struct obdo *); - - /* For quota slave, acquire/release quota from master if needed */ - int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int); - - /* For quota slave, check whether specified uid/gid's remaining quota - * can finish a write rpc */ + + /** + * For quota slave, acquire/release quota from master if needed + */ + int (*quota_acquire) (struct obd_device *, unsigned int, unsigned int, + struct obd_trans_info *, int); + + /** + * For quota slave, check whether specified uid/gid's remaining quota + * can finish a block_write or inode_create rpc. It updates the pending + * record of block and inode, acquires quota if necessary + */ int (*quota_chkquota) (struct obd_device *, unsigned int, unsigned int, - int); + int, int *, quota_acquire, + struct obd_trans_info *, int); - /* For quota client, poll if the quota check done */ + /** + * For quota client, poll if the quota check done + */ int (*quota_poll_check) (struct obd_export *, struct if_quotacheck *); - - /* For quota client, check whether specified uid/gid is over quota */ + + /** + * For quota client, check whether specified uid/gid is over quota + */ int (*quota_chkdq) (struct client_obd *, unsigned int, unsigned int); - - /* For quota client, set over quota flag for specifed uid/gid */ + + /** + * For quota client, the actions after the pending write is committed + */ + int (*quota_pending_commit) (struct obd_device *, unsigned int, + unsigned int, int, int); + + /** + * For quota client, set over quota flag for specifed uid/gid + */ int (*quota_setdq) (struct client_obd *, unsigned int, unsigned int, obd_flag, obd_flag); + + /** + * For adjusting qunit size b=10600 + */ + int (*quota_adjust_qunit) (struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); + } quota_interface_t; #define Q_COPY(out, in, member) (out)->member = (in)->member -#define QUOTA_OP(interface, op) interface->quota_ ## op +#define QUOTA_OP(interface, op) interface->quota_ ## op #define QUOTA_CHECK_OP(interface, op) \ do { \ @@ -235,39 +563,39 @@ static inline int lquota_init(quota_interface_t *interface) { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, init); rc = QUOTA_OP(interface, init)(); RETURN(rc); } -static inline int lquota_exit(quota_interface_t *interface) +static inline int lquota_exit(quota_interface_t *interface) { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, exit); rc = QUOTA_OP(interface, exit)(); RETURN(rc); } static inline int lquota_setup(quota_interface_t *interface, - struct obd_device *obd) + struct obd_device *obd) { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, setup); rc = QUOTA_OP(interface, setup)(obd); RETURN(rc); } static inline int lquota_cleanup(quota_interface_t *interface, - struct obd_device *obd) + struct obd_device *obd) { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, cleanup); rc = QUOTA_OP(interface, cleanup)(obd); RETURN(rc); @@ -278,32 +606,57 @@ static inline int lquota_fs_cleanup(quota_interface_t *interface, { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, fs_cleanup); rc = QUOTA_OP(interface, fs_cleanup)(obd); RETURN(rc); } static inline int lquota_recovery(quota_interface_t *interface, - struct obd_device *obd) -{ + struct obd_device *obd) +{ int rc; ENTRY; - + QUOTA_CHECK_OP(interface, recovery); rc = QUOTA_OP(interface, recovery)(obd); RETURN(rc); } +static inline int lquota_check(quota_interface_t *interface, + struct obd_device *obd, + struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + QUOTA_CHECK_OP(interface, check); + rc = QUOTA_OP(interface, check)(obd, exp, oqctl); + RETURN(rc); +} + +static inline int lquota_ctl(quota_interface_t *interface, + struct obd_device *obd, + struct obd_quotactl *oqctl) +{ + int rc; + ENTRY; + + QUOTA_CHECK_OP(interface, ctl); + rc = QUOTA_OP(interface, ctl)(obd, NULL, oqctl); + RETURN(rc); +} + static inline int lquota_adjust(quota_interface_t *interface, - struct obd_device *obd, - unsigned int qcids[], - unsigned int qpids[], - int rc, int opc) + struct obd_device *obd, + unsigned int qcids[], + unsigned int qpids[], + int rc, int opc) { int ret; ENTRY; - + QUOTA_CHECK_OP(interface, adjust); ret = QUOTA_OP(interface, adjust)(obd, qcids, qpids, rc, opc); RETURN(ret); @@ -315,7 +668,7 @@ static inline int lquota_chkdq(quota_interface_t *interface, { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, chkdq); rc = QUOTA_OP(interface, chkdq)(cli, uid, gid); RETURN(rc); @@ -328,7 +681,7 @@ static inline int lquota_setdq(quota_interface_t *interface, { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, setdq); rc = QUOTA_OP(interface, setdq)(cli, uid, gid, valid, flags); RETURN(rc); @@ -340,26 +693,37 @@ static inline int lquota_poll_check(quota_interface_t *interface, { int rc; ENTRY; - + QUOTA_CHECK_OP(interface, poll_check); rc = QUOTA_OP(interface, poll_check)(exp, qchk); RETURN(rc); } - static inline int lquota_setinfo(quota_interface_t *interface, - struct obd_export *exp, - struct obd_device *obd) + struct obd_device *obd, + void *data) { int rc; ENTRY; QUOTA_CHECK_OP(interface, setinfo); - rc = QUOTA_OP(interface, setinfo)(exp, obd); + rc = QUOTA_OP(interface, setinfo)(obd, data); + RETURN(rc); +} + +static inline int lquota_clearinfo(quota_interface_t *interface, + struct obd_export *exp, + struct obd_device *obd) +{ + int rc; + ENTRY; + + QUOTA_CHECK_OP(interface, clearinfo); + rc = QUOTA_OP(interface, clearinfo)(exp, obd); RETURN(rc); } -static inline int lquota_enforce(quota_interface_t *interface, +static inline int lquota_enforce(quota_interface_t *interface, struct obd_device *obd, unsigned int ignore) { @@ -381,57 +745,60 @@ static inline int lquota_getflag(quota_interface_t *interface, rc = QUOTA_OP(interface, getflag)(obd, oa); RETURN(rc); } - -static inline int lquota_acquire(quota_interface_t *interface, - struct obd_device *obd, - unsigned int uid, unsigned int gid) + +static inline int lquota_chkquota(quota_interface_t *interface, + struct obd_device *obd, + unsigned int uid, unsigned int gid, int count, + int *flag, struct obd_trans_info *oti, + int isblk) { int rc; ENTRY; + QUOTA_CHECK_OP(interface, chkquota); QUOTA_CHECK_OP(interface, acquire); - rc = QUOTA_OP(interface, acquire)(obd, uid, gid); + rc = QUOTA_OP(interface, chkquota)(obd, uid, gid, count, flag, + QUOTA_OP(interface, acquire), oti, + isblk); RETURN(rc); } -static inline int lquota_chkquota(quota_interface_t *interface, - struct obd_device *obd, - unsigned int uid, unsigned int gid, - int npage) +static inline int lquota_pending_commit(quota_interface_t *interface, + struct obd_device *obd, + unsigned int uid, unsigned int gid, + int npage, int isblk) { int rc; ENTRY; - - QUOTA_CHECK_OP(interface, chkquota); - rc = QUOTA_OP(interface, chkquota)(obd, uid, gid, npage); + + QUOTA_CHECK_OP(interface, pending_commit); + rc = QUOTA_OP(interface, pending_commit)(obd, uid, gid, npage, isblk); RETURN(rc); } -int lprocfs_rd_bunit(char *page, char **start, off_t off, int count, - int *eof, void *data); -int lprocfs_rd_iunit(char *page, char **start, off_t off, int count, - int *eof, void *data); -int lprocfs_wr_bunit(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_wr_iunit(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_rd_btune(char *page, char **start, off_t off, int count, - int *eof, void *data); -int lprocfs_rd_itune(char *page, char **start, off_t off, int count, - int *eof, void *data); -int lprocfs_wr_btune(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_wr_itune(struct file *file, const char *buffer, - unsigned long count, void *data); -int lprocfs_rd_type(char *page, char **start, off_t off, int count, - int *eof, void *data); -int lprocfs_wr_type(struct file *file, const char *buffer, - unsigned long count, void *data); - #ifndef __KERNEL__ extern quota_interface_t osc_quota_interface; -extern quota_interface_t mdc_quota_interface; extern quota_interface_t lov_quota_interface; +extern quota_interface_t mdc_quota_interface; +extern quota_interface_t lmv_quota_interface; + +#ifndef MAXQUOTAS +#define MAXQUOTAS 2 +#endif + +#ifndef USRQUOTA +#define USRQUOTA 0 #endif +#ifndef GRPQUOTA +#define GRPQUOTA 1 +#endif + +#endif + +#define LUSTRE_ADMIN_QUOTAFILES_V2 {\ + "admin_quotafile_v2.usr", /** user admin quotafile */\ + "admin_quotafile_v2.grp" /** group admin quotafile */\ +} + #endif /* _LUSTRE_QUOTA_H */ diff --git a/lustre/include/lustre_req_layout.h b/lustre/include/lustre_req_layout.h index 1853571..4f0c777 100644 --- a/lustre/include/lustre_req_layout.h +++ b/lustre/include/lustre_req_layout.h @@ -177,6 +177,7 @@ extern const struct req_format RQF_OST_CONNECT; extern const struct req_format RQF_OST_DISCONNECT; extern const struct req_format RQF_OST_QUOTACHECK; extern const struct req_format RQF_OST_QUOTACTL; +extern const struct req_format RQF_OST_QUOTA_ADJUST_QUNIT; extern const struct req_format RQF_OST_GETATTR; extern const struct req_format RQF_OST_SETATTR; extern const struct req_format RQF_OST_CREATE; @@ -244,6 +245,7 @@ extern const struct req_msg_field RMF_CAPA1; extern const struct req_msg_field RMF_CAPA2; extern const struct req_msg_field RMF_OBD_QUOTACHECK; extern const struct req_msg_field RMF_OBD_QUOTACTL; +extern const struct req_msg_field RMF_QUOTA_ADJUST_QUNIT; extern const struct req_msg_field RMF_QUNIT_DATA; extern const struct req_msg_field RMF_STRING; diff --git a/lustre/include/lustre_sec.h b/lustre/include/lustre_sec.h index 00c20d2..57a58c7 100644 --- a/lustre/include/lustre_sec.h +++ b/lustre/include/lustre_sec.h @@ -162,7 +162,8 @@ enum sptlrpc_service_type { SPTLRPC_MECH_GSS_KRB5, \ SPTLRPC_SVC_PRIV) -#define SPTLRPC_FLVR_INVALID ((__u16) -1) +#define SPTLRPC_FLVR_ANY ((__u16) 0xf000) +#define SPTLRPC_FLVR_INVALID ((__u16) 0xffff) #define SPTLRPC_FLVR_DEFAULT SPTLRPC_FLVR_NULL @@ -192,10 +193,14 @@ enum lustre_sec_part { LUSTRE_SP_CLI = 0, LUSTRE_SP_MDT, LUSTRE_SP_OST, + LUSTRE_SP_MGC, LUSTRE_SP_MGS, LUSTRE_SP_ANY = 0xFF }; +const char *sptlrpc_part2name(enum lustre_sec_part sp); +enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); + struct sptlrpc_rule { __u32 sr_netid; /* LNET network ID */ __u8 sr_from; /* sec_part */ @@ -210,17 +215,7 @@ struct sptlrpc_rule_set { struct sptlrpc_rule *srs_rules; }; -#define SPTLRPC_CONF_LOG_MAX (64) - -struct sptlrpc_conf_log { - __u32 scl_max; /* maximum rules # */ - __u32 scl_nrule; /* rules # */ - __u8 scl_part; /* which part am i */ - __u8 scl_pad0; - __u16 scl_pad1; - __u32 scl_pad2; - struct sptlrpc_rule scl_rules[SPTLRPC_CONF_LOG_MAX]; -}; +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr); static inline void sptlrpc_rule_set_init(struct sptlrpc_rule_set *set) { @@ -232,31 +227,26 @@ int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *set, int expand); int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *set, struct sptlrpc_rule *rule, int expand); -int sptlrpc_rule_set_from_log(struct sptlrpc_rule_set *rset, - struct sptlrpc_conf_log *log); -void sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, - enum lustre_sec_part from, - lnet_nid_t nid, - struct sptlrpc_flavor *flavor); +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *set); -struct sptlrpc_conf_log *sptlrpc_conf_log_alloc(void); -void sptlrpc_conf_log_free(struct sptlrpc_conf_log *log); -int sptlrpc_conf_log_populate(struct sptlrpc_rule_set *gen, - struct sptlrpc_rule_set *tgt, - enum lustre_sec_part from, - enum lustre_sec_part to, - unsigned int fl_udesc, - struct sptlrpc_conf_log *log); -struct sptlrpc_conf_log *sptlrpc_conf_log_extract(struct lustre_cfg *lcfg); -void sptlrpc_conf_log_cleanup(struct sptlrpc_conf_log *log); -void sptlrpc_conf_log_dump(struct sptlrpc_conf_log *log); - -const char *sptlrpc_part2name(enum lustre_sec_part part); -enum lustre_sec_part sptlrpc_target_sec_part(struct obd_device *obd); - -int sptlrpc_cliobd_process_config(struct obd_device *obd, - struct lustre_cfg *lcfg); +int sptlrpc_process_config(struct lustre_cfg *lcfg); +void sptlrpc_conf_log_start(const char *logname); +void sptlrpc_conf_log_stop(const char *logname); +void sptlrpc_conf_log_update_begin(const char *logname); +void sptlrpc_conf_log_update_end(const char *logname); +void sptlrpc_conf_client_adapt(struct obd_device *obd); +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset, + int initial); +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *flavor); /* The maximum length of security payload. 1024 is enough for Kerberos 5, * and should be enough for other future mechanisms but not sure. @@ -779,5 +769,14 @@ int bulk_csum_svc(struct ptlrpc_bulk_desc *desc, int read, struct ptlrpc_bulk_sec_desc *bsdv, int vsize, struct ptlrpc_bulk_sec_desc *bsdr, int rsize); +#define CFS_CAP_CHOWN_MASK (1 << CFS_CAP_CHOWN) +#define CFS_CAP_SYS_RESOURCE_MASK (1 << CFS_CAP_SYS_RESOURCE) + +enum { + LUSTRE_SEC_NONE = 0, + LUSTRE_SEC_REMOTE = 1, + LUSTRE_SEC_SPECIFY = 2, + LUSTRE_SEC_ALL = 3 +}; #endif /* _LUSTRE_SEC_H_ */ diff --git a/lustre/include/md_object.h b/lustre/include/md_object.h index 556728d..640ad6d 100644 --- a/lustre/include/md_object.h +++ b/lustre/include/md_object.h @@ -62,7 +62,7 @@ struct md_device; struct md_device_operations; struct md_object; - +struct obd_export; enum { UCRED_INVALID = -1, @@ -73,26 +73,30 @@ enum { struct md_ucred { __u32 mu_valid; - __u32 mu_o_uid; - __u32 mu_o_gid; - __u32 mu_o_fsuid; - __u32 mu_o_fsgid; - __u32 mu_uid; - __u32 mu_gid; - __u32 mu_fsuid; - __u32 mu_fsgid; - __u32 mu_suppgids[2]; - cfs_cap_t mu_cap; - __u32 mu_umask; - struct group_info *mu_ginfo; + __u32 mu_o_uid; + __u32 mu_o_gid; + __u32 mu_o_fsuid; + __u32 mu_o_fsgid; + __u32 mu_uid; + __u32 mu_gid; + __u32 mu_fsuid; + __u32 mu_fsgid; + __u32 mu_suppgids[2]; + cfs_cap_t mu_cap; + __u32 mu_umask; + struct group_info *mu_ginfo; struct md_identity *mu_identity; }; -#define MD_CAPAINFO_MAX 5 +enum { + MD_CAPAINFO_MAX = 5 +}; /** there are at most 5 fids in one operation, see rename, NOTE the last one * is a temporary one used for is_subdir() */ struct md_capainfo { + __u32 mc_auth; + __u32 mc_padding; const struct lu_fid *mc_fid[MD_CAPAINFO_MAX]; struct lustre_capa *mc_capa[MD_CAPAINFO_MAX]; }; @@ -162,11 +166,12 @@ struct md_op_spec { struct md_spec_reg { /** lov objs exist already */ const struct lu_fid *fid; - int no_lov_create; const void *eadata; int eadatalen; } sp_ea; } u; + /** don't create lov objects or llog cookie - this replay */ + int no_create; /** Create flag from client: such as MDS_OPEN_CREAT, and others. */ __u32 sp_cr_flags; @@ -179,6 +184,9 @@ struct md_op_spec { /** Check for split */ int sp_ck_split; + + /** to create directory */ + const struct dt_index_features *sp_feat; }; /** @@ -313,6 +321,80 @@ struct md_device_operations { int (*mdo_update_capa_key)(const struct lu_env *env, struct md_device *m, struct lustre_capa_key *key); + +#ifdef HAVE_QUOTA_SUPPORT + struct md_quota_operations { + int (*mqo_notify)(const struct lu_env *env, + struct md_device *m); + + int (*mqo_setup)(const struct lu_env *env, + struct md_device *m, + void *data); + + int (*mqo_cleanup)(const struct lu_env *env, + struct md_device *m); + + int (*mqo_recovery)(const struct lu_env *env, + struct md_device *m); + + int (*mqo_check)(const struct lu_env *env, + struct md_device *m, + struct obd_export *exp, + __u32 type); + + int (*mqo_on)(const struct lu_env *env, + struct md_device *m, + __u32 type); + + int (*mqo_off)(const struct lu_env *env, + struct md_device *m, + __u32 type); + + int (*mqo_setinfo)(const struct lu_env *env, + struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqinfo *dqinfo); + + int (*mqo_getinfo)(const struct lu_env *env, + const struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqinfo *dqinfo); + + int (*mqo_setquota)(const struct lu_env *env, + struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqblk *dqblk); + + int (*mqo_getquota)(const struct lu_env *env, + const struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqblk *dqblk); + + int (*mqo_getoinfo)(const struct lu_env *env, + const struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqinfo *dqinfo); + + int (*mqo_getoquota)(const struct lu_env *env, + const struct md_device *m, + __u32 type, + __u32 id, + struct obd_dqblk *dqblk); + + int (*mqo_invalidate)(const struct lu_env *env, + struct md_device *m, + __u32 type); + + int (*mqo_finvalidate)(const struct lu_env *env, + struct md_device *m, + __u32 type); + } mdo_quota; +#endif }; enum md_upcall_event { @@ -335,9 +417,9 @@ struct md_upcall { }; struct md_device { - struct lu_device md_lu_dev; + struct lu_device md_lu_dev; const struct md_device_operations *md_ops; - struct md_upcall md_upcall; + struct md_upcall md_upcall; }; static inline void md_upcall_init(struct md_device *m, void *upcl) @@ -377,7 +459,7 @@ static inline int md_do_upcall(const struct lu_env *env, struct md_device *m, } struct md_object { - struct lu_object mo_lu; + struct lu_object mo_lu; const struct md_object_operations *mo_ops; const struct md_dir_operations *mo_dir_ops; }; @@ -454,12 +536,12 @@ static inline struct md_site *lu_site2md(const struct lu_site *s) static inline int md_device_init(struct md_device *md, struct lu_device_type *t) { - return lu_device_init(&md->md_lu_dev, t); + return lu_device_init(&md->md_lu_dev, t); } static inline void md_device_fini(struct md_device *md) { - lu_device_fini(&md->md_lu_dev); + lu_device_fini(&md->md_lu_dev); } static inline struct md_object *md_object_find_slice(const struct lu_env *env, @@ -722,6 +804,54 @@ static inline int mdo_rename_tgt(const struct lu_env *env, } } -/** @} md */ +struct dt_device; +/** + * Structure to hold object information. This is used to create object + * \pre llod_dir exist + */ +struct lu_local_obj_desc { + const char *llod_dir; + const char *llod_name; + __u32 llod_oid; + int llod_is_index; + const struct dt_index_features * llod_feat; + struct list_head llod_linkage; +}; +struct md_object *llo_store_resolve(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *path, + struct lu_fid *fid); + +struct md_object *llo_store_open(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + struct lu_fid *fid); + +struct md_object *llo_store_create_index(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat); + +struct md_object *llo_store_create(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid); + +void llo_local_obj_register(struct lu_local_obj_desc *); +void llo_local_obj_unregister(struct lu_local_obj_desc *); + +int llo_local_objects_setup(const struct lu_env *env, + struct md_device * md, + struct dt_device * dt); + +/** @} md */ #endif /* _LINUX_MD_OBJECT_H */ diff --git a/lustre/include/obd.h b/lustre/include/obd.h index b42e2c3..badcf8f 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -175,7 +175,7 @@ struct lov_stripe_md { struct obd_info; -typedef int (*obd_enqueue_update_f)(struct obd_info *oinfo, int rc); +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); /* obd info for a particular level (lov, osc). */ struct obd_info { @@ -239,52 +239,6 @@ struct brw_page { obd_flag flag; }; -enum async_flags { - ASYNC_READY = 0x1, /* ap_make_ready will not be called before this - page is added to an rpc */ - ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ - ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called - to give the caller a chance to update - or cancel the size of the io */ - ASYNC_GROUP_SYNC = 0x8, /* ap_completion will not be called, instead - the page is accounted for in the - obd_io_group given to - obd_queue_group_io */ -}; - -struct obd_async_page_ops { - int (*ap_make_ready)(void *data, int cmd); - int (*ap_refresh_count)(void *data, int cmd); - void (*ap_fill_obdo)(void *data, int cmd, struct obdo *oa); - void (*ap_update_obdo)(void *data, int cmd, struct obdo *oa, - obd_valid valid); - int (*ap_completion)(void *data, int cmd, struct obdo *oa, int rc); - struct obd_capa *(*ap_lookup_capa)(void *data, int cmd); -}; - -/* the `oig' is passed down from a caller of obd rw methods. the callee - * records enough state such that the caller can sleep on the oig and - * be woken when all the callees have finished their work */ -struct obd_io_group { - spinlock_t oig_lock; - atomic_t oig_refcount; - int oig_pending; - int oig_rc; - struct list_head oig_occ_list; - cfs_waitq_t oig_waitq; -}; - -/* the oig callback context lets the callee of obd rw methods register - * for callbacks from the caller. */ -struct oig_callback_context { - struct list_head occ_oig_item; - /* called when the caller has received a signal while sleeping. - * callees of this method are encouraged to abort their state - * in the oig. This may be called multiple times. */ - void (*occ_interrupted)(struct oig_callback_context *occ); - unsigned long interrupted:1; -}; - /* Individual type definitions */ struct ost_server_data; @@ -294,13 +248,10 @@ struct obd_device_target { struct super_block *obt_sb; atomic_t obt_quotachecking; struct lustre_quota_ctxt obt_qctxt; + lustre_quota_version_t obt_qfmt; + struct rw_semaphore obt_rwsem; }; -typedef void (*obd_pin_extent_cb)(void *data); -typedef int (*obd_page_removal_cb_t)(void *data, int discard); -typedef int (*obd_lock_cancel_cb)(struct ldlm_lock *,struct ldlm_lock_desc *, - void *, int); - /* llog contexts */ enum llog_ctxt_id { LLOG_CONFIG_ORIG_CTXT = 0, @@ -413,6 +364,7 @@ struct filter_obd { struct list_head fo_capa_keys; struct hlist_head *fo_capa_hash; struct llog_commit_master *fo_lcm; + int fo_sec_level; }; #define OSC_MAX_RIF_DEFAULT 8 @@ -426,7 +378,6 @@ struct filter_obd { struct mdc_rpc_lock; struct obd_import; -struct lustre_cache; struct client_obd { struct rw_semaphore cl_sem; struct obd_uuid cl_target_uuid; @@ -438,9 +389,9 @@ struct client_obd { int cl_max_mds_easize; int cl_max_mds_cookiesize; - /* security configuration */ - struct sptlrpc_rule_set cl_sptlrpc_rset; - enum lustre_sec_part cl_sec_part; + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ //struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */ void *cl_llcd_offset; @@ -448,6 +399,7 @@ struct client_obd { /* the grant values are protected by loi_list_lock below */ long cl_dirty; /* all _dirty_ in bytes */ long cl_dirty_max; /* allowed w/o rpc */ + long cl_dirty_transit; /* dirty synchronous */ long cl_avail_grant; /* bytes of credit for ost */ long cl_lost_grant; /* lost credits (trunc) */ struct list_head cl_cache_waiters; /* waiting for cache/grant */ @@ -521,10 +473,6 @@ struct client_obd { struct lu_client_seq *cl_seq; atomic_t cl_resends; /* resend count */ - - /* Cache of triples */ - struct lustre_cache *cl_cache; - obd_lock_cancel_cb cl_ext_lock_cancel_cb; }; #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) @@ -572,12 +520,13 @@ struct mds_obd { __u32 mds_id; /* mark pages dirty for write. */ - bitmap_t *mds_lov_page_dirty; + bitmap_t *mds_lov_page_dirty; /* array for store pages with obd_id */ - void **mds_lov_page_array; + void **mds_lov_page_array; /* file for store objid */ struct file *mds_lov_objid_filp; __u32 mds_lov_objid_count; + __u32 mds_lov_objid_max_index; __u32 mds_lov_objid_lastpage; __u32 mds_lov_objid_lastidx; @@ -592,13 +541,16 @@ struct mds_obd { mds_fl_acl:1, mds_evict_ost_nids:1, mds_fl_cfglog:1, - mds_fl_synced:1; + mds_fl_synced:1, + mds_quota:1, + mds_fl_target:1; /* mds have one or + * more targets */ struct upcall_cache *mds_identity_cache; /* for capability keys update */ struct lustre_capa_key *mds_capa_keys; - struct rw_semaphore mds_notify_lock; + struct rw_semaphore mds_notify_lock; }; /* lov objid */ @@ -639,6 +591,7 @@ struct echo_client_obd { struct obd_export *ec_exp; /* the local connection to osc/lov */ spinlock_t ec_lock; struct list_head ec_objects; + struct list_head ec_locks; int ec_nstripes; __u64 ec_unique; }; @@ -646,10 +599,11 @@ struct echo_client_obd { struct lov_qos_oss { struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ struct list_head lqo_oss_list; /* link to lov_qos */ - __u32 lqo_ost_count; /* number of osts on this oss */ __u64 lqo_bavail; /* total bytes avail on OSS */ __u64 lqo_penalty; /* current penalty */ __u64 lqo_penalty_per_obj; /* penalty decrease every obj*/ + time_t lqo_used; /* last used time, seconds */ + __u32 lqo_ost_count; /* number of osts on this oss */ }; struct ltd_qos { @@ -657,6 +611,7 @@ struct ltd_qos { __u64 ltq_penalty; /* current penalty */ __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ __u64 ltq_weight; /* net weighting */ + time_t ltq_used; /* last used time, seconds */ unsigned int ltq_usable:1; /* usable for striping */ }; @@ -734,13 +689,11 @@ struct lov_obd { __u32 lov_death_row;/* tgts scheduled to be deleted */ __u32 lov_tgt_size; /* size of tgts array */ int lov_connects; - obd_page_removal_cb_t lov_page_removal_cb; - obd_pin_extent_cb lov_page_pin_cb; - obd_lock_cancel_cb lov_lock_cancel_cb; int lov_pool_count; lustre_hash_t *lov_pools_hash_body; /* used for key access */ struct list_head lov_pool_list; /* used for sequential access */ cfs_proc_dir_entry_t *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; }; struct lmv_tgt_desc { @@ -801,8 +754,10 @@ struct niobuf_local { #define LUSTRE_CMM_NAME "cmm" #define LUSTRE_MDD_NAME "mdd" #define LUSTRE_OSD_NAME "osd" +#define LUSTRE_VVP_NAME "vvp" #define LUSTRE_LMV_NAME "lmv" #define LUSTRE_CMM_MDC_NAME "cmm-mdc" +#define LUSTRE_SLP_NAME "slp" /* obd device type names */ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ @@ -843,7 +798,7 @@ struct obd_trans_info { int oti_numcookies; /* initial thread handling transaction */ - int oti_thread_id; + struct ptlrpc_thread * oti_thread; __u32 oti_conn_cnt; struct obd_uuid *oti_ost_uuid; @@ -863,7 +818,7 @@ static inline void oti_init(struct obd_trans_info *oti, if (req->rq_repmsg != NULL) oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); - oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1; + oti->oti_thread = req->rq_svc_thread; if (req->rq_reqmsg != NULL) oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); } @@ -917,8 +872,9 @@ enum obd_notify_event { /* bit-mask flags for config events */ enum config_flags { - CONFIG_LOG = 0x1, /* finished processing config log */ - CONFIG_SYNC = 0x2 /* mdt synced 1 ost */ + CONFIG_LOG = 0x1, /* finished processing config log */ + CONFIG_SYNC = 0x2, /* mdt synced 1 ost */ + CONFIG_TARGET = 0x4 /* one target is added */ }; /* @@ -940,11 +896,40 @@ struct target_recovery_data { }; enum filter_groups { + FILTER_GROUP_MDS0 = 0, FILTER_GROUP_LLOG = 1, - FILTER_GROUP_ECHO, - FILTER_GROUP_MDS0 + FILTER_GROUP_ECHO = 2 , + FILTER_GROUP_MDS1_N_BASE = 3 }; +static inline __u64 obdo_mdsno(struct obdo *oa) +{ + if (oa->o_gr) + return oa->o_gr - FILTER_GROUP_MDS1_N_BASE; + return 0; +} + +static inline int mdt_to_obd_objgrp(int mdtid) +{ + if (mdtid) + return FILTER_GROUP_MDS1_N_BASE + mdtid; + return 0; +} + +/** + * In HEAD for CMD, the object is created in group number which is 3>= + * or indexing starts from 3. To test this assertions are added to disallow + * group 0. But to run 2.0 mds server on 1.8.x disk format (i.e. interop_mode) + * object in group 0 needs to be allowed. + * So for interop mode following changes needs to be done: + * 1. No need to assert on group 0 or allow group 0 + * 2. The group number indexing starts from 0 instead of 3 + */ + +#define CHECK_MDS_GROUP(group) (group == FILTER_GROUP_MDS0 || \ + group > FILTER_GROUP_MDS1_N_BASE) +#define LASSERT_MDS_GROUP(group) LASSERT(CHECK_MDS_GROUP(group)) + struct obd_llog_group { struct list_head olg_list; int olg_group; @@ -1124,6 +1109,8 @@ enum obd_cleanup_stage { #define KEY_BLOCKSIZE "blocksize" #define KEY_BLOCKSIZE_BITS "blocksize_bits" #define KEY_FIEMAP "FIEMAP" +#define KEY_SPTLRPC_CONF "sptlrpc_conf" +#define KEY_MGSSEC "mgssec" /* XXX unused ?*/ #define KEY_INTERMDS "inter_mds" #define KEY_ASYNC "async" @@ -1269,7 +1256,7 @@ struct obd_ops { struct lov_stripe_md **ea, struct obd_trans_info *oti); int (*o_destroy)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp); + struct obd_export *md_exp, void *capa); int (*o_setattr)(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti); int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, @@ -1281,47 +1268,6 @@ struct obd_ops { int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo, obd_count oa_bufs, struct brw_page *pgarr, struct obd_trans_info *oti); - int (*o_brw_async)(int rw, struct obd_export *exp, - struct obd_info *oinfo, obd_count oa_bufs, - struct brw_page *pgarr, struct obd_trans_info *oti, - struct ptlrpc_request_set *); - int (*o_prep_async_page)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - cfs_page_t *page, obd_off offset, - struct obd_async_page_ops *ops, void *data, - void **res, int nocache, - struct lustre_handle *lockh); - int (*o_reget_short_lock)(struct obd_export *exp, - struct lov_stripe_md *lsm, - void **res, int rw, - obd_off start, obd_off end, - void **cookie); - int (*o_release_short_lock)(struct obd_export *exp, - struct lov_stripe_md *lsm, obd_off end, - void *cookie, int rw); - int (*o_queue_async_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags); - int (*o_queue_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, - void *cookie, int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags); - int (*o_trigger_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig); - int (*o_set_async_flags)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flag async_flags); - int (*o_teardown_async_page)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie); int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only); int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, @@ -1355,9 +1301,6 @@ struct obd_ops { int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo, struct ldlm_enqueue_info *einfo, struct ptlrpc_request_set *rqset); - int (*o_match)(struct obd_export *, struct lov_stripe_md *, __u32 type, - ldlm_policy_data_t *, __u32 mode, int *flags, void *data, - struct lustre_handle *lockh); int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *, ldlm_iterator_t it, void *data); int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md, @@ -1391,20 +1334,17 @@ struct obd_ops { struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); /* quota methods */ - int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *); - int (*o_quotactl)(struct obd_export *, struct obd_quotactl *); + int (*o_quotacheck)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quota_adjust_qunit)(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); + int (*o_ping)(struct obd_export *exp); - int (*o_register_page_removal_cb)(struct obd_export *exp, - obd_page_removal_cb_t cb, - obd_pin_extent_cb pin_cb); - int (*o_unregister_page_removal_cb)(struct obd_export *exp, - obd_page_removal_cb_t cb); - int (*o_register_lock_cancel_cb)(struct obd_export *exp, - obd_lock_cancel_cb cb); - int (*o_unregister_lock_cancel_cb)(struct obd_export *exp, - obd_lock_cancel_cb cb); /* pools methods */ int (*o_pool_new)(struct obd_device *obd, char *poolname); int (*o_pool_del)(struct obd_device *obd, char *poolname); @@ -1544,6 +1484,8 @@ struct md_ops { void *opaque); int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, renew_capa_cb_t cb); + int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, + const struct req_msg_field *, struct obd_capa **); int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, struct obd_capa *, __u32, @@ -1634,11 +1576,12 @@ static inline void init_obd_quota_ops(quota_interface_t *interface, LASSERT(obd_ops); obd_ops->o_quotacheck = QUOTA_OP(interface, check); obd_ops->o_quotactl = QUOTA_OP(interface, ctl); + obd_ops->o_quota_adjust_qunit = QUOTA_OP(interface, adjust_qunit); } static inline __u64 oinfo_mdsno(struct obd_info *oinfo) { - return oinfo->oi_oa->o_gr - FILTER_GROUP_MDS0; + return obdo_mdsno(oinfo->oi_oa); } static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo) diff --git a/lustre/include/obd_class.h b/lustre/include/obd_class.h index 045e3ff..1bc75e1 100644 --- a/lustre/include/obd_class.h +++ b/lustre/include/obd_class.h @@ -86,18 +86,11 @@ void class_obd_list(void); struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, const char * typ_name, struct obd_uuid *grp_uuid); -struct obd_device * class_find_client_notype(struct obd_uuid *tgt_uuid, - struct obd_uuid *grp_uuid); struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next); struct obd_device * class_num2obd(int num); -int oig_init(struct obd_io_group **oig); -int oig_add_one(struct obd_io_group *oig, struct oig_callback_context *occ); -void oig_complete_one(struct obd_io_group *oig, - struct oig_callback_context *occ, int rc); -void oig_release(struct obd_io_group *oig); -int oig_wait(struct obd_io_group *oig); +int class_notify_sptlrpc_conf(const char *fsname, int namelen); char *obd_export_nid2str(struct obd_export *exp); @@ -107,6 +100,7 @@ int obd_export_evict_by_uuid(struct obd_device *obd, const char *uuid); int obd_zombie_impexp_init(void); void obd_zombie_impexp_stop(void); void obd_zombie_impexp_cull(void); +void obd_zombie_barrier(void); /* obd_config.c */ int class_process_config(struct lustre_cfg *lcfg); @@ -140,6 +134,7 @@ static inline void lprocfs_echo_init_vars(struct lprocfs_static_vars *lvars) /* Passed as data param to class_config_parse_llog */ struct config_llog_instance { char * cfg_instance; + char * cfg_obdname; struct super_block *cfg_sb; struct obd_uuid cfg_uuid; int cfg_last_idx; /* for partial llog processing */ @@ -157,9 +152,11 @@ struct config_llog_data { struct config_llog_instance cld_cfg; struct list_head cld_list_chain; atomic_t cld_refcount; + struct config_llog_data *cld_sptlrpc;/* depended sptlrpc log */ struct obd_export *cld_mgcexp; unsigned int cld_stopping:1, /* we were told to stop watching */ - cld_lostlock:1; /* lock not requeued */ + cld_lostlock:1, /* lock not requeued */ + cld_is_sptlrpc:1; }; struct lustre_profile { @@ -411,27 +408,45 @@ static inline int obd_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(rc); } -#ifdef __KERNEL__ +/* + * obd-lu integration. + * + * Functionality is being moved into new lu_device-based layering, but some + * pieces of configuration process are still based on obd devices. + * + * Specifically, lu_device_type_operations::ldto_device_alloc() methods fully + * subsume ->o_setup() methods of obd devices they replace. The same for + * lu_device_operations::ldo_process_config() and ->o_process_config(). As a + * result, obd_setup() and obd_process_config() branch and call one XOR + * another. + * + * Yet neither lu_device_type_operations::ldto_device_fini() nor + * lu_device_type_operations::ldto_device_free() fully implement the + * functionality of ->o_precleanup() and ->o_cleanup() they override. Hence, + * obd_precleanup() and obd_cleanup() call both lu_device and obd operations. + */ + #define DECLARE_LU_VARS(ldt, d) \ - struct lu_device_type *ldt; \ + struct lu_device_type *ldt; \ struct lu_device *d -#else -#define DECLARE_LU_VARS(ldt, d) \ - extern void __placeholder_to_put_a_semicolon(void) -#endif + static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) { int rc; DECLARE_LU_VARS(ldt, d); ENTRY; -#ifdef __KERNEL__ ldt = obd->obd_type->typ_lu; if (ldt != NULL) { + struct lu_context session_ctx; struct lu_env env; + lu_context_init(&session_ctx, LCT_SESSION); + session_ctx.lc_thread = NULL; + lu_context_enter(&session_ctx); - rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags); + rc = lu_env_init(&env, ldt->ldt_ctx_tags); if (rc == 0) { + env.le_ses = &session_ctx; d = ldt->ldt_ops->ldto_device_alloc(&env, ldt, cfg); lu_env_fini(&env); if (!IS_ERR(d)) { @@ -441,9 +456,10 @@ static inline int obd_setup(struct obd_device *obd, struct lustre_cfg *cfg) } else rc = PTR_ERR(d); } - } else -#endif - { + lu_context_exit(&session_ctx); + lu_context_fini(&session_ctx); + + } else { OBD_CHECK_DT_OP(obd, setup, -EOPNOTSUPP); OBD_COUNTER_INCREMENT(obd, setup); rc = OBP(obd, setup)(obd, cfg); @@ -459,29 +475,23 @@ static inline int obd_precleanup(struct obd_device *obd, ENTRY; OBD_CHECK_DEV(obd); -#ifdef __KERNEL__ ldt = obd->obd_type->typ_lu; d = obd->obd_lu_dev; if (ldt != NULL && d != NULL) { if (cleanup_stage == OBD_CLEANUP_EXPORTS) { struct lu_env env; - rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags); + rc = lu_env_init(&env, ldt->ldt_ctx_tags); if (rc == 0) { ldt->ldt_ops->ldto_device_fini(&env, d); lu_env_fini(&env); } - } else { - rc = 0; } - } else -#endif - { - OBD_CHECK_DT_OP(obd, precleanup, 0); - rc = OBP(obd, precleanup)(obd, cleanup_stage); } - + OBD_CHECK_DT_OP(obd, precleanup, 0); OBD_COUNTER_INCREMENT(obd, precleanup); + + rc = OBP(obd, precleanup)(obd, cleanup_stage); RETURN(rc); } @@ -493,25 +503,22 @@ static inline int obd_cleanup(struct obd_device *obd) OBD_CHECK_DEV(obd); -#ifdef __KERNEL__ ldt = obd->obd_type->typ_lu; d = obd->obd_lu_dev; if (ldt != NULL && d != NULL) { struct lu_env env; - rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags); + rc = lu_env_init(&env, ldt->ldt_ctx_tags); if (rc == 0) { ldt->ldt_ops->ldto_device_free(&env, d); lu_env_fini(&env); obd->obd_lu_dev = NULL; } - } else -#endif - { - OBD_CHECK_DT_OP(obd, cleanup, 0); - rc = OBP(obd, cleanup)(obd); } + OBD_CHECK_DT_OP(obd, cleanup, 0); OBD_COUNTER_INCREMENT(obd, cleanup); + + rc = OBP(obd, cleanup)(obd); RETURN(rc); } @@ -524,20 +531,17 @@ obd_process_config(struct obd_device *obd, int datalen, void *data) OBD_CHECK_DEV(obd); -#ifdef __KERNEL__ ldt = obd->obd_type->typ_lu; d = obd->obd_lu_dev; if (ldt != NULL && d != NULL) { struct lu_env env; - rc = lu_env_init(&env, NULL, ldt->ldt_ctx_tags); + rc = lu_env_init(&env, ldt->ldt_ctx_tags); if (rc == 0) { rc = d->ld_ops->ldo_process_config(&env, d, data); lu_env_fini(&env); } - } else -#endif - { + } else { OBD_CHECK_DT_OP(obd, process_config, -EOPNOTSUPP); rc = OBP(obd, process_config)(obd, datalen, data); } @@ -624,9 +628,13 @@ static inline int obd_alloc_memmd(struct obd_export *exp, static inline int obd_free_memmd(struct obd_export *exp, struct lov_stripe_md **mem_tgt) { + int rc; + LASSERT(mem_tgt); LASSERT(*mem_tgt); - return obd_unpackmd(exp, mem_tgt, NULL, 0); + rc = obd_unpackmd(exp, mem_tgt, NULL, 0); + *mem_tgt = NULL; + return rc; } static inline int obd_checkmd(struct obd_export *exp, @@ -672,7 +680,7 @@ static inline int obd_create(struct obd_export *exp, struct obdo *obdo, static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obd_export *md_exp, void *capa) { int rc; ENTRY; @@ -680,7 +688,7 @@ static inline int obd_destroy(struct obd_export *exp, struct obdo *obdo, EXP_CHECK_DT_OP(exp, destroy); EXP_COUNTER_INCREMENT(exp, destroy); - rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti, md_exp); + rc = OBP(exp->exp_obd, destroy)(exp, obdo, ea, oti, md_exp, capa); RETURN(rc); } @@ -811,10 +819,8 @@ static inline int obd_connect(const struct lu_env *env, void *localdata) { int rc; -#ifdef LIBCFS_DEBUG __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition * check */ -#endif ENTRY; OBD_CHECK_DEV_ACTIVE(obd); @@ -836,10 +842,8 @@ static inline int obd_reconnect(const struct lu_env *env, void *localdata) { int rc; -#ifdef LIBCFS_DEBUG __u64 ocf = d ? d->ocd_connect_flags : 0; /* for post-condition * check */ -#endif ENTRY; @@ -1200,230 +1204,6 @@ static inline int obd_brw(int cmd, struct obd_export *exp, RETURN(rc); } -static inline int obd_brw_async(int cmd, struct obd_export *exp, - struct obd_info *oinfo, obd_count oa_bufs, - struct brw_page *pg, struct obd_trans_info *oti, - struct ptlrpc_request_set *set) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, brw_async); - EXP_COUNTER_INCREMENT(exp, brw_async); - - if (!(cmd & OBD_BRW_RWMASK)) { - CERROR("obd_brw: cmd must be OBD_BRW_READ or OBD_BRW_WRITE\n"); - LBUG(); - } - - rc = OBP(exp->exp_obd, brw_async)(cmd, exp, oinfo, oa_bufs, pg,oti,set); - RETURN(rc); -} - -static inline int obd_brw_rqset(int cmd, struct obd_export *exp, - struct obdo *oa, struct lov_stripe_md *lsm, - obd_count oa_bufs, struct brw_page *pg, - struct obd_trans_info *oti, - struct obd_capa *ocapa) -{ - struct ptlrpc_request_set *set = NULL; - struct obd_info oinfo = { { { 0 } } }; - int rc = 0; - ENTRY; - - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); - - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - oinfo.oi_capa = ocapa; - rc = obd_brw_async(cmd, exp, &oinfo, oa_bufs, pg, oti, set); - if (rc == 0) { - rc = ptlrpc_set_wait(set); - if (rc) - CERROR("error from callback: rc = %d\n", rc); - } else { - CDEBUG(rc == -ENOSPC ? D_INODE : D_ERROR, - "error from obd_brw_async: rc = %d\n", rc); - } - ptlrpc_set_destroy(set); - RETURN(rc); -} - -static inline int obd_prep_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - cfs_page_t *page, obd_off offset, - struct obd_async_page_ops *ops, - void *data, void **res, int nocache, - struct lustre_handle *lockh) -{ - int ret; - ENTRY; - - EXP_CHECK_DT_OP(exp, prep_async_page); - EXP_COUNTER_INCREMENT(exp, prep_async_page); - - ret = OBP(exp->exp_obd, prep_async_page)(exp, lsm, loi, page, offset, - ops, data, res, nocache, - lockh); - RETURN(ret); -} - -/** - * Checks if requested extent lock is compatible with a lock under the page. - * - * Checks if the lock under \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param exp obd export (lov or osc) - * \param lsm striping information for the file - * \param res async_page placeholder - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param start start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * - * \post result == 1, *cookie == context, appropriate lock is referenced or - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * \retval -ENOTSUPP reget_short_lock is not exported at this layer - * - * \see obd_release_short_lock - */ -static inline int obd_reget_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, - void **res, int rw, - obd_off start, obd_off end, - void **cookie) -{ - ENTRY; - - EXP_CHECK_DT_OP(exp, reget_short_lock); - EXP_COUNTER_INCREMENT(exp, reget_short_lock); - - RETURN(OBP(exp->exp_obd, reget_short_lock)(exp, lsm, res, rw, - start, end, cookie)); -} - - -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param exp obd export (lov or osc) - * \param lsm striping information for the file - * \param end end of the locked extent - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see obd_reget_short_lock - */ -static inline int obd_release_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, obd_off end, - void *cookie, int rw) -{ - ENTRY; - - EXP_CHECK_DT_OP(exp, release_short_lock); - EXP_COUNTER_INCREMENT(exp, release_short_lock); - - RETURN(OBP(exp->exp_obd, release_short_lock)(exp, lsm, end, - cookie, rw)); -} - -static inline int obd_queue_async_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, queue_async_io); - EXP_COUNTER_INCREMENT(exp, queue_async_io); - LASSERT(cmd & OBD_BRW_RWMASK); - - rc = OBP(exp->exp_obd, queue_async_io)(exp, lsm, loi, cookie, cmd, off, - count, brw_flags, async_flags); - RETURN(rc); -} - -static inline int obd_set_async_flags(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flag async_flags) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, set_async_flags); - EXP_COUNTER_INCREMENT(exp, set_async_flags); - - rc = OBP(exp->exp_obd, set_async_flags)(exp, lsm, loi, cookie, - async_flags); - RETURN(rc); -} - -static inline int obd_queue_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, - void *cookie, int cmd, obd_off off, - int count, obd_flag brw_flags, - obd_flag async_flags) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, queue_group_io); - EXP_COUNTER_INCREMENT(exp, queue_group_io); - LASSERT(cmd & OBD_BRW_RWMASK); - - rc = OBP(exp->exp_obd, queue_group_io)(exp, lsm, loi, oig, cookie, - cmd, off, count, brw_flags, - async_flags); - RETURN(rc); -} - -static inline int obd_trigger_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, trigger_group_io); - EXP_COUNTER_INCREMENT(exp, trigger_group_io); - - rc = OBP(exp->exp_obd, trigger_group_io)(exp, lsm, loi, oig); - RETURN(rc); -} - -static inline int obd_teardown_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, teardown_async_page); - EXP_COUNTER_INCREMENT(exp, teardown_async_page); - - rc = OBP(exp->exp_obd, teardown_async_page)(exp, lsm, loi, cookie); - RETURN(rc); -} - static inline int obd_preprw(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, struct niobuf_remote *remote, int *pages, @@ -1536,21 +1316,6 @@ static inline int obd_enqueue(struct obd_export *exp, RETURN(rc); } -static inline int obd_match(struct obd_export *exp, struct lov_stripe_md *ea, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh) -{ - int rc; - ENTRY; - - EXP_CHECK_DT_OP(exp, match); - EXP_COUNTER_INCREMENT(exp, match); - - rc = OBP(exp->exp_obd, match)(exp, ea, type, policy, mode, flags, data, - lockh); - RETURN(rc); -} - static inline int obd_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, ldlm_iterator_t it, void *data) @@ -1715,7 +1480,7 @@ static inline int obd_quotacheck(struct obd_export *exp, EXP_CHECK_DT_OP(exp, quotacheck); EXP_COUNTER_INCREMENT(exp, quotacheck); - rc = OBP(exp->exp_obd, quotacheck)(exp, oqctl); + rc = OBP(exp->exp_obd, quotacheck)(exp->exp_obd, exp, oqctl); RETURN(rc); } @@ -1728,7 +1493,39 @@ static inline int obd_quotactl(struct obd_export *exp, EXP_CHECK_DT_OP(exp, quotactl); EXP_COUNTER_INCREMENT(exp, quotactl); - rc = OBP(exp->exp_obd, quotactl)(exp, oqctl); + rc = OBP(exp->exp_obd, quotactl)(exp->exp_obd, exp, oqctl); + RETURN(rc); +} + +static inline int obd_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt) +{ +#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT) + struct timeval work_start; + struct timeval work_end; + long timediff; +#endif + int rc; + ENTRY; + +#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT) + if (qctxt) + do_gettimeofday(&work_start); +#endif + EXP_CHECK_DT_OP(exp, quota_adjust_qunit); + EXP_COUNTER_INCREMENT(exp, quota_adjust_qunit); + + rc = OBP(exp->exp_obd, quota_adjust_qunit)(exp, oqaq, qctxt); + +#if defined(LPROCFS) && defined(HAVE_QUOTA_SUPPORT) + if (qctxt) { + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_ADJUST_QUNIT, + timediff); + } +#endif RETURN(rc); } @@ -1769,6 +1566,7 @@ static inline int obd_register_observer(struct obd_device *obd, RETURN(0); } +#if 0 static inline int obd_register_page_removal_cb(struct obd_export *exp, obd_page_removal_cb_t cb, obd_pin_extent_cb pin_cb) @@ -1821,6 +1619,7 @@ static inline int obd_unregister_lock_cancel_cb(struct obd_export *exp, rc = OBP(exp->exp_obd, unregister_lock_cancel_cb)(exp, cb); RETURN(rc); } +#endif /* metadata helpers */ static inline int md_getstatus(struct obd_export *exp, @@ -2176,6 +1975,19 @@ static inline int md_renew_capa(struct obd_export *exp, struct obd_capa *ocapa, RETURN(rc); } +static inline int md_unpack_capa(struct obd_export *exp, + struct ptlrpc_request *req, + const struct req_msg_field *field, + struct obd_capa **oc) +{ + int rc; + ENTRY; + EXP_CHECK_MD_OP(exp, unpack_capa); + EXP_MD_COUNTER_INCREMENT(exp, unpack_capa); + rc = MDP(exp->exp_obd, unpack_capa)(exp, req, field, oc); + RETURN(rc); +} + static inline int md_intent_getattr_async(struct obd_export *exp, struct md_enqueue_info *minfo, struct ldlm_enqueue_info *einfo) diff --git a/lustre/include/obd_lov.h b/lustre/include/obd_lov.h index da3ca51..b4de8d2 100644 --- a/lustre/include/obd_lov.h +++ b/lustre/include/obd_lov.h @@ -52,7 +52,6 @@ static inline int lov_mds_md_size(int stripes, int lmm_magic) stripes * sizeof(struct lov_ost_data_v1); } - #define IOC_LOV_TYPE 'g' #define IOC_LOV_MIN_NR 50 #define IOC_LOV_SET_OSC_ACTIVE _IOWR('g', 50, long) diff --git a/lustre/include/obd_ost.h b/lustre/include/obd_ost.h index e72400a..8ddb969 100644 --- a/lustre/include/obd_ost.h +++ b/lustre/include/obd_ost.h @@ -36,7 +36,7 @@ * lustre/include/obd_ost.h * * Data structures for object storage targets and client: OST & OSC's - * + * * See also lustre_idl.h for wire formats of requests. */ @@ -54,21 +54,35 @@ struct osc_brw_async_args { struct brw_page **aa_ppga; struct client_obd *aa_cli; struct list_head aa_oaps; + struct obd_capa *aa_ocapa; + struct cl_req *aa_clerq; }; struct osc_async_args { struct obd_info *aa_oi; }; +struct osc_punch_args { + struct obdo *pa_oa; + obd_enqueue_update_f pa_upcall; + void *pa_cookie; +}; + struct osc_enqueue_args { - struct obd_export *oa_exp; - struct obd_info *oa_oi; - struct ldlm_enqueue_info*oa_ei; + struct obd_export *oa_exp; + int *oa_flags; + obd_enqueue_update_f oa_upcall; + void *oa_cookie; + struct ost_lvb *oa_lvb; + struct lustre_handle *oa_lockh; + struct ldlm_enqueue_info *oa_ei; }; +#if 0 int osc_extent_blocking_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new, void *data, int flag); +#endif /** * Build DLM resource name from object id & group for osc-ost extent lock. diff --git a/lustre/include/obd_support.h b/lustre/include/obd_support.h index 0507ce6..bcb16c9 100644 --- a/lustre/include/obd_support.h +++ b/lustre/include/obd_support.h @@ -77,6 +77,7 @@ extern unsigned int ldlm_timeout; /* seconds */ extern unsigned int obd_sync_filter; extern unsigned int obd_max_dirty_pages; extern atomic_t obd_dirty_pages; +extern atomic_t obd_dirty_transit_pages; extern cfs_waitq_t obd_race_waitq; extern int obd_race_state; extern unsigned int obd_alloc_fail_rate; @@ -190,6 +191,9 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_MDS_LOV_SYNC_RACE 0x13e #define OBD_FAIL_MDS_CLOSE_NET_REP 0x13f #define OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT 0x140 +#define OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS 0x141 +#define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x142 +#define OBD_FAIL_MDS_DROP_QUOTA_REQ 0x143 #define OBD_FAIL_OST 0x200 #define OBD_FAIL_OST_CONNECT_NET 0x201 @@ -253,6 +257,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_LDLM_CLOSE_THREAD 0x313 #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 #define OBD_FAIL_LDLM_CP_CB_WAIT 0x315 +#define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 /* LOCKLESS IO */ #define OBD_FAIL_LDLM_SET_CONTENTION 0x315 @@ -286,6 +291,10 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_PTLRPC_PAUSE_REP 0x50c #define OBD_FAIL_PTLRPC_DUMP_LOG 0x50e +#define OBD_FAIL_PTLRPC_LONG_REPL_UNLINK 0x50f +#define OBD_FAIL_PTLRPC_LONG_BULK_UNLINK 0x510 +#define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 +#define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 #define OBD_FAIL_OBD_PING_NET 0x600 #define OBD_FAIL_OBD_LOG_CANCEL_NET 0x601 @@ -316,7 +325,7 @@ int obd_alloc_fail(const void *ptr, const char *name, const char *type, #define OBD_FAIL_MGS_PAUSE_REQ 0x904 #define OBD_FAIL_MGS_PAUSE_TARGET_REG 0x905 -#define OBD_FAIL_QUOTA_QD_COUNT_32BIT 0xA00 +#define OBD_FAIL_QUOTA_RET_QDATA 0xA02 #define OBD_FAIL_LPROC_REMOVE 0xB00 diff --git a/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch index ceaaa20..b88a2bd 100644 --- a/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch +++ b/lustre/kernel_patches/patches/2.6-rhel4-kgdb-ga.patch @@ -1573,7 +1573,7 @@ Index: linux/MAINTAINERS =================================================================== --- linux.orig/MAINTAINERS +++ linux/MAINTAINERS -@@ -1242,6 +1242,12 @@ W: http://sf.net/projects/kernel-janitor +@@ -1247,6 +1247,12 @@ W: http://sf.net/projects/kernel-janitor W: http://developer.osdl.org/rddunlap/kj-patches/ S: Maintained @@ -1590,7 +1590,7 @@ Index: linux/arch/i386/Kconfig =================================================================== --- linux.orig/arch/i386/Kconfig +++ linux/arch/i386/Kconfig -@@ -1250,6 +1250,14 @@ menu "Executable file formats" +@@ -1292,6 +1292,14 @@ menu "Executable file formats" source "fs/Kconfig.binfmt" @@ -1800,7 +1800,7 @@ Index: linux/arch/i386/Makefile =================================================================== --- linux.orig/arch/i386/Makefile +++ linux/arch/i386/Makefile -@@ -98,6 +98,9 @@ core-$(CONFIG_X86_ES7000) := arch/i386/m +@@ -102,6 +102,9 @@ core-$(CONFIG_X86_ES7000) := arch/i386/m # default subarch .h files mflags-y += -Iinclude/asm-i386/mach-default @@ -1855,7 +1855,7 @@ Index: linux/arch/i386/kernel/entry.S #define __RESTORE_INT_REGS \ popl %ebx; \ -@@ -357,6 +370,7 @@ need_resched: +@@ -361,6 +374,7 @@ need_resched: # sysenter call handler stub ENTRY(sysenter_entry) movl TSS_sysenter_esp0(%esp),%esp @@ -1863,7 +1863,7 @@ Index: linux/arch/i386/kernel/entry.S sysenter_past_esp: sti pushl $(__USER_DS) -@@ -437,6 +451,19 @@ syscall_exit: +@@ -441,6 +455,19 @@ syscall_exit: testw $_TIF_ALLWORK_MASK, %cx # current->work jne syscall_exit_work restore_all: @@ -4382,7 +4382,7 @@ Index: linux/arch/i386/kernel/traps.c if (notify_die(DIE_GPF, "general protection fault", regs, error_code, 13, SIGSEGV) == NOTIFY_STOP) return; -@@ -829,8 +874,18 @@ asmlinkage void do_debug(struct pt_regs +@@ -835,8 +880,18 @@ asmlinkage void do_debug(struct pt_regs * allowing programs to debug themselves without the ptrace() * interface. */ @@ -4401,7 +4401,7 @@ Index: linux/arch/i386/kernel/traps.c if ((tsk->ptrace & (PT_DTRACE|PT_PTRACED)) == PT_DTRACE) goto clear_TF; } -@@ -842,6 +897,17 @@ asmlinkage void do_debug(struct pt_regs +@@ -848,6 +903,17 @@ asmlinkage void do_debug(struct pt_regs info.si_errno = 0; info.si_code = TRAP_BRKPT; @@ -4419,7 +4419,7 @@ Index: linux/arch/i386/kernel/traps.c /* If this is a kernel mode trap, save the user PC on entry to * the kernel, that's what the debugger can make sense of. */ -@@ -856,6 +922,7 @@ clear_dr7: +@@ -862,6 +928,7 @@ clear_dr7: __asm__("movl %0,%%db7" : /* no output */ : "r" (0)); @@ -4427,7 +4427,7 @@ Index: linux/arch/i386/kernel/traps.c return; debug_vm86: -@@ -1151,6 +1218,12 @@ static void __init set_task_gate(unsigne +@@ -1157,6 +1224,12 @@ static void __init set_task_gate(unsigne { _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); } @@ -4440,7 +4440,7 @@ Index: linux/arch/i386/kernel/traps.c void __init trap_init(void) -@@ -1169,7 +1242,11 @@ void __init trap_init(void) +@@ -1175,7 +1248,11 @@ void __init trap_init(void) set_trap_gate(0,÷_error); set_intr_gate(1,&debug); set_intr_gate(2,&nmi); @@ -5051,7 +5051,7 @@ Index: linux/drivers/serial/8250.c =================================================================== --- linux.orig/drivers/serial/8250.c +++ linux/drivers/serial/8250.c -@@ -880,7 +880,7 @@ receive_chars(struct uart_8250_port *up, +@@ -882,7 +882,7 @@ receive_chars(struct uart_8250_port *up, if (unlikely(tty->flip.count >= TTY_FLIPBUF_SIZE)) { tty->flip.work.func((void *)tty); if (tty->flip.count >= TTY_FLIPBUF_SIZE) @@ -5060,7 +5060,7 @@ Index: linux/drivers/serial/8250.c } ch = serial_inp(up, UART_RX); *tty->flip.char_buf_ptr = ch; -@@ -1241,12 +1241,21 @@ static void serial8250_break_ctl(struct +@@ -1245,12 +1245,21 @@ static void serial8250_break_ctl(struct spin_unlock_irqrestore(&up->port.lock, flags); } @@ -5082,7 +5082,7 @@ Index: linux/drivers/serial/8250.c up->capabilities = uart_config[up->port.type].flags; up->mcr = 0; -@@ -1877,6 +1886,10 @@ static void __init serial8250_register_p +@@ -1881,6 +1890,10 @@ static void __init serial8250_register_p for (i = 0; i < UART_NR; i++) { struct uart_8250_port *up = &serial8250_ports[i]; @@ -5093,7 +5093,7 @@ Index: linux/drivers/serial/8250.c up->port.line = i; up->port.ops = &serial8250_pops; init_timer(&up->timer); -@@ -2160,6 +2173,31 @@ void serial8250_resume_port(int line) +@@ -2181,6 +2194,31 @@ void serial8250_resume_port(int line) uart_resume_port(&serial8250_reg, &serial8250_ports[line].port); } @@ -6317,20 +6317,12 @@ Index: linux/include/linux/spinlock.h + SET_WHO(x, current) \ } while (0) - /* without debugging, spin_is_locked on UP always says -@@ -151,6 +162,7 @@ typedef struct { - (x)->lock = 1; \ - (x)->owner = __FILE__; \ - (x)->oline = __LINE__; \ -+ SET_WHO(x, current) \ - 1; \ - }) - + #define spin_is_locked(x) \ Index: linux/kernel/pid.c =================================================================== --- linux.orig/kernel/pid.c +++ linux/kernel/pid.c -@@ -276,6 +276,9 @@ int pid_alive(struct task_struct *p) +@@ -320,6 +320,9 @@ struct pid *find_ge_pid(int nr) * machine. From a minimum of 16 slots up to 4096 slots at one gigabyte or * more. */ @@ -6340,7 +6332,7 @@ Index: linux/kernel/pid.c void __init pidhash_init(void) { int i, j, pidhash_size; -@@ -297,6 +300,9 @@ void __init pidhash_init(void) +@@ -341,6 +344,9 @@ void __init pidhash_init(void) for (j = 0; j < pidhash_size; j++) INIT_HLIST_HEAD(&pid_hash[i][j]); } @@ -6354,7 +6346,7 @@ Index: linux/kernel/sched.c =================================================================== --- linux.orig/kernel/sched.c +++ linux/kernel/sched.c -@@ -3190,6 +3190,13 @@ out_unlock: +@@ -3207,6 +3207,13 @@ out_unlock: EXPORT_SYMBOL(set_user_nice); diff --git a/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch b/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch new file mode 100644 index 0000000..6e38859 --- /dev/null +++ b/lustre/kernel_patches/patches/2.6-rhel5-kgdb-ga.patch @@ -0,0 +1,19200 @@ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Documentation/DocBook/Makefile linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/Makefile +--- linux-2.6.18-53.1.14/Documentation/DocBook/Makefile 2008-03-06 05:54:50.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/Makefile 2008-06-10 15:37:25.000000000 +0400 +@@ -11,7 +11,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc + procfs-guide.xml writing_usb_driver.xml \ + kernel-api.xml journal-api.xml lsm.xml utrace.xml usb.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ +- genericirq.xml ++ genericirq.xml kgdb.xml + + ### + # The build process is as follows (targets): +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Documentation/DocBook/kgdb.tmpl linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/kgdb.tmpl +--- linux-2.6.18-53.1.14/Documentation/DocBook/kgdb.tmpl 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/Documentation/DocBook/kgdb.tmpl 2008-06-10 15:38:50.000000000 +0400 +@@ -0,0 +1,250 @@ ++ ++ ++ ++ ++ ++ KGDB Internals ++ ++ ++ ++ Tom ++ Rini ++ ++
++ trini@kernel.crashing.org ++
++
++
++
++ ++ ++ ++ Amit S. ++ Kale ++ ++
++ amitkale@linsyssoft.com ++
++
++
++
++ ++ ++ 2004-2005 ++ MontaVista Software, Inc. ++ ++ ++ 2004 ++ Amit S. Kale ++ ++ ++ ++ ++ This file is licensed under the terms of the GNU General Public License ++ version 2. This program is licensed "as is" without any warranty of any ++ kind, whether express or implied. ++ ++ ++ ++
++ ++ ++ ++ Introduction ++ ++ kgdb is a source level debugger for linux kernel. It is used along ++ with gdb to debug a linux kernel. Kernel developers can debug a kernel ++ similar to application programs with the use of kgdb. It makes it ++ possible to place breakpoints in kernel code, step through the code ++ and observe variables. ++ ++ ++ Two machines are required for using kgdb. One of these machines is a ++ development machine and the other is a test machine. The machines are ++ typically connected through a serial line, a null-modem cable which ++ connects their serial ports. It is also possible however, to use an ++ ethernet connection between the machines. The kernel to be debugged ++ runs on the test machine. gdb runs on the development machine. The ++ serial line or ethernet connection is used by gdb to communicate to ++ the kernel being debugged. ++ ++ ++ ++ Compiling a kernel ++ ++ To enable CONFIG_KGDB, look under the "Kernel debugging" ++ and then select "KGDB: kernel debugging with remote gdb". ++ ++ ++ The first choice for I/O is CONFIG_KGDB_ONLY_MODULES. ++ This means that you will only be able to use KGDB after loading a ++ kernel module that defines how you want to be able to talk with ++ KGDB. There are two other choices (more on some architectures) that ++ can be enabled as modules later, if not picked here. ++ ++ The first of these is CONFIG_KGDB_8250_NOMODULE. ++ This has sub-options such as CONFIG_KGDB_SIMPLE_SERIAL ++ which toggles choosing the serial port by ttyS number or by specifying ++ a port and IRQ number. ++ ++ ++ The second of these choices on most systems for I/O is ++ CONFIG_KGDBOE. This requires that the machine to be ++ debugged has an ethernet card which supports the netpoll API, such as ++ the cards supported by CONFIG_E100. There are no ++ sub-options for this, but a kernel command line option is required. ++ ++ ++ ++ Booting the kernel ++ ++ The Kernel command line option kgdbwait makes kgdb ++ wait for gdb connection during booting of a kernel. If the ++ CONFIG_KGDB_8250 driver is used (or if applicable, ++ another serial driver) this breakpoint will happen very early on, before ++ console output. If you wish to change serial port information and you ++ have enabled both CONFIG_KGDB_8250 and ++ CONFIG_KGDB_SIMPLE_SERIAL then you must pass the option ++ kgdb8250=<io or mmio>,<address>,<baud ++ rate>,<irq> before kgdbwait. ++ The values io or mmio refer to ++ if the address being passed next needs to be memory mapped ++ (mmio) or not. The address must ++ be passed in hex and is the hardware address and will be remapped if ++ passed as mmio. The value ++ baud rate and irq are base-10. ++ The supported values for baud rate are ++ 9600, 19200, ++ 38400, 57600, and ++ 115200. ++ ++ ++ To have KGDB stop the kernel and wait, with the compiled values for the ++ serial driver, pass in: kgdbwait. ++ ++ ++ To specify the values of the SH SCI(F) serial port at boot: ++ kgdbsci=0,115200. ++ ++ ++ To specify the values of the serial port at boot: ++ kgdb8250=io,3f8,115200,3. ++ On IA64 this could also be: ++ kgdb8250=mmio,0xff5e0000,115200,74 ++ And to have KGDB also stop the kernel and wait for GDB to connect, pass in ++ kgdbwait after this arguement. ++ ++ ++ To configure the CONFIG_KGDBOE driver, pass in ++ kgdboe=[src-port]@<src-ip>/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr] ++ where: ++ ++ src-port (optional): source for UDP packets (defaults to 6443) ++ src-ip: source IP to use (interface address) ++ dev (optional): network interface (eth0) ++ tgt-port (optional): port GDB will use (defaults to 6442) ++ tgt-ip: IP address GDB will be connecting from ++ tgt-macaddr (optional): ethernet MAC address for logging agent (default is broadcast) ++ ++ ++ ++ The CONFIG_KGDBOE driver can be reconfigured at run ++ time, if CONFIG_SYSFS and ++ CONFIG_MODULES by echo'ing a new config string to ++ /sys/module/kgdboe/parameter/kgdboe. The ++ driver can be unconfigured with the special string ++ not_configured. ++ ++ ++ ++ Connecting gdb ++ ++ If you have used any of the methods to have KGDB stop and create ++ an initial breakpoint described in the previous chapter, kgdb prints ++ the message "Waiting for connection from remote gdb..." on the console ++ and waits for connection from gdb. At this point you connect gdb to kgdb. ++ ++ ++ Example (serial): ++ ++ ++ % gdb ./vmlinux ++ (gdb) set remotebaud 115200 ++ (gdb) target remote /dev/ttyS0 ++ ++ ++ Example (ethernet): ++ ++ ++ % gdb ./vmlinux ++ (gdb) target remote udp:192.168.2.2:6443 ++ ++ ++ Once connected, you can debug a kernel the way you would debug an ++ application program. ++ ++ ++ ++ Architecture specific notes ++ ++ SuperH: The NMI switch found on some boards can be used to trigger an ++ initial breakpoint. Subsequent triggers do nothing. If console ++ is enabled on the SCI(F) serial port, and that is the port being used ++ for KGDB, then you must trigger a breakpoint via sysrq, NMI, or ++ some other method prior to connecting, or echo a control-c to the ++ serial port. Also, to use the SCI(F) port for KGDB, the ++ CONFIG_SERIAL_SH_SCI driver must be enabled. ++ ++ ++ ++ The common backend (required) ++ ++ There are a few flags which must be set on every architecture in ++ their <asm/kgdb.h> file. These are: ++ ++ ++ ++ NUMREGBYTES: The size in bytes of all of the registers, so ++ that we can ensure they will all fit into a packet. ++ ++ ++ BUFMAX: The size in bytes of the buffer GDB will read into. ++ This must be larger than NUMREGBYTES. ++ ++ ++ CACHE_FLUSH_IS_SAFE: Set to one if it always safe to call ++ flush_cache_range or flush_icache_range. On some architectures, ++ these functions may not be safe to call on SMP since we keep other ++ CPUs in a holding pattern. ++ ++ ++ ++ ++ ++ There are also the following functions for the common backend, ++ found in kernel/kgdb.c that must be supplied by the ++ architecture-specific backend. No weak version of these is provided. ++ ++!Iinclude/linux/kgdb.h ++ ++ ++ The common backend (optional) ++ ++ These functions are part of the common backend, found in kernel/kgdb.c ++ and are optionally implemented. Some functions (with _hw_ in the name) ++ end up being required on arches which use hardware breakpoints. ++ ++!Ikernel/kgdb.c ++ ++ ++ Driver-Specific Functions ++ ++ Some of the I/O drivers have additional functions that can be ++ called, that are specific to the driver. Calls from other places ++ to these functions must be wrapped in #ifdefs for the driver in ++ question. ++ ++!Idrivers/serial/8250_kgdb.c ++ ++
+diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/MAINTAINERS linux-2.6.18-53.1.14.kgdb/MAINTAINERS +--- linux-2.6.18-53.1.14/MAINTAINERS 2008-03-06 05:54:49.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/MAINTAINERS 2008-06-10 15:37:25.000000000 +0400 +@@ -1715,6 +1715,15 @@ L: linux-kernel@vger.kernel.org + L: fastboot@osdl.org + S: Maintained + ++KGDB ++P: Tom Rini ++P: Amit S. Kale ++M: trini@kernel.crashing.org ++M: amitkale@linsyssoft.com ++W: http://sourceforge.net/projects/kgdb ++L: kgdb-bugreport@lists.sourceforge.net ++S: Maintained ++ + KPROBES + P: Prasanna S Panchamukhi + M: prasanna@in.ibm.com +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/Makefile linux-2.6.18-53.1.14.kgdb/Makefile +--- linux-2.6.18-53.1.14/Makefile 2008-03-06 05:55:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/Makefile 2008-06-10 15:39:01.000000000 +0400 +@@ -992,6 +992,7 @@ MRPROPER_DIRS += include/config include + MRPROPER_FILES += .config .config.old include/asm .version .old_version \ + include/linux/autoconf.h include/linux/version.h \ + include/linux/utsrelease.h \ ++ include/linux/dwarf2-defs.h \ + Module.symvers tags TAGS cscope* + + # clean - Delete most, but leave enough to build external modules +@@ -1422,7 +1423,11 @@ clean := -f $(if $(KBUILD_SRC),$(srctree + endif # skip-makefile + + PHONY += FORCE +-FORCE: ++include/linux/dwarf2-defs.h: $(srctree)/include/linux/dwarf2.h $(srctree)/scripts/dwarfh.awk ++ mkdir -p include/linux/ ++ awk -f $(srctree)/scripts/dwarfh.awk $(srctree)/include/linux/dwarf2.h > include/linux/dwarf2-defs.h ++ ++FORCE: include/linux/dwarf2-defs.h + + + # Declare the contents of the .PHONY variable as phony. We keep that +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/arm/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/Makefile 2008-06-10 15:38:56.000000000 +0400 +@@ -20,6 +20,7 @@ obj-$(CONFIG_ISA_DMA) += dma-isa.o + obj-$(CONFIG_PCI) += bios32.o isa.o + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + obj-$(CONFIG_CRUNCH) += crunch.o crunch-bits.o + AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/entry-armv.S linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/entry-armv.S +--- linux-2.6.18-53.1.14/arch/arm/kernel/entry-armv.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/entry-armv.S 2008-06-10 15:39:01.000000000 +0400 +@@ -15,6 +15,7 @@ + * it to save wrong values... Be aware! + */ + ++#include + #include + #include + #include +@@ -232,6 +233,7 @@ svc_preempt: + beq preempt_return @ go again + b 1b + #endif ++ CFI_END_FRAME(__irq_svc) + + .align 5 + __und_svc: +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb-jmp.S +--- linux-2.6.18-53.1.14/arch/arm/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb-jmp.S 2008-06-10 15:38:56.000000000 +0400 +@@ -0,0 +1,32 @@ ++/* ++ * arch/arm/kernel/kgdb-jmp.S ++ * ++ * Trivial setjmp and longjmp procedures to support bus error recovery ++ * which may occur during kgdb memory read/write operations. ++ * ++ * Author: MontaVista Software, Inc. ++ * source@mvista.com ++ * ++ * 2002-2005 (c) MontaVista Software, Inc. This file is licensed under the ++ * terms of the GNU General Public License version 2. This program as licensed ++ * "as is" without any warranty of any kind, whether express or implied. ++ */ ++#include ++ ++ENTRY (kgdb_fault_setjmp) ++ /* Save registers */ ++ stmia r0, {r0-r14} ++ str lr,[r0, #60] ++ mrs r1,cpsr ++ str r1,[r0,#64] ++ ldr r1,[r0,#4] ++ mov r0, #0 ++ mov pc,lr ++ ++ENTRY (kgdb_fault_longjmp) ++ /* Restore registers */ ++ mov r1,#1 ++ str r1,[r0] ++ ldr r1,[r0, #64] ++ msr spsr,r1 ++ ldmia r0,{r0-pc}^ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/arm/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/kgdb.c 2008-06-10 15:38:56.000000000 +0400 +@@ -0,0 +1,208 @@ ++/* ++ * arch/arm/kernel/kgdb.c ++ * ++ * ARM KGDB support ++ * ++ * Copyright (c) 2002-2004 MontaVista Software, Inc ++ * ++ * Authors: George Davis ++ * Deepak Saxena ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Make a local copy of the registers passed into the handler (bletch) */ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs) ++{ ++ int regno; ++ ++ /* Initialize all to zero (??) */ ++ for (regno = 0; regno < GDB_MAX_REGS; regno++) ++ gdb_regs[regno] = 0; ++ ++ gdb_regs[_R0] = kernel_regs->ARM_r0; ++ gdb_regs[_R1] = kernel_regs->ARM_r1; ++ gdb_regs[_R2] = kernel_regs->ARM_r2; ++ gdb_regs[_R3] = kernel_regs->ARM_r3; ++ gdb_regs[_R4] = kernel_regs->ARM_r4; ++ gdb_regs[_R5] = kernel_regs->ARM_r5; ++ gdb_regs[_R6] = kernel_regs->ARM_r6; ++ gdb_regs[_R7] = kernel_regs->ARM_r7; ++ gdb_regs[_R8] = kernel_regs->ARM_r8; ++ gdb_regs[_R9] = kernel_regs->ARM_r9; ++ gdb_regs[_R10] = kernel_regs->ARM_r10; ++ gdb_regs[_FP] = kernel_regs->ARM_fp; ++ gdb_regs[_IP] = kernel_regs->ARM_ip; ++ gdb_regs[_SP] = kernel_regs->ARM_sp; ++ gdb_regs[_LR] = kernel_regs->ARM_lr; ++ gdb_regs[_PC] = kernel_regs->ARM_pc; ++ gdb_regs[_CPSR] = kernel_regs->ARM_cpsr; ++} ++ ++/* Copy local gdb registers back to kgdb regs, for later copy to kernel */ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs) ++{ ++ kernel_regs->ARM_r0 = gdb_regs[_R0]; ++ kernel_regs->ARM_r1 = gdb_regs[_R1]; ++ kernel_regs->ARM_r2 = gdb_regs[_R2]; ++ kernel_regs->ARM_r3 = gdb_regs[_R3]; ++ kernel_regs->ARM_r4 = gdb_regs[_R4]; ++ kernel_regs->ARM_r5 = gdb_regs[_R5]; ++ kernel_regs->ARM_r6 = gdb_regs[_R6]; ++ kernel_regs->ARM_r7 = gdb_regs[_R7]; ++ kernel_regs->ARM_r8 = gdb_regs[_R8]; ++ kernel_regs->ARM_r9 = gdb_regs[_R9]; ++ kernel_regs->ARM_r10 = gdb_regs[_R10]; ++ kernel_regs->ARM_fp = gdb_regs[_FP]; ++ kernel_regs->ARM_ip = gdb_regs[_IP]; ++ kernel_regs->ARM_sp = gdb_regs[_SP]; ++ kernel_regs->ARM_lr = gdb_regs[_LR]; ++ kernel_regs->ARM_pc = gdb_regs[_PC]; ++ kernel_regs->ARM_cpsr = gdb_regs[GDB_MAX_REGS - 1]; ++} ++ ++static inline struct pt_regs *kgdb_get_user_regs(struct task_struct *task) ++{ ++ return (struct pt_regs *) ++ ((unsigned long)task->thread_info + THREAD_SIZE - ++ 8 - sizeof(struct pt_regs)); ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, ++ struct task_struct *task) ++{ ++ int regno; ++ struct pt_regs *thread_regs; ++ ++ /* Just making sure... */ ++ if (task == NULL) ++ return; ++ ++ /* Initialize to zero */ ++ for (regno = 0; regno < GDB_MAX_REGS; regno++) ++ gdb_regs[regno] = 0; ++ ++ /* Otherwise, we have only some registers from switch_to() */ ++ thread_regs = kgdb_get_user_regs(task); ++ gdb_regs[_R0] = thread_regs->ARM_r0; /* Not really valid? */ ++ gdb_regs[_R1] = thread_regs->ARM_r1; /* " " */ ++ gdb_regs[_R2] = thread_regs->ARM_r2; /* " " */ ++ gdb_regs[_R3] = thread_regs->ARM_r3; /* " " */ ++ gdb_regs[_R4] = thread_regs->ARM_r4; ++ gdb_regs[_R5] = thread_regs->ARM_r5; ++ gdb_regs[_R6] = thread_regs->ARM_r6; ++ gdb_regs[_R7] = thread_regs->ARM_r7; ++ gdb_regs[_R8] = thread_regs->ARM_r8; ++ gdb_regs[_R9] = thread_regs->ARM_r9; ++ gdb_regs[_R10] = thread_regs->ARM_r10; ++ gdb_regs[_FP] = thread_regs->ARM_fp; ++ gdb_regs[_IP] = thread_regs->ARM_ip; ++ gdb_regs[_SP] = thread_regs->ARM_sp; ++ gdb_regs[_LR] = thread_regs->ARM_lr; ++ gdb_regs[_PC] = thread_regs->ARM_pc; ++ gdb_regs[_CPSR] = thread_regs->ARM_cpsr; ++} ++ ++static int compiled_break; ++ ++int kgdb_arch_handle_exception(int exception_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ long addr; ++ char *ptr; ++ ++ switch (remcom_in_buffer[0]) { ++ case 'c': ++ kgdb_contthread = NULL; ++ ++ /* ++ * Try to read optional parameter, pc unchanged if no parm. ++ * If this was a compiled breakpoint, we need to move ++ * to the next instruction or we will just breakpoint ++ * over and over again. ++ */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) { ++ linux_regs->ARM_pc = addr; ++ } else if (compiled_break == 1) { ++ linux_regs->ARM_pc += 4; ++ } ++ ++ compiled_break = 0; ++ ++ return 0; ++ } ++ ++ return -1; ++} ++ ++static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr) ++{ ++ kgdb_handle_exception(1, SIGTRAP, 0, regs); ++ ++ return 0; ++} ++ ++static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr) ++{ ++ compiled_break = 1; ++ kgdb_handle_exception(1, SIGTRAP, 0, regs); ++ ++ return 0; ++} ++ ++static struct undef_hook kgdb_brkpt_hook = { ++ .instr_mask = 0xffffffff, ++ .instr_val = KGDB_BREAKINST, ++ .fn = kgdb_brk_fn ++}; ++ ++static struct undef_hook kgdb_compiled_brkpt_hook = { ++ .instr_mask = 0xffffffff, ++ .instr_val = KGDB_COMPILED_BREAK, ++ .fn = kgdb_compiled_brk_fn ++}; ++ ++/* ++ * Register our undef instruction hooks with ARM undef core. ++ * We regsiter a hook specifically looking for the KGB break inst ++ * and we handle the normal undef case within the do_undefinstr ++ * handler. ++ */ ++int kgdb_arch_init(void) ++{ ++ register_undef_hook(&kgdb_brkpt_hook); ++ register_undef_hook(&kgdb_compiled_brkpt_hook); ++ ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifndef __ARMEB__ ++ .gdb_bpt_instr = {0xfe, 0xde, 0xff, 0xe7} ++#else ++ .gdb_bpt_instr = {0xe7, 0xff, 0xde, 0xfe} ++#endif ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/setup.c +--- linux-2.6.18-53.1.14/arch/arm/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/setup.c 2008-06-10 15:38:56.000000000 +0400 +@@ -829,6 +829,11 @@ void __init setup_arch(char **cmdline_p) + conswitchp = &dummy_con; + #endif + #endif ++ ++#if defined(CONFIG_KGDB) ++ extern void __init early_trap_init(void); ++ early_trap_init(); ++#endif + } + + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/traps.c +--- linux-2.6.18-53.1.14/arch/arm/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/kernel/traps.c 2008-06-10 15:38:56.000000000 +0400 +@@ -278,6 +278,7 @@ asmlinkage void do_undefinstr(struct pt_ + unsigned int instr; + struct undef_hook *hook; + siginfo_t info; ++ mm_segment_t fs; + void __user *pc; + + /* +@@ -287,12 +288,15 @@ asmlinkage void do_undefinstr(struct pt_ + */ + regs->ARM_pc -= correction; + ++ fs = get_fs(); ++ set_fs(KERNEL_DS); + pc = (void __user *)instruction_pointer(regs); + if (thumb_mode(regs)) { + get_user(instr, (u16 __user *)pc); + } else { + get_user(instr, (u32 __user *)pc); + } ++ set_fs(fs); + + spin_lock_irq(&undef_lock); + list_for_each_entry(hook, &undef_hook, node) { +@@ -684,6 +688,13 @@ EXPORT_SYMBOL(abort); + + void __init trap_init(void) + { ++#if defined(CONFIG_KGDB) ++ return; ++} ++ ++void __init early_trap_init(void) ++{ ++#endif + unsigned long vectors = CONFIG_VECTORS_BASE; + extern char __stubs_start[], __stubs_end[]; + extern char __vectors_start[], __vectors_end[]; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/core.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/core.c +--- linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/core.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/core.c 2008-06-10 15:38:56.000000000 +0400 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -184,6 +185,9 @@ static struct platform_device ixp2000_se + void __init ixp2000_uart_init(void) + { + platform_device_register(&ixp2000_serial_device); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixp2000_serial_port); ++#endif + } + + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/ixdp2x01.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c +--- linux-2.6.18-53.1.14/arch/arm/mach-ixp2000/ixdp2x01.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c 2008-06-10 15:38:56.000000000 +0400 +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -413,6 +414,11 @@ static void __init ixdp2x01_init_machine + platform_add_devices(ixdp2x01_devices, ARRAY_SIZE(ixdp2x01_devices)); + ixp2000_uart_init(); + ixdp2x01_uart_init(); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixdp425_serial_ports[0]); ++ kgdb8250_add_port(1, &ixdp425_serial_ports[1]); ++#endif + } + + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/coyote-setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c +--- linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/coyote-setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c 2008-06-10 15:38:56.000000000 +0400 +@@ -96,6 +96,10 @@ static void __init coyote_init(void) + } + + platform_add_devices(coyote_devices, ARRAY_SIZE(coyote_devices)); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &coyote_serial_port); ++#endif + } + + #ifdef CONFIG_ARCH_ADI_COYOTE +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/ixdp425-setup.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c +--- linux-2.6.18-53.1.14/arch/arm/mach-ixp4xx/ixdp425-setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c 2008-06-10 15:38:56.000000000 +0400 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + static struct flash_platform_data ixdp425_flash_data = { + .map_name = "cfi_probe", +@@ -76,7 +77,8 @@ static struct plat_serial8250_port ixdp4 + .mapbase = IXP4XX_UART1_BASE_PHYS, + .membase = (char *)IXP4XX_UART1_BASE_VIRT + REG_OFFSET, + .irq = IRQ_IXP4XX_UART1, +- .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST, ++ .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | ++ UPF_SHARE_IRQ, + .iotype = UPIO_MEM, + .regshift = 2, + .uartclk = IXP4XX_UART_XTAL, +@@ -85,7 +87,8 @@ static struct plat_serial8250_port ixdp4 + .mapbase = IXP4XX_UART2_BASE_PHYS, + .membase = (char *)IXP4XX_UART2_BASE_VIRT + REG_OFFSET, + .irq = IRQ_IXP4XX_UART2, +- .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST, ++ .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | ++ UPF_SHARE_IRQ, + .iotype = UPIO_MEM, + .regshift = 2, + .uartclk = IXP4XX_UART_XTAL, +@@ -116,6 +119,11 @@ static void __init ixdp425_init(void) + IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1; + + platform_add_devices(ixdp425_devices, ARRAY_SIZE(ixdp425_devices)); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixdp425_serial_ports[0]); ++ kgdb8250_add_port(1, &ixdp425_serial_ports[1]); ++#endif + } + + #ifdef CONFIG_ARCH_IXDP425 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-omap1/serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-omap1/serial.c +--- linux-2.6.18-53.1.14/arch/arm/mach-omap1/serial.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-omap1/serial.c 2008-06-10 15:38:56.000000000 +0400 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -199,6 +200,9 @@ void __init omap_serial_init(void) + break; + } + omap_serial_reset(&serial_platform_data[i]); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_platform_port(i, &serial_platform_data[i]); ++#endif + } + } + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-pxa/Makefile linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/Makefile +--- linux-2.6.18-53.1.14/arch/arm/mach-pxa/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/Makefile 2008-06-10 15:38:56.000000000 +0400 +@@ -31,6 +31,7 @@ obj-$(CONFIG_LEDS) += $(led-y) + # Misc features + obj-$(CONFIG_PM) += pm.o sleep.o + obj-$(CONFIG_PXA_SSP) += ssp.o ++obj-$(CONFIG_KGDB_PXA_SERIAL) += kgdb-serial.o + + ifeq ($(CONFIG_PXA27x),y) + obj-$(CONFIG_PM) += standby.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-pxa/kgdb-serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/kgdb-serial.c +--- linux-2.6.18-53.1.14/arch/arm/mach-pxa/kgdb-serial.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-pxa/kgdb-serial.c 2008-06-10 15:38:56.000000000 +0400 +@@ -0,0 +1,98 @@ ++/* ++ * linux/arch/arm/mach-pxa/kgdb-serial.c ++ * ++ * Provides low level kgdb serial support hooks for PXA2xx boards ++ * ++ * Author: Nicolas Pitre ++ * Copyright: (C) 2002-2005 MontaVista Software Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(CONFIG_KGDB_PXA_FFUART) ++ ++#define UART FFUART ++#define CKEN_UART CKEN6_FFUART ++#define GPIO_RX_MD GPIO34_FFRXD_MD ++#define GPIO_TX_MD GPIO39_FFTXD_MD ++ ++#elif defined(CONFIG_KGDB_PXA_BTUART) ++ ++#define UART BTUART ++#define CKEN_UART CKEN7_BTUART ++#define GPIO_RX_MD GPIO42_BTRXD_MD ++#define GPIO_TX_MD GPIO43_BTTXD_MD ++ ++#elif defined(CONFIG_KGDB_PXA_STUART) ++ ++#define UART STUART ++#define CKEN_UART CKEN5_STUART ++#define GPIO_RX_MD GPIO46_STRXD_MD ++#define GPIO_TX_MD GPIO47_STTXD_MD ++ ++#endif ++ ++#define UART_BAUDRATE (CONFIG_KGDB_BAUDRATE) ++ ++static volatile unsigned long *port = (unsigned long *)&UART; ++ ++static int kgdb_serial_init(void) ++{ ++ pxa_set_cken(CKEN_UART, 1); ++ pxa_gpio_mode(GPIO_RX_MD); ++ pxa_gpio_mode(GPIO_TX_MD); ++ ++ port[UART_IER] = 0; ++ port[UART_LCR] = LCR_DLAB; ++ port[UART_DLL] = ((921600 / UART_BAUDRATE) & 0xff); ++ port[UART_DLM] = ((921600 / UART_BAUDRATE) >> 8); ++ port[UART_LCR] = LCR_WLS1 | LCR_WLS0; ++ port[UART_MCR] = 0; ++ port[UART_IER] = IER_UUE; ++ port[UART_FCR] = FCR_ITL_16; ++ ++ return 0; ++} ++ ++static void kgdb_serial_putchar(int c) ++{ ++ if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE) ++ kgdb_serial_init(); ++ while (!(port[UART_LSR] & LSR_TDRQ)) ++ cpu_relax(); ++ port[UART_TX] = c; ++} ++ ++static void kgdb_serial_flush(void) ++{ ++ if ((CKEN & CKEN_UART) && (port[UART_IER] & IER_UUE)) ++ while (!(port[UART_LSR] & LSR_TEMT)) ++ cpu_relax(); ++} ++ ++static int kgdb_serial_getchar(void) ++{ ++ unsigned char c; ++ if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE) ++ kgdb_serial_init(); ++ while (!(port[UART_LSR] & UART_LSR_DR)) ++ cpu_relax(); ++ c = port[UART_RX]; ++ return c; ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .init = kgdb_serial_init, ++ .write_char = kgdb_serial_putchar, ++ .flush = kgdb_serial_flush, ++ .read_char = kgdb_serial_getchar, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mach-versatile/kgdb_serial.c linux-2.6.18-53.1.14.kgdb/arch/arm/mach-versatile/kgdb_serial.c +--- linux-2.6.18-53.1.14/arch/arm/mach-versatile/kgdb_serial.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mach-versatile/kgdb_serial.c 2008-06-10 15:38:56.000000000 +0400 +@@ -0,0 +1,121 @@ ++/* ++ * arch/arm/mach-versatile/kgdb_serial.c ++ * ++ * Author: Manish Lachwani, mlachwani@mvista.com ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ * ++ * Support for KGDB on ARM Versatile. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define ARM_BAUD_38400 23 ++/* ++ * Functions that will be used later ++ */ ++#define UART_GET_INT_STATUS(p) readb((p) + UART010_IIR) ++#define UART_GET_MIS(p) readw((p) + UART011_MIS) ++#define UART_PUT_ICR(p, c) writel((c), (p) + UART010_ICR) ++#define UART_GET_FR(p) readb((p) + UART01x_FR) ++#define UART_GET_CHAR(p) readb((p) + UART01x_DR) ++#define UART_PUT_CHAR(p, c) writel((c), (p) + UART01x_DR) ++#define UART_GET_RSR(p) readb((p) + UART01x_RSR) ++#define UART_GET_CR(p) readb((p) + UART010_CR) ++#define UART_PUT_CR(p,c) writel((c), (p) + UART010_CR) ++#define UART_GET_LCRL(p) readb((p) + UART010_LCRL) ++#define UART_PUT_LCRL(p,c) writel((c), (p) + UART010_LCRL) ++#define UART_GET_LCRM(p) readb((p) + UART010_LCRM) ++#define UART_PUT_LCRM(p,c) writel((c), (p) + UART010_LCRM) ++#define UART_GET_LCRH(p) readb((p) + UART010_LCRH) ++#define UART_PUT_LCRH(p,c) writel((c), (p) + UART010_LCRH) ++#define UART_RX_DATA(s) (((s) & UART01x_FR_RXFE) == 0) ++#define UART_TX_READY(s) (((s) & UART01x_FR_TXFF) == 0) ++#define UART_TX_EMPTY(p) ((UART_GET_FR(p) & UART01x_FR_TMSK) == 0) ++ ++/* ++ * KGDB IRQ ++ */ ++static int kgdb_irq = 12; ++static volatile unsigned char *port = NULL; ++ ++static int kgdb_serial_init(void) ++{ ++ int rate = ARM_BAUD_38400; ++ ++ port = IO_ADDRESS(0x101F1000); ++ UART_PUT_CR(port, 0); ++ ++ /* Set baud rate */ ++ UART_PUT_LCRM(port, ((rate & 0xf00) >> 8)); ++ UART_PUT_LCRL(port, (rate & 0xff)); ++ UART_PUT_LCRH(port, UART01x_LCRH_WLEN_8 | UART01x_LCRH_FEN); ++ UART_PUT_CR(port, UART01x_CR_UARTEN); ++ ++ return 0; ++} ++ ++static void kgdb_serial_putchar(int ch) ++{ ++ unsigned int status; ++ ++ do { ++ status = UART_GET_FR(port); ++ } while (!UART_TX_READY(status)); ++ ++ UART_PUT_CHAR(port, ch); ++} ++ ++static int kgdb_serial_getchar(void) ++{ ++ unsigned int status; ++ int ch; ++ ++ do { ++ status = UART_GET_FR(port); ++ } while (!UART_RX_DATA(status)); ++ ch = UART_GET_CHAR(port); ++ return ch; ++} ++ ++static struct uart_port kgdb_amba_port = { ++ .irq = 12, ++ .iobase = 0, ++ .iotype = UPIO_MEM, ++ .membase = (unsigned char *)IO_ADDRESS(0x101F1000), ++}; ++ ++static irqreturn_t kgdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ int status = UART_GET_MIS(port); ++ ++ if (irq != kgdb_irq) ++ return IRQ_NONE; ++ ++ if (status & 0x40) ++ breakpoint(); ++ ++ return IRQ_HANDLED; ++} ++ ++static void __init kgdb_hookup_irq(void) ++{ ++ request_irq(kgdb_irq, kgdb_interrupt, SA_SHIRQ, "GDB-stub", ++ &kgdb_amba_port); ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .init = kgdb_serial_init, ++ .write_char = kgdb_serial_putchar, ++ .read_char = kgdb_serial_getchar, ++ .late_init = kgdb_hookup_irq, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/arm/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/arm/mm/extable.c +--- linux-2.6.18-53.1.14/arch/arm/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/arm/mm/extable.c 2008-06-10 15:38:56.000000000 +0400 +@@ -2,6 +2,7 @@ + * linux/arch/arm/mm/extable.c + */ + #include ++#include + #include + + int fixup_exception(struct pt_regs *regs) +@@ -11,6 +12,12 @@ int fixup_exception(struct pt_regs *regs + fixup = search_exception_tables(instruction_pointer(regs)); + if (fixup) + regs->ARM_pc = fixup->fixup; ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + + return fixup != NULL; + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/i386/kernel/Makefile 2008-03-06 05:54:14.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/Makefile 2008-06-10 15:38:03.000000000 +0400 +@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86) += vm86.o + obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + obj-$(CONFIG_HPET_TIMER) += hpet.o + obj-$(CONFIG_K8_NB) += k8.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + EXTRA_AFLAGS := -traditional + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/entry.S +--- linux-2.6.18-53.1.14/arch/i386/kernel/entry.S 2008-03-06 05:55:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/entry.S 2008-06-10 15:39:01.000000000 +0400 +@@ -201,7 +201,7 @@ VM_MASK = 0x00020000 + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP + +-ENTRY(ret_from_fork) ++KPROBE_ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 +@@ -659,7 +659,7 @@ ENTRY(simd_coprocessor_error) + jmp error_code + CFI_ENDPROC + +-ENTRY(device_not_available) ++KPROBE_ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 +@@ -916,7 +916,7 @@ ENTRY(machine_check) + CFI_ENDPROC + #endif + +-ENTRY(spurious_interrupt_bug) ++KPROBE_ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +@@ -942,3 +942,108 @@ ENDPROC(kernel_thread_helper) + #include "syscall_table.S" + + syscall_table_size=(.-sys_call_table) ++ ++# Here we do call frames. We cheat a bit as we only really need ++# correct frames at locations we can actually look at from a ++# debugger. Since the break instruction trap actually goes thru ++# some of this code, we don't really need info on those areas, but ++# only after the fact. I.e. if we can not step or break in a ++# location or end up with a return address pointing at the ++# location, we don't need a correct call frame for it. ++ ++#ifdef CONFIG_KGDB ++ ++#include ++/* ++ * The register numbers as known by gdb ++ */ ++ ++#define _EAX 0 ++#define _ECX 1 ++#define _EDX 2 ++#define _EBX 3 ++#define _ESP 4 ++#define _EBP 5 ++#define _ESI 6 ++#define _EDI 7 ++#define _PC 8 ++#define _EIP 8 ++#define _PS 9 ++#define _EFLAGS 9 ++#define _CS 10 ++#define _SS 11 ++#define _DS 12 ++#define _ES 13 ++#define _FS 14 ++#define _GS 15 ++ /* ++ * This code uses macros defined in linux/dwarf2-lang.h ++ * They attempt to follow the dwarf2 naming conventions... sort of.. ++ */ ++ENTRY(end_of_stack_stop_unwind_function) ++ .long end_of_stack_stop_unwind_function+1 ++ ++ .text ++ ++ CFI_preamble(c1,_PC,1,1) ++ CFA_define_reference(_ESP,OLDESP) /* Stack pointer */ ++ CFA_expression(_EIP) ++ CFA_exp_OP_dup /* copy old esp */ ++ CFA_exp_OP_consts(CS-OLDESP) /* offset to CS address */ ++ CFA_exp_OP_plus /* should be CS address */ ++ CFA_exp_OP_deref /* get the CS */ ++ CFA_exp_OP_const4s(VM_MASK|3) /* prepare to mask it */ ++ CFA_exp_OP_and /* mask it, zero means kernel */ ++ CFA_exp_OP_bra(eip_user_rtn) /* branch if user */ ++ CFA_exp_OP_const4s(EIP-OLDESP) /* offset to return address */ ++ CFA_exp_OP_plus /* add that in */ ++ CFA_exp_OP_skip(eip_end) /* done if kernel, skip out */ ++eip_user_rtn: ++ CFA_exp_OP_addr(end_of_stack_stop_unwind_function)/*dummy function */ ++eip_end: ++ CFA_expression_end ++ CFA_define_offset(_EBX,EBX-OLDESP) ++ CFA_define_offset(_ECX,ECX-OLDESP) ++ CFA_define_offset(_EDX,EDX-OLDESP) ++ CFA_define_offset(_ESI,ESI-OLDESP) ++ CFA_define_offset(_EDI,EDI-OLDESP) ++ CFA_define_offset(_EBP,EBP-OLDESP) ++ CFA_define_offset(_EAX,EAX-OLDESP) ++ CFA_define_offset(_EFLAGS,EFLAGS-OLDESP) ++ CFI_postamble() ++ ++/* ++ * This provides an uwind for our dummy end of unwind function. ++ * Current convention is to provied an undefined return address. ++ */ ++ CFI_preamble(c2,_PC,1,1) ++ CFA_define_reference(_ESP,0) /* Stack pointer */ ++ CFA_undefine_reg(_EIP) ++ CFI_postamble() ++ ++ FDE_preamble(c2,end_of_stack_stop_unwind_function, \ ++ end_of_stack_stop_unwind_function+5) ++ FDE_postamble() ++ /* ++ * This is VERY sloppy. At this point all we want to do is get ++ * the frame right for back tracing. It will not be good if ++ * you try to single step. We use already defined labels. ++ * We want to cover all call outs. ++ * We could also recode this as just one FDE, but this works and ++ * I want to get it out. ++ */ ++ FDE_preamble(c1,ret_from_fork,ret_from_exception) ++ CFA_define_cfa_offset(4) /* one extra word on stack */ ++ FDE_postamble() ++ ++ FDE_preamble(c1,ret_from_exception,device_not_available_emulate) ++ FDE_postamble() ++ ++ FDE_preamble(c1,device_not_available_emulate,debug) ++ CFA_define_cfa_offset(4) /* one extra word on stack */ ++ FDE_postamble() ++ ++ FDE_preamble(c1, debug,spurious_interrupt_bug) ++ FDE_postamble() ++ ++#endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/head.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/head.S +--- linux-2.6.18-53.1.14/arch/i386/kernel/head.S 2008-03-06 05:54:34.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/head.S 2008-06-10 15:39:01.000000000 +0400 +@@ -10,6 +10,7 @@ + .text + #include + #include ++#include + #include + #include + #include +@@ -336,6 +337,10 @@ is386: movl $2,%ecx # set MP + #endif /* CONFIG_SMP */ + jmp start_kernel + ++ /* This dwarf code tells gdb that this is the end of the unwind */ ++ /* This uses the CFA set up for pc=1 located in entry.S */ ++ CFI_END_FRAME(is386) ++ + /* + * We depend on ET to be correct. This checks for 287/387. + */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb-jmp.S +--- linux-2.6.18-53.1.14/arch/i386/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb-jmp.S 2008-06-10 15:38:03.000000000 +0400 +@@ -0,0 +1,74 @@ ++/* ++ * arch/i386/kernel/kgdb-jmp.S ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: George Anzinger ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 1996, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++ ++#define PCOFF 0 ++#define LINKAGE 4 /* just the return address */ ++#define PTR_SIZE 4 ++#define PARMS LINKAGE /* no space for saved regs */ ++#define JMPBUF PARMS ++#define VAL JMPBUF+PTR_SIZE ++ ++#define JB_BX 0 ++#define JB_SI 1 ++#define JB_DI 2 ++#define JB_BP 3 ++#define JB_SP 4 ++#define JB_PC 5 ++ ++/* This must be called prior to kgdb_fault_longjmp and ++ * kgdb_fault_longjmp must not be called outside of the context of the ++ * last call to kgdb_fault_setjmp. ++ * kgdb_fault_setjmp(int *jmp_buf[6]) ++ */ ++ENTRY(kgdb_fault_setjmp) ++ movl JMPBUF(%esp), %eax ++ ++ /* Save registers. */ ++ movl %ebx, (JB_BX*4)(%eax) ++ movl %esi, (JB_SI*4)(%eax) ++ movl %edi, (JB_DI*4)(%eax) ++ /* Save SP as it will be after we return. */ ++ leal JMPBUF(%esp), %ecx ++ movl %ecx, (JB_SP*4)(%eax) ++ movl PCOFF(%esp), %ecx /* Save PC we are returning to now. */ ++ movl %ecx, (JB_PC*4)(%eax) ++ movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */ ++ ++ /* Restore state so we can now try the access. */ ++ movl JMPBUF(%esp), %ecx /* User's jmp_buf in %ecx. */ ++ /* Save the return address now. */ ++ movl (JB_PC*4)(%ecx), %edx ++ /* Restore registers. */ ++ movl $0, %eax ++ movl (JB_SP*4)(%ecx), %esp ++ jmp *%edx /* Jump to saved PC. */ ++ ++/* kgdb_fault_longjmp(int *jmp_buf[6]) */ ++ENTRY(kgdb_fault_longjmp) ++ movl JMPBUF(%esp), %ecx /* User's jmp_buf in %ecx. */ ++ /* Save the return address now. */ ++ movl (JB_PC*4)(%ecx), %edx ++ /* Restore registers. */ ++ movl (JB_BX*4)(%ecx), %ebx ++ movl (JB_SI*4)(%ecx), %esi ++ movl (JB_DI*4)(%ecx), %edi ++ movl (JB_BP*4)(%ecx), %ebp ++ movl $1, %eax ++ movl (JB_SP*4)(%ecx), %esp ++ jmp *%edx /* Jump to saved PC. */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/i386/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/kgdb.c 2008-06-10 15:39:27.000000000 +0400 +@@ -0,0 +1,363 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ */ ++/* ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Updated by: Tom Rini ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by ++ * David Grothe ++ * Additional support from Tigran Aivazian ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++ ++#include "mach_ipi.h" ++ ++/* Put the error code here just in case the user cares. */ ++int gdb_i386errcode; ++/* Likewise, the vector number here (since GDB only gets the signal ++ number through the usual means, and that's not very specific). */ ++int gdb_i386vector = -1; ++ ++extern atomic_t cpu_doing_single_step; ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_EAX] = regs->eax; ++ gdb_regs[_EBX] = regs->ebx; ++ gdb_regs[_ECX] = regs->ecx; ++ gdb_regs[_EDX] = regs->edx; ++ gdb_regs[_ESI] = regs->esi; ++ gdb_regs[_EDI] = regs->edi; ++ gdb_regs[_EBP] = regs->ebp; ++ gdb_regs[_DS] = regs->xds; ++ gdb_regs[_ES] = regs->xes; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_CS] = regs->xcs; ++ gdb_regs[_PC] = regs->eip; ++ gdb_regs[_ESP] = (int)(®s->esp); ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} ++ ++/* ++ * Extracts ebp, esp and eip values understandable by gdb from the values ++ * saved by switch_to. ++ * thread.esp points to ebp. flags and ebp are pushed in switch_to hence esp ++ * prior to entering switch_to is 8 greater then the value that is saved. ++ * If switch_to changes, change following code appropriately. ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ gdb_regs[_EAX] = 0; ++ gdb_regs[_EBX] = 0; ++ gdb_regs[_ECX] = 0; ++ gdb_regs[_EDX] = 0; ++ gdb_regs[_ESI] = 0; ++ gdb_regs[_EDI] = 0; ++ gdb_regs[_EBP] = *(unsigned long *)p->thread.esp; ++ gdb_regs[_DS] = __KERNEL_DS; ++ gdb_regs[_ES] = __KERNEL_DS; ++ gdb_regs[_PS] = 0; ++ gdb_regs[_CS] = __KERNEL_CS; ++ gdb_regs[_PC] = p->thread.eip; ++ gdb_regs[_ESP] = p->thread.esp; ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ regs->eax = gdb_regs[_EAX]; ++ regs->ebx = gdb_regs[_EBX]; ++ regs->ecx = gdb_regs[_ECX]; ++ regs->edx = gdb_regs[_EDX]; ++ regs->esi = gdb_regs[_ESI]; ++ regs->edi = gdb_regs[_EDI]; ++ regs->ebp = gdb_regs[_EBP]; ++ regs->xds = gdb_regs[_DS]; ++ regs->xes = gdb_regs[_ES]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->xcs = gdb_regs[_CS]; ++ regs->eip = gdb_regs[_PC]; ++} ++ ++static struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned addr; ++} breakinfo[4] = { ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++}; ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned dr7; ++ ++ asm volatile ("movl %%db7, %0\n":"=r" (dr7) ++ :); ++ do { ++ unsigned addr0, addr1, addr2, addr3; ++ asm volatile ("movl %%db0, %0\n" ++ "movl %%db1, %1\n" ++ "movl %%db2, %2\n" ++ "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3):); ++ } while (0); ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movl %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movl %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movl %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movl %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) ++ asm volatile ("movl %0, %%db7\n"::"r" (dr7)); ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].addr == addr && breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 0; ++ return 0; ++} ++ ++void kgdb_remove_all_hw_break(void) ++{ ++ int i; ++ ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].enabled) { ++ /* Do what? */ ++ ; ++ } ++ memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); ++ } ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (!breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 1; ++ breakinfo[idx].type = 1; ++ breakinfo[idx].len = 1; ++ breakinfo[idx].addr = addr; ++ return 0; ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ /* Disable hardware debugging while we are in kgdb */ ++ asm volatile ("movl %0,%%db7": /* no output */ :"r" (0)); ++} ++ ++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++ /* Master processor is completely in the debugger */ ++ gdb_i386vector = e_vector; ++ gdb_i386errcode = err_code; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ long addr; ++ char *ptr; ++ int newPC, dr6; ++ ++ switch (remcom_in_buffer[0]) { ++ case 'c': ++ case 's': ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->eip = addr; ++ newPC = linux_regs->eip; ++ ++ /* clear the trace bit */ ++ linux_regs->eflags &= ~TF_MASK; ++ atomic_set(&cpu_doing_single_step, -1); ++ ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++ linux_regs->eflags |= TF_MASK; ++ debugger_step = 1; ++ atomic_set(&cpu_doing_single_step,smp_processor_id()); ++ } ++ ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6)); ++ if (!(dr6 & 0x4000)) { ++ long breakno; ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno) && ++ breakinfo[breakno].type == 0) { ++ /* Set restore flag */ ++ linux_regs->eflags |= X86_EFLAGS_RF; ++ break; ++ } ++ } ++ } ++ kgdb_correct_hw_break(); ++ asm volatile ("movl %0, %%db6\n"::"r" (0)); ++ ++ return (0); ++ } /* switch */ ++ /* this means that we do not want to exit from the handler */ ++ return -1; ++} ++ ++/* Register KGDB with the i386die_chain so that we hook into all of the right ++ * spots. */ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ ++ /* Bad memory access? */ ++ if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active) ++ && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ } else if (cmd == DIE_PAGE_FAULT) ++ /* A normal page fault, ignore. */ ++ return NOTIFY_DONE; ++ else if ((cmd == DIE_NMI || cmd == DIE_NMI_IPI || ++ cmd == DIE_NMIWATCHDOG) && atomic_read(&debugger_active)) { ++ /* CPU roundup */ ++ kgdb_nmihook(smp_processor_id(), regs); ++ return NOTIFY_STOP; ++ } else if (cmd == DIE_NMI_IPI || cmd == DIE_NMI || user_mode(regs) || ++ (cmd == DIE_DEBUG && atomic_read(&debugger_active))) ++ /* Normal watchdog event or userspace debugging, or spurious ++ * debug exception, ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&i386die_chain, &kgdb_notifier); ++ return 0; ++} ++ ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++ ++int kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ if (exception == 3 && kgdb_isremovedbreak(regs->eip - 1)) { ++ regs->eip -= 1; ++ return 1; ++ } ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/setup.c +--- linux-2.6.18-53.1.14/arch/i386/kernel/setup.c 2008-03-06 05:54:58.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/setup.c 2008-06-10 15:38:03.000000000 +0400 +@@ -148,6 +148,7 @@ EXPORT_SYMBOL(ist_info); + struct e820map e820; + + extern void early_cpu_init(void); ++extern void early_trap_init(void); + extern void generic_apic_probe(char *); + extern int root_mountflags; + +@@ -1470,6 +1471,7 @@ void __init setup_arch(char **cmdline_p) + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + pre_setup_arch_hook(); + early_cpu_init(); ++ early_trap_init(); + + /* + * FIXME: This isn't an official loader_type right +@@ -1526,6 +1528,7 @@ void __init setup_arch(char **cmdline_p) + data_resource.end = virt_to_phys(_edata)-1; + + parse_cmdline_early(cmdline_p); ++ parse_early_param(); + + #ifdef CONFIG_EARLY_PRINTK + { +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/smpboot.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/smpboot.c +--- linux-2.6.18-53.1.14/arch/i386/kernel/smpboot.c 2008-03-06 05:54:34.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/smpboot.c 2008-06-10 15:39:01.000000000 +0400 +@@ -592,6 +592,9 @@ void __devinit initialize_secondary(void + + asm volatile( + "movl %0,%%esp\n\t" ++#ifdef CONFIG_KGDB ++ "pushl end_of_stack_stop_unwind_function\n\t" ++#endif + "jmp *%1" + : + :"r" (current->thread.esp),"r" (current->thread.eip)); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/traps.c +--- linux-2.6.18-53.1.14/arch/i386/kernel/traps.c 2008-03-06 05:55:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/kernel/traps.c 2008-06-10 15:38:03.000000000 +0400 +@@ -964,6 +964,7 @@ fastcall void __kprobes do_debug(struct + */ + clear_dr7: + set_debugreg(0, 7); ++ notify_die(DIE_DEBUG, "debug2", regs, condition, error_code, SIGTRAP); + return; + + debug_vm86: +@@ -1268,6 +1269,12 @@ static void __init set_task_gate(unsigne + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + } + ++/* Some traps need to be set early. */ ++void __init early_trap_init(void) { ++ set_intr_gate(1,&debug); ++ set_system_intr_gate(3, &int3); /* int3 can be called from all */ ++ set_intr_gate(14,&page_fault); ++} + + void __init trap_init(void) + { +@@ -1284,10 +1291,8 @@ void __init trap_init(void) + #endif + + set_trap_gate(0,÷_error); +- set_intr_gate(1,&debug); + set_intr_gate(2,&nmi); +- set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ +- set_system_gate(4,&overflow); ++ set_system_gate(4,&overflow); /* int4/5 can be called from all */ + set_trap_gate(5,&bounds); + set_trap_gate(6,&invalid_op); + set_trap_gate(7,&device_not_available); +@@ -1297,7 +1302,6 @@ void __init trap_init(void) + set_trap_gate(11,&segment_not_present); + set_trap_gate(12,&stack_segment); + set_trap_gate(13,&general_protection); +- set_intr_gate(14,&page_fault); + set_trap_gate(15,&spurious_interrupt_bug); + set_trap_gate(16,&coprocessor_error); + set_trap_gate(17,&alignment_check); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/i386/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/i386/mm/fault.c +--- linux-2.6.18-53.1.14/arch/i386/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/i386/mm/fault.c 2008-06-10 15:38:03.000000000 +0400 +@@ -539,6 +539,10 @@ no_context: + if (is_prefetch(regs, address, error_code)) + return; + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ error_code, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/ia64/kernel/Makefile 2008-03-06 05:54:11.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/Makefile 2008-06-10 15:38:32.000000000 +0400 +@@ -32,6 +32,7 @@ obj-$(CONFIG_KEXEC) += machine_kexec.o + obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o + obj-$(CONFIG_AUDIT) += audit.o + mca_recovery-y += mca_drv.o mca_drv_asm.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + # The gate DSO image is built using a special linker script. + targets += gate.so gate-syms.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/entry.S +--- linux-2.6.18-53.1.14/arch/ia64/kernel/entry.S 2008-03-06 05:54:43.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/entry.S 2008-06-10 15:39:39.000000000 +0400 +@@ -959,9 +959,9 @@ GLOBAL_ENTRY(__ia64_leave_kernel) + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + mov r16=ar.bsp // get existing backing store pointer +- addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 ++(pUStk) addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 + ;; +- ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 ++(pUStk) ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 + (pKStk) br.cond.dpnt skip_rbs_switch + + /* +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/ivt.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/ivt.S +--- linux-2.6.18-53.1.14/arch/ia64/kernel/ivt.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/ivt.S 2008-06-10 15:39:39.000000000 +0400 +@@ -52,6 +52,14 @@ + #include + #include + ++#ifdef CONFIG_KGDB ++#define KGDB_ENABLE_PSR_DB mov r31=psr;; movl r30=IA64_PSR_DB;; \ ++ or r31=r31,r30;; \ ++ mov psr.l=r31;; srlz.i;; ++#else ++#define KGDB_ENABLE_PSR_DB ++#endif ++ + #if 1 + # define PSR_DEFAULT_BITS psr.ac + #else +@@ -519,6 +527,7 @@ ENTRY(page_fault) + movl r14=ia64_leave_kernel + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + mov rp=r14 + ;; + adds out2=16,r12 // out2 = pointer to pt_regs +@@ -863,6 +872,7 @@ ENTRY(interrupt) + srlz.i // ensure everybody knows psr.ic is back on + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + ;; + MCA_RECOVER_RANGE(interrupt) + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group +@@ -1110,6 +1120,7 @@ ENTRY(non_syscall) + movl r15=ia64_leave_kernel + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + mov rp=r15 + ;; + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr +@@ -1143,6 +1154,7 @@ ENTRY(dispatch_unaligned_handler) + adds r3=8,r2 // set up second base pointer + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + movl r14=ia64_leave_kernel + ;; + mov rp=r14 +@@ -1185,6 +1197,10 @@ ENTRY(dispatch_to_fault_handler) + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + SAVE_REST ++ cmp.eq p6,p0=29,out0 ++(p6) br.cond.spnt 1f;; // debug_vector ++ KGDB_ENABLE_PSR_DB ++1: + movl r14=ia64_leave_kernel + ;; + mov rp=r14 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb-jmp.S +--- linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb-jmp.S 2008-06-10 15:38:32.000000000 +0400 +@@ -0,0 +1,238 @@ ++/* setjmp() and longjmp() assembler support for kdb on ia64. ++ ++ This code was copied from glibc CVS as of 2001-06-27 and modified where ++ necessary to fit the kernel. ++ Keith Owens 2001-06-27 ++ */ ++ ++/* Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. ++ Contributed by David Mosberger-Tang . ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Library General Public License as ++ published by the Free Software Foundation; either version 2 of the ++ License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Library General Public License for more details. ++ ++ You should have received a copy of the GNU Library General Public ++ License along with the GNU C Library; see the file COPYING.LIB. If ++ not, write to the Free Software Foundation, Inc., ++ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++*/ ++ ++#include ++GLOBAL_ENTRY(kgdb_fault_setjmp) ++ .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) ++ alloc loc1=ar.pfs,2,2,2,0 ++ mov r16=ar.unat ++ ;; ++ mov r17=ar.fpsr ++ mov r2=in0 ++ add r3=8,in0 ++ ;; ++.mem.offset 0,0; ++ st8.spill.nta [r2]=sp,16 // r12 (sp) ++.mem.offset 8,0; ++ st8.spill.nta [r3]=gp,16 // r1 (gp) ++ ;; ++ st8.nta [r2]=r16,16 // save caller's unat ++ st8.nta [r3]=r17,16 // save fpsr ++ add r8=0xa0,in0 ++ ;; ++.mem.offset 160,0; ++ st8.spill.nta [r2]=r4,16 // r4 ++.mem.offset 168,0; ++ st8.spill.nta [r3]=r5,16 // r5 ++ add r9=0xb0,in0 ++ ;; ++ stf.spill.nta [r8]=f2,32 ++ stf.spill.nta [r9]=f3,32 ++ mov loc0=rp ++ .body ++ ;; ++ stf.spill.nta [r8]=f4,32 ++ stf.spill.nta [r9]=f5,32 ++ mov r17=b1 ++ ;; ++ stf.spill.nta [r8]=f16,32 ++ stf.spill.nta [r9]=f17,32 ++ mov r18=b2 ++ ;; ++ stf.spill.nta [r8]=f18,32 ++ stf.spill.nta [r9]=f19,32 ++ mov r19=b3 ++ ;; ++ stf.spill.nta [r8]=f20,32 ++ stf.spill.nta [r9]=f21,32 ++ mov r20=b4 ++ ;; ++ stf.spill.nta [r8]=f22,32 ++ stf.spill.nta [r9]=f23,32 ++ mov r21=b5 ++ ;; ++ stf.spill.nta [r8]=f24,32 ++ stf.spill.nta [r9]=f25,32 ++ mov r22=ar.lc ++ ;; ++ stf.spill.nta [r8]=f26,32 ++ stf.spill.nta [r9]=f27,32 ++ mov r24=pr ++ ;; ++ stf.spill.nta [r8]=f28,32 ++ stf.spill.nta [r9]=f29,32 ++ ;; ++ stf.spill.nta [r8]=f30 ++ stf.spill.nta [r9]=f31 ++ ++.mem.offset 0,0; ++ st8.spill.nta [r2]=r6,16 // r6 ++.mem.offset 8,0; ++ st8.spill.nta [r3]=r7,16 // r7 ++ ;; ++ mov r23=ar.bsp ++ mov r25=ar.unat ++ st8.nta [r2]=loc0,16 // b0 ++ st8.nta [r3]=r17,16 // b1 ++ ;; ++ st8.nta [r2]=r18,16 // b2 ++ st8.nta [r3]=r19,16 // b3 ++ ;; ++ st8.nta [r2]=r20,16 // b4 ++ st8.nta [r3]=r21,16 // b5 ++ ;; ++ st8.nta [r2]=loc1,16 // ar.pfs ++ st8.nta [r3]=r22,16 // ar.lc ++ ;; ++ st8.nta [r2]=r24,16 // pr ++ st8.nta [r3]=r23,16 // ar.bsp ++ ;; ++ st8.nta [r2]=r25 // ar.unat ++ st8.nta [r3]=in0 // &__jmp_buf ++ mov r8=0 ++ mov rp=loc0 ++ mov ar.pfs=loc1 ++ br.ret.sptk.few rp ++END(kdba_setjmp) ++#define pPos p6 /* is rotate count positive? */ ++#define pNeg p7 /* is rotate count negative? */ ++GLOBAL_ENTRY(kgdb_fault_longjmp) ++ alloc r8=ar.pfs,2,1,0,0 ++ mov r27=ar.rsc ++ add r2=0x98,in0 // r2 <- &jmpbuf.orig_jmp_buf_addr ++ ;; ++ ld8 r8=[r2],-16 // r8 <- orig_jmp_buf_addr ++ mov r10=ar.bsp ++ and r11=~0x3,r27 // clear ar.rsc.mode ++ ;; ++ flushrs // flush dirty regs to backing store (must be first in insn grp) ++ ld8 r23=[r2],8 // r23 <- jmpbuf.ar_bsp ++ sub r8=r8,in0 // r8 <- &orig_jmpbuf - &jmpbuf ++ ;; ++ ld8 r25=[r2] // r25 <- jmpbuf.ar_unat ++ extr.u r8=r8,3,6 // r8 <- (&orig_jmpbuf - &jmpbuf)/8 & 0x3f ++ ;; ++ cmp.lt pNeg,pPos=r8,r0 ++ mov r2=in0 ++ ;; ++(pPos) mov r16=r8 ++(pNeg) add r16=64,r8 ++(pPos) sub r17=64,r8 ++(pNeg) sub r17=r0,r8 ++ ;; ++ mov ar.rsc=r11 // put RSE in enforced lazy mode ++ shr.u r8=r25,r16 ++ add r3=8,in0 // r3 <- &jmpbuf.r1 ++ shl r9=r25,r17 ++ ;; ++ or r25=r8,r9 ++ ;; ++ mov r26=ar.rnat ++ mov ar.unat=r25 // setup ar.unat (NaT bits for r1, r4-r7, and r12) ++ ;; ++ ld8.fill.nta sp=[r2],16 // r12 (sp) ++ ld8.fill.nta gp=[r3],16 // r1 (gp) ++ dep r11=-1,r23,3,6 // r11 <- ia64_rse_rnat_addr(jmpbuf.ar_bsp) ++ ;; ++ ld8.nta r16=[r2],16 // caller's unat ++ ld8.nta r17=[r3],16 // fpsr ++ ;; ++ ld8.fill.nta r4=[r2],16 // r4 ++ ld8.fill.nta r5=[r3],16 // r5 (gp) ++ cmp.geu p8,p0=r10,r11 // p8 <- (ar.bsp >= jmpbuf.ar_bsp) ++ ;; ++ ld8.fill.nta r6=[r2],16 // r6 ++ ld8.fill.nta r7=[r3],16 // r7 ++ ;; ++ mov ar.unat=r16 // restore caller's unat ++ mov ar.fpsr=r17 // restore fpsr ++ ;; ++ ld8.nta r16=[r2],16 // b0 ++ ld8.nta r17=[r3],16 // b1 ++ ;; ++(p8) ld8 r26=[r11] // r26 <- *ia64_rse_rnat_addr(jmpbuf.ar_bsp) ++ mov ar.bspstore=r23 // restore ar.bspstore ++ ;; ++ ld8.nta r18=[r2],16 // b2 ++ ld8.nta r19=[r3],16 // b3 ++ ;; ++ ld8.nta r20=[r2],16 // b4 ++ ld8.nta r21=[r3],16 // b5 ++ ;; ++ ld8.nta r11=[r2],16 // ar.pfs ++ ld8.nta r22=[r3],56 // ar.lc ++ ;; ++ ld8.nta r24=[r2],32 // pr ++ mov b0=r16 ++ ;; ++ ldf.fill.nta f2=[r2],32 ++ ldf.fill.nta f3=[r3],32 ++ mov b1=r17 ++ ;; ++ ldf.fill.nta f4=[r2],32 ++ ldf.fill.nta f5=[r3],32 ++ mov b2=r18 ++ ;; ++ ldf.fill.nta f16=[r2],32 ++ ldf.fill.nta f17=[r3],32 ++ mov b3=r19 ++ ;; ++ ldf.fill.nta f18=[r2],32 ++ ldf.fill.nta f19=[r3],32 ++ mov b4=r20 ++ ;; ++ ldf.fill.nta f20=[r2],32 ++ ldf.fill.nta f21=[r3],32 ++ mov b5=r21 ++ ;; ++ ldf.fill.nta f22=[r2],32 ++ ldf.fill.nta f23=[r3],32 ++ mov ar.lc=r22 ++ ;; ++ ldf.fill.nta f24=[r2],32 ++ ldf.fill.nta f25=[r3],32 ++ cmp.eq p8,p9=0,in1 ++ ;; ++ ldf.fill.nta f26=[r2],32 ++ ldf.fill.nta f27=[r3],32 ++ mov ar.pfs=r11 ++ ;; ++ ldf.fill.nta f28=[r2],32 ++ ldf.fill.nta f29=[r3],32 ++ ;; ++ ldf.fill.nta f30=[r2] ++ ldf.fill.nta f31=[r3] ++(p8) mov r8=1 ++ ++ mov ar.rnat=r26 // restore ar.rnat ++ ;; ++ mov ar.rsc=r27 // restore ar.rsc ++(p9) mov r8=in1 ++ ++ invala // virt. -> phys. regnum mapping may change ++ mov pr=r24,-1 ++ br.ret.sptk.few rp ++END(kgdb_fault_longjmp) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/ia64/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/kgdb.c 2008-06-10 15:38:32.000000000 +0400 +@@ -0,0 +1,1131 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * (c) Copyright 2005 Hewlett-Packard Development Company, L.P. ++ * Bob Picco ++ */ ++/* ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NUM_REGS 590 ++#define REGISTER_BYTES (NUM_REGS*8+128*8) ++#define REGISTER_BYTE(N) (((N) * 8) \ ++ + ((N) <= IA64_FR0_REGNUM ? \ ++ 0 : 8 * (((N) > IA64_FR127_REGNUM) ? 128 : (N) - IA64_FR0_REGNUM))) ++#define REGISTER_SIZE(N) \ ++ (((N) >= IA64_FR0_REGNUM && (N) <= IA64_FR127_REGNUM) ? 16 : 8) ++#define IA64_GR0_REGNUM 0 ++#define IA64_FR0_REGNUM 128 ++#define IA64_FR127_REGNUM (IA64_FR0_REGNUM+127) ++#define IA64_PR0_REGNUM 256 ++#define IA64_BR0_REGNUM 320 ++#define IA64_VFP_REGNUM 328 ++#define IA64_PR_REGNUM 330 ++#define IA64_IP_REGNUM 331 ++#define IA64_PSR_REGNUM 332 ++#define IA64_CFM_REGNUM 333 ++#define IA64_AR0_REGNUM 334 ++#define IA64_NAT0_REGNUM 462 ++#define IA64_NAT31_REGNUM (IA64_NAT0_REGNUM+31) ++#define IA64_NAT32_REGNUM (IA64_NAT0_REGNUM+32) ++#define IA64_RSC_REGNUM (IA64_AR0_REGNUM+16) ++#define IA64_BSP_REGNUM (IA64_AR0_REGNUM+17) ++#define IA64_BSPSTORE_REGNUM (IA64_AR0_REGNUM+18) ++#define IA64_RNAT_REGNUM (IA64_AR0_REGNUM+19) ++#define IA64_FCR_REGNUM (IA64_AR0_REGNUM+21) ++#define IA64_EFLAG_REGNUM (IA64_AR0_REGNUM+24) ++#define IA64_CSD_REGNUM (IA64_AR0_REGNUM+25) ++#define IA64_SSD_REGNUM (IA64_AR0_REGNUM+26) ++#define IA64_CFLG_REGNUM (IA64_AR0_REGNUM+27) ++#define IA64_FSR_REGNUM (IA64_AR0_REGNUM+28) ++#define IA64_FIR_REGNUM (IA64_AR0_REGNUM+29) ++#define IA64_FDR_REGNUM (IA64_AR0_REGNUM+30) ++#define IA64_CCV_REGNUM (IA64_AR0_REGNUM+32) ++#define IA64_UNAT_REGNUM (IA64_AR0_REGNUM+36) ++#define IA64_FPSR_REGNUM (IA64_AR0_REGNUM+40) ++#define IA64_ITC_REGNUM (IA64_AR0_REGNUM+44) ++#define IA64_PFS_REGNUM (IA64_AR0_REGNUM+64) ++#define IA64_LC_REGNUM (IA64_AR0_REGNUM+65) ++#define IA64_EC_REGNUM (IA64_AR0_REGNUM+66) ++ ++#define REGISTER_INDEX(N) (REGISTER_BYTE(N) / sizeof (unsigned long)) ++#define BREAK_INSTR_ALIGN (~0xfULL) ++ ++#define ptoff(V) ((unsigned int) &((struct pt_regs *)0x0)->V) ++struct reg_to_ptreg_index { ++ unsigned int reg; ++ unsigned int ptregoff; ++}; ++ ++static struct reg_to_ptreg_index gr_reg_to_ptreg_index[] = { ++ {IA64_GR0_REGNUM + 1, ptoff(r1)}, ++ {IA64_GR0_REGNUM + 2, ptoff(r2)}, ++ {IA64_GR0_REGNUM + 3, ptoff(r3)}, ++ {IA64_GR0_REGNUM + 8, ptoff(r8)}, ++ {IA64_GR0_REGNUM + 9, ptoff(r9)}, ++ {IA64_GR0_REGNUM + 10, ptoff(r10)}, ++ {IA64_GR0_REGNUM + 11, ptoff(r11)}, ++ {IA64_GR0_REGNUM + 12, ptoff(r12)}, ++ {IA64_GR0_REGNUM + 13, ptoff(r13)}, ++ {IA64_GR0_REGNUM + 14, ptoff(r14)}, ++ {IA64_GR0_REGNUM + 15, ptoff(r15)}, ++ {IA64_GR0_REGNUM + 16, ptoff(r16)}, ++ {IA64_GR0_REGNUM + 17, ptoff(r17)}, ++ {IA64_GR0_REGNUM + 18, ptoff(r18)}, ++ {IA64_GR0_REGNUM + 19, ptoff(r19)}, ++ {IA64_GR0_REGNUM + 20, ptoff(r20)}, ++ {IA64_GR0_REGNUM + 21, ptoff(r21)}, ++ {IA64_GR0_REGNUM + 22, ptoff(r22)}, ++ {IA64_GR0_REGNUM + 23, ptoff(r23)}, ++ {IA64_GR0_REGNUM + 24, ptoff(r24)}, ++ {IA64_GR0_REGNUM + 25, ptoff(r25)}, ++ {IA64_GR0_REGNUM + 26, ptoff(r26)}, ++ {IA64_GR0_REGNUM + 27, ptoff(r27)}, ++ {IA64_GR0_REGNUM + 28, ptoff(r28)}, ++ {IA64_GR0_REGNUM + 29, ptoff(r29)}, ++ {IA64_GR0_REGNUM + 30, ptoff(r30)}, ++ {IA64_GR0_REGNUM + 31, ptoff(r31)}, ++}; ++ ++static struct reg_to_ptreg_index br_reg_to_ptreg_index[] = { ++ {IA64_BR0_REGNUM, ptoff(b0)}, ++ {IA64_BR0_REGNUM + 6, ptoff(b6)}, ++ {IA64_BR0_REGNUM + 7, ptoff(b7)}, ++}; ++ ++static struct reg_to_ptreg_index ar_reg_to_ptreg_index[] = { ++ {IA64_PFS_REGNUM, ptoff(ar_pfs)}, ++ {IA64_UNAT_REGNUM, ptoff(ar_unat)}, ++ {IA64_RNAT_REGNUM, ptoff(ar_rnat)}, ++ {IA64_BSPSTORE_REGNUM, ptoff(ar_bspstore)}, ++ {IA64_RSC_REGNUM, ptoff(ar_rsc)}, ++ {IA64_CSD_REGNUM, ptoff(ar_csd)}, ++ {IA64_SSD_REGNUM, ptoff(ar_ssd)}, ++ {IA64_FPSR_REGNUM, ptoff(ar_fpsr)}, ++ {IA64_CCV_REGNUM, ptoff(ar_ccv)}, ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int kgdb_gr_reg(int regnum, struct unw_frame_info *info, ++ unsigned long *reg, int rw) ++{ ++ char nat; ++ ++ if ((regnum >= IA64_GR0_REGNUM && regnum <= (IA64_GR0_REGNUM + 1)) || ++ (regnum >= (IA64_GR0_REGNUM + 4) && ++ regnum <= (IA64_GR0_REGNUM + 7))) ++ return !unw_access_gr(info, regnum - IA64_GR0_REGNUM, ++ reg, &nat, rw); ++ else ++ return 0; ++} ++static int kgdb_gr_ptreg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int i, result = 1; ++ char nat; ++ ++ if (!((regnum >= (IA64_GR0_REGNUM + 2) && ++ regnum <= (IA64_GR0_REGNUM + 3)) || ++ (regnum >= (IA64_GR0_REGNUM + 8) && ++ regnum <= (IA64_GR0_REGNUM + 15)) || ++ (regnum >= (IA64_GR0_REGNUM + 16) && ++ regnum <= (IA64_GR0_REGNUM + 31)))) ++ return 0; ++ else if (rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++) ++ if (gr_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *)(((void *)ptregs) + ++ gr_reg_to_ptreg_index[i].ptregoff)) = *reg; ++ break; ++ } ++ } else if (!rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++) ++ if (gr_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) ++ (((void *)ptregs) + ++ gr_reg_to_ptreg_index[i].ptregoff)); ++ break; ++ } ++ } else ++ result = !unw_access_gr(info, regnum - IA64_GR0_REGNUM, ++ reg, &nat, rw); ++ return result; ++} ++ ++static int kgdb_br_reg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int i, result = 1; ++ ++ if (!(regnum >= IA64_BR0_REGNUM && regnum <= (IA64_BR0_REGNUM + 7))) ++ return 0; ++ ++ switch (regnum) { ++ case IA64_BR0_REGNUM: ++ case IA64_BR0_REGNUM + 6: ++ case IA64_BR0_REGNUM + 7: ++ if (rw) { ++ for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++) ++ if (br_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *) ++ (((void *)ptregs) + ++ br_reg_to_ptreg_index[i].ptregoff)) = ++ *reg; ++ break; ++ } ++ } else ++ for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++) ++ if (br_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) ++ (((void *)ptregs) + ++ br_reg_to_ptreg_index[i]. ++ ptregoff)); ++ break; ++ } ++ break; ++ case IA64_BR0_REGNUM + 1: ++ case IA64_BR0_REGNUM + 2: ++ case IA64_BR0_REGNUM + 3: ++ case IA64_BR0_REGNUM + 4: ++ case IA64_BR0_REGNUM + 5: ++ result = !unw_access_br(info, regnum - IA64_BR0_REGNUM, ++ reg, rw); ++ break; ++ } ++ ++ return result; ++} ++ ++static int kgdb_fr_reg(int regnum, char *inbuffer, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, ++ struct ia64_fpreg *freg, int rw) ++{ ++ int result = 1; ++ ++ if (!(regnum >= IA64_FR0_REGNUM && regnum <= (IA64_FR0_REGNUM + 127))) ++ return 0; ++ ++ switch (regnum) { ++ case IA64_FR0_REGNUM + 6: ++ case IA64_FR0_REGNUM + 7: ++ case IA64_FR0_REGNUM + 8: ++ case IA64_FR0_REGNUM + 9: ++ case IA64_FR0_REGNUM + 10: ++ case IA64_FR0_REGNUM + 11: ++ case IA64_FR0_REGNUM + 12: ++ if (rw) { ++ char *ptr = inbuffer; ++ ++ freg->u.bits[0] = *reg; ++ kgdb_hex2long(&ptr, &freg->u.bits[1]); ++ *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))) = ++ *freg; ++ break; ++ } else if (!ptregs) ++ result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM, ++ freg, rw); ++ else ++ *freg = ++ *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))); ++ break; ++ default: ++ if (!rw) ++ result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM, ++ freg, rw); ++ else ++ result = 0; ++ break; ++ } ++ ++ return result; ++} ++ ++static int kgdb_ar_reg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int result = 0, i; ++ ++ if (!(regnum >= IA64_AR0_REGNUM && regnum <= IA64_EC_REGNUM)) ++ return 0; ++ ++ if (rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++) ++ if (ar_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *) (((void *)ptregs) + ++ ar_reg_to_ptreg_index[i].ptregoff)) = ++ *reg; ++ result = 1; ++ break; ++ } ++ } else if (ptregs) { ++ for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++) ++ if (ar_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) (((void *)ptregs) + ++ ar_reg_to_ptreg_index[i].ptregoff)); ++ result = 1; ++ break; ++ } ++ } ++ ++ if (result) ++ return result; ++ ++ result = 1; ++ ++ switch (regnum) { ++ case IA64_CSD_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_CSD, reg, rw); ++ break; ++ case IA64_SSD_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_SSD, reg, rw); ++ break; ++ case IA64_UNAT_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_RNAT_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_BSPSTORE_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_PFS_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_LC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_LC, reg, rw); ++ break; ++ case IA64_EC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_EC, reg, rw); ++ break; ++ case IA64_FPSR_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_FPSR, reg, rw); ++ break; ++ case IA64_RSC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RSC, reg, rw); ++ break; ++ case IA64_CCV_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_CCV, reg, rw); ++ break; ++ default: ++ result = 0; ++ } ++ ++ return result; ++} ++ ++void kgdb_get_reg(char *outbuffer, int regnum, struct unw_frame_info *info, ++ struct pt_regs *ptregs) ++{ ++ unsigned long reg, size = 0, *mem = ® ++ struct ia64_fpreg freg; ++ ++ if (kgdb_gr_reg(regnum, info, ®, 0) || ++ kgdb_gr_ptreg(regnum, ptregs, info, ®, 0) || ++ kgdb_br_reg(regnum, ptregs, info, ®, 0) || ++ kgdb_ar_reg(regnum, ptregs, info, ®, 0)) ++ size = sizeof(reg); ++ else if (kgdb_fr_reg(regnum, NULL, ptregs, info, ®, &freg, 0)) { ++ size = sizeof(freg); ++ mem = (unsigned long *)&freg; ++ } else if (regnum == IA64_IP_REGNUM) { ++ if (!ptregs) { ++ unw_get_ip(info, ®); ++ size = sizeof(reg); ++ } else { ++ reg = ptregs->cr_iip; ++ size = sizeof(reg); ++ } ++ } else if (regnum == IA64_CFM_REGNUM) { ++ if (!ptregs) ++ unw_get_cfm(info, ®); ++ else ++ reg = ptregs->cr_ifs; ++ size = sizeof(reg); ++ } else if (regnum == IA64_PSR_REGNUM) { ++ if (!ptregs && kgdb_usethread) ++ ptregs = (struct pt_regs *) ++ ((unsigned long)kgdb_usethread + ++ IA64_STK_OFFSET) - 1; ++ if (ptregs) ++ reg = ptregs->cr_ipsr; ++ size = sizeof(reg); ++ } else if (regnum == IA64_PR_REGNUM) { ++ if (ptregs) ++ reg = ptregs->pr; ++ else ++ unw_access_pr(info, ®, 0); ++ size = sizeof(reg); ++ } else if (regnum == IA64_BSP_REGNUM) { ++ unw_get_bsp(info, ®); ++ size = sizeof(reg); ++ } ++ ++ if (size) { ++ kgdb_mem2hex((char *) mem, outbuffer, size); ++ outbuffer[size*2] = 0; ++ } ++ else ++ strcpy(outbuffer, "E0"); ++ ++ return; ++} ++ ++void kgdb_put_reg(char *inbuffer, char *outbuffer, int regnum, ++ struct unw_frame_info *info, struct pt_regs *ptregs) ++{ ++ unsigned long reg; ++ struct ia64_fpreg freg; ++ char *ptr = inbuffer; ++ ++ kgdb_hex2long(&ptr, ®); ++ strcpy(outbuffer, "OK"); ++ ++ if (kgdb_gr_reg(regnum, info, ®, 1) || ++ kgdb_gr_ptreg(regnum, ptregs, info, ®, 1) || ++ kgdb_br_reg(regnum, ptregs, info, ®, 1) || ++ kgdb_fr_reg(regnum, inbuffer, ptregs, info, ®, &freg, 1) || ++ kgdb_ar_reg(regnum, ptregs, info, ®, 1)) ; ++ else if (regnum == IA64_IP_REGNUM) ++ ptregs->cr_iip = reg; ++ else if (regnum == IA64_CFM_REGNUM) ++ ptregs->cr_ifs = reg; ++ else if (regnum == IA64_PSR_REGNUM) ++ ptregs->cr_ipsr = reg; ++ else if (regnum == IA64_PR_REGNUM) ++ ptregs->pr = reg; ++ else ++ strcpy(outbuffer, "E01"); ++ return; ++} ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ ++} ++ ++#define MAX_HW_BREAKPOINT (20) ++long hw_break_total_dbr, hw_break_total_ibr; ++#define HW_BREAKPOINT (hw_break_total_dbr + hw_break_total_ibr) ++#define WATCH_INSTRUCTION 0x0 ++#define WATCH_WRITE 0x1 ++#define WATCH_READ 0x2 ++#define WATCH_ACCESS 0x3 ++ ++#define HWCAP_DBR ((1 << WATCH_WRITE) | (1 << WATCH_READ)) ++#define HWCAP_IBR (1 << WATCH_INSTRUCTION) ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned long capable; ++ unsigned long type; ++ unsigned long mask; ++ unsigned long addr; ++} *breakinfo; ++ ++static struct hw_breakpoint hwbreaks[MAX_HW_BREAKPOINT]; ++ ++enum instruction_type { A, I, M, F, B, L, X, u }; ++ ++static enum instruction_type bundle_encoding[32][3] = { ++ {M, I, I}, /* 00 */ ++ {M, I, I}, /* 01 */ ++ {M, I, I}, /* 02 */ ++ {M, I, I}, /* 03 */ ++ {M, L, X}, /* 04 */ ++ {M, L, X}, /* 05 */ ++ {u, u, u}, /* 06 */ ++ {u, u, u}, /* 07 */ ++ {M, M, I}, /* 08 */ ++ {M, M, I}, /* 09 */ ++ {M, M, I}, /* 0A */ ++ {M, M, I}, /* 0B */ ++ {M, F, I}, /* 0C */ ++ {M, F, I}, /* 0D */ ++ {M, M, F}, /* 0E */ ++ {M, M, F}, /* 0F */ ++ {M, I, B}, /* 10 */ ++ {M, I, B}, /* 11 */ ++ {M, B, B}, /* 12 */ ++ {M, B, B}, /* 13 */ ++ {u, u, u}, /* 14 */ ++ {u, u, u}, /* 15 */ ++ {B, B, B}, /* 16 */ ++ {B, B, B}, /* 17 */ ++ {M, M, B}, /* 18 */ ++ {M, M, B}, /* 19 */ ++ {u, u, u}, /* 1A */ ++ {u, u, u}, /* 1B */ ++ {M, F, B}, /* 1C */ ++ {M, F, B}, /* 1D */ ++ {u, u, u}, /* 1E */ ++ {u, u, u}, /* 1F */ ++}; ++ ++int kgdb_validate_break_address(unsigned long addr) ++{ ++ int error; ++ char tmp_variable[BREAK_INSTR_SIZE]; ++ error = kgdb_get_mem((char *)(addr & BREAK_INSTR_ALIGN), tmp_variable, ++ BREAK_INSTR_SIZE); ++ return error; ++} ++ ++int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) ++{ ++ extern unsigned long _start[]; ++ unsigned long slot = addr & BREAK_INSTR_ALIGN, bundle_addr; ++ unsigned long template; ++ struct bundle { ++ struct { ++ unsigned long long template:5; ++ unsigned long long slot0:41; ++ unsigned long long slot1_p0:64 - 46; ++ } quad0; ++ struct { ++ unsigned long long slot1_p1:41 - (64 - 46); ++ unsigned long long slot2:41; ++ } quad1; ++ } bundle; ++ int ret; ++ ++ bundle_addr = addr & ~0xFULL; ++ ++ if (bundle_addr == (unsigned long)_start) ++ return 0; ++ ++ ret = kgdb_get_mem((char *)bundle_addr, (char *)&bundle, ++ BREAK_INSTR_SIZE); ++ if (ret < 0) ++ return ret; ++ ++ if (slot > 2) ++ slot = 0; ++ ++ memcpy(saved_instr, &bundle, BREAK_INSTR_SIZE); ++ template = bundle.quad0.template; ++ ++ if (slot == 1 && bundle_encoding[template][1] == L) ++ slot = 2; ++ ++ switch (slot) { ++ case 0: ++ bundle.quad0.slot0 = BREAKNUM; ++ break; ++ case 1: ++ bundle.quad0.slot1_p0 = BREAKNUM; ++ bundle.quad1.slot1_p1 = (BREAKNUM >> (64 - 46)); ++ break; ++ case 2: ++ bundle.quad1.slot2 = BREAKNUM; ++ break; ++ } ++ ++ return kgdb_set_mem((char *)bundle_addr, (char *)&bundle, ++ BREAK_INSTR_SIZE); ++} ++ ++int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) ++{ ++ extern unsigned long _start[]; ++ ++ addr = addr & BREAK_INSTR_ALIGN; ++ if (addr == (unsigned long)_start) ++ return 0; ++ return kgdb_set_mem((char *)addr, (char *)bundle, BREAK_INSTR_SIZE); ++} ++ ++static int hw_breakpoint_init; ++ ++void do_init_hw_break(void) ++{ ++ s64 status; ++ int i; ++ ++ hw_breakpoint_init = 1; ++ ++#ifdef CONFIG_IA64_HP_SIM ++ hw_break_total_ibr = 8; ++ hw_break_total_dbr = 8; ++ status = 0; ++#else ++ status = ia64_pal_debug_info(&hw_break_total_ibr, &hw_break_total_dbr); ++#endif ++ ++ if (status) { ++ printk(KERN_INFO "do_init_hw_break: pal call failed %d\n", ++ (int)status); ++ return; ++ } ++ ++ if (HW_BREAKPOINT > MAX_HW_BREAKPOINT) { ++ printk(KERN_INFO "do_init_hw_break: %d exceeds max %d\n", ++ (int)HW_BREAKPOINT, (int)MAX_HW_BREAKPOINT); ++ ++ while ((HW_BREAKPOINT > MAX_HW_BREAKPOINT) ++ && hw_break_total_ibr != 1) ++ hw_break_total_ibr--; ++ while (HW_BREAKPOINT > MAX_HW_BREAKPOINT) ++ hw_break_total_dbr--; ++ } ++ ++ breakinfo = hwbreaks; ++ ++ memset(breakinfo, 0, HW_BREAKPOINT * sizeof(struct hw_breakpoint)); ++ ++ for (i = 0; i < hw_break_total_dbr; i++) ++ breakinfo[i].capable = HWCAP_DBR; ++ ++ for (; i < HW_BREAKPOINT; i++) ++ breakinfo[i].capable = HWCAP_IBR; ++ ++ return; ++} ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ ++ if (!breakinfo) ++ return; ++ ++ for (breakno = 0; breakno < HW_BREAKPOINT; breakno++) { ++ if (breakinfo[breakno].enabled) { ++ if (breakinfo[breakno].capable & HWCAP_IBR) { ++ int ibreakno = breakno - hw_break_total_dbr; ++ ia64_set_ibr(ibreakno << 1, ++ breakinfo[breakno].addr); ++ ia64_set_ibr((ibreakno << 1) + 1, ++ (~breakinfo[breakno].mask & ++ ((1UL << 56UL) - 1)) | ++ (1UL << 56UL) | (1UL << 63UL)); ++ } else { ++ ia64_set_dbr(breakno << 1, ++ breakinfo[breakno].addr); ++ ia64_set_dbr((breakno << 1) + 1, ++ (~breakinfo[breakno]. ++ mask & ((1UL << 56UL) - 1)) | ++ (1UL << 56UL) | ++ (breakinfo[breakno].type << 62UL)); ++ } ++ } else { ++ if (breakinfo[breakno].capable & HWCAP_IBR) ++ ia64_set_ibr(((breakno - ++ hw_break_total_dbr) << 1) + 1, ++ 0); ++ else ++ ia64_set_dbr((breakno << 1) + 1, 0); ++ } ++ } ++ ++ return; ++} ++ ++int hardware_breakpoint(unsigned long addr, int length, int type, int action) ++{ ++ int breakno, found, watch; ++ unsigned long mask; ++ extern unsigned long _start[]; ++ ++ if (!hw_breakpoint_init) ++ do_init_hw_break(); ++ ++ if (!breakinfo) ++ return 0; ++ else if (addr == (unsigned long)_start) ++ return 1; ++ ++ if (type == WATCH_ACCESS) ++ mask = HWCAP_DBR; ++ else ++ mask = 1UL << type; ++ ++ for (watch = 0, found = 0, breakno = 0; breakno < HW_BREAKPOINT; ++ breakno++) { ++ if (action) { ++ if (breakinfo[breakno].enabled ++ || !(breakinfo[breakno].capable & mask)) ++ continue; ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].mask = length - 1; ++ breakinfo[breakno].addr = addr; ++ watch = breakno; ++ } else if (breakinfo[breakno].enabled && ++ ((length < 0 && breakinfo[breakno].addr == addr) || ++ ((breakinfo[breakno].capable & mask) && ++ (breakinfo[breakno].mask == (length - 1)) && ++ (breakinfo[breakno].addr == addr)))) { ++ breakinfo[breakno].enabled = 0; ++ breakinfo[breakno].type = 0UL; ++ } else ++ continue; ++ found++; ++ if (type != WATCH_ACCESS) ++ break; ++ else if (found == 2) ++ break; ++ else ++ mask = HWCAP_IBR; ++ } ++ ++ if (type == WATCH_ACCESS && found == 1) { ++ breakinfo[watch].enabled = 0; ++ found = 0; ++ } ++ ++ mb(); ++ return found; ++} ++ ++int kgdb_arch_set_hw_breakpoint(unsigned long addr, int len, ++ enum kgdb_bptype type) ++{ ++ return hardware_breakpoint(addr, len, type - '1', 1); ++} ++ ++int kgdb_arch_remove_hw_breakpoint(unsigned long addr, int len, ++ enum kgdb_bptype type) ++{ ++ return hardware_breakpoint(addr, len, type - '1', 0); ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 0); ++ ++} ++ ++void kgdb_remove_all_hw_break(void) ++{ ++ int i; ++ ++ for (i = 0; i < HW_BREAKPOINT; i++) ++ memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 1); ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ unsigned long hw_breakpoint_status; ++ ++ hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, ++ hw_breakpoint_status ^ IA64_PSR_DB); ++} ++ ++volatile static struct smp_unw { ++ struct unw_frame_info *unw; ++ struct task_struct *task; ++} smp_unw[NR_CPUS]; ++ ++static int inline kgdb_get_blocked_state(struct task_struct *p, ++ struct unw_frame_info *unw) ++{ ++ unsigned long ip; ++ int count = 0; ++ ++ unw_init_from_blocked_task(unw, p); ++ ip = 0UL; ++ do { ++ if (unw_unwind(unw) < 0) ++ return -1; ++ unw_get_ip(unw, &ip); ++ if (!in_sched_functions(ip)) ++ break; ++ } while (count++ < 16); ++ ++ if (!ip) ++ return -1; ++ else ++ return 0; ++} ++ ++static void inline kgdb_wait(struct pt_regs *regs) ++{ ++ unsigned long hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, ++ hw_breakpoint_status ^ IA64_PSR_DB); ++ kgdb_nmihook(smp_processor_id(), regs); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, hw_breakpoint_status); ++ ++ return; ++} ++ ++static void inline normalize(struct unw_frame_info *running, ++ struct pt_regs *regs) ++{ ++ unsigned long sp; ++ ++ do { ++ unw_get_sp(running, &sp); ++ if ((sp + 0x10) >= (unsigned long)regs) ++ break; ++ } while (unw_unwind(running) >= 0); ++ ++ return; ++} ++ ++static void kgdb_init_running(struct unw_frame_info *unw, void *data) ++{ ++ struct pt_regs *regs; ++ ++ regs = data; ++ normalize(unw, regs); ++ smp_unw[smp_processor_id()].unw = unw; ++ kgdb_wait(regs); ++} ++ ++void kgdb_wait_ipi(struct pt_regs *regs) ++{ ++ struct unw_frame_info unw; ++ ++ smp_unw[smp_processor_id()].task = current; ++ ++ if (user_mode(regs)) { ++ smp_unw[smp_processor_id()].unw = (struct unw_frame_info *)1; ++ kgdb_wait(regs); ++ } else { ++ if (current->state == TASK_RUNNING) ++ unw_init_running(kgdb_init_running, regs); ++ else { ++ if (kgdb_get_blocked_state(current, &unw)) ++ smp_unw[smp_processor_id()].unw = ++ (struct unw_frame_info *)1; ++ else ++ smp_unw[smp_processor_id()].unw = &unw; ++ kgdb_wait(regs); ++ } ++ } ++ ++ smp_unw[smp_processor_id()].unw = NULL; ++ return; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ if (num_online_cpus() > 1) ++ smp_send_nmi_allbutself(); ++} ++ ++static volatile int kgdb_hwbreak_sstep[NR_CPUS]; ++ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ unsigned long err = args->err; ++ ++ switch (cmd) { ++ default: ++ return NOTIFY_DONE; ++ case DIE_PAGE_FAULT_NO_CONTEXT: ++ if (atomic_read(&debugger_active) && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ } ++ break; ++ case DIE_BREAK: ++ if (user_mode(regs) || err == 0x80001) ++ return NOTIFY_DONE; ++ break; ++ case DIE_FAULT: ++ if (user_mode(regs)) ++ return NOTIFY_DONE; ++ else if (err == 36 && kgdb_hwbreak_sstep[smp_processor_id()]) { ++ kgdb_hwbreak_sstep[smp_processor_id()] = 0; ++ regs->cr_ipsr &= ~IA64_PSR_SS; ++ return NOTIFY_STOP; ++ } ++ case DIE_MCA_MONARCH_PROCESS: ++ case DIE_INIT_MONARCH_PROCESS: ++ break; ++ } ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&ia64die_chain, &kgdb_notifier); ++ return 0; ++} ++ ++static void do_kgdb_handle_exception(struct unw_frame_info *, void *data); ++ ++struct kgdb_state { ++ int e_vector; ++ int signo; ++ unsigned long err_code; ++ struct pt_regs *regs; ++ struct unw_frame_info *unw; ++ char *inbuf; ++ char *outbuf; ++ int unwind; ++ int ret; ++}; ++ ++static void inline kgdb_pc(struct pt_regs *regs, unsigned long pc) ++{ ++ regs->cr_iip = pc & ~0xf; ++ ia64_psr(regs)->ri = pc & 0x3; ++ return; ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ struct kgdb_state info; ++ ++ info.e_vector = e_vector; ++ info.signo = signo; ++ info.err_code = err_code; ++ info.unw = (void *)0; ++ info.inbuf = remcom_in_buffer; ++ info.outbuf = remcom_out_buffer; ++ info.unwind = 0; ++ info.ret = -1; ++ ++ if (remcom_in_buffer[0] == 'c' || remcom_in_buffer[0] == 's') { ++ info.regs = linux_regs; ++ do_kgdb_handle_exception(NULL, &info); ++ } else if (kgdb_usethread == current) { ++ info.regs = linux_regs; ++ info.unwind = 1; ++ unw_init_running(do_kgdb_handle_exception, &info); ++ } else if (kgdb_usethread->state != TASK_RUNNING) { ++ struct unw_frame_info unw_info; ++ ++ if (kgdb_get_blocked_state(kgdb_usethread, &unw_info)) { ++ info.ret = 1; ++ goto bad; ++ } ++ info.regs = NULL; ++ do_kgdb_handle_exception(&unw_info, &info); ++ } else { ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) ++ if (smp_unw[i].task == kgdb_usethread && smp_unw[i].unw ++ && smp_unw[i].unw != (struct unw_frame_info *)1) { ++ info.regs = NULL; ++ do_kgdb_handle_exception(smp_unw[i].unw, &info); ++ break; ++ } else { ++ info.ret = 1; ++ goto bad; ++ } ++ } ++ ++ bad: ++ if (info.ret != -1 && remcom_in_buffer[0] == 'p') { ++ unsigned long bad = 0xbad4badbadbadbadUL; ++ ++ printk("kgdb_arch_handle_exception: p packet bad (%s)\n", ++ remcom_in_buffer); ++ kgdb_mem2hex((char *)&bad, remcom_out_buffer, sizeof(bad)); ++ remcom_out_buffer[sizeof(bad) * 2] = 0; ++ info.ret = -1; ++ } ++ return info.ret; ++} ++ ++/* ++ * This is done because I evidently made an incorrect 'p' encoding ++ * when my patch for gdb was committed. It was later corrected. This ++ * check supports both my wrong encoding of the register number and ++ * the correct encoding. Eventually this should be eliminated and ++ * kgdb_hex2long should be demarshalling the regnum. ++ */ ++static inline int check_packet(unsigned int regnum, char *packet) ++{ ++ static int check_done, swap; ++ unsigned long reglong; ++ ++ if (likely(check_done)) { ++ if (swap) { ++ kgdb_hex2long(&packet, ®long); ++ regnum = (int) reglong; ++ } ++ ++ } else { ++ if (regnum > NUM_REGS) { ++ kgdb_hex2long(&packet, ®long); ++ regnum = (int) reglong; ++ swap = 1; ++ } ++ check_done = 1; ++ } ++ return regnum; ++} ++ ++static void do_kgdb_handle_exception(struct unw_frame_info *unw_info, ++ void *data) ++{ ++ long addr; ++ char *ptr; ++ unsigned long newPC; ++ int e_vector, signo; ++ unsigned long err_code; ++ struct pt_regs *linux_regs; ++ struct kgdb_state *info; ++ char *remcom_in_buffer, *remcom_out_buffer; ++ ++ info = data; ++ info->unw = unw_info; ++ e_vector = info->e_vector; ++ signo = info->signo; ++ err_code = info->err_code; ++ remcom_in_buffer = info->inbuf; ++ remcom_out_buffer = info->outbuf; ++ linux_regs = info->regs; ++ ++ if (info->unwind) ++ normalize(unw_info, linux_regs); ++ ++ switch (remcom_in_buffer[0]) { ++ case 'p': ++ { ++ unsigned int regnum; ++ ++ kgdb_hex2mem(&remcom_in_buffer[1], (char *)®num, ++ sizeof(regnum)); ++ regnum = check_packet(regnum, &remcom_in_buffer[1]); ++ if (regnum >= NUM_REGS) { ++ remcom_out_buffer[0] = 'E'; ++ remcom_out_buffer[1] = 0; ++ } else ++ kgdb_get_reg(remcom_out_buffer, regnum, ++ unw_info, linux_regs); ++ break; ++ } ++ case 'P': ++ { ++ unsigned int regno; ++ long v; ++ char *ptr; ++ ++ ptr = &remcom_in_buffer[1]; ++ if ((!kgdb_usethread || kgdb_usethread == current) && ++ kgdb_hex2long(&ptr, &v) && ++ *ptr++ == '=' && (v >= 0)) { ++ regno = (unsigned int)v; ++ regno = (regno >= NUM_REGS ? 0 : regno); ++ kgdb_put_reg(ptr, remcom_out_buffer, regno, ++ unw_info, linux_regs); ++ } else ++ strcpy(remcom_out_buffer, "E01"); ++ break; ++ } ++ case 'c': ++ case 's': ++ if (e_vector == TRAP_BRKPT && err_code == KGDBBREAKNUM) { ++ if (ia64_psr(linux_regs)->ri < 2) ++ kgdb_pc(linux_regs, linux_regs->cr_iip + ++ ia64_psr(linux_regs)->ri + 1); ++ else ++ kgdb_pc(linux_regs, linux_regs->cr_iip + 16); ++ } ++ ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) { ++ linux_regs->cr_iip = addr; ++ } ++ newPC = linux_regs->cr_iip; ++ ++ /* clear the trace bit */ ++ linux_regs->cr_ipsr &= ~IA64_PSR_SS; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ ++ /* set the trace bit if we're stepping or took a hardware break */ ++ if (remcom_in_buffer[0] == 's' || e_vector == TRAP_HWBKPT) { ++ linux_regs->cr_ipsr |= IA64_PSR_SS; ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ ++ kgdb_correct_hw_break(); ++ ++ /* if not hardware breakpoint, then reenable them */ ++ if (e_vector != TRAP_HWBKPT) ++ linux_regs->cr_ipsr |= IA64_PSR_DB; ++ else { ++ kgdb_hwbreak_sstep[smp_processor_id()] = 1; ++ linux_regs->cr_ipsr &= ~IA64_PSR_DB; ++ } ++ ++ info->ret = 0; ++ break; ++ default: ++ break; ++ } ++ ++ return; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .set_hw_breakpoint = kgdb_arch_set_hw_breakpoint, ++ .remove_hw_breakpoint = kgdb_arch_remove_hw_breakpoint, ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/process.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/process.c +--- linux-2.6.18-53.1.14/arch/ia64/kernel/process.c 2008-03-06 05:55:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/process.c 2008-06-10 15:39:39.000000000 +0400 +@@ -463,6 +463,9 @@ copy_thread (int nr, unsigned long clone + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); ++#ifdef CONFIG_KGDB ++ child_ptregs->cr_ipsr |= IA64_PSR_DB; ++#endif + + /* + * NOTE: The calling convention considers all floating point +@@ -691,6 +694,9 @@ kernel_thread (int (*fn)(void *), void * + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; ++#ifdef CONFIG_KGDB ++ regs.pt.cr_ipsr |= IA64_PSR_DB; ++#endif + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/smp.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/smp.c +--- linux-2.6.18-53.1.14/arch/ia64/kernel/smp.c 2008-03-06 05:54:27.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/smp.c 2008-06-10 15:38:32.000000000 +0400 +@@ -48,6 +48,7 @@ + #include + #include + #include ++#include + + /* + * Structure and data for smp_call_function(). This is designed to minimise static memory +@@ -68,6 +69,9 @@ static volatile struct call_data_struct + #define IPI_CALL_FUNC 0 + #define IPI_CPU_STOP 1 + #define IPI_KDUMP_CPU_STOP 3 ++#ifdef CONFIG_KGDB ++#define IPI_KGDB_INTERRUPT 2 ++#endif + + /* This needs to be cacheline aligned because it is written to by *other* CPUs. */ + static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned; +@@ -185,6 +189,11 @@ handle_IPI (int irq, void *dev_id, struc + case IPI_CPU_STOP: + stop_this_cpu(); + break; ++#ifdef CONFIG_KGDB ++ case IPI_KGDB_INTERRUPT: ++ kgdb_wait_ipi(regs); ++ break; ++#endif + #ifdef CONFIG_CRASH_DUMP + case IPI_KDUMP_CPU_STOP: + unw_init_running(kdump_cpu_freeze, NULL); +@@ -359,6 +368,14 @@ smp_call_function_single (int cpuid, voi + } + EXPORT_SYMBOL(smp_call_function_single); + ++#ifdef CONFIG_KGDB ++void ++smp_send_nmi_allbutself(void) ++{ ++ send_IPI_allbutself(IPI_KGDB_INTERRUPT); ++} ++#endif ++ + /* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/traps.c +--- linux-2.6.18-53.1.14/arch/ia64/kernel/traps.c 2008-03-06 05:54:44.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/traps.c 2008-06-10 15:38:32.000000000 +0400 +@@ -200,8 +200,12 @@ __kprobes ia64_bad_break (unsigned long + break; + + default: +- if (break_num < 0x40000 || break_num > 0x100000) ++ if (break_num < 0x40000 || break_num > 0x100000) { ++ if (notify_die(DIE_BREAK, "bad break", regs, ++ break_num, TRAP_BRKPT, SIGTRAP) == NOTIFY_STOP) ++ return; + die_if_kernel("Bad break", regs, break_num); ++ } + + if (break_num < 0x80000) { + sig = SIGILL; code = __ILL_BREAK; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/kernel/unwind.c linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/unwind.c +--- linux-2.6.18-53.1.14/arch/ia64/kernel/unwind.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/kernel/unwind.c 2008-06-10 15:39:39.000000000 +0400 +@@ -72,10 +72,68 @@ + # define STAT(x...) + #endif + ++#ifdef CONFIG_KGDB ++#define KGDB_EARLY_SIZE 100 ++static struct unw_reg_state __initdata kgdb_reg_state[KGDB_EARLY_SIZE]; ++static struct unw_labeled_state __initdata kgdb_labeled_state[KGDB_EARLY_SIZE]; ++void __initdata *kgdb_reg_state_free, __initdata *kgdb_labeled_state_free; ++ ++static void __init ++kgdb_malloc_init(void) ++{ ++ int i; ++ ++ kgdb_reg_state_free = kgdb_reg_state; ++ for (i = 1; i < KGDB_EARLY_SIZE; i++) { ++ *((unsigned long *) &kgdb_reg_state[i]) = (unsigned long) kgdb_reg_state_free; ++ kgdb_reg_state_free = &kgdb_reg_state[i]; ++ } ++ ++ kgdb_labeled_state_free = kgdb_labeled_state; ++ for (i = 1; i < KGDB_EARLY_SIZE; i++) { ++ *((unsigned long *) &kgdb_labeled_state[i]) = ++ (unsigned long) kgdb_labeled_state_free; ++ kgdb_labeled_state_free = &kgdb_labeled_state[i]; ++ } ++ ++} ++ ++static void * __init ++kgdb_malloc(void **mem) ++{ ++ void *p; ++ ++ p = *mem; ++ *mem = *((void **) p); ++ return p; ++} ++ ++static void __init ++kgdb_free(void **mem, void *p) ++{ ++ *((void **)p) = *mem; ++ *mem = p; ++} ++ ++#define alloc_reg_state() (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_malloc(&kgdb_reg_state_free) : \ ++ kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)) ++#define free_reg_state(usr) (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_free(&kgdb_reg_state_free, usr) : \ ++ kfree(usr)) ++#define alloc_labeled_state() (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_malloc(&kgdb_labeled_state_free) : \ ++ kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)) ++#define free_labeled_state(usr) (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_free(&kgdb_labeled_state_free, usr) : \ ++ kfree(usr)) ++ ++#else + #define alloc_reg_state() kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC) + #define free_reg_state(usr) kfree(usr) + #define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC) + #define free_labeled_state(usr) kfree(usr) ++#endif + + typedef unsigned long unw_word; + typedef unsigned char unw_hash_index_t; +@@ -238,6 +296,24 @@ static struct { + #endif + }; + ++#ifdef CONFIG_KGDB ++/* ++ * This makes it safe to call breakpoint() very early ++ * in setup_arch providing: ++ * 1) breakpoint isn't called between lines in cpu_init ++ * where init_mm.mm_count is incremented and ia64_mmu_init ++ * is called. Otherwise the test below is invalid. ++ * 2) the memory examined doesn't result in tlbmiss. ++ */ ++static unsigned long inline kgdb_unimpl_va_mask(void) ++{ ++ if (atomic_read(&init_mm.mm_count) > 1) ++ return local_cpu_data->unimpl_va_mask; ++ else ++ return 0UL; ++} ++#endif ++ + static inline int + read_only (void *addr) + { +@@ -1786,7 +1862,11 @@ run_script (struct unw_script *script, s + + case UNW_INSN_LOAD: + #ifdef UNW_DEBUG ++#ifdef CONFIG_KGDB ++ if ((s[val] & (kgdb_unimpl_va_mask() | 0x7)) != 0 ++#else + if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0 ++#endif + || s[val] < TASK_SIZE) + { + UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n", +@@ -1821,7 +1901,11 @@ find_save_locs (struct unw_frame_info *i + struct unw_script *scr; + unsigned long flags = 0; + ++#ifdef CONFIG_KGDB ++ if ((info->ip & (kgdb_unimpl_va_mask() | 0xf)) || info->ip < TASK_SIZE) { ++#else + if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) { ++#endif + /* don't let obviously bad addresses pollute the cache */ + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip); +@@ -2249,6 +2333,9 @@ unw_init (void) + + init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp, + __start_unwind, __end_unwind); ++#ifdef CONFIG_KGDB ++ kgdb_malloc_init(); ++#endif + } + + /* +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/extable.c +--- linux-2.6.18-53.1.14/arch/ia64/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/extable.c 2008-06-10 15:38:32.000000000 +0400 +@@ -6,6 +6,7 @@ + */ + + #include ++#include + + #include + #include +@@ -73,6 +74,11 @@ search_extable (const struct exception_t + else + last = mid - 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + return NULL; + } + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ia64/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/fault.c +--- linux-2.6.18-53.1.14/arch/ia64/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ia64/mm/fault.c 2008-06-10 15:38:32.000000000 +0400 +@@ -266,6 +266,10 @@ ia64_do_page_fault (unsigned long addres + */ + bust_spinlocks(1); + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ isr, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address); + else +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/mips/Kconfig.debug +--- linux-2.6.18-53.1.14/arch/mips/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/Kconfig.debug 2008-06-10 15:38:24.000000000 +0400 +@@ -37,25 +37,6 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-config KGDB +- bool "Remote GDB kernel debugging" +- depends on DEBUG_KERNEL +- select DEBUG_INFO +- help +- If you say Y here, it will be possible to remotely debug the MIPS +- kernel using gdb. This enlarges your kernel image disk size by +- several megabytes and requires a machine with more than 16 MB, +- better 32 MB RAM to avoid excessive linking time. This is only +- useful for kernel hackers. If unsure, say N. +- +-config GDB_CONSOLE +- bool "Console output to GDB" +- depends on KGDB +- help +- If you are using GDB for remote debugging over a serial port and +- would like kernel messages to be formatted into GDB $O packets so +- that GDB prints them as program output, say 'Y'. +- + config SB1XXX_CORELIS + bool "Corelis Debugger" + depends on SIBYTE_SB1xxx_SOC +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/mips/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/Makefile 2008-06-10 15:38:24.000000000 +0400 +@@ -59,7 +59,8 @@ obj-$(CONFIG_MIPS32_COMPAT) += linux32.o + obj-$(CONFIG_MIPS32_N32) += binfmt_elfn32.o scall64-n32.o signal_n32.o + obj-$(CONFIG_MIPS32_O32) += binfmt_elfo32.o scall64-o32.o ptrace32.o + +-obj-$(CONFIG_KGDB) += gdb-low.o gdb-stub.o ++obj-$(CONFIG_KGDB) += kgdb_handler.o kgdb.o kgdb-jmp.o \ ++ kgdb-setjmp.o + obj-$(CONFIG_PROC_FS) += proc.o + + obj-$(CONFIG_64BIT) += cpu-bugs64.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/gdb-low.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-low.S +--- linux-2.6.18-53.1.14/arch/mips/kernel/gdb-low.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-low.S 1970-01-01 03:00:00.000000000 +0300 +@@ -1,394 +0,0 @@ +-/* +- * gdb-low.S contains the low-level trap handler for the GDB stub. +- * +- * Copyright (C) 1995 Andreas Busse +- */ +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_32BIT +-#define DMFC0 mfc0 +-#define DMTC0 mtc0 +-#define LDC1 lwc1 +-#define SDC1 lwc1 +-#endif +-#ifdef CONFIG_64BIT +-#define DMFC0 dmfc0 +-#define DMTC0 dmtc0 +-#define LDC1 ldc1 +-#define SDC1 ldc1 +-#endif +- +-/* +- * [jsun] We reserves about 2x GDB_FR_SIZE in stack. The lower (addressed) +- * part is used to store registers and passed to exception handler. +- * The upper part is reserved for "call func" feature where gdb client +- * saves some of the regs, setups call frame and passes args. +- * +- * A trace shows about 200 bytes are used to store about half of all regs. +- * The rest should be big enough for frame setup and passing args. +- */ +- +-/* +- * The low level trap handler +- */ +- .align 5 +- NESTED(trap_low, GDB_FR_SIZE, sp) +- .set noat +- .set noreorder +- +- mfc0 k0, CP0_STATUS +- sll k0, 3 /* extract cu0 bit */ +- bltz k0, 1f +- move k1, sp +- +- /* +- * Called from user mode, go somewhere else. +- */ +- mfc0 k0, CP0_CAUSE +- andi k0, k0, 0x7c +-#ifdef CONFIG_64BIT +- dsll k0, k0, 1 +-#endif +- PTR_L k1, saved_vectors(k0) +- jr k1 +- nop +-1: +- move k0, sp +- PTR_SUBU sp, k1, GDB_FR_SIZE*2 # see comment above +- LONG_S k0, GDB_FR_REG29(sp) +- LONG_S $2, GDB_FR_REG2(sp) +- +-/* +- * First save the CP0 and special registers +- */ +- +- mfc0 v0, CP0_STATUS +- LONG_S v0, GDB_FR_STATUS(sp) +- mfc0 v0, CP0_CAUSE +- LONG_S v0, GDB_FR_CAUSE(sp) +- DMFC0 v0, CP0_EPC +- LONG_S v0, GDB_FR_EPC(sp) +- DMFC0 v0, CP0_BADVADDR +- LONG_S v0, GDB_FR_BADVADDR(sp) +- mfhi v0 +- LONG_S v0, GDB_FR_HI(sp) +- mflo v0 +- LONG_S v0, GDB_FR_LO(sp) +- +-/* +- * Now the integer registers +- */ +- +- LONG_S zero, GDB_FR_REG0(sp) /* I know... */ +- LONG_S $1, GDB_FR_REG1(sp) +- /* v0 already saved */ +- LONG_S $3, GDB_FR_REG3(sp) +- LONG_S $4, GDB_FR_REG4(sp) +- LONG_S $5, GDB_FR_REG5(sp) +- LONG_S $6, GDB_FR_REG6(sp) +- LONG_S $7, GDB_FR_REG7(sp) +- LONG_S $8, GDB_FR_REG8(sp) +- LONG_S $9, GDB_FR_REG9(sp) +- LONG_S $10, GDB_FR_REG10(sp) +- LONG_S $11, GDB_FR_REG11(sp) +- LONG_S $12, GDB_FR_REG12(sp) +- LONG_S $13, GDB_FR_REG13(sp) +- LONG_S $14, GDB_FR_REG14(sp) +- LONG_S $15, GDB_FR_REG15(sp) +- LONG_S $16, GDB_FR_REG16(sp) +- LONG_S $17, GDB_FR_REG17(sp) +- LONG_S $18, GDB_FR_REG18(sp) +- LONG_S $19, GDB_FR_REG19(sp) +- LONG_S $20, GDB_FR_REG20(sp) +- LONG_S $21, GDB_FR_REG21(sp) +- LONG_S $22, GDB_FR_REG22(sp) +- LONG_S $23, GDB_FR_REG23(sp) +- LONG_S $24, GDB_FR_REG24(sp) +- LONG_S $25, GDB_FR_REG25(sp) +- LONG_S $26, GDB_FR_REG26(sp) +- LONG_S $27, GDB_FR_REG27(sp) +- LONG_S $28, GDB_FR_REG28(sp) +- /* sp already saved */ +- LONG_S $30, GDB_FR_REG30(sp) +- LONG_S $31, GDB_FR_REG31(sp) +- +- CLI /* disable interrupts */ +- TRACE_IRQS_OFF +- +-/* +- * Followed by the floating point registers +- */ +- mfc0 v0, CP0_STATUS /* FPU enabled? */ +- srl v0, v0, 16 +- andi v0, v0, (ST0_CU1 >> 16) +- +- beqz v0,2f /* disabled, skip */ +- nop +- +- SDC1 $0, GDB_FR_FPR0(sp) +- SDC1 $1, GDB_FR_FPR1(sp) +- SDC1 $2, GDB_FR_FPR2(sp) +- SDC1 $3, GDB_FR_FPR3(sp) +- SDC1 $4, GDB_FR_FPR4(sp) +- SDC1 $5, GDB_FR_FPR5(sp) +- SDC1 $6, GDB_FR_FPR6(sp) +- SDC1 $7, GDB_FR_FPR7(sp) +- SDC1 $8, GDB_FR_FPR8(sp) +- SDC1 $9, GDB_FR_FPR9(sp) +- SDC1 $10, GDB_FR_FPR10(sp) +- SDC1 $11, GDB_FR_FPR11(sp) +- SDC1 $12, GDB_FR_FPR12(sp) +- SDC1 $13, GDB_FR_FPR13(sp) +- SDC1 $14, GDB_FR_FPR14(sp) +- SDC1 $15, GDB_FR_FPR15(sp) +- SDC1 $16, GDB_FR_FPR16(sp) +- SDC1 $17, GDB_FR_FPR17(sp) +- SDC1 $18, GDB_FR_FPR18(sp) +- SDC1 $19, GDB_FR_FPR19(sp) +- SDC1 $20, GDB_FR_FPR20(sp) +- SDC1 $21, GDB_FR_FPR21(sp) +- SDC1 $22, GDB_FR_FPR22(sp) +- SDC1 $23, GDB_FR_FPR23(sp) +- SDC1 $24, GDB_FR_FPR24(sp) +- SDC1 $25, GDB_FR_FPR25(sp) +- SDC1 $26, GDB_FR_FPR26(sp) +- SDC1 $27, GDB_FR_FPR27(sp) +- SDC1 $28, GDB_FR_FPR28(sp) +- SDC1 $29, GDB_FR_FPR29(sp) +- SDC1 $30, GDB_FR_FPR30(sp) +- SDC1 $31, GDB_FR_FPR31(sp) +- +-/* +- * FPU control registers +- */ +- +- cfc1 v0, CP1_STATUS +- LONG_S v0, GDB_FR_FSR(sp) +- cfc1 v0, CP1_REVISION +- LONG_S v0, GDB_FR_FIR(sp) +- +-/* +- * Current stack frame ptr +- */ +- +-2: +- LONG_S sp, GDB_FR_FRP(sp) +- +-/* +- * CP0 registers (R4000/R4400 unused registers skipped) +- */ +- +- mfc0 v0, CP0_INDEX +- LONG_S v0, GDB_FR_CP0_INDEX(sp) +- mfc0 v0, CP0_RANDOM +- LONG_S v0, GDB_FR_CP0_RANDOM(sp) +- DMFC0 v0, CP0_ENTRYLO0 +- LONG_S v0, GDB_FR_CP0_ENTRYLO0(sp) +- DMFC0 v0, CP0_ENTRYLO1 +- LONG_S v0, GDB_FR_CP0_ENTRYLO1(sp) +- DMFC0 v0, CP0_CONTEXT +- LONG_S v0, GDB_FR_CP0_CONTEXT(sp) +- mfc0 v0, CP0_PAGEMASK +- LONG_S v0, GDB_FR_CP0_PAGEMASK(sp) +- mfc0 v0, CP0_WIRED +- LONG_S v0, GDB_FR_CP0_WIRED(sp) +- DMFC0 v0, CP0_ENTRYHI +- LONG_S v0, GDB_FR_CP0_ENTRYHI(sp) +- mfc0 v0, CP0_PRID +- LONG_S v0, GDB_FR_CP0_PRID(sp) +- +- .set at +- +-/* +- * Continue with the higher level handler +- */ +- +- move a0,sp +- +- jal handle_exception +- nop +- +-/* +- * Restore all writable registers, in reverse order +- */ +- +- .set noat +- +- LONG_L v0, GDB_FR_CP0_ENTRYHI(sp) +- LONG_L v1, GDB_FR_CP0_WIRED(sp) +- DMTC0 v0, CP0_ENTRYHI +- mtc0 v1, CP0_WIRED +- LONG_L v0, GDB_FR_CP0_PAGEMASK(sp) +- LONG_L v1, GDB_FR_CP0_ENTRYLO1(sp) +- mtc0 v0, CP0_PAGEMASK +- DMTC0 v1, CP0_ENTRYLO1 +- LONG_L v0, GDB_FR_CP0_ENTRYLO0(sp) +- LONG_L v1, GDB_FR_CP0_INDEX(sp) +- DMTC0 v0, CP0_ENTRYLO0 +- LONG_L v0, GDB_FR_CP0_CONTEXT(sp) +- mtc0 v1, CP0_INDEX +- DMTC0 v0, CP0_CONTEXT +- +- +-/* +- * Next, the floating point registers +- */ +- mfc0 v0, CP0_STATUS /* check if the FPU is enabled */ +- srl v0, v0, 16 +- andi v0, v0, (ST0_CU1 >> 16) +- +- beqz v0, 3f /* disabled, skip */ +- nop +- +- LDC1 $31, GDB_FR_FPR31(sp) +- LDC1 $30, GDB_FR_FPR30(sp) +- LDC1 $29, GDB_FR_FPR29(sp) +- LDC1 $28, GDB_FR_FPR28(sp) +- LDC1 $27, GDB_FR_FPR27(sp) +- LDC1 $26, GDB_FR_FPR26(sp) +- LDC1 $25, GDB_FR_FPR25(sp) +- LDC1 $24, GDB_FR_FPR24(sp) +- LDC1 $23, GDB_FR_FPR23(sp) +- LDC1 $22, GDB_FR_FPR22(sp) +- LDC1 $21, GDB_FR_FPR21(sp) +- LDC1 $20, GDB_FR_FPR20(sp) +- LDC1 $19, GDB_FR_FPR19(sp) +- LDC1 $18, GDB_FR_FPR18(sp) +- LDC1 $17, GDB_FR_FPR17(sp) +- LDC1 $16, GDB_FR_FPR16(sp) +- LDC1 $15, GDB_FR_FPR15(sp) +- LDC1 $14, GDB_FR_FPR14(sp) +- LDC1 $13, GDB_FR_FPR13(sp) +- LDC1 $12, GDB_FR_FPR12(sp) +- LDC1 $11, GDB_FR_FPR11(sp) +- LDC1 $10, GDB_FR_FPR10(sp) +- LDC1 $9, GDB_FR_FPR9(sp) +- LDC1 $8, GDB_FR_FPR8(sp) +- LDC1 $7, GDB_FR_FPR7(sp) +- LDC1 $6, GDB_FR_FPR6(sp) +- LDC1 $5, GDB_FR_FPR5(sp) +- LDC1 $4, GDB_FR_FPR4(sp) +- LDC1 $3, GDB_FR_FPR3(sp) +- LDC1 $2, GDB_FR_FPR2(sp) +- LDC1 $1, GDB_FR_FPR1(sp) +- LDC1 $0, GDB_FR_FPR0(sp) +- +-/* +- * Now the CP0 and integer registers +- */ +- +-3: +-#ifdef CONFIG_MIPS_MT_SMTC +- /* Read-modify write of Status must be atomic */ +- mfc0 t2, CP0_TCSTATUS +- ori t1, t2, TCSTATUS_IXMT +- mtc0 t1, CP0_TCSTATUS +- andi t2, t2, TCSTATUS_IXMT +- _ehb +- DMT 9 # dmt t1 +- jal mips_ihb +- nop +-#endif /* CONFIG_MIPS_MT_SMTC */ +- mfc0 t0, CP0_STATUS +- ori t0, 0x1f +- xori t0, 0x1f +- mtc0 t0, CP0_STATUS +-#ifdef CONFIG_MIPS_MT_SMTC +- andi t1, t1, VPECONTROL_TE +- beqz t1, 9f +- nop +- EMT # emt +-9: +- mfc0 t1, CP0_TCSTATUS +- xori t1, t1, TCSTATUS_IXMT +- or t1, t1, t2 +- mtc0 t1, CP0_TCSTATUS +- _ehb +-#endif /* CONFIG_MIPS_MT_SMTC */ +- LONG_L v0, GDB_FR_STATUS(sp) +- LONG_L v1, GDB_FR_EPC(sp) +- mtc0 v0, CP0_STATUS +- DMTC0 v1, CP0_EPC +- LONG_L v0, GDB_FR_HI(sp) +- LONG_L v1, GDB_FR_LO(sp) +- mthi v0 +- mtlo v1 +- LONG_L $31, GDB_FR_REG31(sp) +- LONG_L $30, GDB_FR_REG30(sp) +- LONG_L $28, GDB_FR_REG28(sp) +- LONG_L $27, GDB_FR_REG27(sp) +- LONG_L $26, GDB_FR_REG26(sp) +- LONG_L $25, GDB_FR_REG25(sp) +- LONG_L $24, GDB_FR_REG24(sp) +- LONG_L $23, GDB_FR_REG23(sp) +- LONG_L $22, GDB_FR_REG22(sp) +- LONG_L $21, GDB_FR_REG21(sp) +- LONG_L $20, GDB_FR_REG20(sp) +- LONG_L $19, GDB_FR_REG19(sp) +- LONG_L $18, GDB_FR_REG18(sp) +- LONG_L $17, GDB_FR_REG17(sp) +- LONG_L $16, GDB_FR_REG16(sp) +- LONG_L $15, GDB_FR_REG15(sp) +- LONG_L $14, GDB_FR_REG14(sp) +- LONG_L $13, GDB_FR_REG13(sp) +- LONG_L $12, GDB_FR_REG12(sp) +- LONG_L $11, GDB_FR_REG11(sp) +- LONG_L $10, GDB_FR_REG10(sp) +- LONG_L $9, GDB_FR_REG9(sp) +- LONG_L $8, GDB_FR_REG8(sp) +- LONG_L $7, GDB_FR_REG7(sp) +- LONG_L $6, GDB_FR_REG6(sp) +- LONG_L $5, GDB_FR_REG5(sp) +- LONG_L $4, GDB_FR_REG4(sp) +- LONG_L $3, GDB_FR_REG3(sp) +- LONG_L $2, GDB_FR_REG2(sp) +- LONG_L $1, GDB_FR_REG1(sp) +-#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX) +- LONG_L k0, GDB_FR_EPC(sp) +- LONG_L $29, GDB_FR_REG29(sp) /* Deallocate stack */ +- jr k0 +- rfe +-#else +- LONG_L sp, GDB_FR_REG29(sp) /* Deallocate stack */ +- +- .set mips3 +- eret +- .set mips0 +-#endif +- .set at +- .set reorder +- END(trap_low) +- +-LEAF(kgdb_read_byte) +-4: lb t0, (a0) +- sb t0, (a1) +- li v0, 0 +- jr ra +- .section __ex_table,"a" +- PTR 4b, kgdbfault +- .previous +- END(kgdb_read_byte) +- +-LEAF(kgdb_write_byte) +-5: sb a0, (a1) +- li v0, 0 +- jr ra +- .section __ex_table,"a" +- PTR 5b, kgdbfault +- .previous +- END(kgdb_write_byte) +- +- .type kgdbfault@function +- .ent kgdbfault +- +-kgdbfault: li v0, -EFAULT +- jr ra +- .end kgdbfault +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/gdb-stub.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-stub.c +--- linux-2.6.18-53.1.14/arch/mips/kernel/gdb-stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/gdb-stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,1154 +0,0 @@ +-/* +- * arch/mips/kernel/gdb-stub.c +- * +- * Originally written by Glenn Engel, Lake Stevens Instrument Division +- * +- * Contributed by HP Systems +- * +- * Modified for SPARC by Stu Grossman, Cygnus Support. +- * +- * Modified for Linux/MIPS (and MIPS in general) by Andreas Busse +- * Send complaints, suggestions etc. to +- * +- * Copyright (C) 1995 Andreas Busse +- * +- * Copyright (C) 2003 MontaVista Software Inc. +- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net +- */ +- +-/* +- * To enable debugger support, two things need to happen. One, a +- * call to set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * Two, a breakpoint needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint(). Breakpoint() +- * simulates a breakpoint by executing a BREAK instruction. +- * +- * +- * The following gdb commands are supported: +- * +- * command function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * +- * k kill +- * +- * ? What was the last sigval ? SNN (signal NN) +- * +- * bBB..BB Set baud rate to BB..BB OK or BNN, then sets +- * baud rate +- * +- * All commands and responses are sent with a packet which includes a +- * checksum. A packet consists of +- * +- * $#. +- * +- * where +- * :: +- * :: < two hex digits computed as modulo 256 sum of > +- * +- * When a packet is received, it is first acknowledged with either '+' or '-'. +- * '+' indicates a successful transfer. '-' indicates a failed transfer. +- * +- * Example: +- * +- * Host: Reply: +- * $m0,10#2a +$00010203040506070809101112131415#42 +- * +- * +- * ============== +- * MORE EXAMPLES: +- * ============== +- * +- * For reference -- the following are the steps that one +- * company took (RidgeRun Inc) to get remote gdb debugging +- * going. In this scenario the host machine was a PC and the +- * target platform was a Galileo EVB64120A MIPS evaluation +- * board. +- * +- * Step 1: +- * First download gdb-5.0.tar.gz from the internet. +- * and then build/install the package. +- * +- * Example: +- * $ tar zxf gdb-5.0.tar.gz +- * $ cd gdb-5.0 +- * $ ./configure --target=mips-linux-elf +- * $ make +- * $ install +- * $ which mips-linux-elf-gdb +- * /usr/local/bin/mips-linux-elf-gdb +- * +- * Step 2: +- * Configure linux for remote debugging and build it. +- * +- * Example: +- * $ cd ~/linux +- * $ make menuconfig +- * $ make +- * +- * Step 3: +- * Download the kernel to the remote target and start +- * the kernel running. It will promptly halt and wait +- * for the host gdb session to connect. It does this +- * since the "Kernel Hacking" option has defined +- * CONFIG_KGDB which in turn enables your calls +- * to: +- * set_debug_traps(); +- * breakpoint(); +- * +- * Step 4: +- * Start the gdb session on the host. +- * +- * Example: +- * $ mips-linux-elf-gdb vmlinux +- * (gdb) set remotebaud 115200 +- * (gdb) target remote /dev/ttyS1 +- * ...at this point you are connected to +- * the remote target and can use gdb +- * in the normal fasion. Setting +- * breakpoints, single stepping, +- * printing variables, etc. +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * external low-level support routines +- */ +- +-extern int putDebugChar(char c); /* write a single character */ +-extern char getDebugChar(void); /* read and return a single char */ +-extern void trap_low(void); +- +-/* +- * breakpoint and test functions +- */ +-extern void breakpoint(void); +-extern void breakinst(void); +-extern void async_breakpoint(void); +-extern void async_breakinst(void); +-extern void adel(void); +- +-/* +- * local prototypes +- */ +- +-static void getpacket(char *buffer); +-static void putpacket(char *buffer); +-static int computeSignal(int tt); +-static int hex(unsigned char ch); +-static int hexToInt(char **ptr, int *intValue); +-static int hexToLong(char **ptr, long *longValue); +-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault); +-void handle_exception(struct gdb_regs *regs); +- +-int kgdb_enabled; +- +-/* +- * spin locks for smp case +- */ +-static DEFINE_SPINLOCK(kgdb_lock); +-static raw_spinlock_t kgdb_cpulock[NR_CPUS] = { +- [0 ... NR_CPUS-1] = __RAW_SPIN_LOCK_UNLOCKED, +-}; +- +-/* +- * BUFMAX defines the maximum number of characters in inbound/outbound buffers +- * at least NUMREGBYTES*2 are needed for register packets +- */ +-#define BUFMAX 2048 +- +-static char input_buffer[BUFMAX]; +-static char output_buffer[BUFMAX]; +-static int initialized; /* !0 means we've been initialized */ +-static int kgdb_started; +-static const char hexchars[]="0123456789abcdef"; +- +-/* Used to prevent crashes in memory access. Note that they'll crash anyway if +- we haven't set up fault handlers yet... */ +-int kgdb_read_byte(unsigned char *address, unsigned char *dest); +-int kgdb_write_byte(unsigned char val, unsigned char *dest); +- +-/* +- * Convert ch from a hex digit to an int +- */ +-static int hex(unsigned char ch) +-{ +- if (ch >= 'a' && ch <= 'f') +- return ch-'a'+10; +- if (ch >= '0' && ch <= '9') +- return ch-'0'; +- if (ch >= 'A' && ch <= 'F') +- return ch-'A'+10; +- return -1; +-} +- +-/* +- * scan for the sequence $# +- */ +-static void getpacket(char *buffer) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- unsigned char ch; +- +- do { +- /* +- * wait around for the start character, +- * ignore all other characters +- */ +- while ((ch = (getDebugChar() & 0x7f)) != '$') ; +- +- checksum = 0; +- xmitcsum = -1; +- count = 0; +- +- /* +- * now, read until a # or end of buffer is found +- */ +- while (count < BUFMAX) { +- ch = getDebugChar(); +- if (ch == '#') +- break; +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- if (count >= BUFMAX) +- continue; +- +- buffer[count] = 0; +- +- if (ch == '#') { +- xmitcsum = hex(getDebugChar() & 0x7f) << 4; +- xmitcsum |= hex(getDebugChar() & 0x7f); +- +- if (checksum != xmitcsum) +- putDebugChar('-'); /* failed checksum */ +- else { +- putDebugChar('+'); /* successful transfer */ +- +- /* +- * if a sequence char is present, +- * reply the sequence ID +- */ +- if (buffer[2] == ':') { +- putDebugChar(buffer[0]); +- putDebugChar(buffer[1]); +- +- /* +- * remove sequence chars from buffer +- */ +- count = strlen(buffer); +- for (i=3; i <= count; i++) +- buffer[i-3] = buffer[i]; +- } +- } +- } +- } +- while (checksum != xmitcsum); +-} +- +-/* +- * send the packet in buffer. +- */ +-static void putpacket(char *buffer) +-{ +- unsigned char checksum; +- int count; +- unsigned char ch; +- +- /* +- * $#. +- */ +- +- do { +- putDebugChar('$'); +- checksum = 0; +- count = 0; +- +- while ((ch = buffer[count]) != 0) { +- if (!(putDebugChar(ch))) +- return; +- checksum += ch; +- count += 1; +- } +- +- putDebugChar('#'); +- putDebugChar(hexchars[checksum >> 4]); +- putDebugChar(hexchars[checksum & 0xf]); +- +- } +- while ((getDebugChar() & 0x7f) != '+'); +-} +- +- +-/* +- * Convert the memory pointed to by mem into hex, placing result in buf. +- * Return a pointer to the last char put in buf (null), in case of mem fault, +- * return 0. +- * may_fault is non-zero if we are reading from arbitrary memory, but is currently +- * not used. +- */ +-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault) +-{ +- unsigned char ch; +- +- while (count-- > 0) { +- if (kgdb_read_byte(mem++, &ch) != 0) +- return 0; +- *buf++ = hexchars[ch >> 4]; +- *buf++ = hexchars[ch & 0xf]; +- } +- +- *buf = 0; +- +- return buf; +-} +- +-/* +- * convert the hex array pointed to by buf into binary to be placed in mem +- * return a pointer to the character AFTER the last byte written +- * may_fault is non-zero if we are reading from arbitrary memory, but is currently +- * not used. +- */ +-static char *hex2mem(char *buf, char *mem, int count, int binary, int may_fault) +-{ +- int i; +- unsigned char ch; +- +- for (i=0; itt && ht->signo; ht++) +- saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low); +- +- putDebugChar('+'); /* 'hello world' */ +- /* +- * In case GDB is started before us, ack any packets +- * (presumably "$?#xx") sitting there. +- */ +- while((c = getDebugChar()) != '$'); +- while((c = getDebugChar()) != '#'); +- c = getDebugChar(); /* eat first csum byte */ +- c = getDebugChar(); /* eat second csum byte */ +- putDebugChar('+'); /* ack it */ +- +- initialized = 1; +- local_irq_restore(flags); +-} +- +-void restore_debug_traps(void) +-{ +- struct hard_trap_info *ht; +- unsigned long flags; +- +- local_irq_save(flags); +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- set_except_vector(ht->tt, saved_vectors[ht->tt]); +- local_irq_restore(flags); +-} +- +-/* +- * Convert the MIPS hardware trap type code to a Unix signal number. +- */ +-static int computeSignal(int tt) +-{ +- struct hard_trap_info *ht; +- +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- if (ht->tt == tt) +- return ht->signo; +- +- return SIGHUP; /* default for things we don't know about */ +-} +- +-/* +- * While we find nice hex chars, build an int. +- * Return number of chars processed. +- */ +-static int hexToInt(char **ptr, int *intValue) +-{ +- int numChars = 0; +- int hexValue; +- +- *intValue = 0; +- +- while (**ptr) { +- hexValue = hex(**ptr); +- if (hexValue < 0) +- break; +- +- *intValue = (*intValue << 4) | hexValue; +- numChars ++; +- +- (*ptr)++; +- } +- +- return (numChars); +-} +- +-static int hexToLong(char **ptr, long *longValue) +-{ +- int numChars = 0; +- int hexValue; +- +- *longValue = 0; +- +- while (**ptr) { +- hexValue = hex(**ptr); +- if (hexValue < 0) +- break; +- +- *longValue = (*longValue << 4) | hexValue; +- numChars ++; +- +- (*ptr)++; +- } +- +- return numChars; +-} +- +- +-#if 0 +-/* +- * Print registers (on target console) +- * Used only to debug the stub... +- */ +-void show_gdbregs(struct gdb_regs * regs) +-{ +- /* +- * Saved main processor registers +- */ +- printk("$0 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg0, regs->reg1, regs->reg2, regs->reg3, +- regs->reg4, regs->reg5, regs->reg6, regs->reg7); +- printk("$8 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg8, regs->reg9, regs->reg10, regs->reg11, +- regs->reg12, regs->reg13, regs->reg14, regs->reg15); +- printk("$16: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg16, regs->reg17, regs->reg18, regs->reg19, +- regs->reg20, regs->reg21, regs->reg22, regs->reg23); +- printk("$24: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg24, regs->reg25, regs->reg26, regs->reg27, +- regs->reg28, regs->reg29, regs->reg30, regs->reg31); +- +- /* +- * Saved cp0 registers +- */ +- printk("epc : %08lx\nStatus: %08lx\nCause : %08lx\n", +- regs->cp0_epc, regs->cp0_status, regs->cp0_cause); +-} +-#endif /* dead code */ +- +-/* +- * We single-step by setting breakpoints. When an exception +- * is handled, we need to restore the instructions hoisted +- * when the breakpoints were set. +- * +- * This is where we save the original instructions. +- */ +-static struct gdb_bp_save { +- unsigned long addr; +- unsigned int val; +-} step_bp[2]; +- +-#define BP 0x0000000d /* break opcode */ +- +-/* +- * Set breakpoint instructions for single stepping. +- */ +-static void single_step(struct gdb_regs *regs) +-{ +- union mips_instruction insn; +- unsigned long targ; +- int is_branch, is_cond, i; +- +- targ = regs->cp0_epc; +- insn.word = *(unsigned int *)targ; +- is_branch = is_cond = 0; +- +- switch (insn.i_format.opcode) { +- /* +- * jr and jalr are in r_format format. +- */ +- case spec_op: +- switch (insn.r_format.func) { +- case jalr_op: +- case jr_op: +- targ = *(®s->reg0 + insn.r_format.rs); +- is_branch = 1; +- break; +- } +- break; +- +- /* +- * This group contains: +- * bltz_op, bgez_op, bltzl_op, bgezl_op, +- * bltzal_op, bgezal_op, bltzall_op, bgezall_op. +- */ +- case bcond_op: +- is_branch = is_cond = 1; +- targ += 4 + (insn.i_format.simmediate << 2); +- break; +- +- /* +- * These are unconditional and in j_format. +- */ +- case jal_op: +- case j_op: +- is_branch = 1; +- targ += 4; +- targ >>= 28; +- targ <<= 28; +- targ |= (insn.j_format.target << 2); +- break; +- +- /* +- * These are conditional. +- */ +- case beq_op: +- case beql_op: +- case bne_op: +- case bnel_op: +- case blez_op: +- case blezl_op: +- case bgtz_op: +- case bgtzl_op: +- case cop0_op: +- case cop1_op: +- case cop2_op: +- case cop1x_op: +- is_branch = is_cond = 1; +- targ += 4 + (insn.i_format.simmediate << 2); +- break; +- } +- +- if (is_branch) { +- i = 0; +- if (is_cond && targ != (regs->cp0_epc + 8)) { +- step_bp[i].addr = regs->cp0_epc + 8; +- step_bp[i++].val = *(unsigned *)(regs->cp0_epc + 8); +- *(unsigned *)(regs->cp0_epc + 8) = BP; +- } +- step_bp[i].addr = targ; +- step_bp[i].val = *(unsigned *)targ; +- *(unsigned *)targ = BP; +- } else { +- step_bp[0].addr = regs->cp0_epc + 4; +- step_bp[0].val = *(unsigned *)(regs->cp0_epc + 4); +- *(unsigned *)(regs->cp0_epc + 4) = BP; +- } +-} +- +-/* +- * If asynchronously interrupted by gdb, then we need to set a breakpoint +- * at the interrupted instruction so that we wind up stopped with a +- * reasonable stack frame. +- */ +-static struct gdb_bp_save async_bp; +- +-/* +- * Swap the interrupted EPC with our asynchronous breakpoint routine. +- * This is safer than stuffing the breakpoint in-place, since no cache +- * flushes (or resulting smp_call_functions) are required. The +- * assumption is that only one CPU will be handling asynchronous bp's, +- * and only one can be active at a time. +- */ +-extern spinlock_t smp_call_lock; +- +-void set_async_breakpoint(unsigned long *epc) +-{ +- /* skip breaking into userland */ +- if ((*epc & 0x80000000) == 0) +- return; +- +-#ifdef CONFIG_SMP +- /* avoid deadlock if someone is make IPC */ +- if (spin_is_locked(&smp_call_lock)) +- return; +-#endif +- +- async_bp.addr = *epc; +- *epc = (unsigned long)async_breakpoint; +-} +- +-static void kgdb_wait(void *arg) +-{ +- unsigned flags; +- int cpu = smp_processor_id(); +- +- local_irq_save(flags); +- +- __raw_spin_lock(&kgdb_cpulock[cpu]); +- __raw_spin_unlock(&kgdb_cpulock[cpu]); +- +- local_irq_restore(flags); +-} +- +-/* +- * GDB stub needs to call kgdb_wait on all processor with interrupts +- * disabled, so it uses it's own special variant. +- */ +-static int kgdb_smp_call_kgdb_wait(void) +-{ +-#ifdef CONFIG_SMP +- struct call_data_struct data; +- int i, cpus = num_online_cpus() - 1; +- int cpu = smp_processor_id(); +- +- /* +- * Can die spectacularly if this CPU isn't yet marked online +- */ +- BUG_ON(!cpu_online(cpu)); +- +- if (!cpus) +- return 0; +- +- if (spin_is_locked(&smp_call_lock)) { +- /* +- * Some other processor is trying to make us do something +- * but we're not going to respond... give up +- */ +- return -1; +- } +- +- /* +- * We will continue here, accepting the fact that +- * the kernel may deadlock if another CPU attempts +- * to call smp_call_function now... +- */ +- +- data.func = kgdb_wait; +- data.info = NULL; +- atomic_set(&data.started, 0); +- data.wait = 0; +- +- spin_lock(&smp_call_lock); +- call_data = &data; +- mb(); +- +- /* Send a message to all other CPUs and wait for them to respond */ +- for (i = 0; i < NR_CPUS; i++) +- if (cpu_online(i) && i != cpu) +- core_send_ipi(i, SMP_CALL_FUNCTION); +- +- /* Wait for response */ +- /* FIXME: lock-up detection, backtrace on lock-up */ +- while (atomic_read(&data.started) != cpus) +- barrier(); +- +- call_data = NULL; +- spin_unlock(&smp_call_lock); +-#endif +- +- return 0; +-} +- +-/* +- * This function does all command processing for interfacing to gdb. It +- * returns 1 if you should skip the instruction at the trap address, 0 +- * otherwise. +- */ +-void handle_exception (struct gdb_regs *regs) +-{ +- int trap; /* Trap type */ +- int sigval; +- long addr; +- int length; +- char *ptr; +- unsigned long *stack; +- int i; +- int bflag = 0; +- +- kgdb_started = 1; +- +- /* +- * acquire the big kgdb spinlock +- */ +- if (!spin_trylock(&kgdb_lock)) { +- /* +- * some other CPU has the lock, we should go back to +- * receive the gdb_wait IPC +- */ +- return; +- } +- +- /* +- * If we're in async_breakpoint(), restore the real EPC from +- * the breakpoint. +- */ +- if (regs->cp0_epc == (unsigned long)async_breakinst) { +- regs->cp0_epc = async_bp.addr; +- async_bp.addr = 0; +- } +- +- /* +- * acquire the CPU spinlocks +- */ +- for (i = num_online_cpus()-1; i >= 0; i--) +- if (__raw_spin_trylock(&kgdb_cpulock[i]) == 0) +- panic("kgdb: couldn't get cpulock %d\n", i); +- +- /* +- * force other cpus to enter kgdb +- */ +- kgdb_smp_call_kgdb_wait(); +- +- /* +- * If we're in breakpoint() increment the PC +- */ +- trap = (regs->cp0_cause & 0x7c) >> 2; +- if (trap == 9 && regs->cp0_epc == (unsigned long)breakinst) +- regs->cp0_epc += 4; +- +- /* +- * If we were single_stepping, restore the opcodes hoisted +- * for the breakpoint[s]. +- */ +- if (step_bp[0].addr) { +- *(unsigned *)step_bp[0].addr = step_bp[0].val; +- step_bp[0].addr = 0; +- +- if (step_bp[1].addr) { +- *(unsigned *)step_bp[1].addr = step_bp[1].val; +- step_bp[1].addr = 0; +- } +- } +- +- stack = (long *)regs->reg29; /* stack ptr */ +- sigval = computeSignal(trap); +- +- /* +- * reply to host that an exception has occurred +- */ +- ptr = output_buffer; +- +- /* +- * Send trap type (converted to signal) +- */ +- *ptr++ = 'T'; +- *ptr++ = hexchars[sigval >> 4]; +- *ptr++ = hexchars[sigval & 0xf]; +- +- /* +- * Send Error PC +- */ +- *ptr++ = hexchars[REG_EPC >> 4]; +- *ptr++ = hexchars[REG_EPC & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->cp0_epc, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- /* +- * Send frame pointer +- */ +- *ptr++ = hexchars[REG_FP >> 4]; +- *ptr++ = hexchars[REG_FP & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->reg30, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- /* +- * Send stack pointer +- */ +- *ptr++ = hexchars[REG_SP >> 4]; +- *ptr++ = hexchars[REG_SP & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->reg29, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- *ptr++ = 0; +- putpacket(output_buffer); /* send it off... */ +- +- /* +- * Wait for input from remote GDB +- */ +- while (1) { +- output_buffer[0] = 0; +- getpacket(input_buffer); +- +- switch (input_buffer[0]) +- { +- case '?': +- output_buffer[0] = 'S'; +- output_buffer[1] = hexchars[sigval >> 4]; +- output_buffer[2] = hexchars[sigval & 0xf]; +- output_buffer[3] = 0; +- break; +- +- /* +- * Detach debugger; let CPU run +- */ +- case 'D': +- putpacket(output_buffer); +- goto finish_kgdb; +- break; +- +- case 'd': +- /* toggle debug flag */ +- break; +- +- /* +- * Return the value of the CPU registers +- */ +- case 'g': +- ptr = output_buffer; +- ptr = mem2hex((char *)®s->reg0, ptr, 32*sizeof(long), 0); /* r0...r31 */ +- ptr = mem2hex((char *)®s->cp0_status, ptr, 6*sizeof(long), 0); /* cp0 */ +- ptr = mem2hex((char *)®s->fpr0, ptr, 32*sizeof(long), 0); /* f0...31 */ +- ptr = mem2hex((char *)®s->cp1_fsr, ptr, 2*sizeof(long), 0); /* cp1 */ +- ptr = mem2hex((char *)®s->frame_ptr, ptr, 2*sizeof(long), 0); /* frp */ +- ptr = mem2hex((char *)®s->cp0_index, ptr, 16*sizeof(long), 0); /* cp0 */ +- break; +- +- /* +- * set the value of the CPU registers - return OK +- */ +- case 'G': +- { +- ptr = &input_buffer[1]; +- hex2mem(ptr, (char *)®s->reg0, 32*sizeof(long), 0, 0); +- ptr += 32*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp0_status, 6*sizeof(long), 0, 0); +- ptr += 6*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->fpr0, 32*sizeof(long), 0, 0); +- ptr += 32*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp1_fsr, 2*sizeof(long), 0, 0); +- ptr += 2*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->frame_ptr, 2*sizeof(long), 0, 0); +- ptr += 2*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp0_index, 16*sizeof(long), 0, 0); +- strcpy(output_buffer,"OK"); +- } +- break; +- +- /* +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA +- */ +- case 'm': +- ptr = &input_buffer[1]; +- +- if (hexToLong(&ptr, &addr) +- && *ptr++ == ',' +- && hexToInt(&ptr, &length)) { +- if (mem2hex((char *)addr, output_buffer, length, 1)) +- break; +- strcpy (output_buffer, "E03"); +- } else +- strcpy(output_buffer,"E01"); +- break; +- +- /* +- * XAA..AA,LLLL: Write LLLL escaped binary bytes at address AA.AA +- */ +- case 'X': +- bflag = 1; +- /* fall through */ +- +- /* +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK +- */ +- case 'M': +- ptr = &input_buffer[1]; +- +- if (hexToLong(&ptr, &addr) +- && *ptr++ == ',' +- && hexToInt(&ptr, &length) +- && *ptr++ == ':') { +- if (hex2mem(ptr, (char *)addr, length, bflag, 1)) +- strcpy(output_buffer, "OK"); +- else +- strcpy(output_buffer, "E03"); +- } +- else +- strcpy(output_buffer, "E02"); +- break; +- +- /* +- * cAA..AA Continue at address AA..AA(optional) +- */ +- case 'c': +- /* try to read optional parameter, pc unchanged if no parm */ +- +- ptr = &input_buffer[1]; +- if (hexToLong(&ptr, &addr)) +- regs->cp0_epc = addr; +- +- goto exit_kgdb_exception; +- break; +- +- /* +- * kill the program; let us try to restart the machine +- * Reset the whole machine. +- */ +- case 'k': +- case 'r': +- machine_restart("kgdb restarts machine"); +- break; +- +- /* +- * Step to next instruction +- */ +- case 's': +- /* +- * There is no single step insn in the MIPS ISA, so we +- * use breakpoints and continue, instead. +- */ +- single_step(regs); +- goto exit_kgdb_exception; +- /* NOTREACHED */ +- break; +- +- /* +- * Set baud rate (bBB) +- * FIXME: Needs to be written +- */ +- case 'b': +- { +-#if 0 +- int baudrate; +- extern void set_timer_3(); +- +- ptr = &input_buffer[1]; +- if (!hexToInt(&ptr, &baudrate)) +- { +- strcpy(output_buffer,"B01"); +- break; +- } +- +- /* Convert baud rate to uart clock divider */ +- +- switch (baudrate) +- { +- case 38400: +- baudrate = 16; +- break; +- case 19200: +- baudrate = 33; +- break; +- case 9600: +- baudrate = 65; +- break; +- default: +- baudrate = 0; +- strcpy(output_buffer,"B02"); +- goto x1; +- } +- +- if (baudrate) { +- putpacket("OK"); /* Ack before changing speed */ +- set_timer_3(baudrate); /* Set it */ +- } +-#endif +- } +- break; +- +- } /* switch */ +- +- /* +- * reply to the request +- */ +- +- putpacket(output_buffer); +- +- } /* while */ +- +- return; +- +-finish_kgdb: +- restore_debug_traps(); +- +-exit_kgdb_exception: +- /* release locks so other CPUs can go */ +- for (i = num_online_cpus()-1; i >= 0; i--) +- __raw_spin_unlock(&kgdb_cpulock[i]); +- spin_unlock(&kgdb_lock); +- +- __flush_cache_all(); +- return; +-} +- +-/* +- * This function will generate a breakpoint exception. It is used at the +- * beginning of a program to sync up with a debugger and can be used +- * otherwise as a quick means to stop program execution and "break" into +- * the debugger. +- */ +-void breakpoint(void) +-{ +- if (!initialized) +- return; +- +- __asm__ __volatile__( +- ".globl breakinst\n\t" +- ".set\tnoreorder\n\t" +- "nop\n" +- "breakinst:\tbreak\n\t" +- "nop\n\t" +- ".set\treorder" +- ); +-} +- +-/* Nothing but the break; don't pollute any registers */ +-void async_breakpoint(void) +-{ +- __asm__ __volatile__( +- ".globl async_breakinst\n\t" +- ".set\tnoreorder\n\t" +- "nop\n" +- "async_breakinst:\tbreak\n\t" +- "nop\n\t" +- ".set\treorder" +- ); +-} +- +-void adel(void) +-{ +- __asm__ __volatile__( +- ".globl\tadel\n\t" +- "lui\t$8,0x8000\n\t" +- "lw\t$9,1($8)\n\t" +- ); +-} +- +-/* +- * malloc is needed by gdb client in "call func()", even a private one +- * will make gdb happy +- */ +-static void * __attribute_used__ malloc(size_t size) +-{ +- return kmalloc(size, GFP_ATOMIC); +-} +- +-static void __attribute_used__ free (void *where) +-{ +- kfree(where); +-} +- +-#ifdef CONFIG_GDB_CONSOLE +- +-void gdb_putsn(const char *str, int l) +-{ +- char outbuf[18]; +- +- if (!kgdb_started) +- return; +- +- outbuf[0]='O'; +- +- while(l) { +- int i = (l>8)?8:l; +- mem2hex((char *)str, &outbuf[1], i, 0); +- outbuf[(i*2)+1]=0; +- putpacket(outbuf); +- str += i; +- l -= i; +- } +-} +- +-static void gdb_console_write(struct console *con, const char *s, unsigned n) +-{ +- gdb_putsn(s, n); +-} +- +-static struct console gdb_console = { +- .name = "gdb", +- .write = gdb_console_write, +- .flags = CON_PRINTBUFFER, +- .index = -1 +-}; +- +-static int __init register_gdb_console(void) +-{ +- register_console(&gdb_console); +- +- return 0; +-} +- +-console_initcall(register_gdb_console); +- +-#endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/irq.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/irq.c +--- linux-2.6.18-53.1.14/arch/mips/kernel/irq.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/irq.c 2008-06-10 15:38:24.000000000 +0400 +@@ -25,6 +25,10 @@ + #include + #include + #include ++#include ++ ++/* Keep track of if we've done certain initialization already or not. */ ++int kgdb_early_setup; + + /* + * 'what should we do if we get a hw irq event on an illegal vector'. +@@ -115,23 +119,13 @@ asmlinkage void spurious_interrupt(struc + atomic_inc(&irq_err_count); + } + +-#ifdef CONFIG_KGDB +-extern void breakpoint(void); +-extern void set_debug_traps(void); +- +-static int kgdb_flag = 1; +-static int __init nokgdb(char *str) +-{ +- kgdb_flag = 0; +- return 1; +-} +-__setup("nokgdb", nokgdb); +-#endif +- + void __init init_IRQ(void) + { + int i; + ++ if (kgdb_early_setup) ++ return; ++ + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; +@@ -144,12 +138,12 @@ void __init init_IRQ(void) + } + + arch_init_irq(); +- + #ifdef CONFIG_KGDB +- if (kgdb_flag) { +- printk("Wait for gdb client connection ...\n"); +- set_debug_traps(); +- breakpoint(); +- } ++ /* ++ * We have been called before kgdb_arch_init(). Hence, ++ * we dont want the traps to be reinitialized ++ */ ++ if (kgdb_early_setup == 0) ++ kgdb_early_setup = 1; + #endif + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-jmp.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-jmp.c +--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-jmp.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-jmp.c 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,116 @@ ++/* ++ * arch/mips/kernel/kgdb-jmp.c ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: Tom Rini ++ * Author: Manish Lachwani ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 1996, 1997, 2000, 2002, 2003 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++#include ++ ++#ifdef CONFIG_MIPS64 ++/* ++ * MIPS 64-bit ++ */ ++ ++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp) ++{ ++ __asm__ __volatile__ ("sd $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__ ("sd $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__ ("sd $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__ ("sd $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__ ("sd $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__ ("sd $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__ ("sd $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__ ("sd $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__ ("sd $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__ ("sd $31, %0" : : "m" (curr_context[9])); ++ curr_context[10] = (long *)sp; ++ curr_context[11] = (long *)fp; ++ ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ unsigned long sp_val, fp_val; ++ ++ __asm__ __volatile__ ("ld $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__ ("ld $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__ ("ld $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__ ("ld $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__ ("ld $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__ ("ld $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__ ("ld $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__ ("ld $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__ ("ld $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__ ("ld $25, %0" : : "m" (curr_context[9])); ++ sp_val = curr_context[10]; ++ fp_val = curr_context[11]; ++ __asm__ __volatile__ ("ld $29, %0\n\t" ++ "ld $30, %1\n\t" : : "m" (sp_val), "m" (fp_val)); ++ ++ __asm__ __volatile__ ("dli $2, 1"); ++ __asm__ __volatile__ ("j $25"); ++ ++ for (;;); ++} ++#else ++/* ++ * MIPS 32-bit ++ */ ++ ++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp) ++{ ++ __asm__ __volatile__("sw $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__("sw $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__("sw $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__("sw $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__("sw $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__("sw $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__("sw $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__("sw $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__("sw $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__("sw $31, %0" : : "m" (curr_context[9])); ++ curr_context[10] = (long *)sp; ++ curr_context[11] = (long *)fp; ++ ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ unsigned long sp_val, fp_val; ++ ++ __asm__ __volatile__("lw $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__("lw $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__("lw $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__("lw $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__("lw $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__("lw $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__("lw $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__("lw $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__("lw $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__("lw $25, %0" : : "m" (curr_context[9])); ++ sp_val = curr_context[10]; ++ fp_val = curr_context[11]; ++ __asm__ __volatile__("lw $29, %0\n\t" ++ "lw $30, %1\n\t" : : "m" (sp_val), "m" (fp_val)); ++ ++ __asm__ __volatile__("li $2, 1"); ++ __asm__ __volatile__("jr $25"); ++ ++ for (;;); ++} ++#endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-setjmp.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-setjmp.S +--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb-setjmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb-setjmp.S 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,28 @@ ++/* ++ * arch/mips/kernel/kgdb-jmp.c ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Copyright (C) 2005 by MontaVista Software. ++ * Author: Manish Lachwani (mlachwani@mvista.com) ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ .ent kgdb_fault_setjmp,0 ++ENTRY (kgdb_fault_setjmp) ++ move a1, sp ++ move a2, fp ++#ifdef CONFIG_MIPS64 ++ nop ++#endif ++ j kgdb_fault_setjmp_aux ++ .end kgdb_fault_setjmp +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb.c 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,297 @@ ++/* ++ * arch/mips/kernel/kgdb.c ++ * ++ * Originally written by Glenn Engel, Lake Stevens Instrument Division ++ * ++ * Contributed by HP Systems ++ * ++ * Modified for SPARC by Stu Grossman, Cygnus Support. ++ * ++ * Modified for Linux/MIPS (and MIPS in general) by Andreas Busse ++ * Send complaints, suggestions etc. to ++ * ++ * Copyright (C) 1995 Andreas Busse ++ * ++ * Copyright (C) 2003 MontaVista Software Inc. ++ * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net ++ * ++ * Copyright (C) 2004-2005 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct hard_trap_info { ++ unsigned char tt; /* Trap type code for MIPS R3xxx and R4xxx */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++ { 6, SIGBUS }, /* instruction bus error */ ++ { 7, SIGBUS }, /* data bus error */ ++ { 9, SIGTRAP }, /* break */ ++/* { 11, SIGILL }, */ /* CPU unusable */ ++ { 12, SIGFPE }, /* overflow */ ++ { 13, SIGTRAP }, /* trap */ ++ { 14, SIGSEGV }, /* virtual instruction cache coherency */ ++ { 15, SIGFPE }, /* floating point exception */ ++ { 23, SIGSEGV }, /* watch */ ++ { 31, SIGSEGV }, /* virtual data cache coherency */ ++ { 0, 0} /* Must be last */ ++}; ++ ++/* Save the normal trap handlers for user-mode traps. */ ++void *saved_vectors[32]; ++ ++extern void trap_low(void); ++extern void breakinst(void); ++extern void init_IRQ(void); ++ ++void kgdb_call_nmi_hook(void *ignored) ++{ ++ kgdb_nmihook(smp_processor_id(), (void *)0); ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ local_irq_restore(flags); ++ smp_call_function(kgdb_call_nmi_hook, 0, 0, 0); ++ local_irq_save(flags); ++} ++ ++static int compute_signal(int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++/* ++ * Set up exception handlers for tracing and breakpoints ++ */ ++void handle_exception(struct pt_regs *regs) ++{ ++ int trap = (regs->cp0_cause & 0x7c) >> 2; ++ ++ if (fixup_exception(regs)) { ++ return; ++ } ++ ++ if (atomic_read(&debugger_active)) ++ kgdb_nmihook(smp_processor_id(), regs); ++ ++ if (atomic_read(&kgdb_setting_breakpoint)) ++ if ((trap == 9) && (regs->cp0_epc == (unsigned long)breakinst)) ++ regs->cp0_epc += 4; ++ ++ kgdb_handle_exception(0, compute_signal(trap), 0, regs); ++ ++ /* In SMP mode, __flush_cache_all does IPI */ ++ __flush_cache_all(); ++} ++ ++void set_debug_traps(void) ++{ ++ struct hard_trap_info *ht; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low); ++ ++ local_irq_restore(flags); ++} ++ ++#if 0 ++/* This should be called before we exit kgdb_handle_exception() I believe. ++ * -- Tom ++ */ ++void restore_debug_traps(void) ++{ ++ struct hard_trap_info *ht; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ set_except_vector(ht->tt, saved_vectors[ht->tt]); ++ local_irq_restore(flags); ++} ++#endif ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ *(ptr++) = regs->cp0_status; ++ *(ptr++) = regs->lo; ++ *(ptr++) = regs->hi; ++ *(ptr++) = regs->cp0_badvaddr; ++ *(ptr++) = regs->cp0_cause; ++ *(ptr++) = regs->cp0_epc; ++ ++ return; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ ++ int reg; ++ const gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 32; reg++) ++ regs->regs[reg] = *(ptr++); ++ ++ regs->cp0_status = *(ptr++); ++ regs->lo = *(ptr++); ++ regs->hi = *(ptr++); ++ regs->cp0_badvaddr = *(ptr++); ++ regs->cp0_cause = *(ptr++); ++ regs->cp0_epc = *(ptr++); ++ ++ return; ++} ++ ++/* ++ * Similar to regs_to_gdb_regs() except that process is sleeping and so ++ * we may not be able to get all the info. ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ int reg; ++ struct thread_info *ti = p->thread_info; ++ unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32; ++ struct pt_regs *regs = (struct pt_regs *)ksp - 1; ++ gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 16; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ /* S0 - S7 */ ++ for (reg = 16; reg < 24; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ for (reg = 24; reg < 28; reg++) ++ *(ptr++) = 0; ++ ++ /* GP, SP, FP, RA */ ++ for (reg = 28; reg < 32; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ *(ptr++) = regs->cp0_status; ++ *(ptr++) = regs->lo; ++ *(ptr++) = regs->hi; ++ *(ptr++) = regs->cp0_badvaddr; ++ *(ptr++) = regs->cp0_cause; ++ *(ptr++) = regs->cp0_epc; ++ ++ return; ++} ++ ++/* ++ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled, ++ * then try to fall into the debugger ++ */ ++static int kgdb_mips_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = (struct die_args *)ptr; ++ struct pt_regs *regs = args->regs; ++ int trap = (regs->cp0_cause & 0x7c) >> 2; ++ ++ /* See if KGDB is interested. */ ++ if (user_mode(regs)) ++ /* Userpace events, ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(trap, compute_signal(trap), 0, regs); ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_mips_notify, ++}; ++ ++/* ++ * Handle the 's' and 'c' commands ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *regs) ++{ ++ char *ptr; ++ unsigned long address; ++ int cpu = smp_processor_id(); ++ ++ switch (remcom_in_buffer[0]) { ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &address)) ++ regs->cp0_epc = address; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ if (remcom_in_buffer[0] == 's') ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, cpu); ++ ++ return 0; ++ } ++ ++ return -1; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifdef CONFIG_CPU_LITTLE_ENDIAN ++ .gdb_bpt_instr = {0xd}, ++#else ++ .gdb_bpt_instr = {0x00, 0x00, 0x00, 0x0d}, ++#endif ++}; ++ ++/* ++ * We use kgdb_early_setup so that functions we need to call now don't ++ * cause trouble when called again later. ++ */ ++int kgdb_arch_init(void) ++{ ++ /* Board-specifics. */ ++ /* Force some calls to happen earlier. */ ++ if (kgdb_early_setup == 0) { ++ trap_init(); ++ init_IRQ(); ++ kgdb_early_setup = 1; ++ } ++ ++ /* Set our traps. */ ++ /* This needs to be done more finely grained again, paired in ++ * a before/after in kgdb_handle_exception(...) -- Tom */ ++ set_debug_traps(); ++ notifier_chain_register(&mips_die_chain, &kgdb_notifier); ++ ++ return 0; ++} +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/kgdb_handler.S linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb_handler.S +--- linux-2.6.18-53.1.14/arch/mips/kernel/kgdb_handler.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/kgdb_handler.S 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,57 @@ ++/* ++ * arch/mips/kernel/kgdb_handler.S ++ * ++ * Copyright (C) 2004-2005 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++/* ++ * Trap Handler for the new KGDB framework. The main KGDB handler is ++ * handle_exception that will be called from here ++ * ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++ .align 5 ++ NESTED(trap_low, PT_SIZE, sp) ++ .set noat ++ .set noreorder ++ ++ /* ++ * Check for privileged instructions in user mode. For ++ * this, check the cu0 bit in the CPU status register. ++ */ ++ mfc0 k0, CP0_STATUS ++ sll k0, 3 ++ bltz k0, 1f ++ move k1, sp ++ ++ /* ++ * GDB userland from within KGDB. If a user mode address ++ * then jump to the saved exception handler ++ */ ++ mfc0 k1, CP0_CAUSE ++ andi k1, k1, 0x7c ++ PTR_L k0, saved_vectors(k1) ++ jr k0 ++ nop ++1: ++ SAVE_ALL ++ .set at ++ .set reorder ++ move a0, sp ++ jal handle_exception ++ j ret_from_exception ++ END(trap_low) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/traps.c +--- linux-2.6.18-53.1.14/arch/mips/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/kernel/traps.c 2008-06-10 15:38:24.000000000 +0400 +@@ -10,6 +10,8 @@ + * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000, 01 MIPS Technologies, Inc. + * Copyright (C) 2002, 2003, 2004, 2005 Maciej W. Rozycki ++ * ++ * KGDB specific changes - Manish Lachwani (mlachwani@mvista.com) + */ + #include + #include +@@ -20,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -40,6 +43,7 @@ + #include + #include + #include ++#include + + extern asmlinkage void handle_int(void); + extern asmlinkage void handle_tlbm(void); +@@ -78,6 +82,21 @@ void (*board_bind_eic_interrupt)(int irq + */ + #define MODULE_RANGE (8*1024*1024) + ++struct notifier_block *mips_die_chain; ++static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED; ++ ++int register_die_notifier(struct notifier_block *nb) ++{ ++ int err = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&die_notifier_lock, flags); ++ err = notifier_chain_register(&mips_die_chain, nb); ++ spin_unlock_irqrestore(&die_notifier_lock, flags); ++ ++ return err; ++} ++ + /* + * This routine abuses get_user()/put_user() to reference pointers + * with at least a bit of error checking ... +@@ -1387,6 +1406,11 @@ void __init trap_init(void) + extern char except_vec4; + unsigned long i; + ++#if defined(CONFIG_KGDB) ++ if (kgdb_early_setup) ++ return; /* Already done */ ++#endif ++ + if (cpu_has_veic || cpu_has_vint) + ebase = (unsigned long) alloc_bootmem_low_pages (0x200 + VECTORSPACING*64); + else +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/Makefile +--- linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/Makefile 2008-06-10 15:38:24.000000000 +0400 +@@ -21,6 +21,5 @@ + obj-y := reset.o display.o init.o memory.o printf.o \ + cmdline.o time.o + obj-$(CONFIG_PCI) += pci.o +-obj-$(CONFIG_KGDB) += gdb_hook.o + + EXTRA_AFLAGS := $(CFLAGS) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/init.c linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/init.c +--- linux-2.6.18-53.1.14/arch/mips/mips-boards/generic/init.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/generic/init.c 2008-06-10 15:38:24.000000000 +0400 +@@ -37,15 +37,6 @@ + + #include + +-#ifdef CONFIG_KGDB +-extern int rs_kgdb_hook(int, int); +-extern int rs_putDebugChar(char); +-extern char rs_getDebugChar(void); +-extern int saa9730_kgdb_hook(int); +-extern int saa9730_putDebugChar(char); +-extern char saa9730_getDebugChar(void); +-#endif +- + int prom_argc; + int *_prom_argv, *_prom_envp; + +@@ -172,58 +163,6 @@ static void __init console_config(void) + } + #endif + +-#ifdef CONFIG_KGDB +-void __init kgdb_config (void) +-{ +- extern int (*generic_putDebugChar)(char); +- extern char (*generic_getDebugChar)(void); +- char *argptr; +- int line, speed; +- +- argptr = prom_getcmdline(); +- if ((argptr = strstr(argptr, "kgdb=ttyS")) != NULL) { +- argptr += strlen("kgdb=ttyS"); +- if (*argptr != '0' && *argptr != '1') +- printk("KGDB: Unknown serial line /dev/ttyS%c, " +- "falling back to /dev/ttyS1\n", *argptr); +- line = *argptr == '0' ? 0 : 1; +- printk("KGDB: Using serial line /dev/ttyS%d for session\n", line); +- +- speed = 0; +- if (*++argptr == ',') +- { +- int c; +- while ((c = *++argptr) && ('0' <= c && c <= '9')) +- speed = speed * 10 + c - '0'; +- } +-#ifdef CONFIG_MIPS_ATLAS +- if (line == 1) { +- speed = saa9730_kgdb_hook(speed); +- generic_putDebugChar = saa9730_putDebugChar; +- generic_getDebugChar = saa9730_getDebugChar; +- } +- else +-#endif +- { +- speed = rs_kgdb_hook(line, speed); +- generic_putDebugChar = rs_putDebugChar; +- generic_getDebugChar = rs_getDebugChar; +- } +- +- prom_printf("KGDB: Using serial line /dev/ttyS%d at %d for session, " +- "please connect your debugger\n", line ? 1 : 0, speed); +- +- { +- char *s; +- for (s = "Please connect GDB to this port\r\n"; *s; ) +- generic_putDebugChar (*s++); +- } +- +- /* Breakpoint is invoked after interrupts are initialised */ +- } +-} +-#endif +- + void __init mips_nmi_setup (void) + { + void *base; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mips-boards/malta/malta_setup.c linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/malta/malta_setup.c +--- linux-2.6.18-53.1.14/arch/mips/mips-boards/malta/malta_setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mips-boards/malta/malta_setup.c 2008-06-10 15:38:24.000000000 +0400 +@@ -46,10 +46,6 @@ extern void mips_reboot_setup(void); + extern void mips_time_init(void); + extern unsigned long mips_rtc_get_time(void); + +-#ifdef CONFIG_KGDB +-extern void kgdb_config(void); +-#endif +- + struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY }, + { .name = "timer", .start = 0x40, .end = 0x5f, .flags = IORESOURCE_BUSY }, +@@ -124,10 +120,6 @@ void __init plat_mem_setup(void) + */ + enable_dma(4); + +-#ifdef CONFIG_KGDB +- kgdb_config (); +-#endif +- + if ((mips_revision_corid == MIPS_REVISION_CORID_BONITO64) || + (mips_revision_corid == MIPS_REVISION_CORID_CORE_20K) || + (mips_revision_corid == MIPS_REVISION_CORID_CORE_EMUL_BON)) { +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/mips/mm/extable.c +--- linux-2.6.18-53.1.14/arch/mips/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/mm/extable.c 2008-06-10 15:38:24.000000000 +0400 +@@ -3,6 +3,7 @@ + */ + #include + #include ++#include + #include + #include + +@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs + + return 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + + return 0; + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/cfe/setup.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/cfe/setup.c +--- linux-2.6.18-53.1.14/arch/mips/sibyte/cfe/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/cfe/setup.c 2008-06-10 15:38:24.000000000 +0400 +@@ -58,10 +58,6 @@ int cfe_cons_handle; + extern unsigned long initrd_start, initrd_end; + #endif + +-#ifdef CONFIG_KGDB +-extern int kgdb_port; +-#endif +- + static void ATTRIB_NORET cfe_linux_exit(void *arg) + { + int warm = *(int *)arg; +@@ -242,9 +238,6 @@ void __init prom_init(void) + int argc = fw_arg0; + char **envp = (char **) fw_arg2; + int *prom_vec = (int *) fw_arg3; +-#ifdef CONFIG_KGDB +- char *arg; +-#endif + + _machine_restart = cfe_linux_restart; + _machine_halt = cfe_linux_halt; +@@ -308,13 +301,6 @@ void __init prom_init(void) + } + } + +-#ifdef CONFIG_KGDB +- if ((arg = strstr(arcs_cmdline,"kgdb=duart")) != NULL) +- kgdb_port = (arg[10] == '0') ? 0 : 1; +- else +- kgdb_port = 1; +-#endif +- + #ifdef CONFIG_BLK_DEV_INITRD + { + char *ptr; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/Makefile +--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/Makefile 2008-06-10 15:38:24.000000000 +0400 +@@ -4,5 +4,6 @@ obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_SIBYTE_TBPROF) += bcm1250_tbprof.o + obj-$(CONFIG_SIBYTE_STANDALONE) += prom.o + obj-$(CONFIG_SIBYTE_BUS_WATCHER) += bus_watcher.o ++obj-$(CONFIG_KGDB_SIBYTE) += kgdb_sibyte.o + + EXTRA_AFLAGS := $(CFLAGS) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/irq.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/irq.c +--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/irq.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/irq.c 2008-06-10 15:38:24.000000000 +0400 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -59,16 +60,6 @@ static void sb1250_set_affinity(unsigned + extern unsigned long ldt_eoi_space; + #endif + +-#ifdef CONFIG_KGDB +-static int kgdb_irq; +- +-/* Default to UART1 */ +-int kgdb_port = 1; +-#ifdef CONFIG_SIBYTE_SB1250_DUART +-extern char sb1250_duart_present[]; +-#endif +-#endif +- + static struct irq_chip sb1250_irq_type = { + .typename = "SB1250-IMR", + .startup = startup_sb1250_irq, +@@ -324,6 +315,11 @@ void __init arch_init_irq(void) + unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 | + STATUSF_IP1 | STATUSF_IP0; + ++#ifdef CONFIG_KGDB ++ if (kgdb_early_setup) ++ return; ++#endif ++ + /* Default everything to IP2 */ + for (i = 0; i < SB1250_NR_IRQS; i++) { /* was I0 */ + __raw_writeq(IMR_IP2_VAL, +@@ -375,50 +371,6 @@ void __init arch_init_irq(void) + /* Enable necessary IPs, disable the rest */ + change_c0_status(ST0_IM, imask); + +-#ifdef CONFIG_KGDB +- if (kgdb_flag) { +- kgdb_irq = K_INT_UART_0 + kgdb_port; +- +-#ifdef CONFIG_SIBYTE_SB1250_DUART +- sb1250_duart_present[kgdb_port] = 0; +-#endif +- /* Setup uart 1 settings, mapper */ +- __raw_writeq(M_DUART_IMR_BRK, +- IOADDR(A_DUART_IMRREG(kgdb_port))); +- +- sb1250_steal_irq(kgdb_irq); +- __raw_writeq(IMR_IP6_VAL, +- IOADDR(A_IMR_REGISTER(0, +- R_IMR_INTERRUPT_MAP_BASE) + +- (kgdb_irq << 3))); +- sb1250_unmask_irq(0, kgdb_irq); +- } +-#endif +-} +- +-#ifdef CONFIG_KGDB +- +-#include +- +-#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +-#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +- +-static void sb1250_kgdb_interrupt(struct pt_regs *regs) +-{ +- /* +- * Clear break-change status (allow some time for the remote +- * host to stop the break, since we would see another +- * interrupt on the end-of-break too) +- */ +- kstat_this_cpu.irqs[kgdb_irq]++; +- mdelay(500); +- duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT | +- M_DUART_RX_EN | M_DUART_TX_EN); +- set_async_breakpoint(®s->cp0_epc); +-} +- +-#endif /* CONFIG_KGDB */ +- + static inline int dclz(unsigned long long x) + { + int lz; +@@ -473,7 +425,7 @@ asmlinkage void plat_irq_dispatch(struct + sb1250_mailbox_interrupt(regs); + #endif + +-#ifdef CONFIG_KGDB ++#ifdef CONFIG_KGDB_SIBYTE + else if (pending & CAUSEF_IP6) /* KGDB (uart 1) */ + sb1250_kgdb_interrupt(regs); + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/kgdb_sibyte.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c +--- linux-2.6.18-53.1.14/arch/mips/sibyte/sb1250/kgdb_sibyte.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,164 @@ ++/* ++ * arch/mips/sibyte/sb1250/kgdb_sibyte.c ++ * ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * 2004 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++/* ++ * Support for KGDB on the Broadcom Sibyte. The SWARM board ++ * for example does not have a 8250/16550 compatible serial ++ * port. Hence, we need to have a driver for the serial ++ * ports to handle KGDB. This board needs nothing in addition ++ * to what is normally provided by the gdb portion of the stub. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int kgdb_port = 1; ++static int kgdb_irq; ++ ++extern char sb1250_duart_present[]; ++extern int sb1250_steal_irq(int irq); ++ ++/* Forward declarations. */ ++static void kgdbsibyte_init_duart(void); ++static int kgdb_init_io(void); ++ ++#define IMR_IP6_VAL K_INT_MAP_I4 ++#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) ++#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) ++ ++static void kgdb_swarm_write_char(int c) ++{ ++ while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0) ; ++ duart_out(R_DUART_TX_HOLD, c); ++} ++ ++static int kgdb_swarm_read_char(void) ++{ ++ int ret_char; ++ unsigned int status; ++ ++ status = duart_in(R_DUART_STATUS); ++ while ((status & M_DUART_RX_RDY) == 0) { ++ status = duart_in(R_DUART_STATUS); ++ } ++ ++ /* ++ * Check for framing error ++ */ ++ if (status & M_DUART_FRM_ERR) { ++ kgdbsibyte_init_duart(); ++ kgdb_swarm_write_char('-'); ++ return '-'; ++ } ++ ++ ret_char = duart_in(R_DUART_RX_HOLD); ++ ++ return ret_char; ++} ++ ++void sb1250_kgdb_interrupt(struct pt_regs *regs) ++{ ++ int kgdb_irq = K_INT_UART_0 + kgdb_port; ++ /* ++ * Clear break-change status (allow some time for the remote ++ * host to stop the break, since we would see another ++ * interrupt on the end-of-break too) ++ */ ++ kstat_this_cpu.irqs[kgdb_irq]++; ++ mdelay(500); ++ duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT | ++ M_DUART_RX_EN | M_DUART_TX_EN); ++ if (kgdb_io_ops.init != kgdb_init_io) { ++ /* Throw away the data if another I/O routine is ++ * active. ++ */ ++ unsigned int status; ++ ++ status = duart_in(R_DUART_STATUS); ++ while ((status & M_DUART_RX_RDY) == 0) { ++ status = duart_in(R_DUART_STATUS); ++ } ++ /* ++ * Check for framing error ++ */ ++ if (status & M_DUART_FRM_ERR) { ++ kgdbsibyte_init_duart(); ++ } ++ duart_in(R_DUART_RX_HOLD); ++ } else ++ breakpoint(); ++ ++} ++ ++/* ++ * We use port #1 and we set it for 115200 BAUD, 8n1. ++ */ ++static void kgdbsibyte_init_duart(void) ++{ ++ /* Set 8n1. */ ++ duart_out(R_DUART_MODE_REG_1, ++ V_DUART_BITS_PER_CHAR_8 | V_DUART_PARITY_MODE_NONE); ++ duart_out(R_DUART_MODE_REG_2, M_DUART_STOP_BIT_LEN_1); ++ /* Set baud rate of 115200. */ ++ duart_out(R_DUART_CLK_SEL, V_DUART_BAUD_RATE(115200)); ++ /* Enable rx and tx */ ++ duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN); ++} ++ ++static int kgdb_init_io(void) ++{ ++#ifdef CONFIG_SIBYTE_SB1250_DUART ++ sb1250_duart_present[kgdb_port] = 0; ++#endif ++ ++ kgdbsibyte_init_duart(); ++ ++ return 0; ++} ++ ++/* ++ * Hookup our IRQ line. We will already have been initialized a ++ * this point. ++ */ ++static void __init kgdbsibyte_hookup_irq(void) ++{ ++ /* Steal the IRQ. */ ++ kgdb_irq = K_INT_UART_0 + kgdb_port; ++ ++ /* Setup uart 1 settings, mapper */ ++ __raw_writeq(M_DUART_IMR_BRK, IOADDR(A_DUART_IMRREG(kgdb_port))); ++ ++ sb1250_steal_irq(kgdb_irq); ++ ++ __raw_writeq(IMR_IP6_VAL, ++ IOADDR(A_IMR_REGISTER(0, R_IMR_INTERRUPT_MAP_BASE) + ++ (kgdb_irq << 3))); ++ ++ sb1250_unmask_irq(0, kgdb_irq); ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_swarm_read_char, ++ .write_char = kgdb_swarm_write_char, ++ .init = kgdb_init_io, ++ .late_init = kgdbsibyte_hookup_irq, ++ .pre_exception = NULL, ++ .post_exception = NULL ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/Makefile +--- linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/Makefile 2008-06-10 15:38:24.000000000 +0400 +@@ -1,3 +1 @@ + lib-y = setup.o rtc_xicor1241.o rtc_m41t81.o +- +-lib-$(CONFIG_KGDB) += dbg_io.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/dbg_io.c linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/dbg_io.c +--- linux-2.6.18-53.1.14/arch/mips/sibyte/swarm/dbg_io.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/sibyte/swarm/dbg_io.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,76 +0,0 @@ +-/* +- * kgdb debug routines for SiByte boards. +- * +- * Copyright (C) 2001 MontaVista Software Inc. +- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net +- * +- * This program is free software; you can redistribute it and/or modify it +- * under the terms of the GNU General Public License as published by the +- * Free Software Foundation; either version 2 of the License, or (at your +- * option) any later version. +- * +- */ +- +-/* -------------------- BEGINNING OF CONFIG --------------------- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * We use the second serial port for kgdb traffic. +- * 115200, 8, N, 1. +- */ +- +-#define BAUD_RATE 115200 +-#define CLK_DIVISOR V_DUART_BAUD_RATE(BAUD_RATE) +-#define DATA_BITS V_DUART_BITS_PER_CHAR_8 /* or 7 */ +-#define PARITY V_DUART_PARITY_MODE_NONE /* or even */ +-#define STOP_BITS M_DUART_STOP_BIT_LEN_1 /* or 2 */ +- +-static int duart_initialized = 0; /* 0: need to be init'ed by kgdb */ +- +-/* -------------------- END OF CONFIG --------------------- */ +-extern int kgdb_port; +- +-#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +-#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +- +-void putDebugChar(unsigned char c); +-unsigned char getDebugChar(void); +-static void +-duart_init(int clk_divisor, int data, int parity, int stop) +-{ +- duart_out(R_DUART_MODE_REG_1, data | parity); +- duart_out(R_DUART_MODE_REG_2, stop); +- duart_out(R_DUART_CLK_SEL, clk_divisor); +- +- duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN); /* enable rx and tx */ +-} +- +-void +-putDebugChar(unsigned char c) +-{ +- if (!duart_initialized) { +- duart_initialized = 1; +- duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS); +- } +- while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0); +- duart_out(R_DUART_TX_HOLD, c); +-} +- +-unsigned char +-getDebugChar(void) +-{ +- if (!duart_initialized) { +- duart_initialized = 1; +- duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS); +- } +- while ((duart_in(R_DUART_STATUS) & M_DUART_RX_RDY) == 0) ; +- return duart_in(R_DUART_RX_HOLD); +-} +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/mips/tx4938/common/Makefile linux-2.6.18-53.1.14.kgdb/arch/mips/tx4938/common/Makefile +--- linux-2.6.18-53.1.14/arch/mips/tx4938/common/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/mips/tx4938/common/Makefile 2008-06-10 15:38:24.000000000 +0400 +@@ -7,5 +7,5 @@ + # + + obj-y += prom.o setup.o irq.o rtc_rx5c348.o +-obj-$(CONFIG_KGDB) += dbgio.o ++obj-$(CONFIG_KGDB_8250) += dbgio.o + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/powerpc/Kconfig.debug +--- linux-2.6.18-53.1.14/arch/powerpc/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/Kconfig.debug 2008-06-10 15:38:14.000000000 +0400 +@@ -18,52 +18,9 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-config DEBUGGER +- bool "Enable debugger hooks" +- depends on DEBUG_KERNEL +- help +- Include in-kernel hooks for kernel debuggers. Unless you are +- intending to debug the kernel, say N here. +- +-config KGDB +- bool "Include kgdb kernel debugger" +- depends on DEBUGGER && (BROKEN || PPC_GEN550 || 4xx) +- select DEBUG_INFO +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-choice +- prompt "Serial Port" +- depends on KGDB +- default KGDB_TTYS1 +- +-config KGDB_TTYS0 +- bool "ttyS0" +- +-config KGDB_TTYS1 +- bool "ttyS1" +- +-config KGDB_TTYS2 +- bool "ttyS2" +- +-config KGDB_TTYS3 +- bool "ttyS3" +- +-endchoice +- +-config KGDB_CONSOLE +- bool "Enable serial console thru kgdb port" +- depends on KGDB && 8xx || CPM2 +- help +- If you enable this, all serial console messages will be sent +- over the gdb stub. +- If unsure, say N. +- + config XMON + bool "Include xmon kernel debugger" +- depends on DEBUGGER && !PPC_ISERIES ++ depends on DEBUG_KERNEL && !PPC_ISERIES + help + Include in-kernel hooks for the xmon kernel monitor/debugger. + Unless you are intending to debug the kernel, say N here. +@@ -82,6 +39,11 @@ config XMON_DEFAULT + xmon is normally disabled unless booted with 'xmon=on'. + Use 'xmon=off' to disable xmon init during runtime. + ++config DEBUGGER ++ bool ++ depends on KGDB || XMON ++ default y ++ + config IRQSTACKS + bool "Use separate kernel stacks when processing interrupts" + depends on PPC64 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/powerpc/kernel/Makefile 2008-03-06 05:54:47.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/Makefile 2008-06-10 15:38:14.000000000 +0400 +@@ -59,6 +59,7 @@ obj-$(CONFIG_PPC64) += misc_64.o dma_64 + obj-$(CONFIG_PPC_MULTIPLATFORM) += prom_init.o + obj-$(CONFIG_MODULES) += ppc_ksyms.o + obj-$(CONFIG_BOOTX_TEXT) += btext.o ++obj-$(CONFIG_KGDB) += kgdb.o + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_KPROBES) += kprobes.o + obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/powerpc/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/kgdb.c 2008-06-10 15:38:14.000000000 +0400 +@@ -0,0 +1,568 @@ ++/* ++ * arch/powerpc/kernel/kgdb.c ++ * ++ * PowerPC backend to the KGDB stub. ++ * ++ * Maintainer: Tom Rini ++ * ++ * Copied from arch/ppc/kernel/kgdb.c, updated for ppc64 ++ * ++ * Copyright (C) 1996 Paul Mackerras (setjmp/longjmp) ++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu) ++ * Copyright (C) 2003 Timesys Corporation. ++ * Copyright (C) 2004-2006 MontaVista Software, Inc. ++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com) ++ * PPC32 support restored by Vitaly Wool and ++ * Sergei Shtylyov ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * This table contains the mapping between PowerPC hardware trap types, and ++ * signals, which are primarily what GDB understands. GDB and the kernel ++ * don't always agree on values, so we use constants taken from gdb-6.2. ++ */ ++static struct hard_trap_info ++{ ++ unsigned int tt; /* Trap type code for powerpc */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++ { 0x0100, 0x02 /* SIGINT */ }, /* system reset */ ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* data access */ ++ { 0x0400, 0x0b /* SIGSEGV */ }, /* instruction access */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* external interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */ ++ { 0x0700, 0x05 /* SIGTRAP */ }, /* program check */ ++ { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */ ++ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ ++#if defined(CONFIG_FSL_BOOKE) ++ { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ ++ { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */ ++ { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */ ++ { 0x2040, 0x08 /* SIGFPE */ }, /* spe fp data */ ++ { 0x2050, 0x08 /* SIGFPE */ }, /* spe fp round */ ++ { 0x2060, 0x0e /* SIGILL */ }, /* performace monitor */ ++ { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */ ++ { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */ ++ { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */ ++#else ++ { 0x1000, 0x0e /* SIGALRM */ }, /* programmable interval timer */ ++ { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */ ++ { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */ ++ { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */ ++ { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */ ++#endif ++#else ++ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ ++#if defined(CONFIG_8xx) ++ { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ ++#else ++ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ ++ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ ++ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ ++#if defined(CONFIG_PPC64) ++ { 0x1200, 0x05 /* SIGILL */ }, /* system error */ ++ { 0x1500, 0x04 /* SIGILL */ }, /* soft patch */ ++ { 0x1600, 0x04 /* SIGILL */ }, /* maintenance */ ++ { 0x1700, 0x08 /* SIGFPE */ }, /* altivec assist */ ++ { 0x1800, 0x04 /* SIGILL */ }, /* thermal */ ++#else ++ { 0x1400, 0x02 /* SIGINT */ }, /* SMI */ ++ { 0x1600, 0x08 /* SIGFPE */ }, /* altivec assist */ ++ { 0x1700, 0x04 /* SIGILL */ }, /* TAU */ ++ { 0x2000, 0x05 /* SIGTRAP */ }, /* run mode */ ++#endif ++#endif ++#endif ++ { 0x0000, 0x00 } /* Must be last */ ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int computeSignal(unsigned int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++static int kgdb_call_nmi_hook(struct pt_regs *regs) ++{ ++ kgdb_nmihook(smp_processor_id(), regs); ++ return 0; ++} ++ ++#ifdef CONFIG_SMP ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ smp_send_debugger_break(MSG_ALL_BUT_SELF); ++} ++#endif ++ ++/* KGDB functions to use existing PowerPC64 hooks. */ ++static int kgdb_debugger(struct pt_regs *regs) ++{ ++ return kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++} ++ ++static int kgdb_breakpoint(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) ++ regs->nip += 4; ++ ++ return 1; ++} ++ ++static int kgdb_singlestep(struct pt_regs *regs) ++{ ++ struct thread_info *thread_info, *exception_thread_info; ++ if (user_mode(regs)) ++ return 0; ++ /* ++ * On Book E and perhaps other processsors, singlestep is handled on ++ * the critical exception stack. This causes current_thread_info() ++ * to fail, since it it locates the thread_info by masking off ++ * the low bits of the current stack pointer. We work around ++ * this issue by copying the thread_info from the kernel stack ++ * before calling kgdb_handle_exception, and copying it back ++ * afterwards. On most processors the copy is avoided since ++ * exception_thread_info == thread_info. ++ */ ++ thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); ++ exception_thread_info = current_thread_info(); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(exception_thread_info, thread_info, sizeof *thread_info); ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(thread_info, exception_thread_info, sizeof *thread_info); ++ ++ return 1; ++} ++ ++int kgdb_iabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++int kgdb_dabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++#define PACK64(ptr,src) do { *(ptr++) = (src); } while(0) ++ ++#define PACK32(ptr,src) do { \ ++ u32 *ptr32; \ ++ ptr32 = (u32 *)ptr; \ ++ *(ptr32++) = (src); \ ++ ptr = (unsigned long *)ptr32; \ ++ } while(0) ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++ memset(gdb_regs, 0, NUMREGBYTES); ++ ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, current->thread.evr[reg]); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(long); ++#endif ++ ++ PACK64(ptr, regs->nip); ++ PACK64(ptr, regs->msr); ++ PACK32(ptr, regs->ccr); ++ PACK64(ptr, regs->link); ++ PACK64(ptr, regs->ctr); ++ PACK32(ptr, regs->xer); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ PACK32(ptr, current->thread->fpscr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++#ifdef CONFIG_ALTIVEC ++ PACK32(ptr, current->thread->vscr); ++ PACK32(ptr, current->thread->vrsave); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ PACK32(ptr, current->thread.acc >> 32); ++ PACK32(ptr, current->thread.acc & 0xffffffff); ++ PACK64(ptr, current->thread.spefscr); ++#else ++ ptr += 2 + 1; ++#endif ++#else ++ /* fpscr not used by kernel, leave zero */ ++ PACK32(ptr, 0); ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + ++ STACK_FRAME_OVERHEAD); ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++ memset(gdb_regs, 0, NUMREGBYTES); ++ ++ /* Regs GPR0-2 */ ++ for (reg = 0; reg < 3; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++ /* Regs GPR3-13 are caller saved, not in regs->gpr[] */ ++ ptr += 11; ++ ++ /* Regs GPR14-31 */ ++ for (reg = 14; reg < 32; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, p->thread.evr[reg]); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(long); ++#endif ++ PACK64(ptr, regs->nip); ++ PACK64(ptr, regs->msr); ++ PACK32(ptr, regs->ccr); ++ PACK64(ptr, regs->link); ++ PACK64(ptr, regs->ctr); ++ PACK32(ptr, regs->xer); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ PACK32(ptr, p->thread->fpscr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++#ifdef CONFIG_ALTIVEC ++ PACK32(ptr, p->thread->vscr); ++ PACK32(ptr, p->thread->vrsave); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ PACK32(ptr, p->thread.acc >> 32); ++ PACK32(ptr, p->thread.acc & 0xffffffff); ++ PACK64(ptr, p->thread.spefscr); ++#else ++ ptr += 2 + 1; ++#endif ++#else ++ /* fpscr not used by kernel, leave zero */ ++ PACK32(ptr, 0); ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++#define UNPACK64(dest,ptr) do { dest = *(ptr++); } while(0) ++ ++#define UNPACK32(dest,ptr) do { \ ++ u32 *ptr32; \ ++ ptr32 = (u32 *)ptr; \ ++ dest = *(ptr32++); \ ++ ptr = (unsigned long *)ptr32; \ ++ } while(0) ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++#ifdef CONFIG_SPE ++ union { ++ u32 v32[2]; ++ u64 v64; ++ } acc; ++#endif ++ for (reg = 0; reg < 32; reg++) ++ UNPACK64(regs->gpr[reg], ptr); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ UNPACK64(current->thread.evr[reg], ptr); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(int); ++#endif ++ UNPACK64(regs->nip, ptr); ++ UNPACK64(regs->msr, ptr); ++ UNPACK32(regs->ccr, ptr); ++ UNPACK64(regs->link, ptr); ++ UNPACK64(regs->ctr, ptr); ++ UNPACK32(regs->xer, ptr); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ /* fpscr, vscr, vrsave not used by kernel, leave unchanged */ ++ ++ UNPACK32(current->thread->fpscr, ptr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++ #ifdef CONFIG_ALTIVEC ++ UNPACK32(current->thread->vscr, ptr); ++ UNPACK32(current->thread->vrsave, ptr); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ UNPACK32(acc.v32[0], ptr); ++ UNPACK32(acc.v32[1], ptr); ++ current->thread.acc = acc.v64; ++ UNPACK64(current->thread.spefscr, ptr); ++#else ++ ptr += 2 + 1; ++#endif ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++/* ++ * This function does PowerPC specific procesing for interfacing to gdb. ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr; ++ ++ switch (remcom_in_buffer[0]) { ++ /* ++ * sAA..AA Step one instruction from AA..AA ++ * This will return an error to gdb .. ++ */ ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->nip = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ mtspr(SPRN_DBCR0, ++ mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); ++ linux_regs->msr |= MSR_DE; ++#else ++ linux_regs->msr |= MSR_SE; ++#endif ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ ++ return -1; ++} ++ ++int kgdb_fault_setjmp(unsigned long *curr_context) ++{ ++#ifdef CONFIG_PPC32 ++ __asm__ __volatile__("mflr 0; stw 0,0(%0);\n\ ++ stw 1,4(%0); stw 2,8(%0);\n\ ++ mfcr 0; stw 0,12(%0);\n\ ++ stmw 13,16(%0)\n" : : "r" (curr_context)); ++#else ++ __asm__ __volatile__("mflr 0; std 0,0(%0)\n\ ++ std 1,8(%0)\n\ ++ std 2,16(%0)\n\ ++ mfcr 0; std 0,24(%0)\n\ ++ std 13,32(%0)\n\ ++ std 14,40(%0)\n\ ++ std 15,48(%0)\n\ ++ std 16,56(%0)\n\ ++ std 17,64(%0)\n\ ++ std 18,72(%0)\n\ ++ std 19,80(%0)\n\ ++ std 20,88(%0)\n\ ++ std 21,96(%0)\n\ ++ std 22,104(%0)\n\ ++ std 23,112(%0)\n\ ++ std 24,120(%0)\n\ ++ std 25,128(%0)\n\ ++ std 26,136(%0)\n\ ++ std 27,144(%0)\n\ ++ std 28,152(%0)\n\ ++ std 29,160(%0)\n\ ++ std 30,168(%0)\n\ ++ std 31,176(%0)\n" : : "r" (curr_context)); ++#endif ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++#ifdef CONFIG_PPC32 ++ __asm__ __volatile__("lmw 13,16(%0);\n\ ++ lwz 0,12(%0); mtcrf 0x38,0;\n\ ++ lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);\n\ ++ mtlr 0; mr 3,1\n" : : "r" (curr_context)); ++#else ++ __asm__ __volatile__("ld 13,32(%0)\n\ ++ ld 14,40(%0)\n\ ++ ld 15,48(%0)\n\ ++ ld 16,56(%0)\n\ ++ ld 17,64(%0)\n\ ++ ld 18,72(%0)\n\ ++ ld 19,80(%0)\n\ ++ ld 20,88(%0)\n\ ++ ld 21,96(%0)\n\ ++ ld 22,104(%0)\n\ ++ ld 23,112(%0)\n\ ++ ld 24,120(%0)\n\ ++ ld 25,128(%0)\n\ ++ ld 26,136(%0)\n\ ++ ld 27,144(%0)\n\ ++ ld 28,152(%0)\n\ ++ ld 29,160(%0)\n\ ++ ld 30,168(%0)\n\ ++ ld 31,176(%0)\n\ ++ ld 0,24(%0)\n\ ++ mtcrf 0x38,0\n\ ++ ld 0,0(%0)\n\ ++ ld 1,8(%0)\n\ ++ ld 2,16(%0)\n\ ++ mtlr 0\n\ ++ mr 3,1\n" : : "r" (curr_context)); ++#endif ++} ++ ++/* ++ * Global data ++ */ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, ++}; ++ ++int kgdb_not_implemented(struct pt_regs *regs) ++{ ++ return 0; ++} ++ ++int kgdb_arch_init(void) ++{ ++#ifdef CONFIG_XMON ++#error Both XMON and KGDB selected in .config. Unselect one of them. ++#endif ++ ++ __debugger_ipi = kgdb_call_nmi_hook; ++ __debugger = kgdb_debugger; ++ __debugger_bpt = kgdb_breakpoint; ++ __debugger_sstep = kgdb_singlestep; ++ __debugger_iabr_match = kgdb_iabr_match; ++ __debugger_dabr_match = kgdb_dabr_match; ++ __debugger_fault_handler = kgdb_not_implemented; ++ ++ return 0; ++} ++ ++arch_initcall(kgdb_arch_init); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/legacy_serial.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/legacy_serial.c +--- linux-2.6.18-53.1.14/arch/powerpc/kernel/legacy_serial.c 2008-03-06 05:54:47.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/legacy_serial.c 2008-06-10 15:38:14.000000000 +0400 +@@ -11,6 +11,9 @@ + #include + #include + #include ++#ifdef CONFIG_KGDB_8250 ++#include ++#endif + + #undef DEBUG + +@@ -485,6 +488,9 @@ static int __init serial_dev_init(void) + fixup_port_pio(i, np, port); + if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI)) + fixup_port_mmio(i, np, port); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_platform_port(i, port); ++#endif + } + + DBG("Registering platform serial ports\n"); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/kernel/setup_32.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/setup_32.c +--- linux-2.6.18-53.1.14/arch/powerpc/kernel/setup_32.c 2008-03-06 05:54:45.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/kernel/setup_32.c 2008-06-10 15:38:14.000000000 +0400 +@@ -45,10 +45,6 @@ + + #define DBG(fmt...) + +-#if defined CONFIG_KGDB +-#include +-#endif +- + extern void bootx_init(unsigned long r4, unsigned long phys); + + struct ide_machdep_calls ppc_ide_md; +@@ -251,18 +247,6 @@ void __init setup_arch(char **cmdline_p) + + xmon_setup(); + +-#if defined(CONFIG_KGDB) +- if (ppc_md.kgdb_map_scc) +- ppc_md.kgdb_map_scc(); +- set_debug_traps(); +- if (strstr(cmd_line, "gdb")) { +- if (ppc_md.progress) +- ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); +- printk("kgdb breakpoint activated\n"); +- breakpoint(); +- } +-#endif +- + /* + * Set cache line size based on type of cpu as a default. + * Systems with OF can look in the properties on the cpu node(s) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/mm/fault.c +--- linux-2.6.18-53.1.14/arch/powerpc/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/mm/fault.c 2008-06-10 15:38:14.000000000 +0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -424,6 +425,13 @@ void bad_page_fault(struct pt_regs *regs + return; + } + ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif ++ + /* kernel has accessed a bad area */ + + printk(KERN_ALERT "Unable to handle kernel paging request for "); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/powerpc/platforms/powermac/setup.c linux-2.6.18-53.1.14.kgdb/arch/powerpc/platforms/powermac/setup.c +--- linux-2.6.18-53.1.14/arch/powerpc/platforms/powermac/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/powerpc/platforms/powermac/setup.c 2008-06-10 15:38:14.000000000 +0400 +@@ -98,8 +98,6 @@ extern struct machdep_calls pmac_md; + int sccdbg; + #endif + +-extern void zs_kgdb_hook(int tty_num); +- + sys_ctrler_t sys_ctrler = SYS_CTRLER_UNKNOWN; + EXPORT_SYMBOL(sys_ctrler); + +@@ -319,10 +317,6 @@ static void __init pmac_setup_arch(void) + l2cr_init(); + #endif /* CONFIG_PPC32 */ + +-#ifdef CONFIG_KGDB +- zs_kgdb_hook(0); +-#endif +- + find_via_cuda(); + find_via_pmu(); + smu_init(); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/ppc/Kconfig.debug +--- linux-2.6.18-53.1.14/arch/ppc/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/Kconfig.debug 2008-06-10 15:38:14.000000000 +0400 +@@ -2,42 +2,6 @@ menu "Kernel hacking" + + source "lib/Kconfig.debug" + +-config KGDB +- bool "Include kgdb kernel debugger" +- depends on DEBUG_KERNEL && (BROKEN || PPC_GEN550 || 4xx) +- select DEBUG_INFO +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-choice +- prompt "Serial Port" +- depends on KGDB +- default KGDB_TTYS1 +- +-config KGDB_TTYS0 +- bool "ttyS0" +- +-config KGDB_TTYS1 +- bool "ttyS1" +- +-config KGDB_TTYS2 +- bool "ttyS2" +- +-config KGDB_TTYS3 +- bool "ttyS3" +- +-endchoice +- +-config KGDB_CONSOLE +- bool "Enable serial console thru kgdb port" +- depends on KGDB && 8xx || CPM2 +- help +- If you enable this, all serial console messages will be sent +- over the gdb stub. +- If unsure, say N. +- + config XMON + bool "Include xmon kernel debugger" + depends on DEBUG_KERNEL +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/ppc/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/kgdb.c 2008-06-10 15:39:34.000000000 +0400 +@@ -0,0 +1,350 @@ ++/* ++ * arch/ppc/kernel/kgdb.c ++ * ++ * PowerPC backend to the KGDB stub. ++ * ++ * Maintainer: Tom Rini ++ * ++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu) ++ * Copyright (C) 2003 Timesys Corporation. ++ * 2004 (c) MontaVista Software, Inc. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * This table contains the mapping between PowerPC hardware trap types, and ++ * signals, which are primarily what GDB understands. GDB and the kernel ++ * don't always agree on values, so we use constants taken from gdb-6.2. ++ */ ++static struct hard_trap_info ++{ ++ unsigned int tt; /* Trap type code for powerpc */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ { 0x0100, 0x02 /* SIGINT */ }, /* critical input interrupt */ ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* data storage */ ++ { 0x0400, 0x0a /* SIGBUS */ }, /* instruction storage */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */ ++ { 0x0700, 0x04 /* SIGILL */ }, /* program */ ++ { 0x0800, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0900, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0a00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0b00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* syscall */ ++ { 0x0d00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0e00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0f00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x2002, 0x05 /* SIGTRAP */}, /* debug */ ++#else ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* address error (store) */ ++ { 0x0400, 0x0a /* SIGBUS */ }, /* instruction bus error */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alingment */ ++ { 0x0700, 0x05 /* SIGTRAP */ }, /* breakpoint trap */ ++ { 0x0800, 0x08 /* SIGFPE */}, /* fpu unavail */ ++ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ ++ { 0x0a00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0b00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* syscall */ ++ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step/watch */ ++ { 0x0e00, 0x08 /* SIGFPE */ }, /* fp assist */ ++#endif ++ { 0x0000, 0x000 } /* Must be last */ ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int computeSignal(unsigned int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++/* KGDB functions to use existing PowerPC hooks. */ ++static void kgdb_debugger(struct pt_regs *regs) ++{ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++} ++ ++static int kgdb_breakpoint(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) ++ regs->nip += 4; ++ ++ return 1; ++} ++ ++static int kgdb_singlestep(struct pt_regs *regs) ++{ ++ struct thread_info *thread_info, *exception_thread_info; ++ ++ if (user_mode(regs)) ++ return 0; ++ /* ++ * On Book E and perhaps other processsors, singlestep is handled on ++ * the critical exception stack. This causes current_thread_info() ++ * to fail, since it it locates the thread_info by masking off ++ * the low bits of the current stack pointer. We work around ++ * this issue by copying the thread_info from the kernel stack ++ * before calling kgdb_handle_exception, and copying it back ++ * afterwards. On most processors the copy is avoided since ++ * exception_thread_info == thread_info. ++ */ ++ thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); ++ exception_thread_info = current_thread_info(); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(exception_thread_info, thread_info, sizeof *thread_info); ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(thread_info, exception_thread_info, sizeof *thread_info); ++ ++ return 1; ++} ++ ++int kgdb_iabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++int kgdb_dabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ unsigned long *ptr = gdb_regs; ++ ++ memset(gdb_regs, 0, MAXREG * 4); ++ ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ *(ptr++) = 0; ++#else ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = current->thread.evr[reg]; ++#endif ++ ++ *(ptr++) = regs->nip; ++ *(ptr++) = regs->msr; ++ *(ptr++) = regs->ccr; ++ *(ptr++) = regs->link; ++ *(ptr++) = regs->ctr; ++ *(ptr++) = regs->xer; ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ *(ptr++) = (current->thread.acc >> 32); ++ *(ptr++) = (current->thread.acc & 0xffffffff); ++ *(ptr++) = current->thread.spefscr; ++#endif ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + ++ STACK_FRAME_OVERHEAD); ++ int reg; ++ unsigned long *ptr = gdb_regs; ++ ++ memset(gdb_regs, 0, MAXREG * 4); ++ ++ /* Regs GPR0-2 */ ++ for (reg = 0; reg < 3; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++ /* Regs GPR3-13 are not saved */ ++ for (reg = 3; reg < 14; reg++) ++ *(ptr++) = 0; ++ ++ /* Regs GPR14-31 */ ++ for (reg = 14; reg < 32; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ *(ptr++) = 0; ++#else ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = current->thread.evr[reg]; ++#endif ++ ++ *(ptr++) = regs->nip; ++ *(ptr++) = regs->msr; ++ *(ptr++) = regs->ccr; ++ *(ptr++) = regs->link; ++ *(ptr++) = regs->ctr; ++ *(ptr++) = regs->xer; ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ *(ptr++) = (current->thread.acc >> 32); ++ *(ptr++) = (current->thread.acc & 0xffffffff); ++ *(ptr++) = current->thread.spefscr; ++#endif ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ unsigned long *ptr = gdb_regs; ++#ifdef CONFIG_SPE ++ union { ++ u32 v32[2]; ++ u64 v64; ++ } u; ++#endif ++ ++ for (reg = 0; reg < 32; reg++) ++ regs->gpr[reg] = *(ptr++); ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ ptr++; ++#else ++ for (reg = 0; reg < 32; reg++) ++ current->thread.evr[reg] = *(ptr++); ++#endif ++ ++ regs->nip = *(ptr++); ++ regs->msr = *(ptr++); ++ regs->ccr = *(ptr++); ++ regs->link = *(ptr++); ++ regs->ctr = *(ptr++); ++ regs->xer = *(ptr++); ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ u.v32[0] = *(ptr++); ++ u.v32[1] = *(ptr++); ++ current->thread.acc = u.v64; ++ current->thread.spefscr = *(ptr++); ++#endif ++} ++ ++/* ++ * Save/restore state in case a memory access causes a fault. ++ */ ++int kgdb_fault_setjmp(unsigned long *curr_context) ++{ ++ __asm__ __volatile__("mflr 0; stw 0,0(%0);" ++ "stw 1,4(%0); stw 2,8(%0);" ++ "mfcr 0; stw 0,12(%0);" ++ "stmw 13,16(%0)"::"r"(curr_context)); ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ __asm__ __volatile__("lmw 13,16(%0);" ++ "lwz 0,12(%0); mtcrf 0x38,0;" ++ "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);" ++ "mtlr 0; mr 3,1"::"r"(curr_context)); ++} ++ ++/* ++ * This function does PoerPC specific procesing for interfacing to gdb. ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr; ++ ++ switch (remcom_in_buffer[0]) ++ { ++ /* ++ * sAA..AA Step one instruction from AA..AA ++ * This will return an error to gdb .. ++ */ ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ if (kgdb_hex2long (&ptr, &addr)) ++ linux_regs->nip = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++#if defined (CONFIG_40x) || defined(CONFIG_BOOKE) ++ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | ++ DBCR0_IC | DBCR0_IDM); ++ linux_regs->msr |= MSR_DE; ++#else ++ linux_regs->msr |= MSR_SE; ++#endif ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Global data ++ */ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ debugger = kgdb_debugger; ++ debugger_bpt = kgdb_breakpoint; ++ debugger_sstep = kgdb_singlestep; ++ debugger_iabr_match = kgdb_iabr_match; ++ debugger_dabr_match = kgdb_dabr_match; ++ ++ return 0; ++} ++ ++arch_initcall(kgdb_arch_init); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/ppc-stub.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/ppc-stub.c +--- linux-2.6.18-53.1.14/arch/ppc/kernel/ppc-stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/ppc-stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,866 +0,0 @@ +-/* +- * ppc-stub.c: KGDB support for the Linux kernel. +- * +- * adapted from arch/sparc/kernel/sparc-stub.c for the PowerPC +- * some stuff borrowed from Paul Mackerras' xmon +- * Copyright (C) 1998 Michael AK Tesch (tesch@cs.wisc.edu) +- * +- * Modifications to run under Linux +- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) +- * +- * This file originally came from the gdb sources, and the +- * copyright notices have been retained below. +- */ +- +-/**************************************************************************** +- +- THIS SOFTWARE IS NOT COPYRIGHTED +- +- HP offers the following for use in the public domain. HP makes no +- warranty with regard to the software or its performance and the +- user accepts the software "AS IS" with all faults. +- +- HP DISCLAIMS ANY WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD +- TO THIS SOFTWARE INCLUDING BUT NOT LIMITED TO THE WARRANTIES +- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. +- +-****************************************************************************/ +- +-/**************************************************************************** +- * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ +- * +- * Module name: remcom.c $ +- * Revision: 1.34 $ +- * Date: 91/03/09 12:29:49 $ +- * Contributor: Lake Stevens Instrument Division$ +- * +- * Description: low level support for gdb debugger. $ +- * +- * Considerations: only works on target hardware $ +- * +- * Written by: Glenn Engel $ +- * ModuleState: Experimental $ +- * +- * NOTES: See Below $ +- * +- * Modified for SPARC by Stu Grossman, Cygnus Support. +- * +- * This code has been extensively tested on the Fujitsu SPARClite demo board. +- * +- * To enable debugger support, two things need to happen. One, a +- * call to set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * Two, a breakpoint needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint(). Breakpoint() +- * simulates a breakpoint by executing a trap #1. +- * +- ************* +- * +- * The following gdb commands are supported: +- * +- * command function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * qOffsets Get section offsets. Reply is Text=xxx;Data=yyy;Bss=zzz +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * +- * k kill +- * +- * ? What was the last sigval ? SNN (signal NN) +- * +- * bBB..BB Set baud rate to BB..BB OK or BNN, then sets +- * baud rate +- * +- * All commands and responses are sent with a packet which includes a +- * checksum. A packet consists of +- * +- * $#. +- * +- * where +- * :: +- * :: > +- * +- * When a packet is received, it is first acknowledged with either '+' or '-'. +- * '+' indicates a successful transfer. '-' indicates a failed transfer. +- * +- * Example: +- * +- * Host: Reply: +- * $m0,10#2a +$00010203040506070809101112131415#42 +- * +- ****************************************************************************/ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-void breakinst(void); +- +-/* +- * BUFMAX defines the maximum number of characters in inbound/outbound buffers +- * at least NUMREGBYTES*2 are needed for register packets +- */ +-#define BUFMAX 2048 +-static char remcomInBuffer[BUFMAX]; +-static char remcomOutBuffer[BUFMAX]; +- +-static int initialized; +-static int kgdb_active; +-static int kgdb_started; +-static u_int fault_jmp_buf[100]; +-static int kdebug; +- +- +-static const char hexchars[]="0123456789abcdef"; +- +-/* Place where we save old trap entries for restoration - sparc*/ +-/* struct tt_entry kgdb_savettable[256]; */ +-/* typedef void (*trapfunc_t)(void); */ +- +-static void kgdb_fault_handler(struct pt_regs *regs); +-static int handle_exception (struct pt_regs *regs); +- +-#if 0 +-/* Install an exception handler for kgdb */ +-static void exceptionHandler(int tnum, unsigned int *tfunc) +-{ +- /* We are dorking with a live trap table, all irqs off */ +-} +-#endif +- +-int +-kgdb_setjmp(long *buf) +-{ +- asm ("mflr 0; stw 0,0(%0);" +- "stw 1,4(%0); stw 2,8(%0);" +- "mfcr 0; stw 0,12(%0);" +- "stmw 13,16(%0)" +- : : "r" (buf)); +- /* XXX should save fp regs as well */ +- return 0; +-} +-void +-kgdb_longjmp(long *buf, int val) +-{ +- if (val == 0) +- val = 1; +- asm ("lmw 13,16(%0);" +- "lwz 0,12(%0); mtcrf 0x38,0;" +- "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);" +- "mtlr 0; mr 3,%1" +- : : "r" (buf), "r" (val)); +-} +-/* Convert ch from a hex digit to an int */ +-static int +-hex(unsigned char ch) +-{ +- if (ch >= 'a' && ch <= 'f') +- return ch-'a'+10; +- if (ch >= '0' && ch <= '9') +- return ch-'0'; +- if (ch >= 'A' && ch <= 'F') +- return ch-'A'+10; +- return -1; +-} +- +-/* Convert the memory pointed to by mem into hex, placing result in buf. +- * Return a pointer to the last char put in buf (null), in case of mem fault, +- * return 0. +- */ +-static unsigned char * +-mem2hex(const char *mem, char *buf, int count) +-{ +- unsigned char ch; +- unsigned short tmp_s; +- unsigned long tmp_l; +- +- if (kgdb_setjmp((long*)fault_jmp_buf) == 0) { +- debugger_fault_handler = kgdb_fault_handler; +- +- /* Accessing 16 bit and 32 bit objects in a single +- ** load instruction is required to avoid bad side +- ** effects for some IO registers. +- */ +- +- if ((count == 2) && (((long)mem & 1) == 0)) { +- tmp_s = *(unsigned short *)mem; +- mem += 2; +- *buf++ = hexchars[(tmp_s >> 12) & 0xf]; +- *buf++ = hexchars[(tmp_s >> 8) & 0xf]; +- *buf++ = hexchars[(tmp_s >> 4) & 0xf]; +- *buf++ = hexchars[tmp_s & 0xf]; +- +- } else if ((count == 4) && (((long)mem & 3) == 0)) { +- tmp_l = *(unsigned int *)mem; +- mem += 4; +- *buf++ = hexchars[(tmp_l >> 28) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 24) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 20) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 16) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 12) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 8) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 4) & 0xf]; +- *buf++ = hexchars[tmp_l & 0xf]; +- +- } else { +- while (count-- > 0) { +- ch = *mem++; +- *buf++ = hexchars[ch >> 4]; +- *buf++ = hexchars[ch & 0xf]; +- } +- } +- +- } else { +- /* error condition */ +- } +- debugger_fault_handler = NULL; +- *buf = 0; +- return buf; +-} +- +-/* convert the hex array pointed to by buf into binary to be placed in mem +- * return a pointer to the character AFTER the last byte written. +-*/ +-static char * +-hex2mem(char *buf, char *mem, int count) +-{ +- unsigned char ch; +- int i; +- char *orig_mem; +- unsigned short tmp_s; +- unsigned long tmp_l; +- +- orig_mem = mem; +- +- if (kgdb_setjmp((long*)fault_jmp_buf) == 0) { +- debugger_fault_handler = kgdb_fault_handler; +- +- /* Accessing 16 bit and 32 bit objects in a single +- ** store instruction is required to avoid bad side +- ** effects for some IO registers. +- */ +- +- if ((count == 2) && (((long)mem & 1) == 0)) { +- tmp_s = hex(*buf++) << 12; +- tmp_s |= hex(*buf++) << 8; +- tmp_s |= hex(*buf++) << 4; +- tmp_s |= hex(*buf++); +- +- *(unsigned short *)mem = tmp_s; +- mem += 2; +- +- } else if ((count == 4) && (((long)mem & 3) == 0)) { +- tmp_l = hex(*buf++) << 28; +- tmp_l |= hex(*buf++) << 24; +- tmp_l |= hex(*buf++) << 20; +- tmp_l |= hex(*buf++) << 16; +- tmp_l |= hex(*buf++) << 12; +- tmp_l |= hex(*buf++) << 8; +- tmp_l |= hex(*buf++) << 4; +- tmp_l |= hex(*buf++); +- +- *(unsigned long *)mem = tmp_l; +- mem += 4; +- +- } else { +- for (i=0; i# */ +-static void +-getpacket(char *buffer) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- unsigned char ch; +- +- do { +- /* wait around for the start character, ignore all other +- * characters */ +- while ((ch = (getDebugChar() & 0x7f)) != '$') ; +- +- checksum = 0; +- xmitcsum = -1; +- +- count = 0; +- +- /* now, read until a # or end of buffer is found */ +- while (count < BUFMAX) { +- ch = getDebugChar() & 0x7f; +- if (ch == '#') +- break; +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- if (count >= BUFMAX) +- continue; +- +- buffer[count] = 0; +- +- if (ch == '#') { +- xmitcsum = hex(getDebugChar() & 0x7f) << 4; +- xmitcsum |= hex(getDebugChar() & 0x7f); +- if (checksum != xmitcsum) +- putDebugChar('-'); /* failed checksum */ +- else { +- putDebugChar('+'); /* successful transfer */ +- /* if a sequence char is present, reply the ID */ +- if (buffer[2] == ':') { +- putDebugChar(buffer[0]); +- putDebugChar(buffer[1]); +- /* remove sequence chars from buffer */ +- count = strlen(buffer); +- for (i=3; i <= count; i++) +- buffer[i-3] = buffer[i]; +- } +- } +- } +- } while (checksum != xmitcsum); +-} +- +-/* send the packet in buffer. */ +-static void putpacket(unsigned char *buffer) +-{ +- unsigned char checksum; +- int count; +- unsigned char ch, recv; +- +- /* $#. */ +- do { +- putDebugChar('$'); +- checksum = 0; +- count = 0; +- +- while ((ch = buffer[count])) { +- putDebugChar(ch); +- checksum += ch; +- count += 1; +- } +- +- putDebugChar('#'); +- putDebugChar(hexchars[checksum >> 4]); +- putDebugChar(hexchars[checksum & 0xf]); +- recv = getDebugChar(); +- } while ((recv & 0x7f) != '+'); +-} +- +-static void kgdb_flush_cache_all(void) +-{ +- flush_instruction_cache(); +-} +- +-/* Set up exception handlers for tracing and breakpoints +- * [could be called kgdb_init()] +- */ +-void set_debug_traps(void) +-{ +-#if 0 +- unsigned char c; +- +- save_and_cli(flags); +- +- /* In case GDB is started before us, ack any packets (presumably +- * "$?#xx") sitting there. +- * +- * I've found this code causes more problems than it solves, +- * so that's why it's commented out. GDB seems to work fine +- * now starting either before or after the kernel -bwb +- */ +- +- while((c = getDebugChar()) != '$'); +- while((c = getDebugChar()) != '#'); +- c = getDebugChar(); /* eat first csum byte */ +- c = getDebugChar(); /* eat second csum byte */ +- putDebugChar('+'); /* ack it */ +-#endif +- debugger = kgdb; +- debugger_bpt = kgdb_bpt; +- debugger_sstep = kgdb_sstep; +- debugger_iabr_match = kgdb_iabr_match; +- debugger_dabr_match = kgdb_dabr_match; +- +- initialized = 1; +-} +- +-static void kgdb_fault_handler(struct pt_regs *regs) +-{ +- kgdb_longjmp((long*)fault_jmp_buf, 1); +-} +- +-int kgdb_bpt(struct pt_regs *regs) +-{ +- return handle_exception(regs); +-} +- +-int kgdb_sstep(struct pt_regs *regs) +-{ +- return handle_exception(regs); +-} +- +-void kgdb(struct pt_regs *regs) +-{ +- handle_exception(regs); +-} +- +-int kgdb_iabr_match(struct pt_regs *regs) +-{ +- printk(KERN_ERR "kgdb doesn't support iabr, what?!?\n"); +- return handle_exception(regs); +-} +- +-int kgdb_dabr_match(struct pt_regs *regs) +-{ +- printk(KERN_ERR "kgdb doesn't support dabr, what?!?\n"); +- return handle_exception(regs); +-} +- +-/* Convert the hardware trap type code to a unix signal number. */ +-/* +- * This table contains the mapping between PowerPC hardware trap types, and +- * signals, which are primarily what GDB understands. +- */ +-static struct hard_trap_info +-{ +- unsigned int tt; /* Trap type code for powerpc */ +- unsigned char signo; /* Signal that we map this trap into */ +-} hard_trap_info[] = { +-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +- { 0x100, SIGINT }, /* critical input interrupt */ +- { 0x200, SIGSEGV }, /* machine check */ +- { 0x300, SIGSEGV }, /* data storage */ +- { 0x400, SIGBUS }, /* instruction storage */ +- { 0x500, SIGINT }, /* interrupt */ +- { 0x600, SIGBUS }, /* alignment */ +- { 0x700, SIGILL }, /* program */ +- { 0x800, SIGILL }, /* reserved */ +- { 0x900, SIGILL }, /* reserved */ +- { 0xa00, SIGILL }, /* reserved */ +- { 0xb00, SIGILL }, /* reserved */ +- { 0xc00, SIGCHLD }, /* syscall */ +- { 0xd00, SIGILL }, /* reserved */ +- { 0xe00, SIGILL }, /* reserved */ +- { 0xf00, SIGILL }, /* reserved */ +- /* +- ** 0x1000 PIT +- ** 0x1010 FIT +- ** 0x1020 watchdog +- ** 0x1100 data TLB miss +- ** 0x1200 instruction TLB miss +- */ +- { 0x2002, SIGTRAP}, /* debug */ +-#else +- { 0x200, SIGSEGV }, /* machine check */ +- { 0x300, SIGSEGV }, /* address error (store) */ +- { 0x400, SIGBUS }, /* instruction bus error */ +- { 0x500, SIGINT }, /* interrupt */ +- { 0x600, SIGBUS }, /* alingment */ +- { 0x700, SIGTRAP }, /* breakpoint trap */ +- { 0x800, SIGFPE }, /* fpu unavail */ +- { 0x900, SIGALRM }, /* decrementer */ +- { 0xa00, SIGILL }, /* reserved */ +- { 0xb00, SIGILL }, /* reserved */ +- { 0xc00, SIGCHLD }, /* syscall */ +- { 0xd00, SIGTRAP }, /* single-step/watch */ +- { 0xe00, SIGFPE }, /* fp assist */ +-#endif +- { 0, 0} /* Must be last */ +- +-}; +- +-static int computeSignal(unsigned int tt) +-{ +- struct hard_trap_info *ht; +- +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- if (ht->tt == tt) +- return ht->signo; +- +- return SIGHUP; /* default for things we don't know about */ +-} +- +-#define PC_REGNUM 64 +-#define SP_REGNUM 1 +- +-/* +- * This function does all command processing for interfacing to gdb. +- */ +-static int +-handle_exception (struct pt_regs *regs) +-{ +- int sigval; +- int addr; +- int length; +- char *ptr; +- unsigned int msr; +- +- /* We don't handle user-mode breakpoints. */ +- if (user_mode(regs)) +- return 0; +- +- if (debugger_fault_handler) { +- debugger_fault_handler(regs); +- panic("kgdb longjump failed!\n"); +- } +- if (kgdb_active) { +- printk(KERN_ERR "interrupt while in kgdb, returning\n"); +- return 0; +- } +- +- kgdb_active = 1; +- kgdb_started = 1; +- +-#ifdef KGDB_DEBUG +- printk("kgdb: entering handle_exception; trap [0x%x]\n", +- (unsigned int)regs->trap); +-#endif +- +- kgdb_interruptible(0); +- lock_kernel(); +- msr = mfmsr(); +- mtmsr(msr & ~MSR_EE); /* disable interrupts */ +- +- if (regs->nip == (unsigned long)breakinst) { +- /* Skip over breakpoint trap insn */ +- regs->nip += 4; +- } +- +- /* reply to host that an exception has occurred */ +- sigval = computeSignal(regs->trap); +- ptr = remcomOutBuffer; +- +- *ptr++ = 'T'; +- *ptr++ = hexchars[sigval >> 4]; +- *ptr++ = hexchars[sigval & 0xf]; +- *ptr++ = hexchars[PC_REGNUM >> 4]; +- *ptr++ = hexchars[PC_REGNUM & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->nip, ptr, 4); +- *ptr++ = ';'; +- *ptr++ = hexchars[SP_REGNUM >> 4]; +- *ptr++ = hexchars[SP_REGNUM & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex(((char *)regs) + SP_REGNUM*4, ptr, 4); +- *ptr++ = ';'; +- *ptr++ = 0; +- +- putpacket(remcomOutBuffer); +- if (kdebug) +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- +- /* XXX We may want to add some features dealing with poking the +- * XXX page tables, ... (look at sparc-stub.c for more info) +- * XXX also required hacking to the gdb sources directly... +- */ +- +- while (1) { +- remcomOutBuffer[0] = 0; +- +- getpacket(remcomInBuffer); +- switch (remcomInBuffer[0]) { +- case '?': /* report most recent signal */ +- remcomOutBuffer[0] = 'S'; +- remcomOutBuffer[1] = hexchars[sigval >> 4]; +- remcomOutBuffer[2] = hexchars[sigval & 0xf]; +- remcomOutBuffer[3] = 0; +- break; +-#if 0 +- case 'q': /* this screws up gdb for some reason...*/ +- { +- extern long _start, sdata, __bss_start; +- +- ptr = &remcomInBuffer[1]; +- if (strncmp(ptr, "Offsets", 7) != 0) +- break; +- +- ptr = remcomOutBuffer; +- sprintf(ptr, "Text=%8.8x;Data=%8.8x;Bss=%8.8x", +- &_start, &sdata, &__bss_start); +- break; +- } +-#endif +- case 'd': +- /* toggle debug flag */ +- kdebug ^= 1; +- break; +- +- case 'g': /* return the value of the CPU registers. +- * some of them are non-PowerPC names :( +- * they are stored in gdb like: +- * struct { +- * u32 gpr[32]; +- * f64 fpr[32]; +- * u32 pc, ps, cnd, lr; (ps=msr) +- * u32 cnt, xer, mq; +- * } +- */ +- { +- int i; +- ptr = remcomOutBuffer; +- /* General Purpose Regs */ +- ptr = mem2hex((char *)regs, ptr, 32 * 4); +- /* Floating Point Regs - FIXME */ +- /*ptr = mem2hex((char *), ptr, 32 * 8);*/ +- for(i=0; i<(32*8*2); i++) { /* 2chars/byte */ +- ptr[i] = '0'; +- } +- ptr += 32*8*2; +- /* pc, msr, cr, lr, ctr, xer, (mq is unused) */ +- ptr = mem2hex((char *)®s->nip, ptr, 4); +- ptr = mem2hex((char *)®s->msr, ptr, 4); +- ptr = mem2hex((char *)®s->ccr, ptr, 4); +- ptr = mem2hex((char *)®s->link, ptr, 4); +- ptr = mem2hex((char *)®s->ctr, ptr, 4); +- ptr = mem2hex((char *)®s->xer, ptr, 4); +- } +- break; +- +- case 'G': /* set the value of the CPU registers */ +- { +- ptr = &remcomInBuffer[1]; +- +- /* +- * If the stack pointer has moved, you should pray. +- * (cause only god can help you). +- */ +- +- /* General Purpose Regs */ +- hex2mem(ptr, (char *)regs, 32 * 4); +- +- /* Floating Point Regs - FIXME?? */ +- /*ptr = hex2mem(ptr, ??, 32 * 8);*/ +- ptr += 32*8*2; +- +- /* pc, msr, cr, lr, ctr, xer, (mq is unused) */ +- ptr = hex2mem(ptr, (char *)®s->nip, 4); +- ptr = hex2mem(ptr, (char *)®s->msr, 4); +- ptr = hex2mem(ptr, (char *)®s->ccr, 4); +- ptr = hex2mem(ptr, (char *)®s->link, 4); +- ptr = hex2mem(ptr, (char *)®s->ctr, 4); +- ptr = hex2mem(ptr, (char *)®s->xer, 4); +- +- strcpy(remcomOutBuffer,"OK"); +- } +- break; +- case 'H': +- /* don't do anything, yet, just acknowledge */ +- hexToInt(&ptr, &addr); +- strcpy(remcomOutBuffer,"OK"); +- break; +- +- case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ +- /* Try to read %x,%x. */ +- +- ptr = &remcomInBuffer[1]; +- +- if (hexToInt(&ptr, &addr) && *ptr++ == ',' +- && hexToInt(&ptr, &length)) { +- if (mem2hex((char *)addr, remcomOutBuffer, +- length)) +- break; +- strcpy(remcomOutBuffer, "E03"); +- } else +- strcpy(remcomOutBuffer, "E01"); +- break; +- +- case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ +- /* Try to read '%x,%x:'. */ +- +- ptr = &remcomInBuffer[1]; +- +- if (hexToInt(&ptr, &addr) && *ptr++ == ',' +- && hexToInt(&ptr, &length) +- && *ptr++ == ':') { +- if (hex2mem(ptr, (char *)addr, length)) +- strcpy(remcomOutBuffer, "OK"); +- else +- strcpy(remcomOutBuffer, "E03"); +- flush_icache_range(addr, addr+length); +- } else +- strcpy(remcomOutBuffer, "E02"); +- break; +- +- +- case 'k': /* kill the program, actually just continue */ +- case 'c': /* cAA..AA Continue; address AA..AA optional */ +- /* try to read optional parameter, pc unchanged if no parm */ +- +- ptr = &remcomInBuffer[1]; +- if (hexToInt(&ptr, &addr)) +- regs->nip = addr; +- +-/* Need to flush the instruction cache here, as we may have deposited a +- * breakpoint, and the icache probably has no way of knowing that a data ref to +- * some location may have changed something that is in the instruction cache. +- */ +- kgdb_flush_cache_all(); +- mtmsr(msr); +- +- kgdb_interruptible(1); +- unlock_kernel(); +- kgdb_active = 0; +- if (kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- return 1; +- +- case 's': +- kgdb_flush_cache_all(); +-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +- mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC); +- regs->msr |= MSR_DE; +-#else +- regs->msr |= MSR_SE; +-#endif +- unlock_kernel(); +- kgdb_active = 0; +- if (kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- return 1; +- +- case 'r': /* Reset (if user process..exit ???)*/ +- panic("kgdb reset."); +- break; +- } /* switch */ +- if (remcomOutBuffer[0] && kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- /* reply to the request */ +- putpacket(remcomOutBuffer); +- } /* while(1) */ +-} +- +-/* This function will generate a breakpoint exception. It is used at the +- beginning of a program to sync up with a debugger and can be used +- otherwise as a quick means to stop program execution and "break" into +- the debugger. */ +- +-void +-breakpoint(void) +-{ +- if (!initialized) { +- printk("breakpoint() called b4 kgdb init\n"); +- return; +- } +- +- asm(" .globl breakinst \n\ +- breakinst: .long 0x7d821008"); +-} +- +-#ifdef CONFIG_KGDB_CONSOLE +-/* Output string in GDB O-packet format if GDB has connected. If nothing +- output, returns 0 (caller must then handle output). */ +-int +-kgdb_output_string (const char* s, unsigned int count) +-{ +- char buffer[512]; +- +- if (!kgdb_started) +- return 0; +- +- count = (count <= (sizeof(buffer) / 2 - 2)) +- ? count : (sizeof(buffer) / 2 - 2); +- +- buffer[0] = 'O'; +- mem2hex (s, &buffer[1], count); +- putpacket(buffer); +- +- return 1; +-} +-#endif +- +-static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, +- struct tty_struct *tty) +-{ +- printk("Entering GDB stub\n"); +- breakpoint(); +-} +-static struct sysrq_key_op sysrq_gdb_op = { +- .handler = sysrq_handle_gdb, +- .help_msg = "Gdb", +- .action_msg = "GDB", +-}; +- +-static int gdb_register_sysrq(void) +-{ +- printk("Registering GDB sysrq handler\n"); +- register_sysrq_key('g', &sysrq_gdb_op); +- return 0; +-} +-module_init(gdb_register_sysrq); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/setup.c +--- linux-2.6.18-53.1.14/arch/ppc/kernel/setup.c 2008-03-06 05:54:43.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/kernel/setup.c 2008-06-10 15:38:14.000000000 +0400 +@@ -48,10 +48,6 @@ + #include + #endif + +-#if defined CONFIG_KGDB +-#include +-#endif +- + extern void platform_init(unsigned long r3, unsigned long r4, + unsigned long r5, unsigned long r6, unsigned long r7); + extern void reloc_got2(unsigned long offset); +@@ -506,18 +502,6 @@ void __init setup_arch(char **cmdline_p) + #endif /* CONFIG_XMON */ + if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); + +-#if defined(CONFIG_KGDB) +- if (ppc_md.kgdb_map_scc) +- ppc_md.kgdb_map_scc(); +- set_debug_traps(); +- if (strstr(cmd_line, "gdb")) { +- if (ppc_md.progress) +- ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); +- printk("kgdb breakpoint activated\n"); +- breakpoint(); +- } +-#endif +- + /* + * Set cache line size based on type of cpu as a default. + * Systems with OF can look in the properties on the cpu node(s) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/ppc/mm/fault.c +--- linux-2.6.18-53.1.14/arch/ppc/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/mm/fault.c 2008-06-10 15:38:14.000000000 +0400 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -329,6 +330,14 @@ bad_page_fault(struct pt_regs *regs, uns + return; + } + ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) { ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++ } ++#endif ++ + /* kernel has accessed a bad area */ + #if defined(CONFIG_XMON) || defined(CONFIG_KGDB) + if (debugger_kernel_faults) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/bubinga.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/bubinga.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/bubinga.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/bubinga.c 2008-06-10 15:38:14.000000000 +0400 +@@ -4,7 +4,7 @@ + * Author: SAW (IBM), derived from walnut.c. + * Maintained by MontaVista Software + * +- * 2003 (c) MontaVista Softare Inc. This file is licensed under the ++ * 2003-2004 (c) MontaVista Softare Inc. This file is licensed under the + * terms of the GNU General Public License version 2. This program is + * licensed "as is" without any warranty of any kind, whether express + * or implied. +@@ -100,17 +100,26 @@ bubinga_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + + port.membase = (void*)ACTING_UART1_IO_BASE; + port.irq = ACTING_UART1_INT; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 1 failed\n"); +- } ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + void __init +@@ -255,8 +264,4 @@ platform_init(unsigned long r3, unsigned + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; + #endif +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = bubinga_early_serial_map; +-#endif + } +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ebony.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ebony.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ebony.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ebony.c 2008-06-10 15:38:14.000000000 +0400 +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -226,14 +227,20 @@ ebony_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(0, &port); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Purge TLB entry added in head_44x.S for early serial access */ + _tlbie(UART0_IO_BASE); + #endif +@@ -243,14 +250,18 @@ ebony_early_serial_map(void) + port.uartclk = clocks.uart1; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 1) + printk("Early serial init of port 1 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(1, &port); + #endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + static void __init +@@ -327,8 +338,4 @@ void __init platform_init(unsigned long + + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ebony_early_serial_map; +-#endif + } +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ocotea.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ocotea.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/ocotea.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/ocotea.c 2008-06-10 15:38:14.000000000 +0400 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -249,14 +250,20 @@ ocotea_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(0, &port); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Purge TLB entry added in head_44x.S for early serial access */ + _tlbie(UART0_IO_BASE); + #endif +@@ -266,14 +273,18 @@ ocotea_early_serial_map(void) + port.uartclk = clocks.uart1; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 1) + printk("Early serial init of port 1 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(1, &port); + #endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + static void __init +@@ -343,8 +354,5 @@ void __init platform_init(unsigned long + + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ocotea_early_serial_map; +-#endif + ppc_md.init = ocotea_init; + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/xilinx_ml300.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/4xx/xilinx_ml300.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c 2008-06-10 15:38:14.000000000 +0400 +@@ -41,9 +41,6 @@ + * ppc4xx_map_io arch/ppc/syslib/ppc4xx_setup.c + * start_kernel init/main.c + * setup_arch arch/ppc/kernel/setup.c +- * #if defined(CONFIG_KGDB) +- * *ppc_md.kgdb_map_scc() == gen550_kgdb_map_scc +- * #endif + * *ppc_md.setup_arch == ml300_setup_arch this file + * ppc4xx_setup_arch arch/ppc/syslib/ppc4xx_setup.c + * ppc4xx_find_bridges arch/ppc/syslib/ppc405_pci.c +@@ -117,7 +114,6 @@ ml300_early_serial_init(int num, struct + void __init + ml300_early_serial_map(void) + { +-#ifdef CONFIG_SERIAL_8250 + struct plat_serial8250_port *pdata; + int i = 0; + +@@ -129,7 +125,14 @@ ml300_early_serial_map(void) + pdata++; + i++; + } +-#endif /* CONFIG_SERIAL_8250 */ ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) ++ printk("Early serial init of port %d failed\n", i); ++#endif ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(i, &port) ++#endif + } + + void __init +@@ -165,9 +168,4 @@ platform_init(unsigned long r3, unsigned + #if defined(XPAR_POWER_0_POWERDOWN_BASEADDR) + ppc_md.power_off = xilinx_power_off; + #endif +- +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ml300_early_serial_map; +-#endif + } +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/85xx/sbc8560.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/85xx/sbc8560.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/85xx/sbc8560.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/85xx/sbc8560.c 2008-06-10 15:38:14.000000000 +0400 +@@ -50,7 +50,6 @@ + #include + #include + +-#ifdef CONFIG_SERIAL_8250 + static void __init + sbc8560_early_serial_map(void) + { +@@ -66,12 +65,16 @@ sbc8560_early_serial_map(void) + uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART0_SIZE); + uart_req.type = PORT_16650; + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) +- gen550_init(0, &uart_req); +-#endif +- ++#ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&uart_req) != 0) + printk("Early serial init of port 0 failed\n"); ++#endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &uart_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &uart_req); ++#endif + + /* Assume early_serial_setup() doesn't modify uart_req */ + uart_req.line = 1; +@@ -79,14 +82,17 @@ sbc8560_early_serial_map(void) + uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART1_SIZE); + uart_req.irq = MPC85xx_IRQ_EXT10; + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) +- gen550_init(1, &uart_req); +-#endif +- ++#ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&uart_req) != 0) +- printk("Early serial init of port 1 failed\n"); +-} ++ printk("Early serial init of port 0 failed\n"); + #endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &uart_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &uart_req); ++#endif ++} + + /* ************************************************************************ + * +@@ -115,9 +121,7 @@ sbc8560_setup_arch(void) + /* setup PCI host bridges */ + mpc85xx_setup_hose(); + #endif +-#ifdef CONFIG_SERIAL_8250 + sbc8560_early_serial_map(); +-#endif + #ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Invalidate the entry we stole earlier the serial ports + * should be properly mapped */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/chestnut.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/chestnut.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/chestnut.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/chestnut.c 2008-06-10 15:38:14.000000000 +0400 +@@ -492,7 +492,7 @@ chestnut_power_off(void) + static void __init + chestnut_map_io(void) + { +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + io_block_mapping(CHESTNUT_UART_BASE, CHESTNUT_UART_BASE, 0x100000, + _PAGE_IO); + #endif +@@ -566,9 +566,6 @@ platform_init(unsigned long r3, unsigned + #if defined(CONFIG_SERIAL_TEXT_DEBUG) + ppc_md.progress = gen550_progress; + #endif +-#if defined(CONFIG_KGDB) +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + + if (ppc_md.progress) + ppc_md.progress("chestnut_init(): exit", 0); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/pplus.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/pplus.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/pplus.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/pplus.c 2008-06-10 15:38:14.000000000 +0400 +@@ -893,9 +893,6 @@ platform_init(unsigned long r3, unsigned + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + #ifdef CONFIG_SMP + smp_ops = &pplus_smp_ops; + #endif /* CONFIG_SMP */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/sandpoint.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/sandpoint.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/sandpoint.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/sandpoint.c 2008-06-10 15:38:14.000000000 +0400 +@@ -730,9 +730,6 @@ platform_init(unsigned long r3, unsigned + ppc_md.nvram_read_val = todc_mc146818_read_val; + ppc_md.nvram_write_val = todc_mc146818_write_val; + +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/platforms/spruce.c linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/spruce.c +--- linux-2.6.18-53.1.14/arch/ppc/platforms/spruce.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/platforms/spruce.c 2008-06-10 15:38:14.000000000 +0400 +@@ -178,26 +178,32 @@ spruce_early_serial_map(void) + serial_req.membase = (u_char *)UART0_IO_BASE; + serial_req.regshift = 0; + +-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG) +- gen550_init(0, &serial_req); +-#endif + #ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&serial_req) != 0) + printk("Early serial init of port 0 failed\n"); + #endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &serial_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + + /* Assume early_serial_setup() doesn't modify serial_req */ + serial_req.line = 1; + serial_req.irq = UART1_INT; + serial_req.membase = (u_char *)UART1_IO_BASE; + +-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG) +- gen550_init(1, &serial_req); +-#endif + #ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&serial_req) != 0) + printk("Early serial init of port 1 failed\n"); + #endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(1, &serial_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &serial_req); ++#endif + } + + TODC_ALLOC(); +@@ -316,7 +322,4 @@ platform_init(unsigned long r3, unsigned + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/Makefile linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/Makefile +--- linux-2.6.18-53.1.14/arch/ppc/syslib/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/Makefile 2008-06-10 15:38:14.000000000 +0400 +@@ -76,7 +76,6 @@ obj-$(CONFIG_PCI_8260) += m82xx_pci.o p + obj-$(CONFIG_8260_PCI9) += m8260_pci_erratum9.o + obj-$(CONFIG_CPM2) += cpm2_common.o cpm2_pic.o + ifeq ($(CONFIG_PPC_GEN550),y) +-obj-$(CONFIG_KGDB) += gen550_kgdb.o gen550_dbg.o + obj-$(CONFIG_SERIAL_TEXT_DEBUG) += gen550_dbg.o + endif + ifeq ($(CONFIG_SERIAL_MPSC_CONSOLE),y) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/gen550.h linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/gen550.h +--- linux-2.6.18-53.1.14/arch/ppc/syslib/gen550.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/gen550.h 2008-06-10 15:38:14.000000000 +0400 +@@ -11,4 +11,3 @@ + + extern void gen550_progress(char *, unsigned short); + extern void gen550_init(int, struct uart_port *); +-extern void gen550_kgdb_map_scc(void); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/ibm44x_common.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ibm44x_common.c +--- linux-2.6.18-53.1.14/arch/ppc/syslib/ibm44x_common.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ibm44x_common.c 2008-06-10 15:38:14.000000000 +0400 +@@ -192,9 +192,6 @@ void __init ibm44x_platform_init(unsigne + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + + /* + * The Abatron BDI JTAG debugger does not tolerate others +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60.c +--- linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60.c 2008-06-10 15:38:14.000000000 +0400 +@@ -241,6 +241,12 @@ static struct resource mv64x60_mpsc0_res + .end = MV64x60_IRQ_SDMA_0, + .flags = IORESOURCE_IRQ, + }, ++ [4] = { ++ .name = "mpsc 0 irq", ++ .start = MV64x60_IRQ_MPSC_0, ++ .end = MV64x60_IRQ_MPSC_0, ++ .flags = IORESOURCE_IRQ, ++ }, + }; + + static struct platform_device mpsc0_device = { +@@ -298,6 +304,12 @@ static struct resource mv64x60_mpsc1_res + .end = MV64360_IRQ_SDMA_1, + .flags = IORESOURCE_IRQ, + }, ++ [4] = { ++ .name = "mpsc 1 irq", ++ .start = MV64360_IRQ_MPSC_1, ++ .end = MV64360_IRQ_MPSC_1, ++ .flags = IORESOURCE_IRQ, ++ }, + }; + + static struct platform_device mpsc1_device = { +@@ -1426,12 +1438,46 @@ mv64x60_pd_fixup(struct mv64x60_handle * + static int __init + mv64x60_add_pds(void) + { +- return platform_add_devices(mv64x60_pd_devs, +- ARRAY_SIZE(mv64x60_pd_devs)); ++ int i, ret = 0; ++ ++ for (i = 0; i < ARRAY_SIZE(mv64x60_pd_devs); i++) { ++ if (mv64x60_pd_devs[i]) { ++ ret = platform_device_register(mv64x60_pd_devs[i]); ++ } ++ if (ret) { ++ while (--i >= 0) ++ platform_device_unregister(mv64x60_pd_devs[i]); ++ break; ++ } ++ } ++ return ret; + } + arch_initcall(mv64x60_add_pds); + + /* ++ * mv64x60_early_get_pdev_data() ++ * ++ * Get the data associated with a platform device by name and number. ++ */ ++struct platform_device * __init ++mv64x60_early_get_pdev_data(const char *name, int id, int remove) ++{ ++ int i; ++ struct platform_device *pdev; ++ ++ for (i = 0; i id == id && ++ !strcmp(pdev->name, name)) { ++ if (remove) ++ mv64x60_pd_devs[i] = NULL; ++ return pdev; ++ } ++ } ++ return NULL; ++} ++ ++/* + ***************************************************************************** + * + * GT64260-Specific Routines +@@ -1764,6 +1810,11 @@ gt64260a_chip_specific_init(struct mv64x + r->start = MV64x60_IRQ_SDMA_0; + r->end = MV64x60_IRQ_SDMA_0; + } ++ if ((r = platform_get_resource(&mpsc1_device, IORESOURCE_IRQ, 1)) ++ != NULL) { ++ r->start = GT64260_IRQ_MPSC_1; ++ r->end = GT64260_IRQ_MPSC_1; ++ } + #endif + } + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60_dbg.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60_dbg.c +--- linux-2.6.18-53.1.14/arch/ppc/syslib/mv64x60_dbg.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/mv64x60_dbg.c 2008-06-10 15:38:14.000000000 +0400 +@@ -34,7 +34,7 @@ static struct mv64x60_handle mv64x60_dbg + void + mv64x60_progress_init(u32 base) + { +- mv64x60_dbg_bh.v_base = base; ++ mv64x60_dbg_bh.v_base = (void*)base; + return; + } + +@@ -69,53 +69,3 @@ mv64x60_mpsc_progress(char *s, unsigned + return; + } + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +- +- +-#if defined(CONFIG_KGDB) +- +-#if defined(CONFIG_KGDB_TTYS0) +-#define KGDB_PORT 0 +-#elif defined(CONFIG_KGDB_TTYS1) +-#define KGDB_PORT 1 +-#else +-#error "Invalid kgdb_tty port" +-#endif +- +-void +-putDebugChar(unsigned char c) +-{ +- mv64x60_polled_putc(KGDB_PORT, (char)c); +-} +- +-int +-getDebugChar(void) +-{ +- unsigned char c; +- +- while (!mv64x60_polled_getc(KGDB_PORT, &c)); +- return (int)c; +-} +- +-void +-putDebugString(char* str) +-{ +- while (*str != '\0') { +- putDebugChar(*str); +- str++; +- } +- putDebugChar('\r'); +- return; +-} +- +-void +-kgdb_interruptible(int enable) +-{ +-} +- +-void +-kgdb_map_scc(void) +-{ +- if (ppc_md.early_serial_map) +- ppc_md.early_serial_map(); +-} +-#endif /* CONFIG_KGDB */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/ppc/syslib/ppc85xx_setup.c linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ppc85xx_setup.c +--- linux-2.6.18-53.1.14/arch/ppc/syslib/ppc85xx_setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/ppc/syslib/ppc85xx_setup.c 2008-06-10 15:38:14.000000000 +0400 +@@ -69,7 +69,6 @@ mpc85xx_calibrate_decr(void) + mtspr(SPRN_TCR, TCR_DIE); + } + +-#ifdef CONFIG_SERIAL_8250 + void __init + mpc85xx_early_serial_map(void) + { +@@ -85,7 +84,7 @@ mpc85xx_early_serial_map(void) + pdata[0].mapbase += binfo->bi_immr_base; + pdata[0].membase = ioremap(pdata[0].mapbase, MPC85xx_UART0_SIZE); + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + memset(&serial_req, 0, sizeof (serial_req)); + serial_req.iotype = UPIO_MEM; + serial_req.mapbase = pdata[0].mapbase; +@@ -93,18 +92,24 @@ mpc85xx_early_serial_map(void) + serial_req.regshift = 0; + + gen550_init(0, &serial_req); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &serial_req); ++#endif + #endif + + pdata[1].uartclk = binfo->bi_busfreq; + pdata[1].mapbase += binfo->bi_immr_base; + pdata[1].membase = ioremap(pdata[1].mapbase, MPC85xx_UART0_SIZE); + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Assume gen550_init() doesn't modify serial_req */ + serial_req.mapbase = pdata[1].mapbase; + serial_req.membase = pdata[1].membase; + + gen550_init(1, &serial_req); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &serial_req); ++#endif + #endif + } + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/sh/Kconfig.debug +--- linux-2.6.18-53.1.14/arch/sh/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/Kconfig.debug 2008-06-10 15:38:50.000000000 +0400 +@@ -29,96 +29,4 @@ config EARLY_PRINTK + This option is only useful porting the kernel to a new machine, + when the kernel may crash or hang before the serial console is + initialised. If unsure, say N. +- +-config KGDB +- bool "Include KGDB kernel debugger" +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-menu "KGDB configuration options" +- depends on KGDB +- +-config MORE_COMPILE_OPTIONS +- bool "Add any additional compile options" +- help +- If you want to add additional CFLAGS to the kernel build, enable this +- option and then enter what you would like to add in the next question. +- Note however that -g is already appended with the selection of KGDB. +- +-config COMPILE_OPTIONS +- string "Additional compile arguments" +- depends on MORE_COMPILE_OPTIONS +- +-config KGDB_NMI +- bool "Enter KGDB on NMI" +- default n +- +-config KGDB_THREAD +- bool "Include KGDB thread support" +- default y +- +-config SH_KGDB_CONSOLE +- bool "Console messages through GDB" +- default n +- +-config KGDB_SYSRQ +- bool "Allow SysRq 'G' to enter KGDB" +- default y +- +-config KGDB_KERNEL_ASSERTS +- bool "Include KGDB kernel assertions" +- default n +- +-comment "Serial port setup" +- +-config KGDB_DEFPORT +- int "Port number (ttySCn)" +- default "1" +- +-config KGDB_DEFBAUD +- int "Baud rate" +- default "115200" +- +-choice +- prompt "Parity" +- depends on KGDB +- default KGDB_DEFPARITY_N +- +-config KGDB_DEFPARITY_N +- bool "None" +- +-config KGDB_DEFPARITY_E +- bool "Even" +- +-config KGDB_DEFPARITY_O +- bool "Odd" +- +-endchoice +- +-choice +- prompt "Data bits" +- depends on KGDB +- default KGDB_DEFBITS_8 +- +-config KGDB_DEFBITS_8 +- bool "8" +- +-config KGDB_DEFBITS_7 +- bool "7" +- +-endchoice +- +-endmenu +- +-config FRAME_POINTER +- bool "Compile the kernel with frame pointers" +- default y if KGDB +- help +- If you say Y here the resulting kernel image will be slightly larger +- and slower, but it will give very useful debugging information. +- If you don't debug the kernel, you can say N, but we may not be able +- to solve problems without frame pointers. +- + endmenu +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/Makefile linux-2.6.18-53.1.14.kgdb/arch/sh/Makefile +--- linux-2.6.18-53.1.14/arch/sh/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/Makefile 2008-06-10 15:38:50.000000000 +0400 +@@ -43,7 +43,6 @@ cflags-$(CONFIG_CPU_SH4) += -m4 \ + cflags-$(CONFIG_CPU_SH4A) += $(call cc-option,-m4a-nofpu,) + + cflags-$(CONFIG_SH_DSP) += -Wa,-dsp +-cflags-$(CONFIG_SH_KGDB) += -g + + cflags-$(CONFIG_MORE_COMPILE_OPTIONS) += \ + $(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g') +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/boards/se/7751/setup.c linux-2.6.18-53.1.14.kgdb/arch/sh/boards/se/7751/setup.c +--- linux-2.6.18-53.1.14/arch/sh/boards/se/7751/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/boards/se/7751/setup.c 2008-06-10 15:38:50.000000000 +0400 +@@ -17,10 +17,6 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-#endif +- + /* + * Configure the Super I/O chip + */ +@@ -82,12 +78,6 @@ const char *get_system_type(void) + return "7751 SolutionEngine"; + } + +-#ifdef CONFIG_SH_KGDB +-static int kgdb_uart_setup(void); +-static struct kgdb_sermap kgdb_uart_sermap = +-{ "ttyS", 0, kgdb_uart_setup, NULL }; +-#endif +- + /* + * Initialize the board + */ +@@ -95,133 +85,4 @@ void __init platform_setup(void) + { + /* Call init_smsc() replacement to set up SuperIO. */ + /* XXX: RTC setting comes here */ +-#ifdef CONFIG_SH_KGDB +- kgdb_register_sermap(&kgdb_uart_sermap); +-#endif +-} +- +-/********************************************************************* +- * Currently a hack (e.g. does not interact well w/serial.c, lots of * +- * hardcoded stuff) but may be useful if SCI/F needs debugging. * +- * Mostly copied from x86 code (see files asm-i386/kgdb_local.h and * +- * arch/i386/lib/kgdb_serial.c). * +- *********************************************************************/ +- +-#ifdef CONFIG_SH_KGDB +-#include +-#include +-#include +-#include +- +-#define COM1_PORT 0x3f8 /* Base I/O address */ +-#define COM1_IRQ 4 /* IRQ not used yet */ +-#define COM2_PORT 0x2f8 /* Base I/O address */ +-#define COM2_IRQ 3 /* IRQ not used yet */ +- +-#define SB_CLOCK 1843200 /* Serial baud clock */ +-#define SB_BASE (SB_CLOCK/16) +-#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS +- +-struct uart_port { +- int base; +-}; +-#define UART_NPORTS 2 +-struct uart_port uart_ports[] = { +- { COM1_PORT }, +- { COM2_PORT }, +-}; +-struct uart_port *kgdb_uart_port; +- +-#define UART_IN(reg) inb_p(kgdb_uart_port->base + reg) +-#define UART_OUT(reg,v) outb_p((v), kgdb_uart_port->base + reg) +- +-/* Basic read/write functions for the UART */ +-#define UART_LSR_RXCERR (UART_LSR_BI | UART_LSR_FE | UART_LSR_PE) +-static int kgdb_uart_getchar(void) +-{ +- int lsr; +- int c = -1; +- +- while (c == -1) { +- lsr = UART_IN(UART_LSR); +- if (lsr & UART_LSR_DR) +- c = UART_IN(UART_RX); +- if ((lsr & UART_LSR_RXCERR)) +- c = -1; +- } +- return c; +-} +- +-static void kgdb_uart_putchar(int c) +-{ +- while ((UART_IN(UART_LSR) & UART_LSR_THRE) == 0) +- ; +- UART_OUT(UART_TX, c); +-} +- +-/* +- * Initialize UART to configured/requested values. +- * (But we don't interrupts yet, or interact w/serial.c) +- */ +-static int kgdb_uart_setup(void) +-{ +- int port; +- int lcr = 0; +- int bdiv = 0; +- +- if (kgdb_portnum >= UART_NPORTS) { +- KGDB_PRINTK("uart port %d invalid.\n", kgdb_portnum); +- return -1; +- } +- +- kgdb_uart_port = &uart_ports[kgdb_portnum]; +- +- /* Init sequence from gdb_hook_interrupt */ +- UART_IN(UART_RX); +- UART_OUT(UART_IER, 0); +- +- UART_IN(UART_RX); /* Serial driver comments say */ +- UART_IN(UART_IIR); /* this clears interrupt regs */ +- UART_IN(UART_MSR); +- +- /* Figure basic LCR values */ +- switch (kgdb_bits) { +- case '7': +- lcr |= UART_LCR_WLEN7; +- break; +- default: case '8': +- lcr |= UART_LCR_WLEN8; +- break; +- } +- switch (kgdb_parity) { +- case 'O': +- lcr |= UART_LCR_PARITY; +- break; +- case 'E': +- lcr |= (UART_LCR_PARITY | UART_LCR_EPAR); +- break; +- default: break; +- } +- +- /* Figure the baud rate divisor */ +- bdiv = (SB_BASE/kgdb_baud); +- +- /* Set the baud rate and LCR values */ +- UART_OUT(UART_LCR, (lcr | UART_LCR_DLAB)); +- UART_OUT(UART_DLL, (bdiv & 0xff)); +- UART_OUT(UART_DLM, ((bdiv >> 8) & 0xff)); +- UART_OUT(UART_LCR, lcr); +- +- /* Set the MCR */ +- UART_OUT(UART_MCR, SB_MCR); +- +- /* Turn off FIFOs for now */ +- UART_OUT(UART_FCR, 0); +- +- /* Setup complete: initialize function pointers */ +- kgdb_getchar = kgdb_uart_getchar; +- kgdb_putchar = kgdb_uart_putchar; +- +- return 0; + } +-#endif /* CONFIG_SH_KGDB */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/sh/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/Makefile 2008-06-10 15:38:50.000000000 +0400 +@@ -13,7 +13,7 @@ obj-y += cpu/ timers/ + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_CF_ENABLER) += cf-enabler.o + obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o +-obj-$(CONFIG_SH_KGDB) += kgdb_stub.o kgdb_jmp.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + obj-$(CONFIG_SH_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_MODULES) += module.o + obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh3/ex.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh3/ex.S +--- linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh3/ex.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh3/ex.S 2008-06-10 15:38:50.000000000 +0400 +@@ -42,7 +42,7 @@ ENTRY(exception_handling_table) + .long exception_error ! reserved_instruction (filled by trap_init) /* 180 */ + .long exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/ + ENTRY(nmi_slot) +-#if defined (CONFIG_KGDB_NMI) ++#if defined (CONFIG_KGDB) + .long debug_enter /* 1C0 */ ! Allow trap to debugger + #else + .long exception_none /* 1C0 */ ! Not implemented yet +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh4/ex.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh4/ex.S +--- linux-2.6.18-53.1.14/arch/sh/kernel/cpu/sh4/ex.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/cpu/sh4/ex.S 2008-06-10 15:38:50.000000000 +0400 +@@ -46,7 +46,7 @@ ENTRY(exception_handling_table) + .long exception_error ! reserved_instruction (filled by trap_init) /* 180 */ + .long exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/ + ENTRY(nmi_slot) +-#if defined (CONFIG_KGDB_NMI) ++#if defined (CONFIG_KGDB) + .long debug_enter /* 1C0 */ ! Allow trap to debugger + #else + .long exception_none /* 1C0 */ ! Not implemented yet +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/entry.S +--- linux-2.6.18-53.1.14/arch/sh/kernel/entry.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/entry.S 2008-06-10 15:38:50.000000000 +0400 +@@ -75,7 +75,7 @@ + ENOSYS = 38 + EINVAL = 22 + +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + NMI_VEC = 0x1c0 ! Must catch early for debounce + #endif + +@@ -227,31 +227,33 @@ call_dae: + 2: .long do_address_error + #endif /* CONFIG_MMU */ + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB) + ! Handle kernel debug if either kgdb (SW) or gdb-stub (FW) is present. + ! If both are configured, handle the debug traps (breakpoints) in SW, + ! but still allow BIOS traps to FW. + + .align 2 + debug_kernel: +-#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_KGDB) + /* Force BIOS call to FW (debug_trap put TRA in r8) */ + mov r8,r0 + shlr2 r0 + cmp/eq #0x3f,r0 + bt debug_kernel_fw +-#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_KGDB */ + ++ .align 2 ++ .globl debug_enter + debug_enter: +-#if defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_KGDB) + /* Jump to kgdb, pass stacked regs as arg */ + debug_kernel_sw: + mov.l 3f, r0 + jmp @r0 + mov r15, r4 + .align 2 +-3: .long kgdb_handle_exception +-#endif /* CONFIG_SH_KGDB */ ++3: .long kgdb_exception_handler ++#endif /* CONFIG_KGDB */ + + #if defined(CONFIG_SH_STANDARD_BIOS) + /* Unwind the stack and jmp to the debug entry */ +@@ -293,12 +295,12 @@ debug_kernel_fw: + 2: .long gdb_vbr_vector + #endif /* CONFIG_SH_STANDARD_BIOS */ + +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB */ + + + .align 2 + debug_trap: +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB) + mov #OFF_SR, r0 + mov.l @(r0,r15), r0 ! get status register + shll r0 +@@ -642,7 +644,7 @@ skip_restore: + 6: or k0, k2 ! Set the IMASK-bits + ldc k2, ssr + ! +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + ! Clear in_nmi + mov.l 4f, k0 + mov #0, k1 +@@ -694,7 +696,7 @@ tlb_miss: + interrupt: + mov.l 2f, k2 + mov.l 3f, k3 +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + ! Debounce (filter nested NMI) + mov.l @k2, k0 + mov.l 5f, k1 +@@ -709,7 +711,7 @@ interrupt: + 5: .long NMI_VEC + 6: .long in_nmi + 0: +-#endif /* defined(CONFIG_KGDB_NMI) */ ++#endif /* defined(CONFIG_KGDB) */ + bra handle_exception + mov.l @k2, k2 + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb-jmp.S +--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb-jmp.S 2008-06-10 15:38:50.000000000 +0400 +@@ -0,0 +1,32 @@ ++#include ++ ++ENTRY(kgdb_fault_setjmp) ++ add #(9*4), r4 ++ sts.l pr, @-r4 ++ mov.l r15, @-r4 ++ mov.l r14, @-r4 ++ mov.l r13, @-r4 ++ mov.l r12, @-r4 ++ mov.l r11, @-r4 ++ mov.l r10, @-r4 ++ mov.l r9, @-r4 ++ mov.l r8, @-r4 ++ rts ++ mov #0, r0 ++ ++ENTRY(kgdb_fault_longjmp) ++ mov.l @r4+, r8 ++ mov.l @r4+, r9 ++ mov.l @r4+, r10 ++ mov.l @r4+, r11 ++ mov.l @r4+, r12 ++ mov.l @r4+, r13 ++ mov.l @r4+, r14 ++ mov.l @r4+, r15 ++ lds.l @r4+, pr ++ mov r5, r0 ++ tst r0, r0 ++ bf 1f ++ mov #1, r0 ++1: rts ++ nop +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb.c 2008-06-10 15:38:50.000000000 +0400 +@@ -0,0 +1,363 @@ ++/* ++ * arch/sh/kernel/kgdb.c ++ * ++ * Contains SH-specific low-level support for KGDB. ++ * ++ * Containes extracts from code by Glenn Engel, Jim Kingdon, ++ * David Grothe , Tigran Aivazian , ++ * Amit S. Kale , William Gatliff , ++ * Ben Lee, Steve Chamberlain and Benoit Miller , ++ * Henry Bell and Jeremy Siegel ++ * ++ * Maintainer: Tom Rini ++ * ++ * 2004 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern void per_cpu_trap_init(void); ++extern atomic_t cpu_doing_single_step; ++ ++/* Function pointers for linkage */ ++static struct kgdb_regs trap_registers; ++ ++/* Globals. */ ++char in_nmi; /* Set during NMI to prevent reentry */ ++ ++/* TRA differs sh3/4 */ ++#if defined(CONFIG_CPU_SH3) ++#define TRA 0xffffffd0 ++#elif defined(CONFIG_CPU_SH4) ++#define TRA 0xff000020 ++#endif ++ ++/* Macros for single step instruction identification */ ++#define OPCODE_BT(op) (((op) & 0xff00) == 0x8900) ++#define OPCODE_BF(op) (((op) & 0xff00) == 0x8b00) ++#define OPCODE_BTF_DISP(op) (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \ ++ (((op) & 0x7f ) << 1)) ++#define OPCODE_BFS(op) (((op) & 0xff00) == 0x8f00) ++#define OPCODE_BTS(op) (((op) & 0xff00) == 0x8d00) ++#define OPCODE_BRA(op) (((op) & 0xf000) == 0xa000) ++#define OPCODE_BRA_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ ++ (((op) & 0x7ff) << 1)) ++#define OPCODE_BRAF(op) (((op) & 0xf0ff) == 0x0023) ++#define OPCODE_BRAF_REG(op) (((op) & 0x0f00) >> 8) ++#define OPCODE_BSR(op) (((op) & 0xf000) == 0xb000) ++#define OPCODE_BSR_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ ++ (((op) & 0x7ff) << 1)) ++#define OPCODE_BSRF(op) (((op) & 0xf0ff) == 0x0003) ++#define OPCODE_BSRF_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_JMP(op) (((op) & 0xf0ff) == 0x402b) ++#define OPCODE_JMP_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_JSR(op) (((op) & 0xf0ff) == 0x400b) ++#define OPCODE_JSR_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_RTS(op) ((op) == 0xb) ++#define OPCODE_RTE(op) ((op) == 0x2b) ++ ++#define SR_T_BIT_MASK 0x1 ++#define STEP_OPCODE 0xc320 ++#define BIOS_CALL_TRAP 0x3f ++ ++/* Exception codes as per SH-4 core manual */ ++#define ADDRESS_ERROR_LOAD_VEC 7 ++#define ADDRESS_ERROR_STORE_VEC 8 ++#define TRAP_VEC 11 ++#define INVALID_INSN_VEC 12 ++#define INVALID_SLOT_VEC 13 ++#define NMI_VEC 14 ++#define SERIAL_BREAK_VEC 58 ++ ++/* Misc static */ ++static int stepped_address; ++static short stepped_opcode; ++ ++/* Translate SH-3/4 exception numbers to unix-like signal values */ ++static int compute_signal(const int excep_code) ++{ ++ switch (excep_code) { ++ case INVALID_INSN_VEC: ++ case INVALID_SLOT_VEC: ++ return SIGILL; ++ case ADDRESS_ERROR_LOAD_VEC: ++ case ADDRESS_ERROR_STORE_VEC: ++ return SIGSEGV; ++ case SERIAL_BREAK_VEC: ++ case NMI_VEC: ++ return SIGINT; ++ default: ++ /* Act like it was a break/trap. */ ++ return SIGTRAP; ++ } ++} ++ ++/* ++ * Translate the registers of the system into the format that GDB wants. Since ++ * we use a local structure to store things, instead of getting them out ++ * of pt_regs, we can just do a memcpy. ++ */ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *ign) ++{ ++ memcpy(gdb_regs, &trap_registers, sizeof(trap_registers)); ++} ++ ++/* ++ * On SH we save: r1 (prev->thread.sp) r2 (prev->thread.pc) r4 (prev) r5 (next) ++ * r6 (next->thread.sp) r7 (next->thread.pc) ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ int count; ++ ++ for (count = 0; count < 16; count++) ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = p->thread.pc; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++} ++ ++/* ++ * Translate the registers values that GDB has given us back into the ++ * format of the system. See the comment above about memcpy. ++ */ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *ign) ++{ ++ memcpy(&trap_registers, gdb_regs, sizeof(trap_registers)); ++} ++ ++/* Calculate the new address for after a step */ ++static short *get_step_address(void) ++{ ++ short op = *(short *)trap_registers.pc; ++ long addr; ++ ++ /* BT */ ++ if (OPCODE_BT(op)) { ++ if (trap_registers.sr & SR_T_BIT_MASK) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 2; ++ } ++ ++ /* BTS */ ++ else if (OPCODE_BTS(op)) { ++ if (trap_registers.sr & SR_T_BIT_MASK) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 4; /* Not in delay slot */ ++ } ++ ++ /* BF */ ++ else if (OPCODE_BF(op)) { ++ if (!(trap_registers.sr & SR_T_BIT_MASK)) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 2; ++ } ++ ++ /* BFS */ ++ else if (OPCODE_BFS(op)) { ++ if (!(trap_registers.sr & SR_T_BIT_MASK)) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 4; /* Not in delay slot */ ++ } ++ ++ /* BRA */ ++ else if (OPCODE_BRA(op)) ++ addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op); ++ ++ /* BRAF */ ++ else if (OPCODE_BRAF(op)) ++ addr = trap_registers.pc + 4 ++ + trap_registers.regs[OPCODE_BRAF_REG(op)]; ++ ++ /* BSR */ ++ else if (OPCODE_BSR(op)) ++ addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op); ++ ++ /* BSRF */ ++ else if (OPCODE_BSRF(op)) ++ addr = trap_registers.pc + 4 ++ + trap_registers.regs[OPCODE_BSRF_REG(op)]; ++ ++ /* JMP */ ++ else if (OPCODE_JMP(op)) ++ addr = trap_registers.regs[OPCODE_JMP_REG(op)]; ++ ++ /* JSR */ ++ else if (OPCODE_JSR(op)) ++ addr = trap_registers.regs[OPCODE_JSR_REG(op)]; ++ ++ /* RTS */ ++ else if (OPCODE_RTS(op)) ++ addr = trap_registers.pr; ++ ++ /* RTE */ ++ else if (OPCODE_RTE(op)) ++ addr = trap_registers.regs[15]; ++ ++ /* Other */ ++ else ++ addr = trap_registers.pc + 2; ++ ++ kgdb_flush_icache_range(addr, addr + 2); ++ return (short *)addr; ++} ++ ++/* The command loop, read and act on requests */ ++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *ign) ++{ ++ unsigned long addr; ++ char *ptr = &remcom_in_buffer[1]; ++ ++ /* Examine first char of buffer to see what we need to do */ ++ switch (remcom_in_buffer[0]) { ++ case 'c': /* Continue at address AA..AA (optional) */ ++ case 's': /* Step one instruction from AA..AA */ ++ /* Try to read optional parameter, PC unchanged if none */ ++ if (kgdb_hex2long(&ptr, &addr)) ++ trap_registers.pc = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ if (remcom_in_buffer[0] == 's') { ++ /* Replace the instruction immediately after the ++ * current instruction (i.e. next in the expected ++ * flow of control) with a trap instruction, so that ++ * returning will cause only a single instruction to ++ * be executed. Note that this model is slightly ++ * broken for instructions with delay slots ++ * (e.g. B[TF]S, BSR, BRA etc), where both the branch ++ * and the instruction in the delay slot will be ++ * executed. ++ */ ++ /* Determine where the target instruction will send ++ * us to */ ++ unsigned short *next_addr = get_step_address(); ++ stepped_address = (int)next_addr; ++ ++ /* Replace it */ ++ stepped_opcode = *(short *)next_addr; ++ *next_addr = STEP_OPCODE; ++ ++ /* Flush and return */ ++ kgdb_flush_icache_range((long)next_addr, ++ (long)next_addr + 2); ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ return -1; ++} ++ ++/* ++ * When an exception has occured, we are called. We need to set things ++ * up so that we can call kgdb_handle_exception to handle requests from ++ * the remote GDB. ++ */ ++void kgdb_exception_handler(struct pt_regs *regs) ++{ ++ int excep_code, vbr_val; ++ int count; ++ ++ /* Copy kernel regs (from stack) */ ++ for (count = 0; count < 16; count++) ++ trap_registers.regs[count] = regs->regs[count]; ++ trap_registers.pc = regs->pc; ++ trap_registers.pr = regs->pr; ++ trap_registers.sr = regs->sr; ++ trap_registers.gbr = regs->gbr; ++ trap_registers.mach = regs->mach; ++ trap_registers.macl = regs->macl; ++ ++ __asm__ __volatile__("stc vbr, %0":"=r"(vbr_val)); ++ trap_registers.vbr = vbr_val; ++ ++ /* Get the execption code. */ ++ __asm__ __volatile__("stc r2_bank, %0":"=r"(excep_code)); ++ ++ excep_code >>= 5; ++ ++ /* If we got an NMI, and KGDB is not yet initialized, call ++ * breakpoint() to try and initialize everything for us. */ ++ if (excep_code == NMI_VEC && !kgdb_initialized) { ++ breakpoint(); ++ return; ++ } ++ ++ /* TRAP_VEC exception indicates a software trap inserted in place of ++ * code by GDB so back up PC by one instruction, as this instruction ++ * will later be replaced by its original one. Do NOT do this for ++ * trap 0xff, since that indicates a compiled-in breakpoint which ++ * will not be replaced (and we would retake the trap forever) */ ++ if (excep_code == TRAP_VEC && ++ (*(volatile unsigned long *)TRA != (0xff << 2))) ++ trap_registers.pc -= 2; ++ ++ /* If we have been single-stepping, put back the old instruction. ++ * We use stepped_address in case we have stopped more than one ++ * instruction away. */ ++ if (stepped_opcode != 0) { ++ *(short *)stepped_address = stepped_opcode; ++ kgdb_flush_icache_range(stepped_address, stepped_address + 2); ++ } ++ stepped_opcode = 0; ++ ++ /* Call the stub to do the processing. Note that not everything we ++ * need to send back and forth lives in pt_regs. */ ++ kgdb_handle_exception(excep_code, compute_signal(excep_code), 0, regs); ++ ++ /* Copy back the (maybe modified) registers */ ++ for (count = 0; count < 16; count++) ++ regs->regs[count] = trap_registers.regs[count]; ++ regs->pc = trap_registers.pc; ++ regs->pr = trap_registers.pr; ++ regs->sr = trap_registers.sr; ++ regs->gbr = trap_registers.gbr; ++ regs->mach = trap_registers.mach; ++ regs->macl = trap_registers.macl; ++ ++ vbr_val = trap_registers.vbr; ++ __asm__ __volatile__("ldc %0, vbr": :"r"(vbr_val)); ++} ++ ++int __init kgdb_arch_init(void) ++{ ++ per_cpu_trap_init(); ++ ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifdef CONFIG_CPU_LITTLE_ENDIAN ++ .gdb_bpt_instr = {0xff, 0xc3}, ++#else ++ .gdb_bpt_instr = {0xc3, 0xff}, ++#endif ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_jmp.S linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_jmp.S +--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_jmp.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_jmp.S 1970-01-01 03:00:00.000000000 +0300 +@@ -1,33 +0,0 @@ +-#include +- +-ENTRY(setjmp) +- add #(9*4), r4 +- sts.l pr, @-r4 +- mov.l r15, @-r4 +- mov.l r14, @-r4 +- mov.l r13, @-r4 +- mov.l r12, @-r4 +- mov.l r11, @-r4 +- mov.l r10, @-r4 +- mov.l r9, @-r4 +- mov.l r8, @-r4 +- rts +- mov #0, r0 +- +-ENTRY(longjmp) +- mov.l @r4+, r8 +- mov.l @r4+, r9 +- mov.l @r4+, r10 +- mov.l @r4+, r11 +- mov.l @r4+, r12 +- mov.l @r4+, r13 +- mov.l @r4+, r14 +- mov.l @r4+, r15 +- lds.l @r4+, pr +- mov r5, r0 +- tst r0, r0 +- bf 1f +- mov #1, r0 ! in case val==0 +-1: rts +- nop +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_stub.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_stub.c +--- linux-2.6.18-53.1.14/arch/sh/kernel/kgdb_stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/kgdb_stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,1491 +0,0 @@ +-/* +- * May be copied or modified under the terms of the GNU General Public +- * License. See linux/COPYING for more information. +- * +- * Containes extracts from code by Glenn Engel, Jim Kingdon, +- * David Grothe , Tigran Aivazian , +- * Amit S. Kale , William Gatliff , +- * Ben Lee, Steve Chamberlain and Benoit Miller . +- * +- * This version by Henry Bell +- * Minor modifications by Jeremy Siegel +- * +- * Contains low-level support for remote debug using GDB. +- * +- * To enable debugger support, two things need to happen. A call to +- * set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * A breakpoint also needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint() which does +- * a trapa if the initialisation phase has been successfully completed. +- * +- * In this case, set_debug_traps() is not used to "take over" exceptions; +- * other kernel code is modified instead to enter the kgdb functions here +- * when appropriate (see entry.S for breakpoint traps and NMI interrupts, +- * see traps.c for kernel error exceptions). +- * +- * The following gdb commands are supported: +- * +- * Command Function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * XAA..AA,LLLL: Same, but data is binary (not hex) OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * CNN; Resume at current address with signal SNN +- * CNN;AA..AA Resume at address AA..AA with signal SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * SNN; Step one instruction with signal SNN +- * SNNAA..AA Step one instruction from AA..AA w/NN SNN +- * +- * k kill (Detach GDB) +- * +- * d Toggle debug flag +- * D Detach GDB +- * +- * Hct Set thread t for operations, OK or ENN +- * c = 'c' (step, cont), c = 'g' (other +- * operations) +- * +- * qC Query current thread ID QCpid +- * qfThreadInfo Get list of current threads (first) m +- * qsThreadInfo " " " " " (subsequent) +- * qOffsets Get section offsets Text=x;Data=y;Bss=z +- * +- * TXX Find if thread XX is alive OK or ENN +- * ? What was the last sigval ? SNN (signal NN) +- * O Output to GDB console +- * +- * Remote communication protocol. +- * +- * A debug packet whose contents are is encapsulated for +- * transmission in the form: +- * +- * $ # CSUM1 CSUM2 +- * +- * must be ASCII alphanumeric and cannot include characters +- * '$' or '#'. If starts with two characters followed by +- * ':', then the existing stubs interpret this as a sequence number. +- * +- * CSUM1 and CSUM2 are ascii hex representation of an 8-bit +- * checksum of , the most significant nibble is sent first. +- * the hex digits 0-9,a-f are used. +- * +- * Receiver responds with: +- * +- * + - if CSUM is correct and ready for next packet +- * - - if CSUM is incorrect +- * +- * Responses can be run-length encoded to save space. A '*' means that +- * the next character is an ASCII encoding giving a repeat count which +- * stands for that many repititions of the character preceding the '*'. +- * The encoding is n+29, yielding a printable character where n >=3 +- * (which is where RLE starts to win). Don't use an n > 126. +- * +- * So "0* " means the same as "0000". +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_SH_KGDB_CONSOLE +-#include +-#endif +- +-/* Function pointers for linkage */ +-kgdb_debug_hook_t *kgdb_debug_hook; +-kgdb_bus_error_hook_t *kgdb_bus_err_hook; +- +-int (*kgdb_getchar)(void); +-void (*kgdb_putchar)(int); +- +-static void put_debug_char(int c) +-{ +- if (!kgdb_putchar) +- return; +- (*kgdb_putchar)(c); +-} +-static int get_debug_char(void) +-{ +- if (!kgdb_getchar) +- return -1; +- return (*kgdb_getchar)(); +-} +- +-/* Num chars in in/out bound buffers, register packets need NUMREGBYTES * 2 */ +-#define BUFMAX 1024 +-#define NUMREGBYTES (MAXREG*4) +-#define OUTBUFMAX (NUMREGBYTES*2+512) +- +-enum regs { +- R0 = 0, R1, R2, R3, R4, R5, R6, R7, +- R8, R9, R10, R11, R12, R13, R14, R15, +- PC, PR, GBR, VBR, MACH, MACL, SR, +- /* */ +- MAXREG +-}; +- +-static unsigned int registers[MAXREG]; +-struct kgdb_regs trap_registers; +- +-char kgdb_in_gdb_mode; +-char in_nmi; /* Set during NMI to prevent reentry */ +-int kgdb_nofault; /* Boolean to ignore bus errs (i.e. in GDB) */ +-int kgdb_enabled = 1; /* Default to enabled, cmdline can disable */ +-int kgdb_halt; +- +-/* Exposed for user access */ +-struct task_struct *kgdb_current; +-unsigned int kgdb_g_imask; +-int kgdb_trapa_val; +-int kgdb_excode; +- +-/* Default values for SCI (can override via kernel args in setup.c) */ +-#ifndef CONFIG_KGDB_DEFPORT +-#define CONFIG_KGDB_DEFPORT 1 +-#endif +- +-#ifndef CONFIG_KGDB_DEFBAUD +-#define CONFIG_KGDB_DEFBAUD 115200 +-#endif +- +-#if defined(CONFIG_KGDB_DEFPARITY_E) +-#define CONFIG_KGDB_DEFPARITY 'E' +-#elif defined(CONFIG_KGDB_DEFPARITY_O) +-#define CONFIG_KGDB_DEFPARITY 'O' +-#else /* CONFIG_KGDB_DEFPARITY_N */ +-#define CONFIG_KGDB_DEFPARITY 'N' +-#endif +- +-#ifdef CONFIG_KGDB_DEFBITS_7 +-#define CONFIG_KGDB_DEFBITS '7' +-#else /* CONFIG_KGDB_DEFBITS_8 */ +-#define CONFIG_KGDB_DEFBITS '8' +-#endif +- +-/* SCI/UART settings, used in kgdb_console_setup() */ +-int kgdb_portnum = CONFIG_KGDB_DEFPORT; +-int kgdb_baud = CONFIG_KGDB_DEFBAUD; +-char kgdb_parity = CONFIG_KGDB_DEFPARITY; +-char kgdb_bits = CONFIG_KGDB_DEFBITS; +- +-/* Jump buffer for setjmp/longjmp */ +-static jmp_buf rem_com_env; +- +-/* TRA differs sh3/4 */ +-#if defined(CONFIG_CPU_SH3) +-#define TRA 0xffffffd0 +-#elif defined(CONFIG_CPU_SH4) +-#define TRA 0xff000020 +-#endif +- +-/* Macros for single step instruction identification */ +-#define OPCODE_BT(op) (((op) & 0xff00) == 0x8900) +-#define OPCODE_BF(op) (((op) & 0xff00) == 0x8b00) +-#define OPCODE_BTF_DISP(op) (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \ +- (((op) & 0x7f ) << 1)) +-#define OPCODE_BFS(op) (((op) & 0xff00) == 0x8f00) +-#define OPCODE_BTS(op) (((op) & 0xff00) == 0x8d00) +-#define OPCODE_BRA(op) (((op) & 0xf000) == 0xa000) +-#define OPCODE_BRA_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ +- (((op) & 0x7ff) << 1)) +-#define OPCODE_BRAF(op) (((op) & 0xf0ff) == 0x0023) +-#define OPCODE_BRAF_REG(op) (((op) & 0x0f00) >> 8) +-#define OPCODE_BSR(op) (((op) & 0xf000) == 0xb000) +-#define OPCODE_BSR_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ +- (((op) & 0x7ff) << 1)) +-#define OPCODE_BSRF(op) (((op) & 0xf0ff) == 0x0003) +-#define OPCODE_BSRF_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_JMP(op) (((op) & 0xf0ff) == 0x402b) +-#define OPCODE_JMP_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_JSR(op) (((op) & 0xf0ff) == 0x400b) +-#define OPCODE_JSR_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_RTS(op) ((op) == 0xb) +-#define OPCODE_RTE(op) ((op) == 0x2b) +- +-#define SR_T_BIT_MASK 0x1 +-#define STEP_OPCODE 0xc320 +-#define BIOS_CALL_TRAP 0x3f +- +-/* Exception codes as per SH-4 core manual */ +-#define ADDRESS_ERROR_LOAD_VEC 7 +-#define ADDRESS_ERROR_STORE_VEC 8 +-#define TRAP_VEC 11 +-#define INVALID_INSN_VEC 12 +-#define INVALID_SLOT_VEC 13 +-#define NMI_VEC 14 +-#define USER_BREAK_VEC 15 +-#define SERIAL_BREAK_VEC 58 +- +-/* Misc static */ +-static int stepped_address; +-static short stepped_opcode; +-static const char hexchars[] = "0123456789abcdef"; +-static char in_buffer[BUFMAX]; +-static char out_buffer[OUTBUFMAX]; +- +-static void kgdb_to_gdb(const char *s); +- +-#ifdef CONFIG_KGDB_THREAD +-static struct task_struct *trapped_thread; +-static struct task_struct *current_thread; +-typedef unsigned char threadref[8]; +-#define BUF_THREAD_ID_SIZE 16 +-#endif +- +-/* Return addr as a real volatile address */ +-static inline unsigned int ctrl_inl(const unsigned long addr) +-{ +- return *(volatile unsigned long *) addr; +-} +- +-/* Correctly set *addr using volatile */ +-static inline void ctrl_outl(const unsigned int b, unsigned long addr) +-{ +- *(volatile unsigned long *) addr = b; +-} +- +-/* Get high hex bits */ +-static char highhex(const int x) +-{ +- return hexchars[(x >> 4) & 0xf]; +-} +- +-/* Get low hex bits */ +-static char lowhex(const int x) +-{ +- return hexchars[x & 0xf]; +-} +- +-/* Convert ch to hex */ +-static int hex(const char ch) +-{ +- if ((ch >= 'a') && (ch <= 'f')) +- return (ch - 'a' + 10); +- if ((ch >= '0') && (ch <= '9')) +- return (ch - '0'); +- if ((ch >= 'A') && (ch <= 'F')) +- return (ch - 'A' + 10); +- return (-1); +-} +- +-/* Convert the memory pointed to by mem into hex, placing result in buf. +- Returns a pointer to the last char put in buf (null) */ +-static char *mem_to_hex(const char *mem, char *buf, const int count) +-{ +- int i; +- int ch; +- unsigned short s_val; +- unsigned long l_val; +- +- /* Check for 16 or 32 */ +- if (count == 2 && ((long) mem & 1) == 0) { +- s_val = *(unsigned short *) mem; +- mem = (char *) &s_val; +- } else if (count == 4 && ((long) mem & 3) == 0) { +- l_val = *(unsigned long *) mem; +- mem = (char *) &l_val; +- } +- for (i = 0; i < count; i++) { +- ch = *mem++; +- *buf++ = highhex(ch); +- *buf++ = lowhex(ch); +- } +- *buf = 0; +- return (buf); +-} +- +-/* Convert the hex array pointed to by buf into binary, to be placed in mem. +- Return a pointer to the character after the last byte written */ +-static char *hex_to_mem(const char *buf, char *mem, const int count) +-{ +- int i; +- unsigned char ch; +- +- for (i = 0; i < count; i++) { +- ch = hex(*buf++) << 4; +- ch = ch + hex(*buf++); +- *mem++ = ch; +- } +- return (mem); +-} +- +-/* While finding valid hex chars, convert to an integer, then return it */ +-static int hex_to_int(char **ptr, int *int_value) +-{ +- int num_chars = 0; +- int hex_value; +- +- *int_value = 0; +- +- while (**ptr) { +- hex_value = hex(**ptr); +- if (hex_value >= 0) { +- *int_value = (*int_value << 4) | hex_value; +- num_chars++; +- } else +- break; +- (*ptr)++; +- } +- return num_chars; +-} +- +-/* Copy the binary array pointed to by buf into mem. Fix $, #, +- and 0x7d escaped with 0x7d. Return a pointer to the character +- after the last byte written. */ +-static char *ebin_to_mem(const char *buf, char *mem, int count) +-{ +- for (; count > 0; count--, buf++) { +- if (*buf == 0x7d) +- *mem++ = *(++buf) ^ 0x20; +- else +- *mem++ = *buf; +- } +- return mem; +-} +- +-/* Pack a hex byte */ +-static char *pack_hex_byte(char *pkt, int byte) +-{ +- *pkt++ = hexchars[(byte >> 4) & 0xf]; +- *pkt++ = hexchars[(byte & 0xf)]; +- return pkt; +-} +- +-#ifdef CONFIG_KGDB_THREAD +- +-/* Pack a thread ID */ +-static char *pack_threadid(char *pkt, threadref * id) +-{ +- char *limit; +- unsigned char *altid; +- +- altid = (unsigned char *) id; +- +- limit = pkt + BUF_THREAD_ID_SIZE; +- while (pkt < limit) +- pkt = pack_hex_byte(pkt, *altid++); +- return pkt; +-} +- +-/* Convert an integer into our threadref */ +-static void int_to_threadref(threadref * id, const int value) +-{ +- unsigned char *scan = (unsigned char *) id; +- int i = 4; +- +- while (i--) +- *scan++ = 0; +- +- *scan++ = (value >> 24) & 0xff; +- *scan++ = (value >> 16) & 0xff; +- *scan++ = (value >> 8) & 0xff; +- *scan++ = (value & 0xff); +-} +- +-/* Return a task structure ptr for a particular pid */ +-static struct task_struct *get_thread(int pid) +-{ +- struct task_struct *thread; +- +- /* Use PID_MAX w/gdb for pid 0 */ +- if (pid == PID_MAX) pid = 0; +- +- /* First check via PID */ +- thread = find_task_by_pid(pid); +- +- if (thread) +- return thread; +- +- /* Start at the start */ +- thread = init_tasks[0]; +- +- /* Walk along the linked list of tasks */ +- do { +- if (thread->pid == pid) +- return thread; +- thread = thread->next_task; +- } while (thread != init_tasks[0]); +- +- return NULL; +-} +- +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* Scan for the start char '$', read the packet and check the checksum */ +-static void get_packet(char *buffer, int buflen) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- char ch; +- +- do { +- /* Ignore everything until the start character */ +- while ((ch = get_debug_char()) != '$'); +- +- checksum = 0; +- xmitcsum = -1; +- count = 0; +- +- /* Now, read until a # or end of buffer is found */ +- while (count < (buflen - 1)) { +- ch = get_debug_char(); +- +- if (ch == '#') +- break; +- +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- buffer[count] = 0; +- +- /* Continue to read checksum following # */ +- if (ch == '#') { +- xmitcsum = hex(get_debug_char()) << 4; +- xmitcsum += hex(get_debug_char()); +- +- /* Checksum */ +- if (checksum != xmitcsum) +- put_debug_char('-'); /* Failed checksum */ +- else { +- /* Ack successful transfer */ +- put_debug_char('+'); +- +- /* If a sequence char is present, reply +- the sequence ID */ +- if (buffer[2] == ':') { +- put_debug_char(buffer[0]); +- put_debug_char(buffer[1]); +- +- /* Remove sequence chars from buffer */ +- count = strlen(buffer); +- for (i = 3; i <= count; i++) +- buffer[i - 3] = buffer[i]; +- } +- } +- } +- } +- while (checksum != xmitcsum); /* Keep trying while we fail */ +-} +- +-/* Send the packet in the buffer with run-length encoding */ +-static void put_packet(char *buffer) +-{ +- int checksum; +- char *src; +- int runlen; +- int encode; +- +- do { +- src = buffer; +- put_debug_char('$'); +- checksum = 0; +- +- /* Continue while we still have chars left */ +- while (*src) { +- /* Check for runs up to 99 chars long */ +- for (runlen = 1; runlen < 99; runlen++) { +- if (src[0] != src[runlen]) +- break; +- } +- +- if (runlen > 3) { +- /* Got a useful amount, send encoding */ +- encode = runlen + ' ' - 4; +- put_debug_char(*src); checksum += *src; +- put_debug_char('*'); checksum += '*'; +- put_debug_char(encode); checksum += encode; +- src += runlen; +- } else { +- /* Otherwise just send the current char */ +- put_debug_char(*src); checksum += *src; +- src += 1; +- } +- } +- +- /* '#' Separator, put high and low components of checksum */ +- put_debug_char('#'); +- put_debug_char(highhex(checksum)); +- put_debug_char(lowhex(checksum)); +- } +- while ((get_debug_char()) != '+'); /* While no ack */ +-} +- +-/* A bus error has occurred - perform a longjmp to return execution and +- allow handling of the error */ +-static void kgdb_handle_bus_error(void) +-{ +- longjmp(rem_com_env, 1); +-} +- +-/* Translate SH-3/4 exception numbers to unix-like signal values */ +-static int compute_signal(const int excep_code) +-{ +- int sigval; +- +- switch (excep_code) { +- +- case INVALID_INSN_VEC: +- case INVALID_SLOT_VEC: +- sigval = SIGILL; +- break; +- case ADDRESS_ERROR_LOAD_VEC: +- case ADDRESS_ERROR_STORE_VEC: +- sigval = SIGSEGV; +- break; +- +- case SERIAL_BREAK_VEC: +- case NMI_VEC: +- sigval = SIGINT; +- break; +- +- case USER_BREAK_VEC: +- case TRAP_VEC: +- sigval = SIGTRAP; +- break; +- +- default: +- sigval = SIGBUS; /* "software generated" */ +- break; +- } +- +- return (sigval); +-} +- +-/* Make a local copy of the registers passed into the handler (bletch) */ +-static void kgdb_regs_to_gdb_regs(const struct kgdb_regs *regs, +- int *gdb_regs) +-{ +- gdb_regs[R0] = regs->regs[R0]; +- gdb_regs[R1] = regs->regs[R1]; +- gdb_regs[R2] = regs->regs[R2]; +- gdb_regs[R3] = regs->regs[R3]; +- gdb_regs[R4] = regs->regs[R4]; +- gdb_regs[R5] = regs->regs[R5]; +- gdb_regs[R6] = regs->regs[R6]; +- gdb_regs[R7] = regs->regs[R7]; +- gdb_regs[R8] = regs->regs[R8]; +- gdb_regs[R9] = regs->regs[R9]; +- gdb_regs[R10] = regs->regs[R10]; +- gdb_regs[R11] = regs->regs[R11]; +- gdb_regs[R12] = regs->regs[R12]; +- gdb_regs[R13] = regs->regs[R13]; +- gdb_regs[R14] = regs->regs[R14]; +- gdb_regs[R15] = regs->regs[R15]; +- gdb_regs[PC] = regs->pc; +- gdb_regs[PR] = regs->pr; +- gdb_regs[GBR] = regs->gbr; +- gdb_regs[MACH] = regs->mach; +- gdb_regs[MACL] = regs->macl; +- gdb_regs[SR] = regs->sr; +- gdb_regs[VBR] = regs->vbr; +-} +- +-/* Copy local gdb registers back to kgdb regs, for later copy to kernel */ +-static void gdb_regs_to_kgdb_regs(const int *gdb_regs, +- struct kgdb_regs *regs) +-{ +- regs->regs[R0] = gdb_regs[R0]; +- regs->regs[R1] = gdb_regs[R1]; +- regs->regs[R2] = gdb_regs[R2]; +- regs->regs[R3] = gdb_regs[R3]; +- regs->regs[R4] = gdb_regs[R4]; +- regs->regs[R5] = gdb_regs[R5]; +- regs->regs[R6] = gdb_regs[R6]; +- regs->regs[R7] = gdb_regs[R7]; +- regs->regs[R8] = gdb_regs[R8]; +- regs->regs[R9] = gdb_regs[R9]; +- regs->regs[R10] = gdb_regs[R10]; +- regs->regs[R11] = gdb_regs[R11]; +- regs->regs[R12] = gdb_regs[R12]; +- regs->regs[R13] = gdb_regs[R13]; +- regs->regs[R14] = gdb_regs[R14]; +- regs->regs[R15] = gdb_regs[R15]; +- regs->pc = gdb_regs[PC]; +- regs->pr = gdb_regs[PR]; +- regs->gbr = gdb_regs[GBR]; +- regs->mach = gdb_regs[MACH]; +- regs->macl = gdb_regs[MACL]; +- regs->sr = gdb_regs[SR]; +- regs->vbr = gdb_regs[VBR]; +-} +- +-#ifdef CONFIG_KGDB_THREAD +-/* Make a local copy of registers from the specified thread */ +-asmlinkage void ret_from_fork(void); +-static void thread_regs_to_gdb_regs(const struct task_struct *thread, +- int *gdb_regs) +-{ +- int regno; +- int *tregs; +- +- /* Initialize to zero */ +- for (regno = 0; regno < MAXREG; regno++) +- gdb_regs[regno] = 0; +- +- /* Just making sure... */ +- if (thread == NULL) +- return; +- +- /* A new fork has pt_regs on the stack from a fork() call */ +- if (thread->thread.pc == (unsigned long)ret_from_fork) { +- +- int vbr_val; +- struct pt_regs *kregs; +- kregs = (struct pt_regs*)thread->thread.sp; +- +- gdb_regs[R0] = kregs->regs[R0]; +- gdb_regs[R1] = kregs->regs[R1]; +- gdb_regs[R2] = kregs->regs[R2]; +- gdb_regs[R3] = kregs->regs[R3]; +- gdb_regs[R4] = kregs->regs[R4]; +- gdb_regs[R5] = kregs->regs[R5]; +- gdb_regs[R6] = kregs->regs[R6]; +- gdb_regs[R7] = kregs->regs[R7]; +- gdb_regs[R8] = kregs->regs[R8]; +- gdb_regs[R9] = kregs->regs[R9]; +- gdb_regs[R10] = kregs->regs[R10]; +- gdb_regs[R11] = kregs->regs[R11]; +- gdb_regs[R12] = kregs->regs[R12]; +- gdb_regs[R13] = kregs->regs[R13]; +- gdb_regs[R14] = kregs->regs[R14]; +- gdb_regs[R15] = kregs->regs[R15]; +- gdb_regs[PC] = kregs->pc; +- gdb_regs[PR] = kregs->pr; +- gdb_regs[GBR] = kregs->gbr; +- gdb_regs[MACH] = kregs->mach; +- gdb_regs[MACL] = kregs->macl; +- gdb_regs[SR] = kregs->sr; +- +- asm("stc vbr, %0":"=r"(vbr_val)); +- gdb_regs[VBR] = vbr_val; +- return; +- } +- +- /* Otherwise, we have only some registers from switch_to() */ +- tregs = (int *)thread->thread.sp; +- gdb_regs[R15] = (int)tregs; +- gdb_regs[R14] = *tregs++; +- gdb_regs[R13] = *tregs++; +- gdb_regs[R12] = *tregs++; +- gdb_regs[R11] = *tregs++; +- gdb_regs[R10] = *tregs++; +- gdb_regs[R9] = *tregs++; +- gdb_regs[R8] = *tregs++; +- gdb_regs[PR] = *tregs++; +- gdb_regs[GBR] = *tregs++; +- gdb_regs[PC] = thread->thread.pc; +-} +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* Calculate the new address for after a step */ +-static short *get_step_address(void) +-{ +- short op = *(short *) trap_registers.pc; +- long addr; +- +- /* BT */ +- if (OPCODE_BT(op)) { +- if (trap_registers.sr & SR_T_BIT_MASK) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 2; +- } +- +- /* BTS */ +- else if (OPCODE_BTS(op)) { +- if (trap_registers.sr & SR_T_BIT_MASK) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 4; /* Not in delay slot */ +- } +- +- /* BF */ +- else if (OPCODE_BF(op)) { +- if (!(trap_registers.sr & SR_T_BIT_MASK)) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 2; +- } +- +- /* BFS */ +- else if (OPCODE_BFS(op)) { +- if (!(trap_registers.sr & SR_T_BIT_MASK)) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 4; /* Not in delay slot */ +- } +- +- /* BRA */ +- else if (OPCODE_BRA(op)) +- addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op); +- +- /* BRAF */ +- else if (OPCODE_BRAF(op)) +- addr = trap_registers.pc + 4 +- + trap_registers.regs[OPCODE_BRAF_REG(op)]; +- +- /* BSR */ +- else if (OPCODE_BSR(op)) +- addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op); +- +- /* BSRF */ +- else if (OPCODE_BSRF(op)) +- addr = trap_registers.pc + 4 +- + trap_registers.regs[OPCODE_BSRF_REG(op)]; +- +- /* JMP */ +- else if (OPCODE_JMP(op)) +- addr = trap_registers.regs[OPCODE_JMP_REG(op)]; +- +- /* JSR */ +- else if (OPCODE_JSR(op)) +- addr = trap_registers.regs[OPCODE_JSR_REG(op)]; +- +- /* RTS */ +- else if (OPCODE_RTS(op)) +- addr = trap_registers.pr; +- +- /* RTE */ +- else if (OPCODE_RTE(op)) +- addr = trap_registers.regs[15]; +- +- /* Other */ +- else +- addr = trap_registers.pc + 2; +- +- kgdb_flush_icache_range(addr, addr + 2); +- return (short *) addr; +-} +- +-/* Set up a single-step. Replace the instruction immediately after the +- current instruction (i.e. next in the expected flow of control) with a +- trap instruction, so that returning will cause only a single instruction +- to be executed. Note that this model is slightly broken for instructions +- with delay slots (e.g. B[TF]S, BSR, BRA etc), where both the branch +- and the instruction in the delay slot will be executed. */ +-static void do_single_step(void) +-{ +- unsigned short *addr = 0; +- +- /* Determine where the target instruction will send us to */ +- addr = get_step_address(); +- stepped_address = (int)addr; +- +- /* Replace it */ +- stepped_opcode = *(short *)addr; +- *addr = STEP_OPCODE; +- +- /* Flush and return */ +- kgdb_flush_icache_range((long) addr, (long) addr + 2); +- return; +-} +- +-/* Undo a single step */ +-static void undo_single_step(void) +-{ +- /* If we have stepped, put back the old instruction */ +- /* Use stepped_address in case we stopped elsewhere */ +- if (stepped_opcode != 0) { +- *(short*)stepped_address = stepped_opcode; +- kgdb_flush_icache_range(stepped_address, stepped_address + 2); +- } +- stepped_opcode = 0; +-} +- +-/* Send a signal message */ +-static void send_signal_msg(const int signum) +-{ +-#ifndef CONFIG_KGDB_THREAD +- out_buffer[0] = 'S'; +- out_buffer[1] = highhex(signum); +- out_buffer[2] = lowhex(signum); +- out_buffer[3] = 0; +- put_packet(out_buffer); +-#else /* CONFIG_KGDB_THREAD */ +- int threadid; +- threadref thref; +- char *out = out_buffer; +- const char *tstring = "thread"; +- +- *out++ = 'T'; +- *out++ = highhex(signum); +- *out++ = lowhex(signum); +- +- while (*tstring) { +- *out++ = *tstring++; +- } +- *out++ = ':'; +- +- threadid = trapped_thread->pid; +- if (threadid == 0) threadid = PID_MAX; +- int_to_threadref(&thref, threadid); +- pack_threadid(out, &thref); +- out += BUF_THREAD_ID_SIZE; +- *out++ = ';'; +- +- *out = 0; +- put_packet(out_buffer); +-#endif /* CONFIG_KGDB_THREAD */ +-} +- +-/* Reply that all was well */ +-static void send_ok_msg(void) +-{ +- strcpy(out_buffer, "OK"); +- put_packet(out_buffer); +-} +- +-/* Reply that an error occurred */ +-static void send_err_msg(void) +-{ +- strcpy(out_buffer, "E01"); +- put_packet(out_buffer); +-} +- +-/* Empty message indicates unrecognised command */ +-static void send_empty_msg(void) +-{ +- put_packet(""); +-} +- +-/* Read memory due to 'm' message */ +-static void read_mem_msg(void) +-{ +- char *ptr; +- int addr; +- int length; +- +- /* Jmp, disable bus error handler */ +- if (setjmp(rem_com_env) == 0) { +- +- kgdb_nofault = 1; +- +- /* Walk through, have m, */ +- ptr = &in_buffer[1]; +- if (hex_to_int(&ptr, &addr) && (*ptr++ == ',')) +- if (hex_to_int(&ptr, &length)) { +- ptr = 0; +- if (length * 2 > OUTBUFMAX) +- length = OUTBUFMAX / 2; +- mem_to_hex((char *) addr, out_buffer, length); +- } +- if (ptr) +- send_err_msg(); +- else +- put_packet(out_buffer); +- } else +- send_err_msg(); +- +- /* Restore bus error handler */ +- kgdb_nofault = 0; +-} +- +-/* Write memory due to 'M' or 'X' message */ +-static void write_mem_msg(int binary) +-{ +- char *ptr; +- int addr; +- int length; +- +- if (setjmp(rem_com_env) == 0) { +- +- kgdb_nofault = 1; +- +- /* Walk through, have M,: */ +- ptr = &in_buffer[1]; +- if (hex_to_int(&ptr, &addr) && (*ptr++ == ',')) +- if (hex_to_int(&ptr, &length) && (*ptr++ == ':')) { +- if (binary) +- ebin_to_mem(ptr, (char*)addr, length); +- else +- hex_to_mem(ptr, (char*)addr, length); +- kgdb_flush_icache_range(addr, addr + length); +- ptr = 0; +- send_ok_msg(); +- } +- if (ptr) +- send_err_msg(); +- } else +- send_err_msg(); +- +- /* Restore bus error handler */ +- kgdb_nofault = 0; +-} +- +-/* Continue message */ +-static void continue_msg(void) +-{ +- /* Try to read optional parameter, PC unchanged if none */ +- char *ptr = &in_buffer[1]; +- int addr; +- +- if (hex_to_int(&ptr, &addr)) +- trap_registers.pc = addr; +-} +- +-/* Continue message with signal */ +-static void continue_with_sig_msg(void) +-{ +- int signal; +- char *ptr = &in_buffer[1]; +- int addr; +- +- /* Report limitation */ +- kgdb_to_gdb("Cannot force signal in kgdb, continuing anyway.\n"); +- +- /* Signal */ +- hex_to_int(&ptr, &signal); +- if (*ptr == ';') +- ptr++; +- +- /* Optional address */ +- if (hex_to_int(&ptr, &addr)) +- trap_registers.pc = addr; +-} +- +-/* Step message */ +-static void step_msg(void) +-{ +- continue_msg(); +- do_single_step(); +-} +- +-/* Step message with signal */ +-static void step_with_sig_msg(void) +-{ +- continue_with_sig_msg(); +- do_single_step(); +-} +- +-/* Send register contents */ +-static void send_regs_msg(void) +-{ +-#ifdef CONFIG_KGDB_THREAD +- if (!current_thread) +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +- else +- thread_regs_to_gdb_regs(current_thread, registers); +-#else +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +-#endif +- +- mem_to_hex((char *) registers, out_buffer, NUMREGBYTES); +- put_packet(out_buffer); +-} +- +-/* Set register contents - currently can't set other thread's registers */ +-static void set_regs_msg(void) +-{ +-#ifdef CONFIG_KGDB_THREAD +- if (!current_thread) { +-#endif +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +- hex_to_mem(&in_buffer[1], (char *) registers, NUMREGBYTES); +- gdb_regs_to_kgdb_regs(registers, &trap_registers); +- send_ok_msg(); +-#ifdef CONFIG_KGDB_THREAD +- } else +- send_err_msg(); +-#endif +-} +- +- +-#ifdef CONFIG_KGDB_THREAD +- +-/* Set the status for a thread */ +-void set_thread_msg(void) +-{ +- int threadid; +- struct task_struct *thread = NULL; +- char *ptr; +- +- switch (in_buffer[1]) { +- +- /* To select which thread for gG etc messages, i.e. supported */ +- case 'g': +- +- ptr = &in_buffer[2]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- +- /* If we haven't found it */ +- if (!thread) { +- send_err_msg(); +- break; +- } +- +- /* Set current_thread (or not) */ +- if (thread == trapped_thread) +- current_thread = NULL; +- else +- current_thread = thread; +- send_ok_msg(); +- break; +- +- /* To select which thread for cCsS messages, i.e. unsupported */ +- case 'c': +- send_ok_msg(); +- break; +- +- default: +- send_empty_msg(); +- break; +- } +-} +- +-/* Is a thread alive? */ +-static void thread_status_msg(void) +-{ +- char *ptr; +- int threadid; +- struct task_struct *thread = NULL; +- +- ptr = &in_buffer[1]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- if (thread) +- send_ok_msg(); +- else +- send_err_msg(); +-} +-/* Send the current thread ID */ +-static void thread_id_msg(void) +-{ +- int threadid; +- threadref thref; +- +- out_buffer[0] = 'Q'; +- out_buffer[1] = 'C'; +- +- if (current_thread) +- threadid = current_thread->pid; +- else if (trapped_thread) +- threadid = trapped_thread->pid; +- else /* Impossible, but just in case! */ +- { +- send_err_msg(); +- return; +- } +- +- /* Translate pid 0 to PID_MAX for gdb */ +- if (threadid == 0) threadid = PID_MAX; +- +- int_to_threadref(&thref, threadid); +- pack_threadid(out_buffer + 2, &thref); +- out_buffer[2 + BUF_THREAD_ID_SIZE] = '\0'; +- put_packet(out_buffer); +-} +- +-/* Send thread info */ +-static void thread_info_msg(void) +-{ +- struct task_struct *thread = NULL; +- int threadid; +- char *pos; +- threadref thref; +- +- /* Start with 'm' */ +- out_buffer[0] = 'm'; +- pos = &out_buffer[1]; +- +- /* For all possible thread IDs - this will overrun if > 44 threads! */ +- /* Start at 1 and include PID_MAX (since GDB won't use pid 0...) */ +- for (threadid = 1; threadid <= PID_MAX; threadid++) { +- +- read_lock(&tasklist_lock); +- thread = get_thread(threadid); +- read_unlock(&tasklist_lock); +- +- /* If it's a valid thread */ +- if (thread) { +- int_to_threadref(&thref, threadid); +- pack_threadid(pos, &thref); +- pos += BUF_THREAD_ID_SIZE; +- *pos++ = ','; +- } +- } +- *--pos = 0; /* Lose final comma */ +- put_packet(out_buffer); +- +-} +- +-/* Return printable info for gdb's 'info threads' command */ +-static void thread_extra_info_msg(void) +-{ +- int threadid; +- struct task_struct *thread = NULL; +- char buffer[20], *ptr; +- int i; +- +- /* Extract thread ID */ +- ptr = &in_buffer[17]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- +- /* If we don't recognise it, say so */ +- if (thread == NULL) +- strcpy(buffer, "(unknown)"); +- else +- strcpy(buffer, thread->comm); +- +- /* Construct packet */ +- for (i = 0, ptr = out_buffer; buffer[i]; i++) +- ptr = pack_hex_byte(ptr, buffer[i]); +- +- if (thread->thread.pc == (unsigned long)ret_from_fork) { +- strcpy(buffer, ""); +- for (i = 0; buffer[i]; i++) +- ptr = pack_hex_byte(ptr, buffer[i]); +- } +- +- *ptr = '\0'; +- put_packet(out_buffer); +-} +- +-/* Handle all qFooBarBaz messages - have to use an if statement as +- opposed to a switch because q messages can have > 1 char id. */ +-static void query_msg(void) +-{ +- const char *q_start = &in_buffer[1]; +- +- /* qC = return current thread ID */ +- if (strncmp(q_start, "C", 1) == 0) +- thread_id_msg(); +- +- /* qfThreadInfo = query all threads (first) */ +- else if (strncmp(q_start, "fThreadInfo", 11) == 0) +- thread_info_msg(); +- +- /* qsThreadInfo = query all threads (subsequent). We know we have sent +- them all after the qfThreadInfo message, so there are no to send */ +- else if (strncmp(q_start, "sThreadInfo", 11) == 0) +- put_packet("l"); /* el = last */ +- +- /* qThreadExtraInfo = supply printable information per thread */ +- else if (strncmp(q_start, "ThreadExtraInfo", 15) == 0) +- thread_extra_info_msg(); +- +- /* Unsupported - empty message as per spec */ +- else +- send_empty_msg(); +-} +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* +- * Bring up the ports.. +- */ +-static int kgdb_serial_setup(void) +-{ +- extern int kgdb_console_setup(struct console *co, char *options); +- struct console dummy; +- +- kgdb_console_setup(&dummy, 0); +- +- return 0; +-} +- +-/* The command loop, read and act on requests */ +-static void kgdb_command_loop(const int excep_code, const int trapa_value) +-{ +- int sigval; +- +- if (excep_code == NMI_VEC) { +-#ifndef CONFIG_KGDB_NMI +- KGDB_PRINTK("Ignoring unexpected NMI?\n"); +- return; +-#else /* CONFIG_KGDB_NMI */ +- if (!kgdb_enabled) { +- kgdb_enabled = 1; +- kgdb_init(); +- } +-#endif /* CONFIG_KGDB_NMI */ +- } +- +- /* Ignore if we're disabled */ +- if (!kgdb_enabled) +- return; +- +-#ifdef CONFIG_KGDB_THREAD +- /* Until GDB specifies a thread */ +- current_thread = NULL; +- trapped_thread = current; +-#endif +- +- /* Enter GDB mode (e.g. after detach) */ +- if (!kgdb_in_gdb_mode) { +- /* Do serial setup, notify user, issue preemptive ack */ +- kgdb_serial_setup(); +- KGDB_PRINTK("Waiting for GDB (on %s%d at %d baud)\n", +- (kgdb_porttype ? kgdb_porttype->name : ""), +- kgdb_portnum, kgdb_baud); +- kgdb_in_gdb_mode = 1; +- put_debug_char('+'); +- } +- +- /* Reply to host that an exception has occurred */ +- sigval = compute_signal(excep_code); +- send_signal_msg(sigval); +- +- /* TRAP_VEC exception indicates a software trap inserted in place of +- code by GDB so back up PC by one instruction, as this instruction +- will later be replaced by its original one. Do NOT do this for +- trap 0xff, since that indicates a compiled-in breakpoint which +- will not be replaced (and we would retake the trap forever) */ +- if ((excep_code == TRAP_VEC) && (trapa_value != (0xff << 2))) { +- trap_registers.pc -= 2; +- } +- +- /* Undo any stepping we may have done */ +- undo_single_step(); +- +- while (1) { +- +- out_buffer[0] = 0; +- get_packet(in_buffer, BUFMAX); +- +- /* Examine first char of buffer to see what we need to do */ +- switch (in_buffer[0]) { +- +- case '?': /* Send which signal we've received */ +- send_signal_msg(sigval); +- break; +- +- case 'g': /* Return the values of the CPU registers */ +- send_regs_msg(); +- break; +- +- case 'G': /* Set the value of the CPU registers */ +- set_regs_msg(); +- break; +- +- case 'm': /* Read LLLL bytes address AA..AA */ +- read_mem_msg(); +- break; +- +- case 'M': /* Write LLLL bytes address AA..AA, ret OK */ +- write_mem_msg(0); /* 0 = data in hex */ +- break; +- +- case 'X': /* Write LLLL bytes esc bin address AA..AA */ +- if (kgdb_bits == '8') +- write_mem_msg(1); /* 1 = data in binary */ +- else +- send_empty_msg(); +- break; +- +- case 'C': /* Continue, signum included, we ignore it */ +- continue_with_sig_msg(); +- return; +- +- case 'c': /* Continue at address AA..AA (optional) */ +- continue_msg(); +- return; +- +- case 'S': /* Step, signum included, we ignore it */ +- step_with_sig_msg(); +- return; +- +- case 's': /* Step one instruction from AA..AA */ +- step_msg(); +- return; +- +-#ifdef CONFIG_KGDB_THREAD +- +- case 'H': /* Task related */ +- set_thread_msg(); +- break; +- +- case 'T': /* Query thread status */ +- thread_status_msg(); +- break; +- +- case 'q': /* Handle query - currently thread-related */ +- query_msg(); +- break; +-#endif +- +- case 'k': /* 'Kill the program' with a kernel ? */ +- break; +- +- case 'D': /* Detach from program, send reply OK */ +- kgdb_in_gdb_mode = 0; +- send_ok_msg(); +- get_debug_char(); +- return; +- +- default: +- send_empty_msg(); +- break; +- } +- } +-} +- +-/* There has been an exception, most likely a breakpoint. */ +-void kgdb_handle_exception(struct pt_regs *regs) +-{ +- int excep_code, vbr_val; +- int count; +- int trapa_value = ctrl_inl(TRA); +- +- /* Copy kernel regs (from stack) */ +- for (count = 0; count < 16; count++) +- trap_registers.regs[count] = regs->regs[count]; +- trap_registers.pc = regs->pc; +- trap_registers.pr = regs->pr; +- trap_registers.sr = regs->sr; +- trap_registers.gbr = regs->gbr; +- trap_registers.mach = regs->mach; +- trap_registers.macl = regs->macl; +- +- asm("stc vbr, %0":"=r"(vbr_val)); +- trap_registers.vbr = vbr_val; +- +- /* Get excode for command loop call, user access */ +- asm("stc r2_bank, %0":"=r"(excep_code)); +- kgdb_excode = excep_code; +- +- /* Other interesting environment items for reference */ +- asm("stc r6_bank, %0":"=r"(kgdb_g_imask)); +- kgdb_current = current; +- kgdb_trapa_val = trapa_value; +- +- /* Act on the exception */ +- kgdb_command_loop(excep_code >> 5, trapa_value); +- +- kgdb_current = NULL; +- +- /* Copy back the (maybe modified) registers */ +- for (count = 0; count < 16; count++) +- regs->regs[count] = trap_registers.regs[count]; +- regs->pc = trap_registers.pc; +- regs->pr = trap_registers.pr; +- regs->sr = trap_registers.sr; +- regs->gbr = trap_registers.gbr; +- regs->mach = trap_registers.mach; +- regs->macl = trap_registers.macl; +- +- vbr_val = trap_registers.vbr; +- asm("ldc %0, vbr": :"r"(vbr_val)); +- +- return; +-} +- +-/* Trigger a breakpoint by function */ +-void breakpoint(void) +-{ +- if (!kgdb_enabled) { +- kgdb_enabled = 1; +- kgdb_init(); +- } +- BREAKPOINT(); +-} +- +-/* Initialise the KGDB data structures and serial configuration */ +-int kgdb_init(void) +-{ +- if (!kgdb_enabled) +- return 1; +- +- in_nmi = 0; +- kgdb_nofault = 0; +- stepped_opcode = 0; +- kgdb_in_gdb_mode = 0; +- +- if (kgdb_serial_setup() != 0) { +- KGDB_PRINTK("serial setup error\n"); +- return -1; +- } +- +- /* Init ptr to exception handler */ +- kgdb_debug_hook = kgdb_handle_exception; +- kgdb_bus_err_hook = kgdb_handle_bus_error; +- +- /* Enter kgdb now if requested, or just report init done */ +- if (kgdb_halt) { +- kgdb_in_gdb_mode = 1; +- put_debug_char('+'); +- breakpoint(); +- } +- else +- { +- KGDB_PRINTK("stub is initialized.\n"); +- } +- +- return 0; +-} +- +-/* Make function available for "user messages"; console will use it too. */ +- +-char gdbmsgbuf[BUFMAX]; +-#define MAXOUT ((BUFMAX-2)/2) +- +-static void kgdb_msg_write(const char *s, unsigned count) +-{ +- int i; +- int wcount; +- char *bufptr; +- +- /* 'O'utput */ +- gdbmsgbuf[0] = 'O'; +- +- /* Fill and send buffers... */ +- while (count > 0) { +- bufptr = gdbmsgbuf + 1; +- +- /* Calculate how many this time */ +- wcount = (count > MAXOUT) ? MAXOUT : count; +- +- /* Pack in hex chars */ +- for (i = 0; i < wcount; i++) +- bufptr = pack_hex_byte(bufptr, s[i]); +- *bufptr = '\0'; +- +- /* Move up */ +- s += wcount; +- count -= wcount; +- +- /* Write packet */ +- put_packet(gdbmsgbuf); +- } +-} +- +-static void kgdb_to_gdb(const char *s) +-{ +- kgdb_msg_write(s, strlen(s)); +-} +- +-#ifdef CONFIG_SH_KGDB_CONSOLE +-void kgdb_console_write(struct console *co, const char *s, unsigned count) +-{ +- /* Bail if we're not talking to GDB */ +- if (!kgdb_in_gdb_mode) +- return; +- +- kgdb_msg_write(s, count); +-} +-#endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/setup.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/setup.c +--- linux-2.6.18-53.1.14/arch/sh/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/setup.c 2008-06-10 15:38:50.000000000 +0400 +@@ -28,10 +28,6 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-static int kgdb_parse_options(char *options); +-#endif + extern void * __rd_start, * __rd_end; + /* + * Machine setup.. +@@ -528,93 +524,3 @@ struct seq_operations cpuinfo_op = { + .show = show_cpuinfo, + }; + #endif /* CONFIG_PROC_FS */ +- +-#ifdef CONFIG_SH_KGDB +-/* +- * Parse command-line kgdb options. By default KGDB is enabled, +- * entered on error (or other action) using default serial info. +- * The command-line option can include a serial port specification +- * and an action to override default or configured behavior. +- */ +-struct kgdb_sermap kgdb_sci_sermap = +-{ "ttySC", 5, kgdb_sci_setup, NULL }; +- +-struct kgdb_sermap *kgdb_serlist = &kgdb_sci_sermap; +-struct kgdb_sermap *kgdb_porttype = &kgdb_sci_sermap; +- +-void kgdb_register_sermap(struct kgdb_sermap *map) +-{ +- struct kgdb_sermap *last; +- +- for (last = kgdb_serlist; last->next; last = last->next) +- ; +- last->next = map; +- if (!map->namelen) { +- map->namelen = strlen(map->name); +- } +-} +- +-static int __init kgdb_parse_options(char *options) +-{ +- char c; +- int baud; +- +- /* Check for port spec (or use default) */ +- +- /* Determine port type and instance */ +- if (!memcmp(options, "tty", 3)) { +- struct kgdb_sermap *map = kgdb_serlist; +- +- while (map && memcmp(options, map->name, map->namelen)) +- map = map->next; +- +- if (!map) { +- KGDB_PRINTK("unknown port spec in %s\n", options); +- return -1; +- } +- +- kgdb_porttype = map; +- kgdb_serial_setup = map->setup_fn; +- kgdb_portnum = options[map->namelen] - '0'; +- options += map->namelen + 1; +- +- options = (*options == ',') ? options+1 : options; +- +- /* Read optional parameters (baud/parity/bits) */ +- baud = simple_strtoul(options, &options, 10); +- if (baud != 0) { +- kgdb_baud = baud; +- +- c = toupper(*options); +- if (c == 'E' || c == 'O' || c == 'N') { +- kgdb_parity = c; +- options++; +- } +- +- c = *options; +- if (c == '7' || c == '8') { +- kgdb_bits = c; +- options++; +- } +- options = (*options == ',') ? options+1 : options; +- } +- } +- +- /* Check for action specification */ +- if (!memcmp(options, "halt", 4)) { +- kgdb_halt = 1; +- options += 4; +- } else if (!memcmp(options, "disabled", 8)) { +- kgdb_enabled = 0; +- options += 8; +- } +- +- if (*options) { +- KGDB_PRINTK("ignored unknown options: %s\n", options); +- return 0; +- } +- return 1; +-} +-__setup("kgdb=", kgdb_parse_options); +-#endif /* CONFIG_SH_KGDB */ +- +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/time.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/time.c +--- linux-2.6.18-53.1.14/arch/sh/kernel/time.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/time.c 2008-06-10 15:38:50.000000000 +0400 +@@ -184,12 +184,4 @@ void __init time_init(void) + */ + sys_timer = get_sys_timer(); + printk(KERN_INFO "Using %s for system timer\n", sys_timer->name); +- +-#if defined(CONFIG_SH_KGDB) +- /* +- * Set up kgdb as requested. We do it here because the serial +- * init uses the timer vars we just set up for figuring baud. +- */ +- kgdb_init(); +-#endif + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/kernel/traps.c linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/traps.c +--- linux-2.6.18-53.1.14/arch/sh/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/kernel/traps.c 2008-06-10 15:38:50.000000000 +0400 +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -34,17 +35,8 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-#define CHK_REMOTE_DEBUG(regs) \ +-{ \ +- if ((kgdb_debug_hook != (kgdb_debug_hook_t *) NULL) && (!user_mode(regs))) \ +- { \ +- (*kgdb_debug_hook)(regs); \ +- } \ +-} +-#else +-#define CHK_REMOTE_DEBUG(regs) ++#ifndef CONFIG_KGDB ++#define kgdb_handle_exception(t, s, e, r) + #endif + + #define DO_ERROR(trapnr, signr, str, name, tsk) \ +@@ -65,7 +57,7 @@ asmlinkage void do_##name(unsigned long + local_irq_enable(); \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ +- CHK_REMOTE_DEBUG(®s); \ ++ kgdb_handle_exception(trapnr, signr, error_code, ®s); \ + force_sig(signr, tsk); \ + die_if_no_fixup(str,®s,error_code); \ + } +@@ -92,10 +84,12 @@ void die(const char * str, struct pt_reg + { + static int die_counter; + ++#ifdef CONFIG_KGDB ++ kgdb_handle_exception(1, SIGTRAP, err, regs); ++#endif + console_verbose(); + spin_lock_irq(&die_lock); + printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); +- CHK_REMOTE_DEBUG(regs); + show_regs(regs); + spin_unlock_irq(&die_lock); + do_exit(SIGSEGV); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/extable.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/extable.c +--- linux-2.6.18-53.1.14/arch/sh/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/extable.c 2008-06-10 15:38:50.000000000 +0400 +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + + int fixup_exception(struct pt_regs *regs) +@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs + regs->pc = fixup->fixup; + return 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Never reached. */ ++#endif + + return 0; + } +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/fault-nommu.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault-nommu.c +--- linux-2.6.18-53.1.14/arch/sh/mm/fault-nommu.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault-nommu.c 2008-06-10 15:38:50.000000000 +0400 +@@ -29,10 +29,6 @@ + #include + #include + +-#if defined(CONFIG_SH_KGDB) +-#include +-#endif +- + extern void die(const char *,struct pt_regs *,long); + + /* +@@ -43,11 +39,6 @@ extern void die(const char *,struct pt_r + asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, + unsigned long address) + { +-#if defined(CONFIG_SH_KGDB) +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +@@ -69,11 +60,6 @@ asmlinkage void do_page_fault(struct pt_ + asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, + unsigned long address) + { +-#if defined(CONFIG_SH_KGDB) +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + if (address >= TASK_SIZE) + return 1; + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/sh/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault.c +--- linux-2.6.18-53.1.14/arch/sh/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/sh/mm/fault.c 2008-06-10 15:38:50.000000000 +0400 +@@ -28,7 +28,6 @@ + #include + #include + #include +-#include + + extern void die(const char *,struct pt_regs *,long); + +@@ -45,11 +44,6 @@ asmlinkage void do_page_fault(struct pt_ + struct vm_area_struct * vma; + unsigned long page; + +-#ifdef CONFIG_SH_KGDB +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + tsk = current; + mm = tsk->mm; + +@@ -153,6 +147,7 @@ no_context: + } + die("Oops", regs, writeaccess); + do_exit(SIGKILL); ++ dump_stack(); + + /* + * We ran out of memory, or some other thing happened to us that made +@@ -202,11 +197,6 @@ asmlinkage int __do_page_fault(struct pt + spinlock_t *ptl; + int ret = 1; + +-#ifdef CONFIG_SH_KGDB +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + #ifdef CONFIG_SH_STORE_QUEUES + addrmax = P4SEG_STORE_QUE + 0x04000000; + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/Kconfig.debug linux-2.6.18-53.1.14.kgdb/arch/x86_64/Kconfig.debug +--- linux-2.6.18-53.1.14/arch/x86_64/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/Kconfig.debug 2008-06-10 15:38:44.000000000 +0400 +@@ -55,7 +55,4 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-#config X86_REMOTE_DEBUG +-# bool "kgdb debugging stub" +- + endmenu +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/Makefile linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/Makefile +--- linux-2.6.18-53.1.14/arch/x86_64/kernel/Makefile 2008-03-06 05:54:48.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/Makefile 2008-06-10 15:38:44.000000000 +0400 +@@ -35,6 +35,7 @@ obj-$(CONFIG_IOMMU) += pci-gart.o apert + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o + obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o + obj-$(CONFIG_KPROBES) += kprobes.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o + obj-$(CONFIG_X86_VSMP) += vsmp.o + obj-$(CONFIG_K8_NB) += k8.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/entry.S linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/entry.S +--- linux-2.6.18-53.1.14/arch/x86_64/kernel/entry.S 2008-03-06 05:54:50.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/entry.S 2008-06-10 15:39:01.000000000 +0400 +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + .code64 + +@@ -887,6 +888,7 @@ error_exit: + RESTORE_ARGS 0,8,0 + jmp iret_label + CFI_ENDPROC ++ CFI_END_FRAME(kernel_thread) + + error_kernelspace: + incl %ebx +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb-jmp.S linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb-jmp.S +--- linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb-jmp.S 2008-06-10 15:38:44.000000000 +0400 +@@ -0,0 +1,65 @@ ++/* ++ * arch/x86_64/kernel/kgdb-jmp.S ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: Tom Rini ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 2001, 2003, 2004 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++ ++#define JB_RBX 0 ++#define JB_RBP 1 ++#define JB_R12 2 ++#define JB_R13 3 ++#define JB_R14 4 ++#define JB_R15 5 ++#define JB_RSP 6 ++#define JB_PC 7 ++ ++ .code64 ++ ++/* This must be called prior to kgdb_fault_longjmp and ++ * kgdb_fault_longjmp must not be called outside of the context of the ++ * last call to kgdb_fault_setjmp. ++ */ ++ENTRY(kgdb_fault_setjmp) ++ /* Save registers. */ ++ movq %rbx, (JB_RBX*8)(%rdi) ++ movq %rbp, (JB_RBP*8)(%rdi) ++ movq %r12, (JB_R12*8)(%rdi) ++ movq %r13, (JB_R13*8)(%rdi) ++ movq %r14, (JB_R14*8)(%rdi) ++ movq %r15, (JB_R15*8)(%rdi) ++ leaq 8(%rsp), %rdx /* Save SP as it will be after we return. */ ++ movq %rdx, (JB_RSP*8)(%rdi) ++ movq (%rsp), %rax /* Save PC we are returning to now. */ ++ movq %rax, (JB_PC*8)(%rdi) ++ /* Set return value for setjmp. */ ++ mov $0,%eax ++ movq (JB_PC*8)(%rdi),%rdx ++ movq (JB_RSP*8)(%rdi),%rsp ++ jmpq *%rdx ++ ++ENTRY(kgdb_fault_longjmp) ++ /* Restore registers. */ ++ movq (JB_RBX*8)(%rdi),%rbx ++ movq (JB_RBP*8)(%rdi),%rbp ++ movq (JB_R12*8)(%rdi),%r12 ++ movq (JB_R13*8)(%rdi),%r13 ++ movq (JB_R14*8)(%rdi),%r14 ++ movq (JB_R15*8)(%rdi),%r15 ++ /* Set return value for setjmp. */ ++ movq (JB_PC*8)(%rdi),%rdx ++ movq (JB_RSP*8)(%rdi),%rsp ++ mov $1,%eax ++ jmpq *%rdx +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb.c +--- linux-2.6.18-53.1.14/arch/x86_64/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/kernel/kgdb.c 2008-06-10 15:38:44.000000000 +0400 +@@ -0,0 +1,474 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2004 Amit S. Kale ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * Copyright (C) 2002 Andi Kleen, SuSE Labs ++ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd. ++ */ ++/**************************************************************************** ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by ++ * David Grothe ++ * Integrated into 2.2.5 kernel by Tigran Aivazian ++ * X86_64 changes from Andi Kleen's patch merged by Jim Houston ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Put the error code here just in case the user cares. */ ++int gdb_x86_64errcode; ++/* Likewise, the vector number here (since GDB only gets the signal ++ number through the usual means, and that's not very specific). */ ++int gdb_x86_64vector = -1; ++ ++extern atomic_t cpu_doing_single_step; ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_RAX] = regs->rax; ++ gdb_regs[_RBX] = regs->rbx; ++ gdb_regs[_RCX] = regs->rcx; ++ gdb_regs[_RDX] = regs->rdx; ++ gdb_regs[_RSI] = regs->rsi; ++ gdb_regs[_RDI] = regs->rdi; ++ gdb_regs[_RBP] = regs->rbp; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_PC] = regs->rip; ++ gdb_regs[_R8] = regs->r8; ++ gdb_regs[_R9] = regs->r9; ++ gdb_regs[_R10] = regs->r10; ++ gdb_regs[_R11] = regs->r11; ++ gdb_regs[_R12] = regs->r12; ++ gdb_regs[_R13] = regs->r13; ++ gdb_regs[_R14] = regs->r14; ++ gdb_regs[_R15] = regs->r15; ++ gdb_regs[_RSP] = regs->rsp; ++} ++ ++extern void thread_return(void); ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ gdb_regs[_RAX] = 0; ++ gdb_regs[_RBX] = 0; ++ gdb_regs[_RCX] = 0; ++ gdb_regs[_RDX] = 0; ++ gdb_regs[_RSI] = 0; ++ gdb_regs[_RDI] = 0; ++ gdb_regs[_RBP] = *(unsigned long *)p->thread.rsp; ++ gdb_regs[_PS] = *(unsigned long *)(p->thread.rsp + 8); ++ gdb_regs[_PC] = (unsigned long)&thread_return; ++ gdb_regs[_R8] = 0; ++ gdb_regs[_R9] = 0; ++ gdb_regs[_R10] = 0; ++ gdb_regs[_R11] = 0; ++ gdb_regs[_R12] = 0; ++ gdb_regs[_R13] = 0; ++ gdb_regs[_R14] = 0; ++ gdb_regs[_R15] = 0; ++ gdb_regs[_RSP] = p->thread.rsp; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ regs->rax = gdb_regs[_RAX]; ++ regs->rbx = gdb_regs[_RBX]; ++ regs->rcx = gdb_regs[_RCX]; ++ regs->rdx = gdb_regs[_RDX]; ++ regs->rsi = gdb_regs[_RSI]; ++ regs->rdi = gdb_regs[_RDI]; ++ regs->rbp = gdb_regs[_RBP]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->rip = gdb_regs[_PC]; ++ regs->r8 = gdb_regs[_R8]; ++ regs->r9 = gdb_regs[_R9]; ++ regs->r10 = gdb_regs[_R10]; ++ regs->r11 = gdb_regs[_R11]; ++ regs->r12 = gdb_regs[_R12]; ++ regs->r13 = gdb_regs[_R13]; ++ regs->r14 = gdb_regs[_R14]; ++ regs->r15 = gdb_regs[_R15]; ++#if 0 /* can't change these */ ++ regs->rsp = gdb_regs[_RSP]; ++ regs->ss = gdb_regs[_SS]; ++ regs->fs = gdb_regs[_FS]; ++ regs->gs = gdb_regs[_GS]; ++#endif ++ ++} /* gdb_regs_to_regs */ ++ ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned long addr; ++} breakinfo[4] = { { ++enabled:0}, { ++enabled:0}, { ++enabled:0}, { ++enabled:0}}; ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned long dr7; ++ ++ asm volatile ("movq %%db7, %0\n":"=r" (dr7):); ++ do { ++ unsigned long addr0, addr1, addr2, addr3; ++ asm volatile ("movq %%db0, %0\n" ++ "movq %%db1, %1\n" ++ "movq %%db2, %2\n" ++ "movq %%db3, %3\n":"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3):); ++ } while (0); ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movq %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movq %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movq %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movq %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) { ++ asm volatile ("movq %0, %%db7\n"::"r" (dr7)); ++ } ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].addr == addr && breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 0; ++ return 0; ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (!breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 1; ++ breakinfo[idx].type = 1; ++ breakinfo[idx].len = 1; ++ breakinfo[idx].addr = addr; ++ return 0; ++} ++ ++int remove_hw_break(unsigned breakno) ++{ ++ if (!breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 0; ++ return 0; ++} ++ ++int set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) ++{ ++ if (breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].len = len; ++ breakinfo[breakno].addr = addr; ++ return 0; ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ /* Disable hardware debugging while we are in kgdb */ ++ asm volatile ("movq %0,%%db7": /* no output */ :"r" (0UL)); ++} ++ ++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++ /* Master processor is completely in the debugger */ ++ gdb_x86_64vector = e_vector; ++ gdb_x86_64errcode = err_code; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ++ char *remcomInBuffer, char *remcomOutBuffer, ++ struct pt_regs *linux_regs) ++{ ++ unsigned long addr, length; ++ unsigned long breakno, breaktype; ++ char *ptr; ++ int newPC; ++ unsigned long dr6; ++ ++ switch (remcomInBuffer[0]) { ++ case 'c': ++ case 's': ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcomInBuffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->rip = addr; ++ newPC = linux_regs->rip; ++ ++ /* clear the trace bit */ ++ linux_regs->eflags &= ~TF_MASK; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcomInBuffer[0] == 's') { ++ linux_regs->eflags |= TF_MASK; ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ ++ } ++ ++ asm volatile ("movq %%db6, %0\n":"=r" (dr6)); ++ if (!(dr6 & 0x4000)) { ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno)) { ++ if (breakinfo[breakno].type == 0) { ++ /* Set restore flag */ ++ linux_regs->eflags |= ++ X86_EFLAGS_RF; ++ break; ++ } ++ } ++ } ++ } ++ kgdb_correct_hw_break(); ++ asm volatile ("movq %0, %%db6\n"::"r" (0UL)); ++ ++ return (0); ++ ++ case 'Y': ++ ptr = &remcomInBuffer[1]; ++ kgdb_hex2long(&ptr, &breakno); ++ ptr++; ++ kgdb_hex2long(&ptr, &breaktype); ++ ptr++; ++ kgdb_hex2long(&ptr, &length); ++ ptr++; ++ kgdb_hex2long(&ptr, &addr); ++ if (set_hw_break(breakno & 0x3, breaktype & 0x3, ++ length & 0x3, addr) == 0) ++ strcpy(remcomOutBuffer, "OK"); ++ else ++ strcpy(remcomOutBuffer, "ERROR"); ++ break; ++ ++ /* Remove hardware breakpoint */ ++ case 'y': ++ ptr = &remcomInBuffer[1]; ++ kgdb_hex2long(&ptr, &breakno); ++ if (remove_hw_break(breakno & 0x3) == 0) ++ strcpy(remcomOutBuffer, "OK"); ++ else ++ strcpy(remcomOutBuffer, "ERROR"); ++ break; ++ ++ } /* switch */ ++ return -1; ++} ++ ++static struct pt_regs *in_interrupt_stack(unsigned long rsp, int cpu) ++{ ++ struct pt_regs *regs; ++ unsigned long end = (unsigned long)cpu_pda(cpu)->irqstackptr; ++ if (rsp <= end && rsp >= end - IRQSTACKSIZE + 8) { ++ regs = *(((struct pt_regs **)end) - 1); ++ return regs; ++ } ++ return NULL; ++} ++ ++static struct pt_regs *in_exception_stack(unsigned long rsp, int cpu) ++{ ++ int i; ++ struct tss_struct *init_tss = &__get_cpu_var(init_tss); ++ for (i = 0; i < N_EXCEPTION_STACKS; i++) ++ if (rsp >= init_tss[cpu].ist[i] && ++ rsp <= init_tss[cpu].ist[i] + EXCEPTION_STKSZ) { ++ struct pt_regs *r = ++ (void *)init_tss[cpu].ist[i] + EXCEPTION_STKSZ; ++ return r - 1; ++ } ++ return NULL; ++} ++ ++void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid) ++{ ++ static char intr_desc[] = "Stack at interrupt entrypoint"; ++ static char exc_desc[] = "Stack at exception entrypoint"; ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ kgdb_mem2hex(intr_desc, buffer, strlen(intr_desc)); ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ kgdb_mem2hex(exc_desc, buffer, strlen(exc_desc)); ++} ++ ++struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, int threadid) ++{ ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ return current; ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ return current; ++ ++ return NULL; ++} ++ ++struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid) ++{ ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ return stregs; ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ return stregs; ++ ++ return NULL; ++} ++ ++/* Register KGDB with the die_chain so that we hook into all of the right ++ * spots. */ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ ++ if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active) ++ && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ /* CPU roundup? */ ++ } else if (atomic_read(&debugger_active) && cmd == DIE_NMI_IPI) { ++ kgdb_nmihook(smp_processor_id(), regs); ++ return NOTIFY_STOP; ++ /* See if KGDB is interested. */ ++ } else if (cmd == DIE_PAGE_FAULT || user_mode(regs) || ++ cmd == DIE_NMI_IPI || (cmd == DIE_DEBUG && ++ atomic_read(&debugger_active))) ++ /* Userpace events, normal watchdog event, or spurious ++ * debug exception. Ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++ .priority = 0x7fffffff, /* we need to notified first */ ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&die_chain, &kgdb_notifier); ++ return 0; ++} ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++ ++int kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ if (exception == 3 && kgdb_isremovedbreak(regs->rip - 1)) { ++ regs->rip -= 1; ++ return 1; ++ } ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++ .shadowth = 1, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/arch/x86_64/mm/fault.c linux-2.6.18-53.1.14.kgdb/arch/x86_64/mm/fault.c +--- linux-2.6.18-53.1.14/arch/x86_64/mm/fault.c 2008-03-06 05:54:27.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/arch/x86_64/mm/fault.c 2008-06-10 15:38:41.000000000 +0400 +@@ -557,6 +557,10 @@ no_context: + if (is_errata93(regs, address)) + return; + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ error_code, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/char/keyboard.c linux-2.6.18-53.1.14.kgdb/drivers/char/keyboard.c +--- linux-2.6.18-53.1.14/drivers/char/keyboard.c 2008-03-06 05:54:23.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/char/keyboard.c 2008-06-10 15:39:11.000000000 +0400 +@@ -1174,6 +1174,7 @@ static void kbd_keycode(unsigned int key + sysrq_down = 0; + if (sysrq_down && down && !rep) { + handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); ++ sysrq_down = 0; /* In case we miss the 'up' event. */ + return; + } + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/net/Makefile linux-2.6.18-53.1.14.kgdb/drivers/net/Makefile +--- linux-2.6.18-53.1.14/drivers/net/Makefile 2008-03-06 05:54:59.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/net/Makefile 2008-06-10 15:37:55.000000000 +0400 +@@ -221,6 +221,7 @@ obj-$(CONFIG_ETRAX_ETHERNET) += cris/ + obj-$(CONFIG_ENP2611_MSF_NET) += ixp2000/ + + obj-$(CONFIG_NETCONSOLE) += netconsole.o ++obj-$(CONFIG_KGDBOE) += kgdboe.o + + obj-$(CONFIG_FS_ENET) += fs_enet/ + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/net/kgdboe.c linux-2.6.18-53.1.14.kgdb/drivers/net/kgdboe.c +--- linux-2.6.18-53.1.14/drivers/net/kgdboe.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/net/kgdboe.c 2008-06-10 15:37:55.000000000 +0400 +@@ -0,0 +1,294 @@ ++/* ++ * drivers/net/kgdboe.c ++ * ++ * A network interface for GDB. ++ * Based upon 'gdbserial' by David Grothe ++ * and Scott Foehner ++ * ++ * Maintainers: Amit S. Kale and ++ * Tom Rini ++ * ++ * 2004 (c) Amit S. Kale ++ * 2004-2005 (c) MontaVista Software, Inc. ++ * 2005 (c) Wind River Systems, Inc. ++ * ++ * Contributors at various stages not listed above: ++ * San Mehat , Robert Walsh , ++ * wangdi , Matt Mackall , ++ * Pavel Machek , Jason Wessel ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define IN_BUF_SIZE 512 /* power of 2, please */ ++#define NOT_CONFIGURED_STRING "not_configured" ++#define OUT_BUF_SIZE 30 /* We don't want to send too big of a packet. */ ++#define MAX_KGDBOE_CONFIG_STR 256 ++ ++static char in_buf[IN_BUF_SIZE], out_buf[OUT_BUF_SIZE]; ++static int in_head, in_tail, out_count; ++static atomic_t in_count; ++/* 0 = unconfigured, 1 = netpoll options parsed, 2 = fully configured. */ ++static int configured; ++static struct kgdb_io local_kgdb_io_ops; ++static int use_dynamic_mac; ++ ++MODULE_DESCRIPTION("KGDB driver for network interfaces"); ++MODULE_LICENSE("GPL"); ++static char config[MAX_KGDBOE_CONFIG_STR] = NOT_CONFIGURED_STRING; ++static struct kparam_string kps = { ++ .string = config, ++ .maxlen = MAX_KGDBOE_CONFIG_STR, ++}; ++ ++static void rx_hook(struct netpoll *np, int port, char *msg, int len, ++ struct sk_buff *skb) ++{ ++ int i; ++ ++ np->remote_port = port; ++ ++ /* Copy the MAC address if we need to. */ ++ if (use_dynamic_mac) { ++ memcpy(np->remote_mac, eth_hdr(skb)->h_source, ++ sizeof(np->remote_mac)); ++ use_dynamic_mac = 0; ++ } ++ ++ /* ++ * This could be GDB trying to attach. But it could also be GDB ++ * finishing up a session, with kgdb_connected=0 but GDB sending ++ * an ACK for the final packet. To make sure we don't try and ++ * make a breakpoint when GDB is leaving, make sure that if ++ * !kgdb_connected the only len == 1 packet we allow is ^C. ++ */ ++ if (!kgdb_connected && (len != 1 || msg[0] == 3) && ++ !atomic_read(&kgdb_setting_breakpoint)) { ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ } ++ ++ for (i = 0; i < len; i++) { ++ if (msg[i] == 3) ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ ++ if (atomic_read(&in_count) >= IN_BUF_SIZE) { ++ /* buffer overflow, clear it */ ++ in_head = in_tail = 0; ++ atomic_set(&in_count, 0); ++ break; ++ } ++ in_buf[in_head++] = msg[i]; ++ in_head &= (IN_BUF_SIZE - 1); ++ atomic_inc(&in_count); ++ } ++} ++ ++static struct netpoll np = { ++ .dev_name = "eth0", ++ .name = "kgdboe", ++ .rx_hook = rx_hook, ++ .local_port = 6443, ++ .remote_port = 6442, ++ .remote_mac = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ++}; ++ ++static void eth_pre_exception_handler(void) ++{ ++ /* Increment the module count when the debugger is active */ ++ if (!kgdb_connected) ++ try_module_get(THIS_MODULE); ++ netpoll_set_trap(1); ++} ++ ++static void eth_post_exception_handler(void) ++{ ++ /* decrement the module count when the debugger detaches */ ++ if (!kgdb_connected) ++ module_put(THIS_MODULE); ++ netpoll_set_trap(0); ++} ++ ++static int eth_get_char(void) ++{ ++ int chr; ++ ++ while (atomic_read(&in_count) == 0) ++ netpoll_poll(&np); ++ ++ chr = in_buf[in_tail++]; ++ in_tail &= (IN_BUF_SIZE - 1); ++ atomic_dec(&in_count); ++ return chr; ++} ++ ++static void eth_flush_buf(void) ++{ ++ if (out_count && np.dev) { ++ netpoll_send_udp(&np, out_buf, out_count); ++ memset(out_buf, 0, sizeof(out_buf)); ++ out_count = 0; ++ } ++} ++ ++static void eth_put_char(u8 chr) ++{ ++ out_buf[out_count++] = chr; ++ if (out_count == OUT_BUF_SIZE) ++ eth_flush_buf(); ++} ++ ++static int option_setup(char *opt) ++{ ++ char opt_scratch[MAX_KGDBOE_CONFIG_STR]; ++ ++ /* If we're being given a new configuration, copy it in. */ ++ if (opt != config) ++ strcpy(config, opt); ++ /* But work on a copy as netpoll_parse_options will eat it. */ ++ strcpy(opt_scratch, opt); ++ configured = !netpoll_parse_options(&np, opt_scratch); ++ ++ use_dynamic_mac = 1; ++ ++ return 0; ++} ++__setup("kgdboe=", option_setup); ++ ++/* With our config string set by some means, configure kgdboe. */ ++static int configure_kgdboe(void) ++{ ++ /* Try out the string. */ ++ option_setup(config); ++ ++ if (!configured) { ++ printk(KERN_ERR "kgdboe: configuration incorrect - kgdboe not " ++ "loaded.\n"); ++ printk(KERN_ERR " Usage: kgdboe=[src-port]@[src-ip]/[dev]," ++ "[tgt-port]@/\n"); ++ return -EINVAL; ++ } ++ ++ /* Bring it up. */ ++ if (netpoll_setup(&np)) { ++ printk(KERN_ERR "kgdboe: netpoll_setup failed kgdboe failed\n"); ++ return -EINVAL; ++ } ++ ++ if (kgdb_register_io_module(&local_kgdb_io_ops)) { ++ netpoll_cleanup(&np); ++ return -EINVAL; ++ } ++ ++ configured = 2; ++ ++ return 0; ++} ++ ++static int init_kgdboe(void) ++{ ++ int ret; ++ ++ /* Already done? */ ++ if (configured == 2) ++ return 0; ++ ++ /* OK, go ahead and do it. */ ++ ret = configure_kgdboe(); ++ ++ if (configured == 2) ++ printk(KERN_INFO "kgdboe: debugging over ethernet enabled\n"); ++ ++ return ret; ++} ++ ++static void cleanup_kgdboe(void) ++{ ++ netpoll_cleanup(&np); ++ configured = 0; ++ kgdb_unregister_io_module(&local_kgdb_io_ops); ++} ++ ++static int param_set_kgdboe_var(const char *kmessage, struct kernel_param *kp) ++{ ++ char kmessage_save[MAX_KGDBOE_CONFIG_STR]; ++ int msg_len = strlen(kmessage); ++ ++ if (msg_len + 1 > MAX_KGDBOE_CONFIG_STR) { ++ printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", ++ kp->name, MAX_KGDBOE_CONFIG_STR - 1); ++ return -ENOSPC; ++ } ++ ++ if (kgdb_connected) { ++ printk(KERN_ERR "kgdboe: Cannot reconfigure while KGDB is " ++ "connected.\n"); ++ return 0; ++ } ++ ++ /* Start the reconfiguration process by saving the old string */ ++ strncpy(kmessage_save, config, sizeof(kmessage_save)); ++ ++ ++ /* Copy in the new param and strip out invalid characters so we ++ * can optionally specify the MAC. ++ */ ++ strncpy(config, kmessage, sizeof(config)); ++ msg_len--; ++ while (msg_len > 0 && ++ (config[msg_len] < ',' || config[msg_len] > 'f')) { ++ config[msg_len] = '\0'; ++ msg_len--; ++ } ++ ++ /* Check to see if we are unconfiguring the io module and that it ++ * was in a fully configured state, as this is the only time that ++ * netpoll_cleanup should get called ++ */ ++ if (configured == 2 && strcmp(config, NOT_CONFIGURED_STRING) == 0) { ++ printk(KERN_INFO "kgdboe: reverting to unconfigured state\n"); ++ cleanup_kgdboe(); ++ return 0; ++ } else ++ /* Go and configure with the new params. */ ++ configure_kgdboe(); ++ ++ if (configured == 2) ++ return 0; ++ ++ /* If the new string was invalid, revert to the previous state, which ++ * is at a minimum not_configured. */ ++ strncpy(config, kmessage_save, sizeof(config)); ++ if (strcmp(kmessage_save, NOT_CONFIGURED_STRING) != 0) { ++ printk(KERN_INFO "kgdboe: reverting to prior configuration\n"); ++ /* revert back to the original config */ ++ strncpy(config, kmessage_save, sizeof(config)); ++ configure_kgdboe(); ++ } ++ return 0; ++} ++ ++static struct kgdb_io local_kgdb_io_ops = { ++ .read_char = eth_get_char, ++ .write_char = eth_put_char, ++ .init = init_kgdboe, ++ .flush = eth_flush_buf, ++ .pre_exception = eth_pre_exception_handler, ++ .post_exception = eth_post_exception_handler ++}; ++ ++module_init(init_kgdboe); ++module_exit(cleanup_kgdboe); ++module_param_call(kgdboe, param_set_kgdboe_var, param_get_string, &kps, 0644); ++MODULE_PARM_DESC(kgdboe, " kgdboe=[src-port]@[src-ip]/[dev]," ++ "[tgt-port]@/\n"); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/8250.c linux-2.6.18-53.1.14.kgdb/drivers/serial/8250.c +--- linux-2.6.18-53.1.14/drivers/serial/8250.c 2008-03-06 05:54:43.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/8250.c 2008-06-10 15:37:43.000000000 +0400 +@@ -2656,6 +2656,25 @@ void serial8250_unregister_port(int line + } + EXPORT_SYMBOL(serial8250_unregister_port); + ++/** ++ * serial8250_unregister_by_port - remove a 16x50 serial port ++ * at runtime. ++ * @port: A &struct uart_port that describes the port to remove. ++ * ++ * Remove one serial port. This may not be called from interrupt ++ * context. We hand the port back to the our control. ++ */ ++void serial8250_unregister_by_port(struct uart_port *port) ++{ ++ struct uart_8250_port *uart; ++ ++ uart = serial8250_find_match_or_unused(port); ++ ++ if (uart) ++ serial8250_unregister_port(uart->port.line); ++} ++EXPORT_SYMBOL(serial8250_unregister_by_port); ++ + static int __init serial8250_init(void) + { + int ret, i; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/8250_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/8250_kgdb.c +--- linux-2.6.18-53.1.14/drivers/serial/8250_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/8250_kgdb.c 2008-06-10 15:37:43.000000000 +0400 +@@ -0,0 +1,516 @@ ++/* ++ * 8250 interface for kgdb. ++ * ++ * This is a merging of many different drivers, and all of the people have ++ * had an impact in some form or another: ++ * ++ * 2004-2005 (c) MontaVista Software, Inc. ++ * 2005-2006 (c) Wind River Systems, Inc. ++ * ++ * Amit Kale , David Grothe , ++ * Scott Foehner , George Anzinger , ++ * Robert Walsh , wangdi , ++ * San Mehat, Tom Rini , ++ * Jason Wessel ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include /* For BASE_BAUD and SERIAL_PORT_DFNS */ ++ ++#include "8250.h" ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++MODULE_DESCRIPTION("KGDB driver for the 8250"); ++MODULE_LICENSE("GPL"); ++/* These will conflict with early_param otherwise. */ ++#ifdef CONFIG_KGDB_8250_MODULE ++static char config[256]; ++module_param_string(kgdb8250, config, 256, 0); ++MODULE_PARM_DESC(kgdb8250, ++ " kgdb8250=,
,,\n"); ++static struct kgdb_io local_kgdb_io_ops; ++#endif /* CONFIG_KGDB_8250_MODULE */ ++ ++/* Speed of the UART. */ ++static int kgdb8250_baud; ++ ++/* Flag for if we need to call request_mem_region */ ++static int kgdb8250_needs_request_mem_region; ++ ++static char kgdb8250_buf[GDB_BUF_SIZE]; ++static atomic_t kgdb8250_buf_in_cnt; ++static int kgdb8250_buf_out_inx; ++ ++/* Old-style serial definitions, if existant, and a counter. */ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++static int __initdata should_copy_rs_table = 1; ++static struct serial_state old_rs_table[] __initdata = { ++#ifdef SERIAL_PORT_DFNS ++ SERIAL_PORT_DFNS ++#endif ++}; ++#endif ++ ++/* Our internal table of UARTS. */ ++#define UART_NR CONFIG_SERIAL_8250_NR_UARTS ++static struct uart_port kgdb8250_ports[UART_NR]; ++ ++static struct uart_port *current_port; ++ ++/* Base of the UART. */ ++static void *kgdb8250_addr; ++ ++/* Forward declarations. */ ++static int kgdb8250_uart_init(void); ++static int __init kgdb_init_io(void); ++static int __init kgdb8250_opt(char *str); ++ ++/* These are much shorter calls to ioread8/iowrite8 that take into ++ * account our shifts, etc. */ ++static inline unsigned int kgdb_ioread(u8 mask) ++{ ++ return ioread8(kgdb8250_addr + (mask << current_port->regshift)); ++} ++ ++static inline void kgdb_iowrite(u8 val, u8 mask) ++{ ++ iowrite8(val, kgdb8250_addr + (mask << current_port->regshift)); ++} ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void kgdb_put_debug_char(u8 chr) ++{ ++ while (!(kgdb_ioread(UART_LSR) & UART_LSR_THRE)) ; ++ ++ kgdb_iowrite(chr, UART_TX); ++} ++ ++/* ++ * Get a byte from the hardware data buffer and return it ++ */ ++static int read_data_bfr(void) ++{ ++ char it = kgdb_ioread(UART_LSR); ++ ++ if (it & UART_LSR_DR) ++ return kgdb_ioread(UART_RX); ++ ++ /* ++ * If we have a framing error assume somebody messed with ++ * our uart. Reprogram it and send '-' both ways... ++ */ ++ if (it & 0xc) { ++ kgdb8250_uart_init(); ++ kgdb_put_debug_char('-'); ++ return '-'; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ */ ++static int kgdb_get_debug_char(void) ++{ ++ int retchr; ++ ++ /* intr routine has q'd chars */ ++ if (atomic_read(&kgdb8250_buf_in_cnt) != 0) { ++ retchr = kgdb8250_buf[kgdb8250_buf_out_inx++]; ++ kgdb8250_buf_out_inx &= (GDB_BUF_SIZE - 1); ++ atomic_dec(&kgdb8250_buf_in_cnt); ++ return retchr; ++ } ++ ++ do { ++ retchr = read_data_bfr(); ++ } while (retchr < 0); ++ ++ return retchr; ++} ++ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * All that we need to do is verify that the interrupt happened on the ++ * line we're in charge of. If this is true, schedule a breakpoint and ++ * return. ++ */ ++static irqreturn_t ++kgdb8250_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ if (kgdb_ioread(UART_IIR) & UART_IIR_RDI) { ++ /* Throw away the data if another I/O routine is active. */ ++ if (kgdb_io_ops.read_char != kgdb_get_debug_char && ++ (kgdb_ioread(UART_LSR) & UART_LSR_DR)) ++ kgdb_ioread(UART_RX); ++ else ++ breakpoint(); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++/* ++ * Initializes the UART. ++ * Returns: ++ * 0 on success, 1 on failure. ++ */ ++static int ++kgdb8250_uart_init (void) ++{ ++ unsigned int ier, base_baud = current_port->uartclk ? ++ current_port->uartclk / 16 : BASE_BAUD; ++ ++ /* test uart existance */ ++ if(kgdb_ioread(UART_LSR) == 0xff) ++ return -1; ++ ++ /* disable interrupts */ ++ kgdb_iowrite(0, UART_IER); ++ ++#if defined(CONFIG_ARCH_OMAP1510) ++ /* Workaround to enable 115200 baud on OMAP1510 internal ports */ ++ if (cpu_is_omap1510() && is_omap_port((void *)kgdb8250_addr)) { ++ if (kgdb8250_baud == 115200) { ++ base_baud = 1; ++ kgdb8250_baud = 1; ++ kgdb_iowrite(1, UART_OMAP_OSC_12M_SEL); ++ } else ++ kgdb_iowrite(0, UART_OMAP_OSC_12M_SEL); ++ } ++#endif ++ /* set DLAB */ ++ kgdb_iowrite(UART_LCR_DLAB, UART_LCR); ++ ++ /* set baud */ ++ kgdb_iowrite((base_baud / kgdb8250_baud) & 0xff, UART_DLL); ++ kgdb_iowrite((base_baud / kgdb8250_baud) >> 8, UART_DLM); ++ ++ /* reset DLAB, set LCR */ ++ kgdb_iowrite(UART_LCR_WLEN8, UART_LCR); ++ ++ /* set DTR and RTS */ ++ kgdb_iowrite(UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS, UART_MCR); ++ ++ /* setup fifo */ ++ kgdb_iowrite(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR ++ | UART_FCR_CLEAR_XMIT | UART_FCR_TRIGGER_8, ++ UART_FCR); ++ ++ /* clear pending interrupts */ ++ kgdb_ioread(UART_IIR); ++ kgdb_ioread(UART_RX); ++ kgdb_ioread(UART_LSR); ++ kgdb_ioread(UART_MSR); ++ ++ /* turn on RX interrupt only */ ++ kgdb_iowrite(UART_IER_RDI, UART_IER); ++ ++ /* ++ * Borrowed from the main 8250 driver. ++ * Try writing and reading the UART_IER_UUE bit (b6). ++ * If it works, this is probably one of the Xscale platform's ++ * internal UARTs. ++ * We're going to explicitly set the UUE bit to 0 before ++ * trying to write and read a 1 just to make sure it's not ++ * already a 1 and maybe locked there before we even start start. ++ */ ++ ier = kgdb_ioread(UART_IER); ++ kgdb_iowrite(ier & ~UART_IER_UUE, UART_IER); ++ if (!(kgdb_ioread(UART_IER) & UART_IER_UUE)) { ++ /* ++ * OK it's in a known zero state, try writing and reading ++ * without disturbing the current state of the other bits. ++ */ ++ kgdb_iowrite(ier | UART_IER_UUE, UART_IER); ++ if (kgdb_ioread(UART_IER) & UART_IER_UUE) ++ /* ++ * It's an Xscale. ++ */ ++ ier |= UART_IER_UUE | UART_IER_RTOIE; ++ } ++ kgdb_iowrite(ier, UART_IER); ++ return 0; ++} ++ ++/* ++ * Copy the old serial_state table to our uart_port table if we haven't ++ * had values specifically configured in. We need to make sure this only ++ * happens once. ++ */ ++static void __init kgdb8250_copy_rs_table(void) ++{ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++ int i; ++ ++ if (!should_copy_rs_table) ++ return; ++ ++ for (i = 0; i < ARRAY_SIZE(old_rs_table); i++) { ++ kgdb8250_ports[i].iobase = old_rs_table[i].port; ++ kgdb8250_ports[i].irq = irq_canonicalize(old_rs_table[i].irq); ++ kgdb8250_ports[i].uartclk = old_rs_table[i].baud_base * 16; ++ kgdb8250_ports[i].membase = old_rs_table[i].iomem_base; ++ kgdb8250_ports[i].iotype = old_rs_table[i].io_type; ++ kgdb8250_ports[i].regshift = old_rs_table[i].iomem_reg_shift; ++ kgdb8250_ports[i].line = i; ++ } ++ ++ should_copy_rs_table = 0; ++#endif ++} ++ ++/* ++ * Hookup our IRQ line now that it is safe to do so, after we grab any ++ * memory regions we might need to. If we haven't been initialized yet, ++ * go ahead and copy the old_rs_table in. ++ */ ++static void __init kgdb8250_late_init(void) ++{ ++ /* Try and copy the old_rs_table. */ ++ kgdb8250_copy_rs_table(); ++ ++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE) ++ /* Take the port away from the main driver. */ ++ serial8250_unregister_by_port(current_port); ++ ++ /* Now reinit the port as the above has disabled things. */ ++ kgdb8250_uart_init(); ++#endif ++ /* We may need to call request_mem_region() first. */ ++ if (kgdb8250_needs_request_mem_region) ++ request_mem_region(current_port->mapbase, ++ 8 << current_port->regshift, "kgdb"); ++ if (request_irq(current_port->irq, kgdb8250_interrupt, SA_SHIRQ, ++ "GDB-stub", current_port) < 0) ++ printk(KERN_ERR "KGDB failed to request the serial IRQ (%d)\n", ++ current_port->irq); ++} ++ ++static __init int kgdb_init_io(void) ++{ ++ /* Give us the basic table of uarts. */ ++ kgdb8250_copy_rs_table(); ++ ++ /* We're either a module and parse a config string, or we have a ++ * semi-static config. */ ++#ifdef CONFIG_KGDB_8250_MODULE ++ if (strlen(config)) { ++ if (kgdb8250_opt(config)) ++ return -EINVAL; ++ } else { ++ printk(KERN_ERR "kgdb8250: argument error, usage: " ++ "kgdb8250=,
,,\n"); ++ return -EINVAL; ++ } ++#elif defined(CONFIG_KGDB_SIMPLE_SERIAL) ++ kgdb8250_baud = CONFIG_KGDB_BAUDRATE; ++ ++ /* Setup our pointer to the serial port now. */ ++ current_port = &kgdb8250_ports[CONFIG_KGDB_PORT_NUM]; ++#else ++ if (kgdb8250_opt(CONFIG_KGDB_8250_CONF_STRING)) ++ return -EINVAL; ++#endif ++ ++ ++ /* Internal driver setup. */ ++ switch (current_port->iotype) { ++ case UPIO_MEM: ++ if (current_port->mapbase) ++ kgdb8250_needs_request_mem_region = 1; ++ if (current_port->flags & UPF_IOREMAP) { ++ current_port->membase = ioremap(current_port->mapbase, ++ 8 << current_port->regshift); ++ if (!current_port->membase) ++ return -EIO; /* Failed. */ ++ } ++ kgdb8250_addr = current_port->membase; ++ break; ++ case UPIO_PORT: ++ default: ++ kgdb8250_addr = ioport_map(current_port->iobase, ++ 8 << current_port->regshift); ++ if (!kgdb8250_addr) ++ return -EIO; /* Failed. */ ++ } ++ ++ if (kgdb8250_uart_init() == -1) { ++ printk(KERN_ERR "kgdb8250: init failed\n"); ++ return -EIO; ++ } ++#ifdef CONFIG_KGDB_8250_MODULE ++ /* Attach the kgdb irq. When this is built into the kernel, it ++ * is called as a part of late_init sequence. ++ */ ++ kgdb8250_late_init(); ++ if (kgdb_register_io_module(&local_kgdb_io_ops)) ++ return -EINVAL; ++ ++ printk(KERN_INFO "kgdb8250: debugging enabled\n"); ++#endif /* CONFIG_KGD_8250_MODULE */ ++ ++ return 0; ++} ++ ++#ifdef CONFIG_KGDB_8250_MODULE ++/* If it is a module the kgdb_io_ops should be a static which ++ * is passed to the KGDB I/O initialization ++ */ ++static struct kgdb_io local_kgdb_io_ops = { ++#else /* ! CONFIG_KGDB_8250_MODULE */ ++struct kgdb_io kgdb_io_ops = { ++#endif /* ! CONFIG_KGD_8250_MODULE */ ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_put_debug_char, ++ .init = kgdb_init_io, ++ .late_init = kgdb8250_late_init, ++}; ++ ++/** ++ * kgdb8250_add_port - Define a serial port for use with KGDB ++ * @i: The index of the port being added ++ * @serial_req: The &struct uart_port describing the port ++ * ++ * On platforms where we must register the serial device ++ * dynamically, this is the best option if a platform also normally ++ * calls early_serial_setup(). ++ */ ++void __init kgdb8250_add_port(int i, struct uart_port *serial_req) ++{ ++ /* Make sure we've got the built-in data before we override. */ ++ kgdb8250_copy_rs_table(); ++ ++ /* Copy the whole thing over. */ ++ if (current_port != &kgdb8250_ports[i]) ++ memcpy(&kgdb8250_ports[i], serial_req, sizeof(struct uart_port)); ++} ++ ++/** ++ * kgdb8250_add_platform_port - Define a serial port for use with KGDB ++ * @i: The index of the port being added ++ * @p: The &struct plat_serial8250_port describing the port ++ * ++ * On platforms where we must register the serial device ++ * dynamically, this is the best option if a platform normally ++ * handles uart setup with an array of &struct plat_serial8250_port. ++ */ ++void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *p) ++{ ++ /* Make sure we've got the built-in data before we override. */ ++ kgdb8250_copy_rs_table(); ++ ++ kgdb8250_ports[i].iobase = p->iobase; ++ kgdb8250_ports[i].membase = p->membase; ++ kgdb8250_ports[i].irq = p->irq; ++ kgdb8250_ports[i].uartclk = p->uartclk; ++ kgdb8250_ports[i].regshift = p->regshift; ++ kgdb8250_ports[i].iotype = p->iotype; ++ kgdb8250_ports[i].flags = p->flags; ++ kgdb8250_ports[i].mapbase = p->mapbase; ++} ++ ++/* ++ * Syntax for this cmdline option is: ++ * kgdb8250=,
,," ++ */ ++static int __init kgdb8250_opt(char *str) ++{ ++ /* We'll fill out and use the first slot. */ ++ current_port = &kgdb8250_ports[0]; ++ ++ if (!strncmp(str, "io", 2)) { ++ current_port->iotype = UPIO_PORT; ++ str += 2; ++ } else if (!strncmp(str, "mmap", 4)) { ++ current_port->iotype = UPIO_MEM; ++ current_port->flags |= UPF_IOREMAP; ++ str += 4; ++ } else if (!strncmp(str, "mmio", 4)) { ++ current_port->iotype = UPIO_MEM; ++ current_port->flags &= ~UPF_IOREMAP; ++ str += 4; ++ } else ++ goto errout; ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ if (current_port->iotype == UPIO_PORT) ++ current_port->iobase = simple_strtoul(str, &str, 16); ++ else { ++ if (current_port->flags & UPF_IOREMAP) ++ current_port->mapbase = ++ (unsigned long) simple_strtoul(str, &str, 16); ++ else ++ current_port->membase = ++ (void *) simple_strtoul(str, &str, 16); ++ } ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ kgdb8250_baud = simple_strtoul(str, &str, 10); ++ if (!kgdb8250_baud) ++ goto errout; ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ current_port->irq = simple_strtoul(str, &str, 10); ++ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++ should_copy_rs_table = 0; ++#endif ++ ++ return 0; ++ ++ errout: ++ printk(KERN_ERR "Invalid syntax for option kgdb8250=\n"); ++ return 1; ++} ++ ++#ifdef CONFIG_KGDB_8250_MODULE ++static void cleanup_kgdb8250(void) ++{ ++ kgdb_unregister_io_module(&local_kgdb_io_ops); ++ ++ /* Clean up the irq and memory */ ++ free_irq(current_port->irq, current_port); ++ ++ if (kgdb8250_needs_request_mem_region) ++ release_mem_region(current_port->mapbase, ++ 8 << current_port->regshift); ++ /* Hook up the serial port back to what it was previously ++ * hooked up to. ++ */ ++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE) ++ /* Give the port back to the 8250 driver. */ ++ serial8250_register_port(current_port); ++#endif ++} ++ ++module_init(kgdb_init_io); ++module_exit(cleanup_kgdb8250); ++#else /* ! CONFIG_KGDB_8250_MODULE */ ++early_param("kgdb8250", kgdb8250_opt); ++#endif /* ! CONFIG_KGDB_8250_MODULE */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/Kconfig linux-2.6.18-53.1.14.kgdb/drivers/serial/Kconfig +--- linux-2.6.18-53.1.14/drivers/serial/Kconfig 2008-03-06 05:54:47.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/Kconfig 2008-06-10 15:37:43.000000000 +0400 +@@ -107,7 +107,7 @@ config SERIAL_8250_CS + + config SERIAL_8250_NR_UARTS + int "Maximum number of 8250/16550 serial ports" +- depends on SERIAL_8250 ++ depends on SERIAL_8250 || KGDB_8250 + default "4" + help + Set this to the number of serial ports you want the driver +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/Makefile linux-2.6.18-53.1.14.kgdb/drivers/serial/Makefile +--- linux-2.6.18-53.1.14/drivers/serial/Makefile 2008-03-06 05:54:47.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/Makefile 2008-06-10 15:38:14.000000000 +0400 +@@ -47,6 +47,7 @@ obj-$(CONFIG_SERIAL_IMX) += imx.o + obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o + obj-$(CONFIG_SERIAL_ICOM) += icom.o + obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o ++obj-$(CONFIG_KGDB_MPSC) += mpsc_kgdb.o + obj-$(CONFIG_SERIAL_MPSC) += mpsc.o + obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o + obj-$(CONFIG_SERIAL_JSM) += jsm/ +@@ -57,3 +58,4 @@ obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_se + obj-$(CONFIG_SERIAL_AT91) += at91_serial.o + obj-$(CONFIG_SERIAL_NETX) += netx-serial.o + obj-$(CONFIG_SERIAL_OF_PLATFORM) += of_serial.o ++obj-$(CONFIG_KGDB_8250) += 8250_kgdb.o +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/amba-pl011.c linux-2.6.18-53.1.14.kgdb/drivers/serial/amba-pl011.c +--- linux-2.6.18-53.1.14/drivers/serial/amba-pl011.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/amba-pl011.c 2008-06-10 15:38:56.000000000 +0400 +@@ -340,7 +340,7 @@ static int pl011_startup(struct uart_por + /* + * Allocate the IRQ + */ +- retval = request_irq(uap->port.irq, pl011_int, 0, "uart-pl011", uap); ++ retval = request_irq(uap->port.irq, pl011_int, SA_SHIRQ, "uart-pl011", uap); + if (retval) + goto clk_dis; + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/Makefile linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/Makefile +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/Makefile 2008-06-10 15:38:14.000000000 +0400 +@@ -7,5 +7,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart.o + # Select the correct platform objects. + cpm_uart-objs-$(CONFIG_CPM2) += cpm_uart_cpm2.o + cpm_uart-objs-$(CONFIG_8xx) += cpm_uart_cpm1.o ++cpm_uart-objs-$(CONFIG_KGDB_CPM_UART) += cpm_uart_kgdb.o + + cpm_uart-objs := cpm_uart_core.o $(cpm_uart-objs-y) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart.h linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart.h +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart.h 2008-03-06 05:54:12.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart.h 2008-06-10 15:38:14.000000000 +0400 +@@ -50,6 +50,39 @@ + + #define SCC_WAIT_CLOSING 100 + ++#ifdef CONFIG_KGDB_CPM_UART ++ ++/* Speed of the debug UART. */ ++#if defined(CONFIG_KGDB_9600BAUD) ++#define KGDB_BAUD B9600 ++#elif defined(CONFIG_KGDB_19200BAUD) ++#define KGDB_BAUD B19200 ++#elif defined(CONFIG_KGDB_38400BAUD) ++#define KGDB_BAUD B38400 ++#elif defined(CONFIG_KGDB_57600BAUD) ++#define KGDB_BAUD B57600 ++#else ++#define KGDB_BAUD B115200 /* Start with this if not given */ ++#endif ++ ++#ifdef CONFIG_KGDB_CPM_UART_SCC1 ++#define KGDB_PINFO_INDEX UART_SCC1 ++#elif CONFIG_KGDB_CPM_UART_SCC2 ++#define KGDB_PINFO_INDEX UART_SCC2 ++#elif CONFIG_KGDB_CPM_UART_SCC3 ++#define KGDB_PINFO_INDEX UART_SCC3 ++#elif CONFIG_KGDB_CPM_UART_SCC4 ++#define KGDB_PINFO_INDEX UART_SCC4 ++#elif CONFIG_KGDB_CPM_UART_SMC1 ++#define KGDB_PINFO_INDEX UART_SMC1 ++#elif CONFIG_KGDB_CPM_UART_SMC2 ++#define KGDB_PINFO_INDEX UART_SMC2 ++#else ++#error The S(M)CC for kgdb console is undefined ++#endif ++ ++#endif /* CONFIG_KGDB_CPM_UART */ ++ + struct uart_cpm_port { + struct uart_port port; + u16 rx_nrfifos; +@@ -86,6 +119,9 @@ extern int cpm_uart_port_map[UART_NR]; + extern int cpm_uart_nr; + extern struct uart_cpm_port cpm_uart_ports[UART_NR]; + ++void cpm_uart_early_write(int index, const char *s, u_int count); ++int cpm_uart_early_setup(int index,int early); ++ + /* these are located in their respective files */ + void cpm_line_cr_cmd(int line, int cmd); + int cpm_uart_init_portdesc(void); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_core.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_core.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c 2008-06-10 15:38:14.000000000 +0400 +@@ -1070,22 +1070,17 @@ int cpm_uart_drv_get_platform_data(struc + return 0; + } + +-#ifdef CONFIG_SERIAL_CPM_CONSOLE +-/* +- * Print a string to the serial port trying not to disturb +- * any possible real use of the port... +- * +- * Note that this is called with interrupts already disabled +- */ +-static void cpm_uart_console_write(struct console *co, const char *s, ++void cpm_uart_early_write(int index, const char *s, + u_int count) + { +- struct uart_cpm_port *pinfo = +- &cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ struct uart_cpm_port *pinfo; + unsigned int i; + volatile cbd_t *bdp, *bdbase; + volatile unsigned char *cp; + ++ BUG_ON(index>UART_NR); ++ pinfo = &cpm_uart_ports[index]; ++ + /* Get the address of the host memory buffer. + */ + bdp = pinfo->tx_cur; +@@ -1149,16 +1144,11 @@ static void cpm_uart_console_write(struc + pinfo->tx_cur = (volatile cbd_t *) bdp; + } + +- +-static int __init cpm_uart_console_setup(struct console *co, char *options) ++int cpm_uart_early_setup(int index, int early) + { ++ int ret; + struct uart_port *port; + struct uart_cpm_port *pinfo; +- int baud = 38400; +- int bits = 8; +- int parity = 'n'; +- int flow = 'n'; +- int ret; + + struct fs_uart_platform_info *pdata; + struct platform_device* pdev = early_uart_get_pdev(co->index); +@@ -1169,8 +1159,9 @@ static int __init cpm_uart_console_setup + cpm_uart_init_portdesc(); + } + ++ BUG_ON(index>UART_NR); + port = +- (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ (struct uart_port *)&cpm_uart_ports[index]; + pinfo = (struct uart_cpm_port *)port; + if (!pdev) { + if (pinfo->set_lineif) +@@ -1184,19 +1175,6 @@ static int __init cpm_uart_console_setup + cpm_uart_drv_get_platform_data(pdev, 1); + } + +- pinfo->flags |= FLAG_CONSOLE; +- +- if (options) { +- uart_parse_options(options, &baud, &parity, &bits, &flow); +- } else { +- bd_t *bd = (bd_t *) __res; +- +- if (bd->bi_baudrate) +- baud = bd->bi_baudrate; +- else +- baud = 9600; +- } +- + if (IS_SMC(pinfo)) { + pinfo->smcp->smc_smcm &= ~(SMCM_RX | SMCM_TX); + pinfo->smcp->smc_smcmr &= ~(SMCMR_REN | SMCMR_TEN); +@@ -1204,8 +1182,7 @@ static int __init cpm_uart_console_setup + pinfo->sccp->scc_sccm &= ~(UART_SCCM_TX | UART_SCCM_RX); + pinfo->sccp->scc_gsmrl &= ~(SCC_GSMRL_ENR | SCC_GSMRL_ENT); + } +- +- ret = cpm_uart_allocbuf(pinfo, 1); ++ ret = cpm_uart_allocbuf(pinfo, early); + + if (ret) + return ret; +@@ -1217,6 +1194,56 @@ static int __init cpm_uart_console_setup + else + cpm_uart_init_scc(pinfo); + ++ return 0; ++} ++ ++#ifdef CONFIG_SERIAL_CPM_CONSOLE ++/* ++ * Print a string to the serial port trying not to disturb ++ * any possible real use of the port... ++ * ++ * Note that this is called with interrupts already disabled ++ */ ++ ++static void cpm_uart_console_write(struct console *co, const char *s, ++ u_int count) ++{ ++ cpm_uart_early_write(cpm_uart_port_map[co->index],s,count); ++} ++ ++/* ++ * Setup console. Be careful is called early ! ++ */ ++static int __init cpm_uart_console_setup(struct console *co, char *options) ++{ ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ int baud = 115200; ++ int bits = 8; ++ int parity = 'n'; ++ int flow = 'n'; ++ int ret; ++ ++ port = ++ (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ pinfo->flags |= FLAG_CONSOLE; ++ ++ if (options) { ++ uart_parse_options(options, &baud, &parity, &bits, &flow); ++ } else { ++ bd_t *bd = (bd_t *) __res; ++ ++ if (bd->bi_baudrate) ++ baud = bd->bi_baudrate; ++ else ++ baud = 9600; ++ } ++ ++ ret = cpm_uart_early_setup(cpm_uart_port_map[co->index], 1); ++ if(ret) ++ return ret; + uart_set_options(port, co, baud, parity, bits, flow); + + return 0; +@@ -1364,6 +1391,12 @@ static int cpm_uart_init(void) { + + for (i = 0; i < cpm_uart_nr; i++) { + int con = cpm_uart_port_map[i]; ++ ++#ifdef CONFIG_KGDB_CPM_UART ++ /* We are not interested in ports yet utilized by kgdb */ ++ if(con == KGDB_PINFO_INDEX) ++ continue; ++#endif + cpm_uart_ports[con].port.line = i; + cpm_uart_ports[con].port.flags = UPF_BOOT_AUTOCONF; + uart_add_one_port(&cpm_reg, &cpm_uart_ports[con].port); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm1.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm1.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c 2008-06-10 15:38:14.000000000 +0400 +@@ -52,6 +52,7 @@ void cpm_line_cr_cmd(int line, int cmd) + { + ushort val; + volatile cpm8xx_t *cp = cpmp; ++ unsigned *bcsr_io; + + switch (line) { + case UART_SMC1: +@@ -94,12 +95,35 @@ void scc1_lineif(struct uart_cpm_port *p + { + /* XXX SCC1: insert port configuration here */ + pinfo->brg = 1; ++ ++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS) ++ bcsr_io = ioremap(BCSR1, sizeof(unsigned long)); ++ ++ if (bcsr_io == NULL) { ++ printk(KERN_CRIT "Could not remap BCSR\n"); ++ return; ++ } ++ out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_1); ++ iounmap(bcsr_io); ++#endif + } + + void scc2_lineif(struct uart_cpm_port *pinfo) + { + /* XXX SCC2: insert port configuration here */ + pinfo->brg = 2; ++ unsigned *bcsr_io; ++ ++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS) ++ bcsr_io = ioremap(BCSR1, sizeof(unsigned long)); ++ ++ if (bcsr_io == NULL) { ++ printk(KERN_CRIT "Could not remap BCSR\n"); ++ return; ++ } ++ out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_2); ++ iounmap(bcsr_io); ++#endif + } + + void scc3_lineif(struct uart_cpm_port *pinfo) +@@ -188,6 +212,10 @@ int cpm_uart_init_portdesc(void) + { + pr_debug("CPM uart[-]:init portdesc\n"); + ++ /* Check if we have called this yet. This may happen if early kgdb ++ breakpoint is on */ ++ if(cpm_uart_nr) ++ return 0; + cpm_uart_nr = 0; + #ifdef CONFIG_SERIAL_CPM_SMC1 + cpm_uart_ports[UART_SMC1].smcp = &cpmp->cp_smc[0]; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm2.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_cpm2.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c 2008-06-10 15:38:14.000000000 +0400 +@@ -256,6 +256,10 @@ int cpm_uart_init_portdesc(void) + { + pr_debug("CPM uart[-]:init portdesc\n"); + ++ /* Check if we have called this yet. This may happen if early kgdb ++ breakpoint is on */ ++ if(cpm_uart_nr) ++ return 0; + cpm_uart_nr = 0; + #ifdef CONFIG_SERIAL_CPM_SMC1 + cpm_uart_ports[UART_SMC1].smcp = (smc_t *) & cpm2_immr->im_smc[0]; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c +--- linux-2.6.18-53.1.14/drivers/serial/cpm_uart/cpm_uart_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c 2008-06-10 15:38:14.000000000 +0400 +@@ -0,0 +1,195 @@ ++/* ++ * drivers/serial/cpm_uart/cpm_uart_kgdb.c ++ * ++ * CPM UART interface for kgdb. ++ * ++ * Author: Vitaly Bordug ++ * ++ * Used some bits from drivers/serial/kgdb_8250.c as a template ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include /* For BASE_BAUD and SERIAL_PORT_DFNS */ ++ ++#include "cpm_uart.h" ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++ ++static char kgdb_buf[GDB_BUF_SIZE], *kgdbp; ++static int kgdb_chars; ++ ++/* Forward declarations. */ ++ ++/* ++ * Receive character from the serial port. This only works well ++ * before the port is initialize for real use. ++ */ ++static int kgdb_wait_key(char *obuf) ++{ ++ struct uart_cpm_port *pinfo; ++ ++ u_char c, *cp; ++ volatile cbd_t *bdp; ++ int i; ++ ++ pinfo = &cpm_uart_ports[KGDB_PINFO_INDEX]; ++ ++ /* Get the address of the host memory buffer. ++ */ ++ bdp = pinfo->rx_cur; ++ while (bdp->cbd_sc & BD_SC_EMPTY); ++ ++ /* If the buffer address is in the CPM DPRAM, don't ++ * convert it. ++ */ ++ cp = cpm2cpu_addr(bdp->cbd_bufaddr); ++ ++ if (obuf) { ++ i = c = bdp->cbd_datlen; ++ while (i-- > 0) ++ { ++ *obuf++ = *cp++; ++ } ++ } else { ++ c = *cp; ++ } ++ bdp->cbd_sc |= BD_SC_EMPTY; ++ ++ if (bdp->cbd_sc & BD_SC_WRAP) { ++ bdp = pinfo->rx_bd_base; ++ } else { ++ bdp++; ++ } ++ pinfo->rx_cur = (cbd_t *)bdp; ++ ++ return((int)c); ++} ++ ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void ++kgdb_put_debug_char(int chr) ++{ ++ static char ch[2]; ++ ch[0]=(char)chr; ++ cpm_uart_early_write(KGDB_PINFO_INDEX, ch, 1); ++} ++ ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ */ ++static int ++kgdb_get_debug_char(void) ++{ ++ if (kgdb_chars<=0) { ++ kgdb_chars = kgdb_wait_key(kgdb_buf); ++ kgdbp = kgdb_buf; ++ } ++ kgdb_chars--; ++ ++ return (*kgdbp++); ++} ++ ++static void termios_set_options(int index, ++ int baud, int parity, int bits, int flow) ++{ ++ struct termios termios; ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ ++ BUG_ON(index>UART_NR); ++ ++ port = ++ (struct uart_port *)&cpm_uart_ports[index]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ /* ++ * Ensure that the serial console lock is initialised ++ * early. ++ */ ++ spin_lock_init(&port->lock); ++ ++ memset(&termios, 0, sizeof(struct termios)); ++ ++ termios.c_cflag = CREAD | HUPCL | CLOCAL; ++ ++ termios.c_cflag |= baud; ++ ++ if (bits == 7) ++ termios.c_cflag |= CS7; ++ else ++ termios.c_cflag |= CS8; ++ ++ switch (parity) { ++ case 'o': case 'O': ++ termios.c_cflag |= PARODD; ++ /*fall through*/ ++ case 'e': case 'E': ++ termios.c_cflag |= PARENB; ++ break; ++ } ++ ++ if (flow == 'r') ++ termios.c_cflag |= CRTSCTS; ++ ++ port->ops->set_termios(port, &termios, NULL); ++} ++ ++/* ++ * Returns: ++ * 0 on success, 1 on failure. ++ */ ++static int kgdb_init(void) ++{ ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ ++ int use_bootmem = 0; /* use dma by default */ ++ ++ if(!cpm_uart_nr) ++ { ++ use_bootmem = 1; ++ cpm_uart_init_portdesc(); ++ } ++ port = (struct uart_port *)&cpm_uart_ports[KGDB_PINFO_INDEX]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ if (cpm_uart_early_setup(KGDB_PINFO_INDEX, use_bootmem)) ++ return 1; ++ ++ termios_set_options(KGDB_PINFO_INDEX, KGDB_BAUD,'n',8,'n'); ++ if (IS_SMC(pinfo)) ++ pinfo->smcp->smc_smcm |= SMCM_TX; ++ else ++ pinfo->sccp->scc_sccm |= UART_SCCM_TX; ++ ++ return 0; ++} ++ ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_put_debug_char, ++ .init = kgdb_init, ++}; ++ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/mpsc.c linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc.c +--- linux-2.6.18-53.1.14/drivers/serial/mpsc.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc.c 2008-06-10 15:38:14.000000000 +0400 +@@ -242,6 +242,11 @@ struct mpsc_port_info *mpsc_device_remov + #define MPSC_RCRR 0x0004 + #define MPSC_TCRR 0x0008 + ++/* MPSC Interrupt registers (offset from MV64x60_SDMA_INTR_OFFSET) */ ++#define MPSC_INTR_CAUSE 0x0004 ++#define MPSC_INTR_MASK 0x0084 ++#define MPSC_INTR_CAUSE_RCC (1<<6) ++ + /* Serial DMA Controller Interface Registers */ + #define SDMA_SDC 0x0000 + #define SDMA_SDCM 0x0008 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/mpsc_kgdb.c linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc_kgdb.c +--- linux-2.6.18-53.1.14/drivers/serial/mpsc_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/mpsc_kgdb.c 2008-06-10 15:38:14.000000000 +0400 +@@ -0,0 +1,299 @@ ++/* ++ * drivers/serial/mpsc_kgdb.c ++ * ++ * KGDB driver for the Marvell MultiProtocol Serial Controller (MPCS) ++ * ++ * Based on the polled boot loader driver by Ajit Prem (ajit.prem@motorola.com) ++ * ++ * Author: Randy Vinson ++ * ++ * 2005 (c) MontaVista Software, Inc. ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "mpsc.h" ++ ++/* Speed of the UART. */ ++static int kgdbmpsc_baud = CONFIG_KGDB_BAUDRATE; ++ ++/* Index of the UART, matches ttyMX naming. */ ++static int kgdbmpsc_ttyMM = CONFIG_KGDB_PORT_NUM; ++ ++#define MPSC_INTR_REG_SELECT(x) ((x) + (8 * kgdbmpsc_ttyMM)) ++ ++static int kgdbmpsc_init(void); ++ ++static struct platform_device mpsc_dev, shared_dev; ++ ++static void __iomem *mpsc_base; ++static void __iomem *brg_base; ++static void __iomem *routing_base; ++static void __iomem *sdma_base; ++ ++static unsigned int mpsc_irq; ++ ++static void kgdb_write_debug_char(int c) ++{ ++ u32 data; ++ ++ data = readl(mpsc_base + MPSC_MPCR); ++ writeb(c, mpsc_base + MPSC_CHR_1); ++ mb(); ++ data = readl(mpsc_base + MPSC_CHR_2); ++ data |= MPSC_CHR_2_TTCS; ++ writel(data, mpsc_base + MPSC_CHR_2); ++ mb(); ++ ++ while (readl(mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS) ; ++} ++ ++static int kgdb_get_debug_char(void) ++{ ++ unsigned char c; ++ ++ while (!(readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) & ++ MPSC_INTR_CAUSE_RCC)) ; ++ ++ c = readb(mpsc_base + MPSC_CHR_10 + (1 << 1)); ++ mb(); ++ writeb(c, mpsc_base + MPSC_CHR_10 + (1 << 1)); ++ mb(); ++ writel(~MPSC_INTR_CAUSE_RCC, sdma_base + ++ MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)); ++ return (c); ++} ++ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * All that we need to do is verify that the interrupt happened on the ++ * line we're in charge of. If this is true, schedule a breakpoint and ++ * return. ++ */ ++static irqreturn_t ++kgdbmpsc_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ if (irq != mpsc_irq) ++ return IRQ_NONE; ++ /* ++ * If there is some other CPU in KGDB then this is a ++ * spurious interrupt. so return without even checking a byte ++ */ ++ if (atomic_read(&debugger_active)) ++ return IRQ_NONE; ++ ++ if (readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) & ++ MPSC_INTR_CAUSE_RCC) ++ breakpoint(); ++ ++ return IRQ_HANDLED; ++} ++ ++static int __init kgdbmpsc_init(void) ++{ ++ struct mpsc_pdata *pdata; ++ u32 cdv; ++ ++ if (!brg_base || !mpsc_base || !routing_base || !sdma_base) ++ return -1; ++ ++ /* Set MPSC Routing to enable both ports */ ++ writel(0x0, routing_base + MPSC_MRR); ++ ++ /* MPSC 0/1 Rx & Tx get clocks BRG0/1 */ ++ writel(0x00000100, routing_base + MPSC_RCRR); ++ writel(0x00000100, routing_base + MPSC_TCRR); ++ ++ /* Disable all MPSC interrupts and clear any pending interrupts */ ++ writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)); ++ ++ pdata = (struct mpsc_pdata *)mpsc_dev.dev.platform_data; ++ ++ /* cdv = (clock/(2*16*baud rate)) for 16X mode. */ ++ cdv = ((pdata->brg_clk_freq / (32 * kgdbmpsc_baud)) - 1); ++ writel((pdata->brg_clk_src << 18) | (1 << 16) | cdv, ++ brg_base + BRG_BCR); ++ ++ /* Put MPSC into UART mode, no null modem, 16x clock mode */ ++ writel(0x000004c4, mpsc_base + MPSC_MMCRL); ++ writel(0x04400400, mpsc_base + MPSC_MMCRH); ++ ++ writel(0, mpsc_base + MPSC_CHR_1); ++ writel(0, mpsc_base + MPSC_CHR_9); ++ writel(0, mpsc_base + MPSC_CHR_10); ++ writel(4, mpsc_base + MPSC_CHR_3); ++ writel(0x20000000, mpsc_base + MPSC_CHR_4); ++ writel(0x9000, mpsc_base + MPSC_CHR_5); ++ writel(0, mpsc_base + MPSC_CHR_6); ++ writel(0, mpsc_base + MPSC_CHR_7); ++ writel(0, mpsc_base + MPSC_CHR_8); ++ ++ /* 8 data bits, 1 stop bit */ ++ writel((3 << 12), mpsc_base + MPSC_MPCR); ++ ++ /* Enter "hunt" mode */ ++ writel((1 << 31), mpsc_base + MPSC_CHR_2); ++ ++ udelay(100); ++ return 0; ++} ++ ++static void __iomem *__init ++kgdbmpsc_map_resource(struct platform_device *pd, int type, int num) ++{ ++ void __iomem *base = NULL; ++ struct resource *r; ++ ++ if ((r = platform_get_resource(pd, IORESOURCE_MEM, num))) ++ base = ioremap(r->start, r->end - r->start + 1); ++ return base; ++} ++ ++static void __iomem *__init ++kgdbmpsc_unmap_resource(struct platform_device *pd, int type, int num, ++ void __iomem * base) ++{ ++ if (base) ++ iounmap(base); ++ return NULL; ++} ++ ++static void __init ++kgdbmpsc_reserve_resource(struct platform_device *pd, int type, int num) ++{ ++ struct resource *r; ++ ++ if ((r = platform_get_resource(pd, IORESOURCE_MEM, num))) ++ request_mem_region(r->start, r->end - r->start + 1, "kgdb"); ++} ++ ++static int __init kgdbmpsc_local_init(void) ++{ ++ if (!mpsc_dev.num_resources || !shared_dev.num_resources) ++ return 1; /* failure */ ++ ++ mpsc_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER); ++ brg_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER); ++ ++ /* get the platform data for the shared registers and get them mapped */ ++ routing_base = kgdbmpsc_map_resource(&shared_dev, ++ IORESOURCE_MEM, ++ MPSC_ROUTING_BASE_ORDER); ++ sdma_base = ++ kgdbmpsc_map_resource(&shared_dev, IORESOURCE_MEM, ++ MPSC_SDMA_INTR_BASE_ORDER); ++ ++ mpsc_irq = platform_get_irq(&mpsc_dev, 1); ++ ++ if (mpsc_base && brg_base && routing_base && sdma_base) ++ return 0; /* success */ ++ ++ return 1; /* failure */ ++} ++ ++static void __init kgdbmpsc_local_exit(void) ++{ ++ if (sdma_base) ++ sdma_base = kgdbmpsc_unmap_resource(&shared_dev, IORESOURCE_MEM, ++ MPSC_SDMA_INTR_BASE_ORDER, ++ sdma_base); ++ if (routing_base) ++ routing_base = kgdbmpsc_unmap_resource(&shared_dev, ++ IORESOURCE_MEM, ++ MPSC_ROUTING_BASE_ORDER, ++ routing_base); ++ if (brg_base) ++ brg_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER, ++ brg_base); ++ if (mpsc_base) ++ mpsc_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER, mpsc_base); ++} ++ ++static void __init kgdbmpsc_update_pdata(struct platform_device *pdev) ++{ ++ ++ snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s%u", pdev->name, pdev->id); ++} ++ ++static int __init kgdbmpsc_pdev_init(void) ++{ ++ struct platform_device *pdev; ++ ++ /* get the platform data for the specified port. */ ++ pdev = mv64x60_early_get_pdev_data(MPSC_CTLR_NAME, kgdbmpsc_ttyMM, 1); ++ if (pdev) { ++ memcpy(&mpsc_dev, pdev, sizeof(struct platform_device)); ++ if (platform_notify) { ++ kgdbmpsc_update_pdata(&mpsc_dev); ++ platform_notify(&mpsc_dev.dev); ++ } ++ ++ /* get the platform data for the shared registers. */ ++ pdev = mv64x60_early_get_pdev_data(MPSC_SHARED_NAME, 0, 0); ++ if (pdev) { ++ memcpy(&shared_dev, pdev, ++ sizeof(struct platform_device)); ++ if (platform_notify) { ++ kgdbmpsc_update_pdata(&shared_dev); ++ platform_notify(&shared_dev.dev); ++ } ++ } ++ } ++ return 0; ++} ++ ++postcore_initcall(kgdbmpsc_pdev_init); ++ ++static int __init kgdbmpsc_init_io(void) ++{ ++ ++ kgdbmpsc_pdev_init(); ++ ++ if (kgdbmpsc_local_init()) { ++ kgdbmpsc_local_exit(); ++ return -1; ++ } ++ ++ if (kgdbmpsc_init() == -1) ++ return -1; ++ return 0; ++} ++ ++static void __init kgdbmpsc_hookup_irq(void) ++{ ++ unsigned int msk; ++ if (!request_irq(mpsc_irq, kgdbmpsc_interrupt, 0, "kgdb mpsc", NULL)) { ++ /* Enable interrupt */ ++ msk = readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ msk |= MPSC_INTR_CAUSE_RCC; ++ writel(msk, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ ++ kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER); ++ kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER); ++ } ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_write_debug_char, ++ .init = kgdbmpsc_init_io, ++ .late_init = kgdbmpsc_hookup_irq, ++}; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/pxa.c linux-2.6.18-53.1.14.kgdb/drivers/serial/pxa.c +--- linux-2.6.18-53.1.14/drivers/serial/pxa.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/pxa.c 2008-06-10 15:38:56.000000000 +0400 +@@ -42,6 +42,9 @@ + #include + #include + #include ++#ifdef CONFIG_KGDB_CONSOLE ++#include ++#endif + + #include + #include +@@ -692,6 +695,8 @@ serial_pxa_console_init(void) + console_initcall(serial_pxa_console_init); + + #define PXA_CONSOLE &serial_pxa_console ++#elif defined(CONFIG_KGDB_CONSOLE) ++#define PXA_CONSOLE &kgdbcons + #else + #define PXA_CONSOLE NULL + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/serial_core.c linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_core.c +--- linux-2.6.18-53.1.14/drivers/serial/serial_core.c 2008-03-06 05:54:07.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_core.c 2008-06-10 15:37:43.000000000 +0400 +@@ -33,6 +33,7 @@ + #include /* for serial_state and serial_icounter_struct */ + #include + #include ++#include + + #include + #include +@@ -65,6 +66,12 @@ static struct lock_class_key port_lock_k + #define uart_console(port) (0) + #endif + ++#ifdef CONFIG_KGDB_CONSOLE ++#define uart_kgdb(port) (port->cons && !strcmp(port->cons->name, "kgdb")) ++#else ++#define uart_kgdb(port) (0) ++#endif ++ + static void uart_change_speed(struct uart_state *state, struct termios *old_termios); + static void uart_wait_until_sent(struct tty_struct *tty, int timeout); + static void uart_change_pm(struct uart_state *state, int pm_state); +@@ -1673,6 +1680,9 @@ static int uart_line_info(char *buf, str + port->iotype == UPIO_MEM ? port->mapbase : + (unsigned long) port->iobase, + port->irq); ++ if (port->iotype == UPIO_MEM) ++ ret += sprintf(buf+ret, " membase 0x%08lX", ++ (unsigned long) port->membase); + + if (port->type == PORT_UNKNOWN) { + strcat(buf, "\n"); +@@ -2043,7 +2053,8 @@ uart_report_port(struct uart_driver *drv + case UPIO_AU: + case UPIO_TSI: + snprintf(address, sizeof(address), +- "MMIO 0x%lx", port->mapbase); ++ "MMIO map 0x%lx mem 0x%lx", port->mapbase, ++ (unsigned long) port->membase); + break; + default: + strlcpy(address, "*unknown*", sizeof(address)); +@@ -2095,9 +2106,9 @@ uart_configure_port(struct uart_driver * + + /* + * Power down all ports by default, except the +- * console if we have one. ++ * console (real or kgdb) if we have one. + */ +- if (!uart_console(port)) ++ if (!uart_console(port) && !uart_kgdb(port)) + uart_change_pm(state, 3); + } + } +@@ -2289,6 +2300,12 @@ int uart_add_one_port(struct uart_driver + */ + port->flags &= ~UPF_DEAD; + ++#if defined(CONFIG_KGDB_8250) ++ /* Add any 8250-like ports we find later. */ ++ if (port->type <= PORT_MAX_8250) ++ kgdb8250_add_port(port->line, port); ++#endif ++ + out: + mutex_unlock(&state->mutex); + mutex_unlock(&port_mutex); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/serial_txx9.c linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_txx9.c +--- linux-2.6.18-53.1.14/drivers/serial/serial_txx9.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/serial_txx9.c 2008-06-10 15:38:24.000000000 +0400 +@@ -1164,6 +1164,96 @@ static struct pci_driver serial_txx9_pci + MODULE_DEVICE_TABLE(pci, serial_txx9_pci_tbl); + #endif /* ENABLE_SERIAL_TXX9_PCI */ + ++/******************************************************************************/ ++/* BEG: KDBG Routines */ ++/******************************************************************************/ ++ ++#ifdef CONFIG_KGDB ++int kgdb_init_count = 0; ++ ++void txx9_sio_kgdb_hook(unsigned int port, unsigned int baud_rate) ++{ ++ static struct resource kgdb_resource; ++ int ret; ++ struct uart_txx9_port *up = &serial_txx9_ports[port]; ++ ++ /* prevent initialization by driver */ ++ kgdb_resource.name = "serial_txx9(debug)"; ++ kgdb_resource.start = (unsigned long)up->port.membase; ++ kgdb_resource.end = (unsigned long)(up->port.membase + 36 - 1); ++ kgdb_resource.flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ ++ ret = request_resource(&iomem_resource, &kgdb_resource); ++ if(ret == -EBUSY) ++ printk(" serial_txx9(debug): request_resource failed\n"); ++ ++ return; ++} ++void ++txx9_sio_kdbg_init( unsigned int port_number ) ++{ ++ if (port_number == 1) { ++ txx9_sio_kgdb_hook(port_number, 38400); ++ } else { ++ printk("Bad Port Number [%u] != [1]\n",port_number); ++ } ++ return; ++} ++ ++u8 ++txx9_sio_kdbg_rd( void ) ++{ ++ unsigned int status,ch; ++ struct uart_txx9_port *up = &serial_txx9_ports[1]; ++ ++ if (kgdb_init_count == 0) { ++ txx9_sio_kdbg_init(1); ++ kgdb_init_count = 1; ++ } ++ ++ while (1) { ++ status = sio_in(up, TXX9_SIDISR); ++ if ( status & 0x1f ) { ++ ch = sio_in(up, TXX9_SIRFIFO ); ++ break; ++ } ++ } ++ ++ return (ch); ++} ++ ++int ++txx9_sio_kdbg_wr( u8 ch ) ++{ ++ unsigned int status; ++ struct uart_txx9_port *up = &serial_txx9_ports[1]; ++ ++ if (kgdb_init_count == 0) { ++ txx9_sio_kdbg_init(1); ++ kgdb_init_count = 1; ++ } ++ ++ while (1) { ++ status = sio_in(up, TXX9_SICISR); ++ if (status & TXX9_SICISR_TRDY) { ++ if ( ch == '\n' ) { ++ txx9_sio_kdbg_wr( '\r' ); ++ } ++ sio_out(up, TXX9_SITFIFO, (u32)ch ); ++ ++ break; ++ } ++ } ++ ++ return (1); ++} ++#endif /* CONFIG_KGDB */ ++ ++ ++/******************************************************************************/ ++/* END: KDBG Routines */ ++/******************************************************************************/ ++ + static int __init serial_txx9_init(void) + { + int ret; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/drivers/serial/sh-sci.c linux-2.6.18-53.1.14.kgdb/drivers/serial/sh-sci.c +--- linux-2.6.18-53.1.14/drivers/serial/sh-sci.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/drivers/serial/sh-sci.c 2008-06-10 15:38:50.000000000 +0400 +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_CPU_FREQ + #include +@@ -67,14 +68,16 @@ + + #include "sh-sci.h" + +-#ifdef CONFIG_SH_KGDB +-#include +- +-static int kgdb_get_char(struct sci_port *port); +-static void kgdb_put_char(struct sci_port *port, char c); +-static void kgdb_handle_error(struct sci_port *port); +-static struct sci_port *kgdb_sci_port; +-#endif /* CONFIG_SH_KGDB */ ++#ifdef CONFIG_KGDB_SH_SCI ++/* Speed of the UART. */ ++static int kgdbsci_baud = CONFIG_KGDB_BAUDRATE ++ ++/* Index of the UART, matches ttySCX naming. */ ++static int kgdbsci_ttySC = CONFIG_KGDB_PORT_NUM; ++ ++/* Make life easier on us. */ ++#define KGDBPORT sci_ports[kgdbsci_ttySC] ++#endif /* CONFIG_KGDB_SH_SCI */ + + #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE + static struct sci_port *serial_console_port = 0; +@@ -87,20 +90,17 @@ static void sci_start_rx(struct uart_por + static void sci_stop_rx(struct uart_port *port); + static int sci_request_irq(struct sci_port *port); + static void sci_free_irq(struct sci_port *port); ++static void sci_set_termios(struct uart_port *port, struct termios *termios, ++ struct termios *old); ++static int kgdbsci_init(void); + + static struct sci_port sci_ports[]; + static struct uart_driver sci_uart_driver; + + #define SCI_NPORTS sci_uart_driver.nr + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) +- +-static void handle_error(struct uart_port *port) +-{ /* Clear error flags */ +- sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port)); +-} +- +-static int get_char(struct uart_port *port) ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB_SH_SCI) ++static int get_char_for_gdb(struct uart_port *port) + { + unsigned long flags; + unsigned short status; +@@ -110,7 +110,8 @@ static int get_char(struct uart_port *po + do { + status = sci_in(port, SCxSR); + if (status & SCxSR_ERRORS(port)) { +- handle_error(port); ++ /* Clear error flags. */ ++ sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port)); + continue; + } + } while (!(status & SCxSR_RDxF(port))); +@@ -121,21 +122,7 @@ static int get_char(struct uart_port *po + + return c; + } +- +-/* Taken from sh-stub.c of GDB 4.18 */ +-static const char hexchars[] = "0123456789abcdef"; +- +-static __inline__ char highhex(int x) +-{ +- return hexchars[(x >> 4) & 0xf]; +-} +- +-static __inline__ char lowhex(int x) +-{ +- return hexchars[x & 0xf]; +-} +- +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB_SH_SCI */ + + /* + * Send the packet in buffer. The host gets one chance to read it. +@@ -167,21 +154,14 @@ static void put_string(struct sci_port * + const unsigned char *p = buffer; + int i; + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) ++#ifdef CONFIG_SH_STANDARD_BIOS + int checksum; +- int usegdb=0; ++ const char hexchars[] = "0123456789abcdef"; + +-#ifdef CONFIG_SH_STANDARD_BIOS + /* This call only does a trap the first time it is + * called, and so is safe to do here unconditionally + */ +- usegdb |= sh_bios_in_gdb_mode(); +-#endif +-#ifdef CONFIG_SH_KGDB +- usegdb |= (kgdb_in_gdb_mode && (port == kgdb_sci_port)); +-#endif +- +- if (usegdb) { ++ if (sh_bios_in_gdb_mode()) { + /* $#. */ + do { + unsigned char c; +@@ -193,18 +173,18 @@ static void put_string(struct sci_port * + int h, l; + + c = *p++; +- h = highhex(c); +- l = lowhex(c); ++ h = hexchars[c >> 4]; ++ l = hexchars[c % 16]; + put_char(port, h); + put_char(port, l); + checksum += h + l; + } + put_char(port, '#'); +- put_char(port, highhex(checksum)); +- put_char(port, lowhex(checksum)); ++ put_char(port, hexchars[checksum >> 4]); ++ put_char(port, hexchars[checksum % 16]); + } while (get_char(port) != '+'); + } else +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS */ + for (i=0; i SCI_NPORTS + '0') ++ goto errout; ++ kgdbsci_ttySC = *str - '0'; ++ str++; ++ if (*str != ',') ++ goto errout; ++ str++; ++ kgdbsci_baud = simple_strtoul(str, &str, 10); ++ if (kgdbsci_baud != 9600 && kgdbsci_baud != 19200 && ++ kgdbsci_baud != 38400 && kgdbsci_baud != 57600 && ++ kgdbsci_baud != 115200) ++ goto errout; ++ ++ return 0; ++ ++errout: ++ printk(KERN_ERR "Invalid syntax for option kgdbsci=\n"); ++ return 1; ++} ++__setup("kgdbsci", kgdbsci_opt); ++#endif /* CONFIG_KGDB_SH_SCI */ + + #if defined(__H8300S__) + enum { sci_disable, sci_enable }; +@@ -555,6 +608,16 @@ static inline void sci_receive_chars(str + continue; + } + ++#ifdef CONFIG_KGDB_SH_SCI ++ /* We assume that a ^C on the port KGDB ++ * is using means that KGDB wants to ++ * interrupt the running system. ++ */ ++ if (port->line == KGDBPORT.port.line && ++ c == 3) ++ breakpoint(); ++#endif ++ + /* Store data and status */ + if (status&SCxSR_FER(port)) { + flag = TTY_FRAME; +@@ -1618,6 +1681,7 @@ static int __init sci_console_init(void) + console_initcall(sci_console_init); + #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */ + ++#if 0 + #ifdef CONFIG_SH_KGDB + /* + * FIXME: Most of this can go away.. at the moment, we rely on +@@ -1663,30 +1727,9 @@ int __init kgdb_console_setup(struct con + return uart_set_options(port, co, baud, parity, bits, flow); + } + #endif /* CONFIG_SH_KGDB */ ++#endif /* 0 */ + +-#ifdef CONFIG_SH_KGDB_CONSOLE +-static struct console kgdb_console = { +- .name = "ttySC", +- .write = kgdb_console_write, +- .setup = kgdb_console_setup, +- .flags = CON_PRINTBUFFER | CON_ENABLED, +- .index = -1, +- .data = &sci_uart_driver, +-}; +- +-/* Register the KGDB console so we get messages (d'oh!) */ +-static int __init kgdb_console_init(void) +-{ +- register_console(&kgdb_console); +- return 0; +-} +- +-console_initcall(kgdb_console_init); +-#endif /* CONFIG_SH_KGDB_CONSOLE */ +- +-#if defined(CONFIG_SH_KGDB_CONSOLE) +-#define SCI_CONSOLE &kgdb_console +-#elif defined(CONFIG_SERIAL_SH_SCI_CONSOLE) ++#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE + #define SCI_CONSOLE &serial_console + #else + #define SCI_CONSOLE 0 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-arm/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-arm/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-arm/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-arm/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,92 @@ ++/* ++ * include/asm-arm/kgdb.h ++ * ++ * ARM KGDB support ++ * ++ * Author: Deepak Saxena ++ * ++ * Copyright (C) 2002 MontaVista Software Inc. ++ * ++ */ ++ ++#ifndef __ASM_KGDB_H__ ++#define __ASM_KGDB_H__ ++ ++#include ++#include ++#include ++ ++ ++/* ++ * GDB assumes that we're a user process being debugged, so ++ * it will send us an SWI command to write into memory as the ++ * debug trap. When an SWI occurs, the next instruction addr is ++ * placed into R14_svc before jumping to the vector trap. ++ * This doesn't work for kernel debugging as we are already in SVC ++ * we would loose the kernel's LR, which is a bad thing. This ++ * is bad thing. ++ * ++ * By doing this as an undefined instruction trap, we force a mode ++ * switch from SVC to UND mode, allowing us to save full kernel state. ++ * ++ * We also define a KGDB_COMPILED_BREAK which can be used to compile ++ * in breakpoints. This is important for things like sysrq-G and for ++ * the initial breakpoint from trap_init(). ++ * ++ * Note to ARM HW designers: Add real trap support like SH && PPC to ++ * make our lives much much simpler. :) ++ */ ++#define BREAK_INSTR_SIZE 4 ++#define GDB_BREAKINST 0xef9f0001 ++#define KGDB_BREAKINST 0xe7ffdefe ++#define KGDB_COMPILED_BREAK 0xe7ffdeff ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++#ifndef __ASSEMBLY__ ++ ++#define BREAKPOINT() asm(".word 0xe7ffdeff") ++ ++ ++extern void kgdb_handle_bus_error(void); ++extern int kgdb_fault_expected; ++#endif /* !__ASSEMBLY__ */ ++ ++/* ++ * From Amit S. Kale: ++ * ++ * In the register packet, words 0-15 are R0 to R10, FP, IP, SP, LR, PC. But ++ * Register 16 isn't cpsr. GDB passes CPSR in word 25. There are 9 words in ++ * between which are unused. Passing only 26 words to gdb is sufficient. ++ * GDB can figure out that floating point registers are not passed. ++ * GDB_MAX_REGS should be 26. ++ */ ++#define GDB_MAX_REGS (26) ++ ++#define KGDB_MAX_NO_CPUS 1 ++#define BUFMAX 400 ++#define NUMREGBYTES (GDB_MAX_REGS << 2) ++#define NUMCRITREGBYTES (32 << 2) ++ ++#define _R0 0 ++#define _R1 1 ++#define _R2 2 ++#define _R3 3 ++#define _R4 4 ++#define _R5 5 ++#define _R6 6 ++#define _R7 7 ++#define _R8 8 ++#define _R9 9 ++#define _R10 10 ++#define _FP 11 ++#define _IP 12 ++#define _SP 13 ++#define _LR 14 ++#define _PC 15 ++#define _CPSR (GDB_MAX_REGS - 1) ++ ++/* So that we can denote the end of a frame for tracing, in the simple ++ * case. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_PC,_SP,func) ++ ++#endif /* __ASM_KGDB_H__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-arm/system.h linux-2.6.18-53.1.14.kgdb/include/asm-arm/system.h +--- linux-2.6.18-53.1.14/include/asm-arm/system.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-arm/system.h 2008-06-10 15:38:56.000000000 +0400 +@@ -444,6 +444,47 @@ static inline unsigned long __xchg(unsig + extern void disable_hlt(void); + extern void enable_hlt(void); + ++#define __HAVE_ARCH_CMPXCHG 1 ++ ++#include ++ ++static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old, ++ unsigned long new) ++{ ++ u32 retval; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ retval = *m; ++ if (retval == old) ++ *m = new; ++ local_irq_restore(flags); /* implies memory barrier */ ++ ++ return retval; ++} ++ ++/* This function doesn't exist, so you'll get a linker error ++ if something tries to do an invalid cmpxchg(). */ ++extern void __cmpxchg_called_with_bad_pointer(void); ++ ++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, ++ unsigned long new, int size) ++{ ++ switch (size) { ++ case 4: ++ return __cmpxchg_u32(ptr, old, new); ++ } ++ __cmpxchg_called_with_bad_pointer(); ++ return old; ++} ++ ++#define cmpxchg(ptr,o,n) \ ++ ({ \ ++ __typeof__(*(ptr)) _o_ = (o); \ ++ __typeof__(*(ptr)) _n_ = (n); \ ++ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ ++ (unsigned long)_n_, sizeof(*(ptr))); \ ++ }) + #endif /* __ASSEMBLY__ */ + + #define arch_align_stack(x) (x) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-generic/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-generic/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-generic/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-generic/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,34 @@ ++/* ++ * include/asm-generic/kgdb.h ++ * ++ * This provides the assembly level information so that KGDB can provide ++ * a GDB that has been patched with enough information to know to stop ++ * trying to unwind the function. ++ * ++ * Author: Tom Rini ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under the terms ++ * of the GNU General Public License version 2. This program is licensed ++ * "as is" without any warranty of any kind, whether express or implied. ++ */ ++ ++#ifndef __ASM_GENERIC_KGDB_H__ ++#define __ASM_GENERIC_KGDB_H__ ++ ++#include ++#ifdef __ASSEMBLY__ ++#ifdef CONFIG_KGDB ++/* This MUST be put at the end of a given assembly function */ ++#define __CFI_END_FRAME(pc,sp,func) \ ++CAT3(.Lend_,func,:) \ ++ CFI_preamble(func,pc,0x1,-DATA_ALIGN_FACTOR) \ ++ CFA_define_reference(sp, 0) \ ++ CFA_undefine_reg(pc) \ ++ CFI_postamble() \ ++ FDE_preamble(func,func,CAT3(.Lend,_,func)) \ ++ FDE_postamble() ++#else ++#define __CFI_END_FRAME(pc,sp,fn) ++#endif /* CONFIG_KGDB */ ++#endif /* __ASSEMBLY__ */ ++#endif /* __ASM_GENERIC_KGDB_H__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-i386/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-i386/kdebug.h +--- linux-2.6.18-53.1.14/include/asm-i386/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-i386/kdebug.h 2008-06-10 15:38:03.000000000 +0400 +@@ -39,6 +39,7 @@ enum die_val { + DIE_CALL, + DIE_NMI_IPI, + DIE_PAGE_FAULT, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, const char *str, +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-i386/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-i386/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-i386/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-i386/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,58 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++#define _EAX 0 ++#define _ECX 1 ++#define _EDX 2 ++#define _EBX 3 ++#define _ESP 4 ++#define _EBP 5 ++#define _ESI 6 ++#define _EDI 7 ++#define _PC 8 ++#define _EIP 8 ++#define _PS 9 ++#define _EFLAGS 9 ++#define _CS 10 ++#define _SS 11 ++#define _DS 12 ++#define _ES 13 ++#define _FS 14 ++#define _GS 15 ++ ++/* So that we can denote the end of a frame for tracing, in the simple ++ * case. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_EIP,_ESP,func) ++ ++#ifndef __ASSEMBLY__ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 1024 ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES 64 ++/* Number of bytes of registers we need to save for a setjmp/longjmp. */ ++#define NUMCRITREGBYTES 24 ++ ++#define BREAKPOINT() asm(" int $3"); ++#define BREAK_INSTR_SIZE 1 ++#define CACHE_FLUSH_IS_SAFE 1 ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ia64/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kdebug.h +--- linux-2.6.18-53.1.14/include/asm-ia64/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kdebug.h 2008-06-10 15:38:32.000000000 +0400 +@@ -72,6 +72,7 @@ enum die_val { + DIE_KDEBUG_LEAVE, + DIE_KDUMP_ENTER, + DIE_KDUMP_LEAVE, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs, +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ia64/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-ia64/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ia64/kgdb.h 2008-06-10 15:38:32.000000000 +0400 +@@ -0,0 +1,36 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 1024 ++ ++/* Number of bytes of registers. We set this to 0 so that certain GDB ++ * packets will fail, forcing the use of others, which are more friendly ++ * on ia64. */ ++#define NUMREGBYTES 0 ++ ++#define NUMCRITREGBYTES (70*8) ++#define JMP_REGS_ALIGNMENT __attribute__ ((aligned (16))) ++ ++#define BREAKNUM 0x00003333300LL ++#define KGDBBREAKNUM 0x6665UL ++#define BREAKPOINT() asm volatile ("break.m 0x6665") ++#define BREAK_INSTR_SIZE 16 ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++struct pt_regs; ++extern volatile int kgdb_hwbreak_sstep[NR_CPUS]; ++extern void smp_send_nmi_allbutself(void); ++extern void kgdb_wait_ipi(struct pt_regs *); ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-mips/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-mips/kdebug.h +--- linux-2.6.18-53.1.14/include/asm-mips/kdebug.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-mips/kdebug.h 2008-06-10 15:38:24.000000000 +0400 +@@ -0,0 +1,47 @@ ++/* ++ * ++ * Copyright (C) 2004 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ */ ++#ifndef _MIPS_KDEBUG_H ++#define _MIPS_KDEBUG_H ++ ++#include ++ ++struct pt_regs; ++ ++struct die_args { ++ struct pt_regs *regs; ++ const char *str; ++ long err; ++}; ++ ++int register_die_notifier(struct notifier_block *nb); ++extern struct notifier_block *mips_die_chain; ++ ++enum die_val { ++ DIE_OOPS = 1, ++ DIE_PANIC, ++ DIE_DIE, ++ DIE_KERNELDEBUG, ++ DIE_TRAP, ++ DIE_PAGE_FAULT, ++}; ++ ++/* ++ * trap number can be computed from regs and signr can be computed using ++ * compute_signal() ++ */ ++static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err) ++{ ++ struct die_args args = { .regs=regs, .str=str, .err=err }; ++ return notifier_call_chain(&mips_die_chain, val, &args); ++} ++ ++#endif /* _MIPS_KDEBUG_H */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-mips/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-mips/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-mips/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-mips/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,34 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++#ifndef __ASSEMBLY__ ++#if (_MIPS_ISA == _MIPS_ISA_MIPS1) || (_MIPS_ISA == _MIPS_ISA_MIPS2) ++typedef u32 gdb_reg_t; ++#elif (_MIPS_ISA == _MIPS_ISA_MIPS3) || (_MIPS_ISA == _MIPS_ISA_MIPS4) ++typedef u64 gdb_reg_t; ++#else ++#error need to do ++#endif /* _MIPS_ISA */ ++ ++#include ++ ++#ifndef __ASSEMBLY__ ++#define BUFMAX 2048 ++#define NUMREGBYTES (90*sizeof(gdb_reg_t)) ++#define NUMCRITREGBYTES (12*sizeof(gdb_reg_t)) ++#define BREAK_INSTR_SIZE 4 ++#define BREAKPOINT() __asm__ __volatile__( \ ++ ".globl breakinst\n\t" \ ++ ".set\tnoreorder\n\t" \ ++ "nop\n" \ ++ "breakinst:\tbreak\n\t" \ ++ "nop\n\t" \ ++ ".set\treorder") ++#define CACHE_FLUSH_IS_SAFE 0 ++ ++extern int kgdb_early_setup; ++ ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-powerpc/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-powerpc/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-powerpc/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-powerpc/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,74 @@ ++/* ++ * include/asm-powerpc/kgdb.h ++ * ++ * The PowerPC (32/64) specific defines / externs for KGDB. Based on ++ * the previous 32bit and 64bit specific files, which had the following ++ * copyrights: ++ * ++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com) ++ * PPC Mods (C) 2004 Tom Rini (trini@mvista.com) ++ * PPC Mods (C) 2003 John Whitney (john.whitney@timesys.com) ++ * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu) ++ * ++ * ++ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) ++ * Author: Tom Rini ++ * ++ * 2006 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++#ifdef __KERNEL__ ++#ifndef __POWERPC_KGDB_H__ ++#define __POWERPC_KGDB_H__ ++ ++#include ++#ifndef __ASSEMBLY__ ++ ++#define BREAK_INSTR_SIZE 4 ++#define BUFMAX ((NUMREGBYTES * 2) + 512) ++#define OUTBUFMAX ((NUMREGBYTES * 2) + 512) ++#define BREAKPOINT() asm(".long 0x7d821008"); /* twge r2, r2 */ ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++/* The number bytes of registers we have to save depends on a few ++ * things. For 64bit we default to not including vector registers and ++ * vector state registers. */ ++#ifdef CONFIG_PPC64 ++/* ++ * 64 bit (8 byte) registers: ++ * 32 gpr, 32 fpr, nip, msr, link, ctr ++ * 32 bit (4 byte) registers: ++ * ccr, xer, fpscr ++ */ ++#define NUMREGBYTES ((68 * 8) + (3 * 4)) ++#if 0 ++/* The following adds in vector registers and vector state registers. */ ++/* 128 bit (16 byte) registers: ++ * 32 vr ++ * 64 bit (8 byte) registers: ++ * 32 gpr, 32 fpr, nip, msr, link, ctr ++ * 32 bit (4 byte) registers: ++ * ccr, xer, fpscr, vscr, vrsave ++ */ ++#define NUMREGBYTES ((128 * 16) + (68 * 8) + (5 * 4)) ++#endif ++#define NUMCRITREGBYTES 184 ++#else /* CONFIG_PPC32 */ ++/* On non-E500 family PPC32 we determine the size by picking the last ++ * register we need, but on E500 we skip sections so we list what we ++ * need to store, and add it up. */ ++#ifndef CONFIG_E500 ++#define MAXREG (PT_FPSCR+1) ++#else ++/* 32 GPRs (8 bytes), nip, msr, ccr, link, ctr, xer, acc (8 bytes), spefscr*/ ++#define MAXREG ((32*2)+6+2+1) ++#endif ++#define NUMREGBYTES (MAXREG * sizeof(int)) ++/* CR/LR, R1, R2, R13-R31 inclusive. */ ++#define NUMCRITREGBYTES (23 * sizeof(int)) ++#endif /* 32/64 */ ++#endif /* !(__ASSEMBLY__) */ ++#endif /* !__POWERPC_KGDB_H__ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-ppc/kgdb.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/kgdb.h 2008-06-10 15:38:14.000000000 +0400 +@@ -1,57 +1,18 @@ +-/* +- * kgdb.h: Defines and declarations for serial line source level +- * remote debugging of the Linux kernel using gdb. +- * +- * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu) +- * +- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) +- */ + #ifdef __KERNEL__ +-#ifndef _PPC_KGDB_H +-#define _PPC_KGDB_H +- ++#ifndef __PPC_KGDB_H__ ++#define __PPC_KGDB_H__ ++#include + #ifndef __ASSEMBLY__ +- +-/* Things specific to the gen550 backend. */ +-struct uart_port; +- +-extern void gen550_progress(char *, unsigned short); +-extern void gen550_kgdb_map_scc(void); +-extern void gen550_init(int, struct uart_port *); +- +-/* Things specific to the pmac backend. */ +-extern void zs_kgdb_hook(int tty_num); +- +-/* To init the kgdb engine. (called by serial hook)*/ +-extern void set_debug_traps(void); +- +-/* To enter the debugger explicitly. */ +-extern void breakpoint(void); +- +-/* For taking exceptions ++ /* For taking exceptions + * these are defined in traps.c + */ +-extern int (*debugger)(struct pt_regs *regs); ++struct pt_regs; ++extern void (*debugger)(struct pt_regs *regs); + extern int (*debugger_bpt)(struct pt_regs *regs); + extern int (*debugger_sstep)(struct pt_regs *regs); + extern int (*debugger_iabr_match)(struct pt_regs *regs); + extern int (*debugger_dabr_match)(struct pt_regs *regs); + extern void (*debugger_fault_handler)(struct pt_regs *regs); +- +-/* What we bring to the party */ +-int kgdb_bpt(struct pt_regs *regs); +-int kgdb_sstep(struct pt_regs *regs); +-void kgdb(struct pt_regs *regs); +-int kgdb_iabr_match(struct pt_regs *regs); +-int kgdb_dabr_match(struct pt_regs *regs); +- +-/* +- * external low-level support routines (ie macserial.c) +- */ +-extern void kgdb_interruptible(int); /* control interrupts from serial */ +-extern void putDebugChar(char); /* write a single character */ +-extern char getDebugChar(void); /* read and return a single char */ +- +-#endif /* !(__ASSEMBLY__) */ +-#endif /* !(_PPC_KGDB_H) */ ++#endif /* !__ASSEMBLY__ */ ++#endif /* __PPC_KGDB_H__ */ + #endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/machdep.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/machdep.h +--- linux-2.6.18-53.1.14/include/asm-ppc/machdep.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/machdep.h 2008-06-10 15:38:14.000000000 +0400 +@@ -72,9 +72,7 @@ struct machdep_calls { + unsigned long (*find_end_of_memory)(void); + void (*setup_io_mappings)(void); + +- void (*early_serial_map)(void); + void (*progress)(char *, unsigned short); +- void (*kgdb_map_scc)(void); + + unsigned char (*nvram_read_val)(int addr); + void (*nvram_write_val)(int addr, unsigned char val); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/mv64x60.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60.h +--- linux-2.6.18-53.1.14/include/asm-ppc/mv64x60.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60.h 2008-06-10 15:38:14.000000000 +0400 +@@ -348,6 +348,8 @@ u32 mv64x60_calc_mem_size(struct mv64x60 + + void mv64x60_progress_init(u32 base); + void mv64x60_mpsc_progress(char *s, unsigned short hex); ++struct platform_device * mv64x60_early_get_pdev_data(const char *name, ++ int id, int remove); + + extern struct mv64x60_32bit_window + gt64260_32bit_windows[MV64x60_32BIT_WIN_COUNT]; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-ppc/mv64x60_defs.h linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60_defs.h +--- linux-2.6.18-53.1.14/include/asm-ppc/mv64x60_defs.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-ppc/mv64x60_defs.h 2008-06-10 15:38:14.000000000 +0400 +@@ -57,7 +57,8 @@ + #define MV64x60_IRQ_I2C 37 + #define MV64x60_IRQ_BRG 39 + #define MV64x60_IRQ_MPSC_0 40 +-#define MV64x60_IRQ_MPSC_1 42 ++#define MV64360_IRQ_MPSC_1 41 ++#define GT64260_IRQ_MPSC_1 42 + #define MV64x60_IRQ_COMM 43 + #define MV64x60_IRQ_P0_GPP_0_7 56 + #define MV64x60_IRQ_P0_GPP_8_15 57 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-sh/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-sh/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-sh/kgdb.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-sh/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -2,94 +2,40 @@ + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * +- * Based on original code by Glenn Engel, Jim Kingdon, +- * David Grothe , Tigran Aivazian, and +- * Amit S. Kale ++ * Based on a file that was modified or based on files by: Glenn Engel, ++ * Jim Kingdon, David Grothe , Tigran Aivazian , ++ * Amit S. Kale , sh-stub.c from Ben Lee and ++ * Steve Chamberlain, Henry Bell + * +- * Super-H port based on sh-stub.c (Ben Lee and Steve Chamberlain) by +- * Henry Bell +- * +- * Header file for low-level support for remote debug using GDB. ++ * Maintainer: Tom Rini + * + */ + + #ifndef __KGDB_H + #define __KGDB_H + +-#include +- +-struct console; ++#include ++/* Based on sh-gdb.c from gdb-6.1, Glenn ++ Engel at HP Ben Lee and Steve Chamberlain */ ++#define NUMREGBYTES 112 /* 92 */ ++#define NUMCRITREGBYTES (9 << 2) ++#define BUFMAX 400 + +-/* Same as pt_regs but has vbr in place of syscall_nr */ ++#ifndef __ASSEMBLY__ + struct kgdb_regs { + unsigned long regs[16]; + unsigned long pc; + unsigned long pr; +- unsigned long sr; + unsigned long gbr; ++ unsigned long vbr; + unsigned long mach; + unsigned long macl; +- unsigned long vbr; +-}; +- +-/* State info */ +-extern char kgdb_in_gdb_mode; +-extern int kgdb_done_init; +-extern int kgdb_enabled; +-extern int kgdb_nofault; /* Ignore bus errors (in gdb mem access) */ +-extern int kgdb_halt; /* Execute initial breakpoint at startup */ +-extern char in_nmi; /* Debounce flag to prevent NMI reentry*/ +- +-/* SCI */ +-extern int kgdb_portnum; +-extern int kgdb_baud; +-extern char kgdb_parity; +-extern char kgdb_bits; +-extern int kgdb_console_setup(struct console *, char *); +- +-/* Init and interface stuff */ +-extern int kgdb_init(void); +-extern int (*kgdb_serial_setup)(void); +-extern int (*kgdb_getchar)(void); +-extern void (*kgdb_putchar)(int); +- +-struct kgdb_sermap { +- char *name; +- int namelen; +- int (*setup_fn)(struct console *, char *); +- struct kgdb_sermap *next; ++ unsigned long sr; + }; +-extern void kgdb_register_sermap(struct kgdb_sermap *map); +-extern struct kgdb_sermap *kgdb_porttype; + +-/* Trap functions */ +-typedef void (kgdb_debug_hook_t)(struct pt_regs *regs); +-typedef void (kgdb_bus_error_hook_t)(void); +-extern kgdb_debug_hook_t *kgdb_debug_hook; +-extern kgdb_bus_error_hook_t *kgdb_bus_err_hook; +- +-extern void breakpoint(void); +- +-/* Console */ +-struct console; +-void kgdb_console_write(struct console *co, const char *s, unsigned count); +-void kgdb_console_init(void); +- +-/* Prototypes for jmp fns */ +-#define _JBLEN 9 +-typedef int jmp_buf[_JBLEN]; +-extern void longjmp(jmp_buf __jmpb, int __retval); +-extern int setjmp(jmp_buf __jmpb); +- +-/* Variadic macro to print our own message to the console */ +-#define KGDB_PRINTK(...) printk("KGDB: " __VA_ARGS__) +- +-/* Forced breakpoint */ +-#define BREAKPOINT() do { \ +- if (kgdb_enabled) { \ +- asm volatile("trapa #0xff"); \ +- } \ +-} while (0) ++#define BREAKPOINT() asm("trapa #0xff"); ++#define BREAK_INSTR_SIZE 2 ++#define CACHE_FLUSH_IS_SAFE 1 + + /* KGDB should be able to flush all kernel text space */ + #if defined(CONFIG_CPU_SH4) +@@ -102,30 +48,5 @@ extern int setjmp(jmp_buf __jmpb); + #else + #define kgdb_flush_icache_range(start, end) do { } while (0) + #endif +- +-/* Kernel assert macros */ +-#ifdef CONFIG_KGDB_KERNEL_ASSERTS +- +-/* Predefined conditions */ +-#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) +-#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) +-#define KA_VALID_KPTR(ptr) (!(ptr) || \ +- ((void *)(ptr) >= (void *)PAGE_OFFSET && \ +- (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) +-#define KA_VALID_PTRORERR(errptr) \ +- (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) +-#define KA_HELD_GKL() (current->lock_depth >= 0) +- +-/* The actual assert */ +-#define KGDB_ASSERT(condition, message) do { \ +- if (!(condition) && (kgdb_enabled)) { \ +- KGDB_PRINTK("Assertion failed at %s:%d: %s\n", \ +- __FILE__, __LINE__, message);\ +- BREAKPOINT(); \ +- } \ +-} while (0) +-#else +-#define KGDB_ASSERT(condition, message) +-#endif +- ++#endif /* !__ASSEMBLY__ */ + #endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-sh/system.h linux-2.6.18-53.1.14.kgdb/include/asm-sh/system.h +--- linux-2.6.18-53.1.14/include/asm-sh/system.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-sh/system.h 2008-06-10 15:38:50.000000000 +0400 +@@ -6,6 +6,7 @@ + * Copyright (C) 2002 Paul Mundt + */ + ++#include + + /* + * switch_to() should switch tasks to task nr n, first +@@ -260,6 +261,45 @@ static __inline__ unsigned long __xchg(u + return x; + } + ++static inline unsigned long __cmpxchg_u32(volatile int * m, unsigned long old, ++ unsigned long new) ++{ ++ __u32 retval; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ retval = *m; ++ if (retval == old) ++ *m = new; ++ local_irq_restore(flags); /* implies memory barrier */ ++ return retval; ++} ++ ++/* This function doesn't exist, so you'll get a linker error ++ * if something tries to do an invalid cmpxchg(). */ ++extern void __cmpxchg_called_with_bad_pointer(void); ++ ++#define __HAVE_ARCH_CMPXCHG 1 ++ ++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, ++ unsigned long new, int size) ++{ ++ switch (size) { ++ case 4: ++ return __cmpxchg_u32(ptr, old, new); ++ } ++ __cmpxchg_called_with_bad_pointer(); ++ return old; ++} ++ ++#define cmpxchg(ptr,o,n) \ ++ ({ \ ++ __typeof__(*(ptr)) _o_ = (o); \ ++ __typeof__(*(ptr)) _n_ = (n); \ ++ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ ++ (unsigned long)_n_, sizeof(*(ptr))); \ ++ }) ++ + /* XXX + * disable hlt during certain critical i/o operations + */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/kdebug.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kdebug.h +--- linux-2.6.18-53.1.14/include/asm-x86_64/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kdebug.h 2008-06-10 15:38:41.000000000 +0400 +@@ -34,6 +34,7 @@ enum die_val { + DIE_CALL, + DIE_NMI_IPI, + DIE_PAGE_FAULT, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, const char *str, +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/kgdb.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kgdb.h +--- linux-2.6.18-53.1.14/include/asm-x86_64/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/kgdb.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,54 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++#define _RAX 0 ++#define _RDX 1 ++#define _RCX 2 ++#define _RBX 3 ++#define _RSI 4 ++#define _RDI 5 ++#define _RBP 6 ++#define _RSP 7 ++#define _R8 8 ++#define _R9 9 ++#define _R10 10 ++#define _R11 11 ++#define _R12 12 ++#define _R13 13 ++#define _R14 14 ++#define _R15 15 ++#define _PC 16 ++#define _PS 17 ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES ((_PS+1)*8) ++#define NUMCRITREGBYTES (8 * 8) /* 8 registers. */ ++ ++/* Help GDB to know when to stop backtracing. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_PC,_RSP,func) ++#ifndef __ASSEMBLY__ ++/* BUFMAX defines the maximum number of characters in inbound/outbound ++ * buffers at least NUMREGBYTES*2 are needed for register packets, and ++ * a longer buffer is needed to list all threads. */ ++#define BUFMAX 1024 ++#define BREAKPOINT() asm(" int $3"); ++#define CHECK_EXCEPTION_STACK() ((&__get_cpu_var(init_tss))[0].ist[0]) ++#define BREAK_INSTR_SIZE 1 ++#define CACHE_FLUSH_IS_SAFE 1 ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/asm-x86_64/system.h linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/system.h +--- linux-2.6.18-53.1.14/include/asm-x86_64/system.h 2008-03-06 05:54:38.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/asm-x86_64/system.h 2008-06-10 15:38:44.000000000 +0400 +@@ -22,7 +22,9 @@ + + /* Save restore flags to clear handle leaking NT */ + #define switch_to(prev,next,last) \ +- asm volatile(SAVE_CONTEXT \ ++ asm volatile(".globl __switch_to_begin\n\t" \ ++ "__switch_to_begin:\n\t" \ ++ SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ +@@ -34,6 +36,8 @@ + "movq %%rax,%%rdi\n\t" \ + "jc ret_from_fork\n\t" \ + RESTORE_CONTEXT \ ++ ".globl __switch_to_end\n\t" \ ++ "__switch_to_end:\n\t" \ + : "=a" (last) \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/dwarf2-lang.h linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2-lang.h +--- linux-2.6.18-53.1.14/include/linux/dwarf2-lang.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2-lang.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,300 @@ ++#ifndef DWARF2_LANG ++#define DWARF2_LANG ++ ++/* ++ * This is free software; you can redistribute it and/or modify it under ++ * the terms of the GNU General Public License as published by the Free ++ * Software Foundation; either version 2, or (at your option) any later ++ * version. ++ */ ++/* ++ * This file defines macros that allow generation of DWARF debug records ++ * for asm files. This file is platform independent. Register numbers ++ * (which are about the only thing that is platform dependent) are to be ++ * supplied by a platform defined file. ++ */ ++/* ++ * We need this to work for both asm and C. In asm we are using the ++ * old comment trick to concatenate while C uses the new ANSI thing. ++ * Here we have concat macro... The multi level thing is to allow and ++ * macros used in the names to be resolved prior to the cat (at which ++ * time they are no longer the same string). ++ */ ++#define CAT3(a,b,c) _CAT3(a,b,c) ++#define _CAT3(a,b,c) __CAT3(a,b,c) ++#ifndef __STDC__ ++#define __CAT3(a,b,c) a/**/b/**/c ++#else ++#define __CAT3(a,b,c) a##b##c ++#endif ++#ifdef __ASSEMBLY__ ++#define IFC(a) ++#define IFN_C(a) a ++#define NL ; ++#define QUOTE_THIS(a) a ++#define DWARF_preamble .section .debug_frame,"",%progbits; ++#else ++#define IFC(a) a ++#define IFN_C(a) ++#define NL \n\t ++#define QUOTE_THIS(a) _QUOTE_THIS(a) ++#define _QUOTE_THIS(a) #a ++/* Don't let CPP see the " and , \042=" \054=, */ ++#define DWARF_preamble .section .debug_frame \054\042\042\054%progbits ++#endif ++ ++#ifdef CONFIG_64BIT ++#define DATA_ALIGN_FACTOR 8 ++#define ADDR_LOC .quad ++#else ++#define DATA_ALIGN_FACTOR 4 ++#define ADDR_LOC .long ++#endif ++ ++#include ++/* ++ * This macro starts a debug frame section. The debug_frame describes ++ * where to find the registers that the enclosing function saved on ++ * entry. ++ * ++ * ORD is use by the label generator and should be the same as what is ++ * passed to CFI_postamble. ++ * ++ * pc, pc register gdb ordinal. ++ * ++ * code_align this is the factor used to define locations or regions ++ * where the given definitions apply. If you use labels to define these ++ * this should be 1. ++ * ++ * data_align this is the factor used to define register offsets. If ++ * you use struct offset, this should be the size of the register in ++ * bytes or the negative of that. This is how it is used: you will ++ * define a register as the reference register, say the stack pointer, ++ * then you will say where a register is located relative to this ++ * reference registers value, say 40 for register 3 (the gdb register ++ * number). The <40> will be multiplied by to define the ++ * byte offset of the given register (3, in this example). So if your ++ * <40> is the byte offset and the reference register points at the ++ * begining, you would want 1 for the data_offset. If <40> was the 40th ++ * 4-byte element in that structure you would want 4. And if your ++ * reference register points at the end of the structure you would want ++ * a negative data_align value(and you would have to do other math as ++ * well). ++ */ ++ ++#define CFI_preamble(ORD, pc, code_align, data_align) \ ++ DWARF_preamble NL \ ++ .align DATA_ALIGN_FACTOR NL \ ++ .globl CAT3(frame,_,ORD) NL \ ++CAT3(frame,_,ORD): NL \ ++ .long 7f-6f NL \ ++6: \ ++ .long DW_CIE_ID NL \ ++ .byte DW_CIE_VERSION NL \ ++ .byte 0 NL \ ++ .uleb128 code_align NL \ ++ .sleb128 data_align NL \ ++ .byte pc NL ++ ++/* ++ * After the above macro and prior to the CFI_postamble, you need to ++ * define the initial state. This starts with defining the reference ++ * register and, usually the pc. Here are some helper macros: ++ */ ++ ++#define CFA_define_reference(reg, offset) \ ++ .byte DW_CFA_def_cfa NL \ ++ .uleb128 reg NL \ ++ .uleb128 (offset) NL ++ ++#define CFA_define_offset(reg, offset) \ ++ .byte (DW_CFA_offset + reg) NL \ ++ .uleb128 (offset) NL ++ ++#define CFA_restore(reg) \ ++ .byte (DW_CFA_restore + reg) NL ++ ++#define CFI_postamble() \ ++ .align DATA_ALIGN_FACTOR NL \ ++7: NL \ ++.previous NL ++ ++/* ++ * So now your code pushs stuff on the stack, you need a new location ++ * and the rules for what to do. This starts a running description of ++ * the call frame. You need to describe what changes with respect to ++ * the call registers as the location of the pc moves through the code. ++ * The following builds an FDE (fram descriptor entry?). Like the ++ * above, it has a preamble and a postamble. It also is tied to the CFI ++ * above. ++ * The preamble macro is tied to the CFI thru the first parameter. The ++ * second is the code start address and then the code end address+1. ++ */ ++#define FDE_preamble(ORD, initial_address, end_address) \ ++ DWARF_preamble NL \ ++ .align DATA_ALIGN_FACTOR NL \ ++ .long 9f-8f NL \ ++8: \ ++ .long CAT3(frame,_,ORD) NL \ ++ ADDR_LOC initial_address NL \ ++ ADDR_LOC (end_address - initial_address) NL ++ ++#define FDE_postamble() \ ++ .align DATA_ALIGN_FACTOR NL \ ++9: NL \ ++.previous NL ++ ++/* ++ * That done, you can now add registers, subtract registers, move the ++ * reference and even change the reference. You can also define a new ++ * area of code the info applies to. For discontinuous bits you should ++ * start a new FDE. You may have as many as you like. ++ */ ++ ++/* ++ * To advance the stack address by (0x3f max) ++ */ ++ ++#define CFA_advance_loc(bytes) \ ++ .byte DW_CFA_advance_loc+bytes NL ++ ++/* ++ * This one is good for 0xff or 255 ++ */ ++#define CFA_advance_loc1(bytes) \ ++ .byte DW_CFA_advance_loc1 NL \ ++ .byte bytes NL ++ ++#define CFA_undefine_reg(reg) \ ++ .byte DW_CFA_undefined NL \ ++ .uleb128 reg NL ++/* ++ * With the above you can define all the register locations. But ++ * suppose the reference register moves... Takes the new offset NOT an ++ * increment. This is how esp is tracked if it is not saved. ++ */ ++ ++#define CFA_define_cfa_offset(offset) \ ++ .byte DW_CFA_def_cfa_offset NL \ ++ .uleb128 (offset) NL ++/* ++ * Or suppose you want to use a different reference register... ++ */ ++#define CFA_define_cfa_register(reg) \ ++ .byte DW_CFA_def_cfa_register NL \ ++ .uleb128 reg NL ++ ++/* ++ * If you want to mess with the stack pointer, here is the expression. ++ * The stack starts empty. ++ */ ++#define CFA_def_cfa_expression \ ++ .byte DW_CFA_def_cfa_expression NL \ ++ .uleb128 20f-10f NL \ ++10: NL ++/* ++ * This expression is to be used for other regs. The stack starts with the ++ * stack address. ++ */ ++ ++#define CFA_expression(reg) \ ++ .byte DW_CFA_expression NL \ ++ .uleb128 reg NL \ ++ .uleb128 20f-10f NL \ ++10: NL ++/* ++ * Here we do the expression stuff. You should code the above followed ++ * by expression OPs followed by CFA_expression_end. ++ */ ++ ++ ++#define CFA_expression_end \ ++20: NL ++ ++#define CFA_exp_OP_const4s(a) \ ++ .byte DW_OP_const4s NL \ ++ .long a NL ++ ++#define CFA_exp_OP_swap .byte DW_OP_swap NL ++#define CFA_exp_OP_dup .byte DW_OP_dup NL ++#define CFA_exp_OP_drop .byte DW_OP_drop NL ++/* ++ * All these work on the top two elements on the stack, replacing them ++ * with the result. Top comes first where it matters. True is 1, false 0. ++ */ ++#define CFA_exp_OP_deref .byte DW_OP_deref NL ++#define CFA_exp_OP_and .byte DW_OP_and NL ++#define CFA_exp_OP_div .byte DW_OP_div NL ++#define CFA_exp_OP_minus .byte DW_OP_minus NL ++#define CFA_exp_OP_mod .byte DW_OP_mod NL ++#define CFA_exp_OP_neg .byte DW_OP_neg NL ++#define CFA_exp_OP_plus .byte DW_OP_plus NL ++#define CFA_exp_OP_not .byte DW_OP_not NL ++#define CFA_exp_OP_or .byte DW_OP_or NL ++#define CFA_exp_OP_xor .byte DW_OP_xor NL ++#define CFA_exp_OP_le .byte DW_OP_le NL ++#define CFA_exp_OP_ge .byte DW_OP_ge NL ++#define CFA_exp_OP_eq .byte DW_OP_eq NL ++#define CFA_exp_OP_lt .byte DW_OP_lt NL ++#define CFA_exp_OP_gt .byte DW_OP_gt NL ++#define CFA_exp_OP_ne .byte DW_OP_ne NL ++/* ++ * These take a parameter as noted ++ */ ++/* ++ * Unconditional skip to loc. loc is a label (loc:) ++ */ ++#define CFA_exp_OP_skip(loc) \ ++ .byte DW_OP_skip NL \ ++ .hword loc-.-2 NL ++/* ++ * Conditional skip to loc (TOS != 0, TOS--) (loc is a label) ++ */ ++#define CFA_exp_OP_bra(loc) \ ++ .byte DW_OP_bra NL \ ++ .hword loc-.-2 NL ++ ++/* ++ * TOS += no (an unsigned number) ++ */ ++#define CFA_exp_OP_plus_uconst(no) \ ++ .byte DW_OP_plus_uconst NL \ ++ .uleb128 no NL ++ ++/* ++ * ++TOS = no (a unsigned number) ++ */ ++#define CFA_exp_OP_constu(no) \ ++ .byte DW_OP_constu NL \ ++ .uleb128 no NL ++/* ++ * ++TOS = no (a signed number) ++ */ ++#define CFA_exp_OP_consts(no) \ ++ .byte DW_OP_consts NL \ ++ .sleb128 no NL ++/* ++ * ++TOS = no (an unsigned byte) ++ */ ++#define CFA_exp_OP_const1u(no) \ ++ .byte DW_OP_const1u NL \ ++ .byte no NL ++ ++ ++/* ++ * ++TOS = no (a address) ++ */ ++#define CFA_exp_OP_addr(no) \ ++ .byte DW_OP_addr NL \ ++ .long no NL ++ ++/* ++ * Push current frames value for "reg" + offset ++ * We take advantage of the opcode assignments to make this a litteral reg ++ * rather than use the DW_OP_bregx opcode. ++ */ ++ ++#define CFA_exp_OP_breg(reg,offset) \ ++ .byte DW_OP_breg0+reg NL \ ++ .sleb128 offset NL ++#endif +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/dwarf2.h linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2.h +--- linux-2.6.18-53.1.14/include/linux/dwarf2.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/dwarf2.h 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,775 @@ ++/* Declarations and definitions of codes relating to the DWARF2 symbolic ++ debugging information format. ++ Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002, ++ 2003 Free Software Foundation, Inc. ++ ++ Written by Gary Funck (gary@intrepid.com) The Ada Joint Program ++ Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. ++ provided support for this effort -- June 21, 1995. ++ ++ Derived from the DWARF 1 implementation written by Ron Guilmette ++ (rfg@netcom.com), November 1990. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it under ++ the terms of the GNU General Public License as published by the Free ++ Software Foundation; either version 2, or (at your option) any later ++ version. ++ ++ GCC is distributed in the hope that it will be useful, but WITHOUT ++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ++ License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING. If not, write to the Free ++ Software Foundation, 59 Temple Place - Suite 330, Boston, MA ++ 02111-1307, USA. */ ++ ++/* This file is derived from the DWARF specification (a public document) ++ Revision 2.0.0 (July 27, 1993) developed by the UNIX International ++ Programming Languages Special Interest Group (UI/PLSIG) and distributed ++ by UNIX International. Copies of this specification are available from ++ UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. ++ ++ This file also now contains definitions from the DWARF 3 specification. */ ++ ++/* This file is shared between GCC and GDB, and should not contain ++ prototypes. */ ++ ++#ifndef _ELF_DWARF2_H ++#define _ELF_DWARF2_H ++ ++/* Structure found in the .debug_line section. */ ++typedef struct ++{ ++ unsigned char li_length [4]; ++ unsigned char li_version [2]; ++ unsigned char li_prologue_length [4]; ++ unsigned char li_min_insn_length [1]; ++ unsigned char li_default_is_stmt [1]; ++ unsigned char li_line_base [1]; ++ unsigned char li_line_range [1]; ++ unsigned char li_opcode_base [1]; ++} ++DWARF2_External_LineInfo; ++ ++typedef struct ++{ ++ unsigned long li_length; ++ unsigned short li_version; ++ unsigned int li_prologue_length; ++ unsigned char li_min_insn_length; ++ unsigned char li_default_is_stmt; ++ int li_line_base; ++ unsigned char li_line_range; ++ unsigned char li_opcode_base; ++} ++DWARF2_Internal_LineInfo; ++ ++/* Structure found in .debug_pubnames section. */ ++typedef struct ++{ ++ unsigned char pn_length [4]; ++ unsigned char pn_version [2]; ++ unsigned char pn_offset [4]; ++ unsigned char pn_size [4]; ++} ++DWARF2_External_PubNames; ++ ++typedef struct ++{ ++ unsigned long pn_length; ++ unsigned short pn_version; ++ unsigned long pn_offset; ++ unsigned long pn_size; ++} ++DWARF2_Internal_PubNames; ++ ++/* Structure found in .debug_info section. */ ++typedef struct ++{ ++ unsigned char cu_length [4]; ++ unsigned char cu_version [2]; ++ unsigned char cu_abbrev_offset [4]; ++ unsigned char cu_pointer_size [1]; ++} ++DWARF2_External_CompUnit; ++ ++typedef struct ++{ ++ unsigned long cu_length; ++ unsigned short cu_version; ++ unsigned long cu_abbrev_offset; ++ unsigned char cu_pointer_size; ++} ++DWARF2_Internal_CompUnit; ++ ++typedef struct ++{ ++ unsigned char ar_length [4]; ++ unsigned char ar_version [2]; ++ unsigned char ar_info_offset [4]; ++ unsigned char ar_pointer_size [1]; ++ unsigned char ar_segment_size [1]; ++} ++DWARF2_External_ARange; ++ ++typedef struct ++{ ++ unsigned long ar_length; ++ unsigned short ar_version; ++ unsigned long ar_info_offset; ++ unsigned char ar_pointer_size; ++ unsigned char ar_segment_size; ++} ++DWARF2_Internal_ARange; ++ ++ ++/* Tag names and codes. */ ++enum dwarf_tag ++ { ++ DW_TAG_padding = 0x00, ++ DW_TAG_array_type = 0x01, ++ DW_TAG_class_type = 0x02, ++ DW_TAG_entry_point = 0x03, ++ DW_TAG_enumeration_type = 0x04, ++ DW_TAG_formal_parameter = 0x05, ++ DW_TAG_imported_declaration = 0x08, ++ DW_TAG_label = 0x0a, ++ DW_TAG_lexical_block = 0x0b, ++ DW_TAG_member = 0x0d, ++ DW_TAG_pointer_type = 0x0f, ++ DW_TAG_reference_type = 0x10, ++ DW_TAG_compile_unit = 0x11, ++ DW_TAG_string_type = 0x12, ++ DW_TAG_structure_type = 0x13, ++ DW_TAG_subroutine_type = 0x15, ++ DW_TAG_typedef = 0x16, ++ DW_TAG_union_type = 0x17, ++ DW_TAG_unspecified_parameters = 0x18, ++ DW_TAG_variant = 0x19, ++ DW_TAG_common_block = 0x1a, ++ DW_TAG_common_inclusion = 0x1b, ++ DW_TAG_inheritance = 0x1c, ++ DW_TAG_inlined_subroutine = 0x1d, ++ DW_TAG_module = 0x1e, ++ DW_TAG_ptr_to_member_type = 0x1f, ++ DW_TAG_set_type = 0x20, ++ DW_TAG_subrange_type = 0x21, ++ DW_TAG_with_stmt = 0x22, ++ DW_TAG_access_declaration = 0x23, ++ DW_TAG_base_type = 0x24, ++ DW_TAG_catch_block = 0x25, ++ DW_TAG_const_type = 0x26, ++ DW_TAG_constant = 0x27, ++ DW_TAG_enumerator = 0x28, ++ DW_TAG_file_type = 0x29, ++ DW_TAG_friend = 0x2a, ++ DW_TAG_namelist = 0x2b, ++ DW_TAG_namelist_item = 0x2c, ++ DW_TAG_packed_type = 0x2d, ++ DW_TAG_subprogram = 0x2e, ++ DW_TAG_template_type_param = 0x2f, ++ DW_TAG_template_value_param = 0x30, ++ DW_TAG_thrown_type = 0x31, ++ DW_TAG_try_block = 0x32, ++ DW_TAG_variant_part = 0x33, ++ DW_TAG_variable = 0x34, ++ DW_TAG_volatile_type = 0x35, ++ /* DWARF 3. */ ++ DW_TAG_dwarf_procedure = 0x36, ++ DW_TAG_restrict_type = 0x37, ++ DW_TAG_interface_type = 0x38, ++ DW_TAG_namespace = 0x39, ++ DW_TAG_imported_module = 0x3a, ++ DW_TAG_unspecified_type = 0x3b, ++ DW_TAG_partial_unit = 0x3c, ++ DW_TAG_imported_unit = 0x3d, ++ /* SGI/MIPS Extensions. */ ++ DW_TAG_MIPS_loop = 0x4081, ++ /* HP extensions. See: ftp://ftp.hp.com/pub/lang/tools/WDB/wdb-4.0.tar.gz . */ ++ DW_TAG_HP_array_descriptor = 0x4090, ++ /* GNU extensions. */ ++ DW_TAG_format_label = 0x4101, /* For FORTRAN 77 and Fortran 90. */ ++ DW_TAG_function_template = 0x4102, /* For C++. */ ++ DW_TAG_class_template = 0x4103, /* For C++. */ ++ DW_TAG_GNU_BINCL = 0x4104, ++ DW_TAG_GNU_EINCL = 0x4105, ++ /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ ++ DW_TAG_upc_shared_type = 0x8765, ++ DW_TAG_upc_strict_type = 0x8766, ++ DW_TAG_upc_relaxed_type = 0x8767, ++ /* PGI (STMicroelectronics) extensions. No documentation available. */ ++ DW_TAG_PGI_kanji_type = 0xA000, ++ DW_TAG_PGI_interface_block = 0xA020 ++ }; ++ ++#define DW_TAG_lo_user 0x4080 ++#define DW_TAG_hi_user 0xffff ++ ++/* Flag that tells whether entry has a child or not. */ ++#define DW_children_no 0 ++#define DW_children_yes 1 ++ ++/* Form names and codes. */ ++enum dwarf_form ++ { ++ DW_FORM_addr = 0x01, ++ DW_FORM_block2 = 0x03, ++ DW_FORM_block4 = 0x04, ++ DW_FORM_data2 = 0x05, ++ DW_FORM_data4 = 0x06, ++ DW_FORM_data8 = 0x07, ++ DW_FORM_string = 0x08, ++ DW_FORM_block = 0x09, ++ DW_FORM_block1 = 0x0a, ++ DW_FORM_data1 = 0x0b, ++ DW_FORM_flag = 0x0c, ++ DW_FORM_sdata = 0x0d, ++ DW_FORM_strp = 0x0e, ++ DW_FORM_udata = 0x0f, ++ DW_FORM_ref_addr = 0x10, ++ DW_FORM_ref1 = 0x11, ++ DW_FORM_ref2 = 0x12, ++ DW_FORM_ref4 = 0x13, ++ DW_FORM_ref8 = 0x14, ++ DW_FORM_ref_udata = 0x15, ++ DW_FORM_indirect = 0x16 ++ }; ++ ++/* Attribute names and codes. */ ++enum dwarf_attribute ++ { ++ DW_AT_sibling = 0x01, ++ DW_AT_location = 0x02, ++ DW_AT_name = 0x03, ++ DW_AT_ordering = 0x09, ++ DW_AT_subscr_data = 0x0a, ++ DW_AT_byte_size = 0x0b, ++ DW_AT_bit_offset = 0x0c, ++ DW_AT_bit_size = 0x0d, ++ DW_AT_element_list = 0x0f, ++ DW_AT_stmt_list = 0x10, ++ DW_AT_low_pc = 0x11, ++ DW_AT_high_pc = 0x12, ++ DW_AT_language = 0x13, ++ DW_AT_member = 0x14, ++ DW_AT_discr = 0x15, ++ DW_AT_discr_value = 0x16, ++ DW_AT_visibility = 0x17, ++ DW_AT_import = 0x18, ++ DW_AT_string_length = 0x19, ++ DW_AT_common_reference = 0x1a, ++ DW_AT_comp_dir = 0x1b, ++ DW_AT_const_value = 0x1c, ++ DW_AT_containing_type = 0x1d, ++ DW_AT_default_value = 0x1e, ++ DW_AT_inline = 0x20, ++ DW_AT_is_optional = 0x21, ++ DW_AT_lower_bound = 0x22, ++ DW_AT_producer = 0x25, ++ DW_AT_prototyped = 0x27, ++ DW_AT_return_addr = 0x2a, ++ DW_AT_start_scope = 0x2c, ++ DW_AT_stride_size = 0x2e, ++ DW_AT_upper_bound = 0x2f, ++ DW_AT_abstract_origin = 0x31, ++ DW_AT_accessibility = 0x32, ++ DW_AT_address_class = 0x33, ++ DW_AT_artificial = 0x34, ++ DW_AT_base_types = 0x35, ++ DW_AT_calling_convention = 0x36, ++ DW_AT_count = 0x37, ++ DW_AT_data_member_location = 0x38, ++ DW_AT_decl_column = 0x39, ++ DW_AT_decl_file = 0x3a, ++ DW_AT_decl_line = 0x3b, ++ DW_AT_declaration = 0x3c, ++ DW_AT_discr_list = 0x3d, ++ DW_AT_encoding = 0x3e, ++ DW_AT_external = 0x3f, ++ DW_AT_frame_base = 0x40, ++ DW_AT_friend = 0x41, ++ DW_AT_identifier_case = 0x42, ++ DW_AT_macro_info = 0x43, ++ DW_AT_namelist_items = 0x44, ++ DW_AT_priority = 0x45, ++ DW_AT_segment = 0x46, ++ DW_AT_specification = 0x47, ++ DW_AT_static_link = 0x48, ++ DW_AT_type = 0x49, ++ DW_AT_use_location = 0x4a, ++ DW_AT_variable_parameter = 0x4b, ++ DW_AT_virtuality = 0x4c, ++ DW_AT_vtable_elem_location = 0x4d, ++ /* DWARF 3 values. */ ++ DW_AT_allocated = 0x4e, ++ DW_AT_associated = 0x4f, ++ DW_AT_data_location = 0x50, ++ DW_AT_stride = 0x51, ++ DW_AT_entry_pc = 0x52, ++ DW_AT_use_UTF8 = 0x53, ++ DW_AT_extension = 0x54, ++ DW_AT_ranges = 0x55, ++ DW_AT_trampoline = 0x56, ++ DW_AT_call_column = 0x57, ++ DW_AT_call_file = 0x58, ++ DW_AT_call_line = 0x59, ++ /* SGI/MIPS extensions. */ ++ DW_AT_MIPS_fde = 0x2001, ++ DW_AT_MIPS_loop_begin = 0x2002, ++ DW_AT_MIPS_tail_loop_begin = 0x2003, ++ DW_AT_MIPS_epilog_begin = 0x2004, ++ DW_AT_MIPS_loop_unroll_factor = 0x2005, ++ DW_AT_MIPS_software_pipeline_depth = 0x2006, ++ DW_AT_MIPS_linkage_name = 0x2007, ++ DW_AT_MIPS_stride = 0x2008, ++ DW_AT_MIPS_abstract_name = 0x2009, ++ DW_AT_MIPS_clone_origin = 0x200a, ++ DW_AT_MIPS_has_inlines = 0x200b, ++ /* HP extensions. */ ++ DW_AT_HP_block_index = 0x2000, ++ DW_AT_HP_unmodifiable = 0x2001, /* Same as DW_AT_MIPS_fde. */ ++ DW_AT_HP_actuals_stmt_list = 0x2010, ++ DW_AT_HP_proc_per_section = 0x2011, ++ DW_AT_HP_raw_data_ptr = 0x2012, ++ DW_AT_HP_pass_by_reference = 0x2013, ++ DW_AT_HP_opt_level = 0x2014, ++ DW_AT_HP_prof_version_id = 0x2015, ++ DW_AT_HP_opt_flags = 0x2016, ++ DW_AT_HP_cold_region_low_pc = 0x2017, ++ DW_AT_HP_cold_region_high_pc = 0x2018, ++ DW_AT_HP_all_variables_modifiable = 0x2019, ++ DW_AT_HP_linkage_name = 0x201a, ++ DW_AT_HP_prof_flags = 0x201b, /* In comp unit of procs_info for -g. */ ++ /* GNU extensions. */ ++ DW_AT_sf_names = 0x2101, ++ DW_AT_src_info = 0x2102, ++ DW_AT_mac_info = 0x2103, ++ DW_AT_src_coords = 0x2104, ++ DW_AT_body_begin = 0x2105, ++ DW_AT_body_end = 0x2106, ++ DW_AT_GNU_vector = 0x2107, ++ /* VMS extensions. */ ++ DW_AT_VMS_rtnbeg_pd_address = 0x2201, ++ /* UPC extension. */ ++ DW_AT_upc_threads_scaled = 0x3210, ++ /* PGI (STMicroelectronics) extensions. */ ++ DW_AT_PGI_lbase = 0x3a00, ++ DW_AT_PGI_soffset = 0x3a01, ++ DW_AT_PGI_lstride = 0x3a02 ++ }; ++ ++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ ++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ ++ ++/* Location atom names and codes. */ ++enum dwarf_location_atom ++ { ++ DW_OP_addr = 0x03, ++ DW_OP_deref = 0x06, ++ DW_OP_const1u = 0x08, ++ DW_OP_const1s = 0x09, ++ DW_OP_const2u = 0x0a, ++ DW_OP_const2s = 0x0b, ++ DW_OP_const4u = 0x0c, ++ DW_OP_const4s = 0x0d, ++ DW_OP_const8u = 0x0e, ++ DW_OP_const8s = 0x0f, ++ DW_OP_constu = 0x10, ++ DW_OP_consts = 0x11, ++ DW_OP_dup = 0x12, ++ DW_OP_drop = 0x13, ++ DW_OP_over = 0x14, ++ DW_OP_pick = 0x15, ++ DW_OP_swap = 0x16, ++ DW_OP_rot = 0x17, ++ DW_OP_xderef = 0x18, ++ DW_OP_abs = 0x19, ++ DW_OP_and = 0x1a, ++ DW_OP_div = 0x1b, ++ DW_OP_minus = 0x1c, ++ DW_OP_mod = 0x1d, ++ DW_OP_mul = 0x1e, ++ DW_OP_neg = 0x1f, ++ DW_OP_not = 0x20, ++ DW_OP_or = 0x21, ++ DW_OP_plus = 0x22, ++ DW_OP_plus_uconst = 0x23, ++ DW_OP_shl = 0x24, ++ DW_OP_shr = 0x25, ++ DW_OP_shra = 0x26, ++ DW_OP_xor = 0x27, ++ DW_OP_bra = 0x28, ++ DW_OP_eq = 0x29, ++ DW_OP_ge = 0x2a, ++ DW_OP_gt = 0x2b, ++ DW_OP_le = 0x2c, ++ DW_OP_lt = 0x2d, ++ DW_OP_ne = 0x2e, ++ DW_OP_skip = 0x2f, ++ DW_OP_lit0 = 0x30, ++ DW_OP_lit1 = 0x31, ++ DW_OP_lit2 = 0x32, ++ DW_OP_lit3 = 0x33, ++ DW_OP_lit4 = 0x34, ++ DW_OP_lit5 = 0x35, ++ DW_OP_lit6 = 0x36, ++ DW_OP_lit7 = 0x37, ++ DW_OP_lit8 = 0x38, ++ DW_OP_lit9 = 0x39, ++ DW_OP_lit10 = 0x3a, ++ DW_OP_lit11 = 0x3b, ++ DW_OP_lit12 = 0x3c, ++ DW_OP_lit13 = 0x3d, ++ DW_OP_lit14 = 0x3e, ++ DW_OP_lit15 = 0x3f, ++ DW_OP_lit16 = 0x40, ++ DW_OP_lit17 = 0x41, ++ DW_OP_lit18 = 0x42, ++ DW_OP_lit19 = 0x43, ++ DW_OP_lit20 = 0x44, ++ DW_OP_lit21 = 0x45, ++ DW_OP_lit22 = 0x46, ++ DW_OP_lit23 = 0x47, ++ DW_OP_lit24 = 0x48, ++ DW_OP_lit25 = 0x49, ++ DW_OP_lit26 = 0x4a, ++ DW_OP_lit27 = 0x4b, ++ DW_OP_lit28 = 0x4c, ++ DW_OP_lit29 = 0x4d, ++ DW_OP_lit30 = 0x4e, ++ DW_OP_lit31 = 0x4f, ++ DW_OP_reg0 = 0x50, ++ DW_OP_reg1 = 0x51, ++ DW_OP_reg2 = 0x52, ++ DW_OP_reg3 = 0x53, ++ DW_OP_reg4 = 0x54, ++ DW_OP_reg5 = 0x55, ++ DW_OP_reg6 = 0x56, ++ DW_OP_reg7 = 0x57, ++ DW_OP_reg8 = 0x58, ++ DW_OP_reg9 = 0x59, ++ DW_OP_reg10 = 0x5a, ++ DW_OP_reg11 = 0x5b, ++ DW_OP_reg12 = 0x5c, ++ DW_OP_reg13 = 0x5d, ++ DW_OP_reg14 = 0x5e, ++ DW_OP_reg15 = 0x5f, ++ DW_OP_reg16 = 0x60, ++ DW_OP_reg17 = 0x61, ++ DW_OP_reg18 = 0x62, ++ DW_OP_reg19 = 0x63, ++ DW_OP_reg20 = 0x64, ++ DW_OP_reg21 = 0x65, ++ DW_OP_reg22 = 0x66, ++ DW_OP_reg23 = 0x67, ++ DW_OP_reg24 = 0x68, ++ DW_OP_reg25 = 0x69, ++ DW_OP_reg26 = 0x6a, ++ DW_OP_reg27 = 0x6b, ++ DW_OP_reg28 = 0x6c, ++ DW_OP_reg29 = 0x6d, ++ DW_OP_reg30 = 0x6e, ++ DW_OP_reg31 = 0x6f, ++ DW_OP_breg0 = 0x70, ++ DW_OP_breg1 = 0x71, ++ DW_OP_breg2 = 0x72, ++ DW_OP_breg3 = 0x73, ++ DW_OP_breg4 = 0x74, ++ DW_OP_breg5 = 0x75, ++ DW_OP_breg6 = 0x76, ++ DW_OP_breg7 = 0x77, ++ DW_OP_breg8 = 0x78, ++ DW_OP_breg9 = 0x79, ++ DW_OP_breg10 = 0x7a, ++ DW_OP_breg11 = 0x7b, ++ DW_OP_breg12 = 0x7c, ++ DW_OP_breg13 = 0x7d, ++ DW_OP_breg14 = 0x7e, ++ DW_OP_breg15 = 0x7f, ++ DW_OP_breg16 = 0x80, ++ DW_OP_breg17 = 0x81, ++ DW_OP_breg18 = 0x82, ++ DW_OP_breg19 = 0x83, ++ DW_OP_breg20 = 0x84, ++ DW_OP_breg21 = 0x85, ++ DW_OP_breg22 = 0x86, ++ DW_OP_breg23 = 0x87, ++ DW_OP_breg24 = 0x88, ++ DW_OP_breg25 = 0x89, ++ DW_OP_breg26 = 0x8a, ++ DW_OP_breg27 = 0x8b, ++ DW_OP_breg28 = 0x8c, ++ DW_OP_breg29 = 0x8d, ++ DW_OP_breg30 = 0x8e, ++ DW_OP_breg31 = 0x8f, ++ DW_OP_regx = 0x90, ++ DW_OP_fbreg = 0x91, ++ DW_OP_bregx = 0x92, ++ DW_OP_piece = 0x93, ++ DW_OP_deref_size = 0x94, ++ DW_OP_xderef_size = 0x95, ++ DW_OP_nop = 0x96, ++ /* DWARF 3 extensions. */ ++ DW_OP_push_object_address = 0x97, ++ DW_OP_call2 = 0x98, ++ DW_OP_call4 = 0x99, ++ DW_OP_call_ref = 0x9a, ++ /* GNU extensions. */ ++ DW_OP_GNU_push_tls_address = 0xe0, ++ /* HP extensions. */ ++ DW_OP_HP_unknown = 0xe0, /* Ouch, the same as GNU_push_tls_address. */ ++ DW_OP_HP_is_value = 0xe1, ++ DW_OP_HP_fltconst4 = 0xe2, ++ DW_OP_HP_fltconst8 = 0xe3, ++ DW_OP_HP_mod_range = 0xe4, ++ DW_OP_HP_unmod_range = 0xe5, ++ DW_OP_HP_tls = 0xe6 ++ }; ++ ++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ ++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ ++ ++/* Type encodings. */ ++enum dwarf_type ++ { ++ DW_ATE_void = 0x0, ++ DW_ATE_address = 0x1, ++ DW_ATE_boolean = 0x2, ++ DW_ATE_complex_float = 0x3, ++ DW_ATE_float = 0x4, ++ DW_ATE_signed = 0x5, ++ DW_ATE_signed_char = 0x6, ++ DW_ATE_unsigned = 0x7, ++ DW_ATE_unsigned_char = 0x8, ++ /* DWARF 3. */ ++ DW_ATE_imaginary_float = 0x9, ++ /* HP extensions. */ ++ DW_ATE_HP_float80 = 0x80, /* Floating-point (80 bit). */ ++ DW_ATE_HP_complex_float80 = 0x81, /* Complex floating-point (80 bit). */ ++ DW_ATE_HP_float128 = 0x82, /* Floating-point (128 bit). */ ++ DW_ATE_HP_complex_float128 = 0x83, /* Complex floating-point (128 bit). */ ++ DW_ATE_HP_floathpintel = 0x84, /* Floating-point (82 bit IA64). */ ++ DW_ATE_HP_imaginary_float80 = 0x85, ++ DW_ATE_HP_imaginary_float128 = 0x86 ++ }; ++ ++#define DW_ATE_lo_user 0x80 ++#define DW_ATE_hi_user 0xff ++ ++/* Array ordering names and codes. */ ++enum dwarf_array_dim_ordering ++ { ++ DW_ORD_row_major = 0, ++ DW_ORD_col_major = 1 ++ }; ++ ++/* Access attribute. */ ++enum dwarf_access_attribute ++ { ++ DW_ACCESS_public = 1, ++ DW_ACCESS_protected = 2, ++ DW_ACCESS_private = 3 ++ }; ++ ++/* Visibility. */ ++enum dwarf_visibility_attribute ++ { ++ DW_VIS_local = 1, ++ DW_VIS_exported = 2, ++ DW_VIS_qualified = 3 ++ }; ++ ++/* Virtuality. */ ++enum dwarf_virtuality_attribute ++ { ++ DW_VIRTUALITY_none = 0, ++ DW_VIRTUALITY_virtual = 1, ++ DW_VIRTUALITY_pure_virtual = 2 ++ }; ++ ++/* Case sensitivity. */ ++enum dwarf_id_case ++ { ++ DW_ID_case_sensitive = 0, ++ DW_ID_up_case = 1, ++ DW_ID_down_case = 2, ++ DW_ID_case_insensitive = 3 ++ }; ++ ++/* Calling convention. */ ++enum dwarf_calling_convention ++ { ++ DW_CC_normal = 0x1, ++ DW_CC_program = 0x2, ++ DW_CC_nocall = 0x3 ++ }; ++ ++#define DW_CC_lo_user 0x40 ++#define DW_CC_hi_user 0xff ++ ++/* Inline attribute. */ ++enum dwarf_inline_attribute ++ { ++ DW_INL_not_inlined = 0, ++ DW_INL_inlined = 1, ++ DW_INL_declared_not_inlined = 2, ++ DW_INL_declared_inlined = 3 ++ }; ++ ++/* Discriminant lists. */ ++enum dwarf_discrim_list ++ { ++ DW_DSC_label = 0, ++ DW_DSC_range = 1 ++ }; ++ ++/* Line number opcodes. */ ++enum dwarf_line_number_ops ++ { ++ DW_LNS_extended_op = 0, ++ DW_LNS_copy = 1, ++ DW_LNS_advance_pc = 2, ++ DW_LNS_advance_line = 3, ++ DW_LNS_set_file = 4, ++ DW_LNS_set_column = 5, ++ DW_LNS_negate_stmt = 6, ++ DW_LNS_set_basic_block = 7, ++ DW_LNS_const_add_pc = 8, ++ DW_LNS_fixed_advance_pc = 9, ++ /* DWARF 3. */ ++ DW_LNS_set_prologue_end = 10, ++ DW_LNS_set_epilogue_begin = 11, ++ DW_LNS_set_isa = 12 ++ }; ++ ++/* Line number extended opcodes. */ ++enum dwarf_line_number_x_ops ++ { ++ DW_LNE_end_sequence = 1, ++ DW_LNE_set_address = 2, ++ DW_LNE_define_file = 3, ++ /* HP extensions. */ ++ DW_LNE_HP_negate_is_UV_update = 0x11, ++ DW_LNE_HP_push_context = 0x12, ++ DW_LNE_HP_pop_context = 0x13, ++ DW_LNE_HP_set_file_line_column = 0x14, ++ DW_LNE_HP_set_routine_name = 0x15, ++ DW_LNE_HP_set_sequence = 0x16, ++ DW_LNE_HP_negate_post_semantics = 0x17, ++ DW_LNE_HP_negate_function_exit = 0x18, ++ DW_LNE_HP_negate_front_end_logical = 0x19, ++ DW_LNE_HP_define_proc = 0x20 ++ }; ++ ++/* Call frame information. */ ++enum dwarf_call_frame_info ++ { ++ DW_CFA_advance_loc = 0x40, ++ DW_CFA_offset = 0x80, ++ DW_CFA_restore = 0xc0, ++ DW_CFA_nop = 0x00, ++ DW_CFA_set_loc = 0x01, ++ DW_CFA_advance_loc1 = 0x02, ++ DW_CFA_advance_loc2 = 0x03, ++ DW_CFA_advance_loc4 = 0x04, ++ DW_CFA_offset_extended = 0x05, ++ DW_CFA_restore_extended = 0x06, ++ DW_CFA_undefined = 0x07, ++ DW_CFA_same_value = 0x08, ++ DW_CFA_register = 0x09, ++ DW_CFA_remember_state = 0x0a, ++ DW_CFA_restore_state = 0x0b, ++ DW_CFA_def_cfa = 0x0c, ++ DW_CFA_def_cfa_register = 0x0d, ++ DW_CFA_def_cfa_offset = 0x0e, ++ /* DWARF 3. */ ++ DW_CFA_def_cfa_expression = 0x0f, ++ DW_CFA_expression = 0x10, ++ DW_CFA_offset_extended_sf = 0x11, ++ DW_CFA_def_cfa_sf = 0x12, ++ DW_CFA_def_cfa_offset_sf = 0x13, ++ /* SGI/MIPS specific. */ ++ DW_CFA_MIPS_advance_loc8 = 0x1d, ++ /* GNU extensions. */ ++ DW_CFA_GNU_window_save = 0x2d, ++ DW_CFA_GNU_args_size = 0x2e, ++ DW_CFA_GNU_negative_offset_extended = 0x2f ++ }; ++ ++#define DW_CIE_ID 0xffffffff ++#define DW_CIE_VERSION 1 ++ ++#define DW_CFA_extended 0 ++#define DW_CFA_lo_user 0x1c ++#define DW_CFA_hi_user 0x3f ++ ++#define DW_CHILDREN_no 0x00 ++#define DW_CHILDREN_yes 0x01 ++ ++#define DW_ADDR_none 0 ++ ++/* Source language names and codes. */ ++enum dwarf_source_language ++ { ++ DW_LANG_C89 = 0x0001, ++ DW_LANG_C = 0x0002, ++ DW_LANG_Ada83 = 0x0003, ++ DW_LANG_C_plus_plus = 0x0004, ++ DW_LANG_Cobol74 = 0x0005, ++ DW_LANG_Cobol85 = 0x0006, ++ DW_LANG_Fortran77 = 0x0007, ++ DW_LANG_Fortran90 = 0x0008, ++ DW_LANG_Pascal83 = 0x0009, ++ DW_LANG_Modula2 = 0x000a, ++ DW_LANG_Java = 0x000b, ++ /* DWARF 3. */ ++ DW_LANG_C99 = 0x000c, ++ DW_LANG_Ada95 = 0x000d, ++ DW_LANG_Fortran95 = 0x000e, ++ /* MIPS. */ ++ DW_LANG_Mips_Assembler = 0x8001, ++ /* UPC. */ ++ DW_LANG_Upc = 0x8765 ++ }; ++ ++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ ++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ ++ ++/* Names and codes for macro information. */ ++enum dwarf_macinfo_record_type ++ { ++ DW_MACINFO_define = 1, ++ DW_MACINFO_undef = 2, ++ DW_MACINFO_start_file = 3, ++ DW_MACINFO_end_file = 4, ++ DW_MACINFO_vendor_ext = 255 ++ }; ++ ++/* @@@ For use with GNU frame unwind information. */ ++ ++#define DW_EH_PE_absptr 0x00 ++#define DW_EH_PE_omit 0xff ++ ++#define DW_EH_PE_uleb128 0x01 ++#define DW_EH_PE_udata2 0x02 ++#define DW_EH_PE_udata4 0x03 ++#define DW_EH_PE_udata8 0x04 ++#define DW_EH_PE_sleb128 0x09 ++#define DW_EH_PE_sdata2 0x0A ++#define DW_EH_PE_sdata4 0x0B ++#define DW_EH_PE_sdata8 0x0C ++#define DW_EH_PE_signed 0x08 ++ ++#define DW_EH_PE_pcrel 0x10 ++#define DW_EH_PE_textrel 0x20 ++#define DW_EH_PE_datarel 0x30 ++#define DW_EH_PE_funcrel 0x40 ++#define DW_EH_PE_aligned 0x50 ++ ++#define DW_EH_PE_indirect 0x80 ++ ++#endif /* _ELF_DWARF2_H */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/kgdb.h linux-2.6.18-53.1.14.kgdb/include/linux/kgdb.h +--- linux-2.6.18-53.1.14/include/linux/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/kgdb.h 2008-06-10 15:39:21.000000000 +0400 +@@ -0,0 +1,279 @@ ++/* ++ * include/linux/kgdb.h ++ * ++ * This provides the hooks and functions that KGDB needs to share between ++ * the core, I/O and arch-specific portions. ++ * ++ * Author: Amit Kale and ++ * Tom Rini ++ * ++ * 2001-2004 (c) Amit S. Kale and 2003-2005 (c) MontaVista Software, Inc. ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++#ifdef __KERNEL__ ++#ifndef _KGDB_H_ ++#define _KGDB_H_ ++ ++#include ++ ++#ifdef CONFIG_KGDB ++#include ++#include ++#include ++#include ++ ++#ifndef CHECK_EXCEPTION_STACK ++#define CHECK_EXCEPTION_STACK() 1 ++#endif ++ ++struct tasklet_struct; ++struct pt_regs; ++struct task_struct; ++struct uart_port; ++ ++#ifdef CONFIG_KGDB_CONSOLE ++extern struct console kgdbcons; ++#endif ++ ++/* To enter the debugger explicitly. */ ++extern void breakpoint(void); ++extern int kgdb_connected; ++extern int kgdb_may_fault; ++extern struct tasklet_struct kgdb_tasklet_breakpoint; ++ ++extern atomic_t kgdb_setting_breakpoint; ++extern atomic_t cpu_doing_single_step; ++extern atomic_t kgdb_sync_softlockup[NR_CPUS]; ++ ++extern struct task_struct *kgdb_usethread, *kgdb_contthread; ++ ++enum kgdb_bptype { ++ bp_breakpoint = '0', ++ bp_hardware_breakpoint, ++ bp_write_watchpoint, ++ bp_read_watchpoint, ++ bp_access_watchpoint ++}; ++ ++enum kgdb_bpstate { ++ bp_none = 0, ++ bp_removed, ++ bp_set, ++ bp_active ++}; ++ ++struct kgdb_bkpt { ++ unsigned long bpt_addr; ++ unsigned char saved_instr[BREAK_INSTR_SIZE]; ++ enum kgdb_bptype type; ++ enum kgdb_bpstate state; ++}; ++ ++/* The maximum number of KGDB I/O modules that can be loaded */ ++#define MAX_KGDB_IO_HANDLERS 3 ++ ++#ifndef MAX_BREAKPOINTS ++#define MAX_BREAKPOINTS 1000 ++#endif ++ ++#define KGDB_HW_BREAKPOINT 1 ++ ++/* Required functions. */ ++/** ++ * regs_to_gdb_regs - Convert ptrace regs to GDB regs ++ * @gdb_regs: A pointer to hold the registers in the order GDB wants. ++ * @regs: The &struct pt_regs of the current process. ++ * ++ * Convert the pt_regs in @regs into the format for registers that ++ * GDB expects, stored in @gdb_regs. ++ */ ++extern void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs); ++ ++/** ++ * sleeping_regs_to_gdb_regs - Convert ptrace regs to GDB regs ++ * @gdb_regs: A pointer to hold the registers in the order GDB wants. ++ * @p: The &struct task_struct of the desired process. ++ * ++ * Convert the register values of the sleeping process in @p to ++ * the format that GDB expects. ++ * This function is called when kgdb does not have access to the ++ * &struct pt_regs and therefore it should fill the gdb registers ++ * @gdb_regs with what has been saved in &struct thread_struct ++ * thread field during switch_to. ++ */ ++extern void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, ++ struct task_struct *p); ++ ++/** ++ * gdb_regs_to_regs - Convert GDB regs to ptrace regs. ++ * @gdb_regs: A pointer to hold the registers we've recieved from GDB. ++ * @regs: A pointer to a &struct pt_regs to hold these values in. ++ * ++ * Convert the GDB regs in @gdb_regs into the pt_regs, and store them ++ * in @regs. ++ */ ++extern void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs); ++ ++/** ++ * kgdb_arch_handle_exception - Handle architecture specific GDB packets. ++ * @vector: The error vector of the exception that happened. ++ * @signo: The signal number of the exception that happened. ++ * @err_code: The error code of the exception that happened. ++ * @remcom_in_buffer: The buffer of the packet we have read. ++ * @remcom_out_buffer: The buffer, of %BUFMAX to write a packet into. ++ * @regs: The &struct pt_regs of the current process. ++ * ++ * This function MUST handle the 'c' and 's' command packets, ++ * as well packets to set / remove a hardware breakpoint, if used. ++ * If there are additional packets which the hardware needs to handle, ++ * they are handled here. The code should return -1 if it wants to ++ * process more packets, and a %0 or %1 if it wants to exit from the ++ * kgdb hook. ++ */ ++extern int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *regs); ++ ++#ifndef JMP_REGS_ALIGNMENT ++#define JMP_REGS_ALIGNMENT ++#endif ++ ++extern unsigned long kgdb_fault_jmp_regs[]; ++ ++/** ++ * kgdb_fault_setjmp - Store state in case we fault. ++ * @curr_context: An array to store state into. ++ * ++ * Certain functions may try and access memory, and in doing so may ++ * cause a fault. When this happens, we trap it, restore state to ++ * this call, and let ourself know that something bad has happened. ++ */ ++extern asmlinkage int kgdb_fault_setjmp(unsigned long *curr_context); ++ ++/** ++ * kgdb_fault_longjmp - Restore state when we have faulted. ++ * @curr_context: The previously stored state. ++ * ++ * When something bad does happen, this function is called to ++ * restore the known good state, and set the return value to 1, so ++ * we know something bad happened. ++ */ ++extern asmlinkage void kgdb_fault_longjmp(unsigned long *curr_context); ++ ++/* Optional functions. */ ++extern int kgdb_arch_init(void); ++extern void kgdb_disable_hw_debug(struct pt_regs *regs); ++extern void kgdb_post_master_code(struct pt_regs *regs, int e_vector, ++ int err_code); ++extern void kgdb_roundup_cpus(unsigned long flags); ++extern int kgdb_set_hw_break(unsigned long addr); ++extern int kgdb_remove_hw_break(unsigned long addr); ++extern void kgdb_remove_all_hw_break(void); ++extern void kgdb_correct_hw_break(void); ++extern void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, ++ unsigned threadid); ++extern struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, ++ int threadid); ++extern struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid); ++extern int kgdb_validate_break_address(unsigned long addr); ++extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr); ++extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle); ++ ++/** ++ * struct kgdb_arch - Desribe architecture specific values. ++ * @gdb_bpt_instr: The instruction to trigger a breakpoint. ++ * @flags: Flags for the breakpoint, currently just %KGDB_HW_BREAKPOINT. ++ * @shadowth: A value of %1 indicates we shadow information on processes. ++ * @set_breakpoint: Allow an architecture to specify how to set a software ++ * breakpoint. ++ * @remove_breakpoint: Allow an architecture to specify how to remove a ++ * software breakpoint. ++ * @set_hw_breakpoint: Allow an architecture to specify how to set a hardware ++ * breakpoint. ++ * @remove_hw_breakpoint: Allow an architecture to specify how to remove a ++ * hardware breakpoint. ++ * ++ * The @shadowth flag is an option to shadow information not retrievable by ++ * gdb otherwise. This is deprecated in favor of a binutils which supports ++ * CFI macros. ++ */ ++struct kgdb_arch { ++ unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE]; ++ unsigned long flags; ++ unsigned shadowth; ++ int (*set_breakpoint) (unsigned long, char *); ++ int (*remove_breakpoint)(unsigned long, char *); ++ int (*set_hw_breakpoint)(unsigned long, int, enum kgdb_bptype); ++ int (*remove_hw_breakpoint)(unsigned long, int, enum kgdb_bptype); ++}; ++ ++/* Thread reference */ ++typedef unsigned char threadref[8]; ++ ++/** ++ * struct kgdb_io - Desribe the interface for an I/O driver to talk with KGDB. ++ * @read_char: Pointer to a function that will return one char. ++ * @write_char: Pointer to a function that will write one char. ++ * @flush: Pointer to a function that will flush any pending writes. ++ * @init: Pointer to a function that will initialize the device. ++ * @late_init: Pointer to a function that will do any setup that has ++ * other dependencies. ++ * @pre_exception: Pointer to a function that will do any prep work for ++ * the I/O driver. ++ * @post_exception: Pointer to a function that will do any cleanup work ++ * for the I/O driver. ++ * ++ * The @init and @late_init function pointers allow for an I/O driver ++ * such as a serial driver to fully initialize the port with @init and ++ * be called very early, yet safely call request_irq() later in the boot ++ * sequence. ++ * ++ * @init is allowed to return a non-0 return value to indicate failure. ++ * If this is called early on, then KGDB will try again when it would call ++ * @late_init. If it has failed later in boot as well, the user will be ++ * notified. ++ */ ++struct kgdb_io { ++ int (*read_char) (void); ++ void (*write_char) (u8); ++ void (*flush) (void); ++ int (*init) (void); ++ void (*late_init) (void); ++ void (*pre_exception) (void); ++ void (*post_exception) (void); ++}; ++ ++extern struct kgdb_io kgdb_io_ops; ++extern struct kgdb_arch arch_kgdb_ops; ++extern int kgdb_initialized; ++ ++extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); ++extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops); ++ ++extern void __init kgdb8250_add_port(int i, struct uart_port *serial_req); ++extern void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *serial_req); ++ ++extern int kgdb_hex2long(char **ptr, long *long_val); ++extern char *kgdb_mem2hex(char *mem, char *buf, int count); ++extern char *kgdb_hex2mem(char *buf, char *mem, int count); ++extern int kgdb_get_mem(char *addr, unsigned char *buf, int count); ++extern int kgdb_set_mem(char *addr, unsigned char *buf, int count); ++ ++int kgdb_isremovedbreak(unsigned long addr); ++int kgdb_skipexception(int exception, struct pt_regs *regs); ++ ++extern int kgdb_handle_exception(int ex_vector, int signo, int err_code, ++ struct pt_regs *regs); ++extern void kgdb_nmihook(int cpu, void *regs); ++extern int debugger_step; ++extern atomic_t debugger_active; ++extern struct kgdb_arch *kgdb_ops; ++#else ++/* Stubs for when KGDB is not set. */ ++static const atomic_t debugger_active = ATOMIC_INIT(0); ++#endif /* CONFIG_KGDB */ ++#endif /* _KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/module.h linux-2.6.18-53.1.14.kgdb/include/linux/module.h +--- linux-2.6.18-53.1.14/include/linux/module.h 2008-03-06 05:54:41.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/module.h 2008-06-10 15:39:15.000000000 +0400 +@@ -229,8 +229,17 @@ enum module_state + MODULE_STATE_LIVE, + MODULE_STATE_COMING, + MODULE_STATE_GOING, ++ MODULE_STATE_GONE, + }; + ++#ifdef CONFIG_KGDB ++#define MAX_SECTNAME 31 ++struct mod_section { ++ void *address; ++ char name[MAX_SECTNAME + 1]; ++}; ++#endif ++ + /* Similar stuff for section attributes. */ + #define MODULE_SECT_NAME_LEN 32 + struct module_sect_attr +@@ -258,6 +267,13 @@ struct module + /* Unique handle for this module */ + char name[MODULE_NAME_LEN]; + ++#ifdef CONFIG_KGDB ++ /* keep kgdb info at the begining so that gdb doesn't have a chance to ++ * miss out any fields */ ++ unsigned long num_sections; ++ struct mod_section *mod_sections; ++#endif ++ + /* Sysfs stuff. */ + struct module_kobject mkobj; + struct module_param_attrs *param_attrs; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/netpoll.h linux-2.6.18-53.1.14.kgdb/include/linux/netpoll.h +--- linux-2.6.18-53.1.14/include/linux/netpoll.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/netpoll.h 2008-06-10 15:37:49.000000000 +0400 +@@ -17,7 +17,7 @@ struct netpoll; + struct netpoll { + struct net_device *dev; + char dev_name[16], *name; +- void (*rx_hook)(struct netpoll *, int, char *, int); ++ void (*rx_hook)(struct netpoll *, int, char *, int, struct sk_buff *); + void (*drop)(struct sk_buff *skb); + u32 local_ip, remote_ip; + u16 local_port, remote_port; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/include/linux/serial_8250.h linux-2.6.18-53.1.14.kgdb/include/linux/serial_8250.h +--- linux-2.6.18-53.1.14/include/linux/serial_8250.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18-53.1.14.kgdb/include/linux/serial_8250.h 2008-06-10 15:37:43.000000000 +0400 +@@ -56,6 +56,7 @@ struct uart_port; + + int serial8250_register_port(struct uart_port *); + void serial8250_unregister_port(int line); ++void serial8250_unregister_by_port(struct uart_port *port); + void serial8250_suspend_port(int line); + void serial8250_resume_port(int line); + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/Makefile linux-2.6.18-53.1.14.kgdb/kernel/Makefile +--- linux-2.6.18-53.1.14/kernel/Makefile 2008-03-06 05:54:50.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/Makefile 2008-06-10 15:37:25.000000000 +0400 +@@ -42,6 +42,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machi + obj-$(CONFIG_AUDIT) += audit.o auditfilter.o + obj-$(CONFIG_AUDITSYSCALL) += auditsc.o audit_tree.o + obj-$(CONFIG_KPROBES) += kprobes.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdbarchlib.o + obj-$(CONFIG_SYSFS) += ksysfs.o + obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o + obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/kgdb.c linux-2.6.18-53.1.14.kgdb/kernel/kgdb.c +--- linux-2.6.18-53.1.14/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/kgdb.c 2008-06-10 15:39:21.000000000 +0400 +@@ -0,0 +1,1778 @@ ++/* ++ * kernel/kgdb.c ++ * ++ * Maintainer: Tom Rini ++ * ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * Copyright (C) 2002-2004 Timesys Corporation ++ * Copyright (C) 2003-2004 Amit S. Kale ++ * Copyright (C) 2004 Pavel Machek ++ * Copyright (C) 2004-2005 Tom Rini ++ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. ++ * Copyright (C) 2005 Wind River Systems, Inc. ++ * ++ * Contributors at various stages not listed above: ++ * Jason Wessel ( jason.wessel@windriver.com ) ++ * George Anzinger ++ * Anurekh Saxena (anurekh.saxena@timesys.com) ++ * Lake Stevens Instrument Division (Glenn Engel) ++ * Jim Kingdon, Cygnus Support. ++ * ++ * Original KGDB stub: David Grothe , ++ * Tigran Aivazian ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern int pid_max; ++/* How many times to count all of the waiting CPUs */ ++#define ROUNDUP_WAIT 640000 /* Arbitrary, increase if needed. */ ++#define BUF_THREAD_ID_SIZE 16 ++ ++/* ++ * kgdb_initialized with a value of 1 indicates that kgdb is setup and is ++ * all ready to serve breakpoints and other kernel exceptions. A value of ++ * -1 indicates that we have tried to initialize early, and need to try ++ * again later. ++ */ ++int kgdb_initialized; ++/* Is a host GDB connected to us? */ ++int kgdb_connected; ++/* Could we be about to try and access a bad memory location? If so we ++ * also need to flag this has happend. */ ++int kgdb_may_fault; ++/* All the KGDB handlers are installed */ ++int kgdb_from_module_registered = 0; ++ ++/* We provide a kgdb_io_ops structure that may be overriden. */ ++struct kgdb_io __attribute__ ((weak)) kgdb_io_ops; ++ ++static struct kgdb_io kgdb_io_ops_prev[MAX_KGDB_IO_HANDLERS]; ++static int kgdb_io_handler_cnt = 0; ++ ++/* Export the following symbols for use with kernel modules */ ++EXPORT_SYMBOL(kgdb_io_ops); ++EXPORT_SYMBOL(kgdb_tasklet_breakpoint); ++EXPORT_SYMBOL(kgdb_connected); ++EXPORT_SYMBOL(kgdb_register_io_module); ++EXPORT_SYMBOL(kgdb_unregister_io_module); ++EXPORT_SYMBOL(debugger_active); ++ ++/* ++ * Holds information about breakpoints in a kernel. These breakpoints are ++ * added and removed by gdb. ++ */ ++struct kgdb_bkpt kgdb_break[MAX_BREAKPOINTS]; ++ ++static const char hexchars[] = "0123456789abcdef"; ++ ++static spinlock_t slavecpulocks[NR_CPUS]; ++static atomic_t procindebug[NR_CPUS]; ++atomic_t kgdb_setting_breakpoint; ++EXPORT_SYMBOL(kgdb_setting_breakpoint); ++struct task_struct *kgdb_usethread, *kgdb_contthread; ++ ++int debugger_step; ++atomic_t debugger_active; ++ ++/* Our I/O buffers. */ ++static char remcom_in_buffer[BUFMAX]; ++static char remcom_out_buffer[BUFMAX]; ++/* Storage for the registers, in GDB format. */ ++static unsigned long gdb_regs[(NUMREGBYTES + sizeof(unsigned long) - 1) / ++ sizeof(unsigned long)]; ++/* Storage of registers for handling a fault. */ ++unsigned long kgdb_fault_jmp_regs[NUMCRITREGBYTES / sizeof(unsigned long)] ++ JMP_REGS_ALIGNMENT; ++static int kgdb_notify_reboot(struct notifier_block *this, ++ unsigned long code ,void *x); ++struct debuggerinfo_struct { ++ void *debuggerinfo; ++ struct task_struct *task; ++} kgdb_info[NR_CPUS]; ++ ++/* to keep track of the CPU which is doing the single stepping*/ ++atomic_t cpu_doing_single_step = ATOMIC_INIT(-1); ++ ++atomic_t kgdb_sync_softlockup[NR_CPUS] = {ATOMIC_INIT(0)}; ++ ++/* reboot notifier block */ ++static struct notifier_block kgdb_reboot_notifier = { ++ .notifier_call = kgdb_notify_reboot, ++ .next = NULL, ++ .priority = INT_MAX, ++}; ++ ++static int hex(char ch) ++{ ++ if ((ch >= 'a') && (ch <= 'f')) ++ return (ch - 'a' + 10); ++ if ((ch >= '0') && (ch <= '9')) ++ return (ch - '0'); ++ if ((ch >= 'A') && (ch <= 'F')) ++ return (ch - 'A' + 10); ++ return (-1); ++} ++ ++/* scan for the sequence $# */ ++static void get_packet(char *buffer) ++{ ++ unsigned char checksum; ++ unsigned char xmitcsum; ++ int count; ++ char ch; ++ if (!kgdb_io_ops.read_char) ++ return; ++ do { ++ /* Spin and wait around for the start character, ignore all ++ * other characters */ ++ while ((ch = (kgdb_io_ops.read_char())) != '$') ; ++ kgdb_connected = 1; ++ checksum = 0; ++ xmitcsum = -1; ++ ++ count = 0; ++ ++ /* now, read until a # or end of buffer is found */ ++ while (count < (BUFMAX - 1)) { ++ ch = kgdb_io_ops.read_char(); ++ if (ch == '#') ++ break; ++ checksum = checksum + ch; ++ buffer[count] = ch; ++ count = count + 1; ++ } ++ buffer[count] = 0; ++ ++ if (ch == '#') { ++ xmitcsum = hex(kgdb_io_ops.read_char()) << 4; ++ xmitcsum += hex(kgdb_io_ops.read_char()); ++ ++ if (checksum != xmitcsum) ++ /* failed checksum */ ++ kgdb_io_ops.write_char('-'); ++ else ++ /* successful transfer */ ++ kgdb_io_ops.write_char('+'); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ } ++ } while (checksum != xmitcsum); ++} ++ ++/* ++ * Send the packet in buffer. ++ * Check for gdb connection if asked for. ++ */ ++static void put_packet(char *buffer) ++{ ++ unsigned char checksum; ++ int count; ++ char ch; ++ ++ if (!kgdb_io_ops.write_char) ++ return; ++ /* $#. */ ++ while (1) { ++ kgdb_io_ops.write_char('$'); ++ checksum = 0; ++ count = 0; ++ ++ while ((ch = buffer[count])) { ++ kgdb_io_ops.write_char(ch); ++ checksum += ch; ++ count++; ++ } ++ ++ kgdb_io_ops.write_char('#'); ++ kgdb_io_ops.write_char(hexchars[checksum >> 4]); ++ kgdb_io_ops.write_char(hexchars[checksum % 16]); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ ++ /* Now see what we get in reply. */ ++ ch = kgdb_io_ops.read_char(); ++ ++ if (ch == 3) ++ ch = kgdb_io_ops.read_char(); ++ ++ /* If we get an ACK, we are done. */ ++ if (ch == '+') ++ return; ++ ++ /* If we get the start of another packet, this means ++ * that GDB is attempting to reconnect. We will NAK ++ * the packet being sent, and stop trying to send this ++ * packet. */ ++ if (ch == '$') { ++ kgdb_io_ops.write_char('-'); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ return; ++ } ++ } ++} ++ ++/* ++ * convert the memory pointed to by mem into hex, placing result in buf ++ * return a pointer to the last char put in buf (null). May return an error. ++ */ ++char *kgdb_mem2hex(char *mem, char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ /* Accessing some registers in a single load instruction is ++ * required to avoid bad side effects for some I/O registers. ++ */ ++ if ((count == 2) && (((long)mem & 1) == 0)) { ++ unsigned short tmp_s = *(unsigned short *)mem; ++ mem += 2; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_s >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_s & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_s >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_s & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 8) & 0xf]; ++#endif ++ } else if ((count == 4) && (((long)mem & 3) == 0)) { ++ unsigned long tmp_l = *(unsigned int *)mem; ++ mem += 4; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_l >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_l & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_l >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_l & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 24) & 0xf]; ++#endif ++#ifdef CONFIG_64BIT ++ } else if ((count == 8) && (((long)mem & 7) == 0)) { ++ unsigned long long tmp_ll = *(unsigned long long *)mem; ++ mem += 8; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_ll >> 60) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 56) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 52) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 48) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 44) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 40) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 36) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 32) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_ll & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_ll >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_ll & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 36) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 32) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 44) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 40) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 52) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 48) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 60) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 56) & 0xf]; ++#endif ++#endif ++ } else { ++ while (count-- > 0) { ++ unsigned char ch = *mem++; ++ *buf++ = hexchars[ch >> 4]; ++ *buf++ = hexchars[ch & 0xf]; ++ } ++ } ++ kgdb_may_fault = 0; ++ *buf = 0; ++ return (buf); ++} ++ ++/* ++ * Copy the binary array pointed to by buf into mem. Fix $, #, and ++ * 0x7d escaped with 0x7d. Return a pointer to the character after ++ * the last byte written. ++ */ ++static char *kgdb_ebin2mem(char *buf, char *mem, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ for (; count > 0; count--, buf++) { ++ if (*buf == 0x7d) ++ *mem++ = *(++buf) ^ 0x20; ++ else ++ *mem++ = *buf; ++ } ++ kgdb_may_fault = 0; ++ return mem; ++} ++ ++/* ++ * convert the hex array pointed to by buf into binary to be placed in mem ++ * return a pointer to the character AFTER the last byte written ++ * May return an error. ++ */ ++char *kgdb_hex2mem(char *buf, char *mem, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ if ((count == 2) && (((long)mem & 1) == 0)) { ++ unsigned short tmp_s = 0; ++#ifdef __BIG_ENDIAN ++ tmp_s |= hex(*buf++) << 12; ++ tmp_s |= hex(*buf++) << 8; ++ tmp_s |= hex(*buf++) << 4; ++ tmp_s |= hex(*buf++); ++#else ++ tmp_s |= hex(*buf++) << 4; ++ tmp_s |= hex(*buf++); ++ tmp_s |= hex(*buf++) << 12; ++ tmp_s |= hex(*buf++) << 8; ++#endif ++ *(unsigned short *)mem = tmp_s; ++ mem += 2; ++ } else if ((count == 4) && (((long)mem & 3) == 0)) { ++ unsigned long tmp_l = 0; ++#ifdef __BIG_ENDIAN ++ tmp_l |= hex(*buf++) << 28; ++ tmp_l |= hex(*buf++) << 24; ++ tmp_l |= hex(*buf++) << 20; ++ tmp_l |= hex(*buf++) << 16; ++ tmp_l |= hex(*buf++) << 12; ++ tmp_l |= hex(*buf++) << 8; ++ tmp_l |= hex(*buf++) << 4; ++ tmp_l |= hex(*buf++); ++#else ++ tmp_l |= hex(*buf++) << 4; ++ tmp_l |= hex(*buf++); ++ tmp_l |= hex(*buf++) << 12; ++ tmp_l |= hex(*buf++) << 8; ++ tmp_l |= hex(*buf++) << 20; ++ tmp_l |= hex(*buf++) << 16; ++ tmp_l |= hex(*buf++) << 28; ++ tmp_l |= hex(*buf++) << 24; ++#endif ++ *(unsigned long *)mem = tmp_l; ++ mem += 4; ++ } else { ++ int i; ++ for (i = 0; i < count; i++) { ++ unsigned char ch = hex(*buf++) << 4; ++ ch |= hex(*buf++); ++ *mem++ = ch; ++ } ++ } ++ kgdb_may_fault = 0; ++ return (mem); ++} ++ ++/* ++ * While we find nice hex chars, build a long_val. ++ * Return number of chars processed. ++ */ ++int kgdb_hex2long(char **ptr, long *long_val) ++{ ++ int hex_val, num = 0; ++ ++ *long_val = 0; ++ ++ while (**ptr) { ++ hex_val = hex(**ptr); ++ if (hex_val >= 0) { ++ *long_val = (*long_val << 4) | hex_val; ++ num++; ++ } else ++ break; ++ ++ (*ptr)++; ++ } ++ ++ return (num); ++} ++ ++/* Write memory due to an 'M' or 'X' packet. */ ++static char *write_mem_msg(int binary) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr, length; ++ ++ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && ++ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { ++ if (binary) ++ ptr = kgdb_ebin2mem(ptr, (char *)addr, length); ++ else ++ ptr = kgdb_hex2mem(ptr, (char *)addr, length); ++ if (CACHE_FLUSH_IS_SAFE) ++ flush_icache_range(addr, addr + length + 1); ++ if (IS_ERR(ptr)) ++ return ptr; ++ return NULL; ++ } ++ ++ return ERR_PTR(-EINVAL); ++} ++ ++static inline char *pack_hex_byte(char *pkt, int byte) ++{ ++ *pkt++ = hexchars[(byte >> 4) & 0xf]; ++ *pkt++ = hexchars[(byte & 0xf)]; ++ return pkt; ++} ++ ++static inline void error_packet(char *pkt, int error) ++{ ++ error = -error; ++ pkt[0] = 'E'; ++ pkt[1] = hexchars[(error / 10)]; ++ pkt[2] = hexchars[(error % 10)]; ++ pkt[3] = '\0'; ++} ++ ++static char *pack_threadid(char *pkt, threadref * id) ++{ ++ char *limit; ++ unsigned char *altid; ++ ++ altid = (unsigned char *)id; ++ limit = pkt + BUF_THREAD_ID_SIZE; ++ while (pkt < limit) ++ pkt = pack_hex_byte(pkt, *altid++); ++ ++ return pkt; ++} ++ ++void int_to_threadref(threadref * id, int value) ++{ ++ unsigned char *scan; ++ int i = 4; ++ ++ scan = (unsigned char *)id; ++ while (i--) ++ *scan++ = 0; ++ *scan++ = (value >> 24) & 0xff; ++ *scan++ = (value >> 16) & 0xff; ++ *scan++ = (value >> 8) & 0xff; ++ *scan++ = (value & 0xff); ++} ++ ++static struct task_struct *getthread(struct pt_regs *regs, int tid) ++{ ++ if (last_pid == 0) ++ return current; ++ ++ if (num_online_cpus() && ++ (tid >= pid_max + num_online_cpus() + kgdb_ops->shadowth)) ++ return NULL; ++ ++ if (kgdb_ops->shadowth && (tid >= pid_max + num_online_cpus())) ++ return kgdb_get_shadow_thread(regs, tid - pid_max - ++ num_online_cpus()); ++ ++ if (tid >= pid_max) ++ return idle_task(tid - pid_max); ++ ++ if (!tid) ++ return NULL; ++ ++ return find_task_by_pid(tid); ++} ++ ++#ifdef CONFIG_SMP ++static void kgdb_wait(struct pt_regs *regs) ++{ ++ unsigned long flags; ++ int processor; ++ ++ local_irq_save(flags); ++ processor = smp_processor_id(); ++ kgdb_info[processor].debuggerinfo = regs; ++ kgdb_info[processor].task = current; ++ atomic_set(&procindebug[processor], 1); ++ atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1); ++ ++ /* Wait till master processor goes completely into the debugger. ++ * FIXME: this looks racy */ ++ while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) { ++ int i = 10; /* an arbitrary number */ ++ ++ while (--i) ++ cpu_relax(); ++ } ++ ++ /* Wait till master processor is done with debugging */ ++ spin_lock(&slavecpulocks[processor]); ++ ++ /* This has been taken from x86 kgdb implementation and ++ * will be needed by architectures that have SMP support ++ */ ++ kgdb_correct_hw_break(); ++ ++ kgdb_info[processor].debuggerinfo = NULL; ++ kgdb_info[processor].task = NULL; ++ ++ /* Signal the master processor that we are done */ ++ atomic_set(&procindebug[processor], 0); ++ spin_unlock(&slavecpulocks[processor]); ++ local_irq_restore(flags); ++} ++#endif ++ ++int kgdb_get_mem(char *addr, unsigned char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return -EINVAL; ++ } ++ while (count) { ++ if ((unsigned long)addr < TASK_SIZE) ++ return -EINVAL; ++ *buf++ = *addr++; ++ count--; ++ } ++ kgdb_may_fault = 0; ++ return 0; ++} ++ ++int kgdb_set_mem(char *addr, unsigned char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return -EINVAL; ++ } ++ while (count) { ++ if ((unsigned long)addr < TASK_SIZE) ++ return -EINVAL; ++ *addr++ = *buf++; ++ count--; ++ } ++ kgdb_may_fault = 0; ++ return 0; ++} ++int kgdb_activate_sw_breakpoints(void) ++{ ++ int i; ++ int error = 0; ++ unsigned long addr; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_set) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_set_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ ++ if (CACHE_FLUSH_IS_SAFE) { ++ if (current->mm && addr < TASK_SIZE) ++ flush_cache_range(current->mm->mmap_cache, ++ addr, addr + BREAK_INSTR_SIZE); ++ else ++ flush_icache_range(addr, addr + ++ BREAK_INSTR_SIZE); ++ } ++ ++ kgdb_break[i].state = bp_active; ++ } ++ return 0; ++} ++ ++static int kgdb_set_sw_break(unsigned long addr) ++{ ++ int i, breakno = -1; ++ int error = 0; ++ if ((error = kgdb_validate_break_address(addr)) < 0) ++ return error; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_set) && ++ (kgdb_break[i].bpt_addr == addr)) ++ return -EEXIST; ++ } ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state == bp_removed && ++ kgdb_break[i].bpt_addr == addr) { ++ breakno = i; ++ break; ++ } ++ } ++ ++ if (breakno == -1) { ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state == bp_none) { ++ breakno = i; ++ break; ++ } ++ } ++ } ++ if (breakno == -1) ++ return -E2BIG; ++ ++ kgdb_break[breakno].state = bp_set; ++ kgdb_break[breakno].type = bp_breakpoint; ++ kgdb_break[breakno].bpt_addr = addr; ++ ++ return 0; ++} ++ ++int kgdb_deactivate_sw_breakpoints(void) ++{ ++ int i; ++ int error = 0; ++ unsigned long addr; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_active) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_remove_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ ++ if (CACHE_FLUSH_IS_SAFE && current->mm && ++ addr < TASK_SIZE) ++ flush_cache_range(current->mm->mmap_cache, ++ addr, addr + BREAK_INSTR_SIZE); ++ else if (CACHE_FLUSH_IS_SAFE) ++ flush_icache_range(addr, ++ addr + BREAK_INSTR_SIZE); ++ kgdb_break[i].state = bp_set; ++ } ++ return 0; ++} ++ ++static int kgdb_remove_sw_break(unsigned long addr) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_set) && ++ (kgdb_break[i].bpt_addr == addr)) { ++ kgdb_break[i].state = bp_removed; ++ return 0; ++ } ++ } ++ return -ENOENT; ++} ++ ++int kgdb_isremovedbreak(unsigned long addr) ++{ ++ int i; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_removed) && ++ (kgdb_break[i].bpt_addr == addr)) { ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++int remove_all_break(void) ++{ ++ int i; ++ int error; ++ unsigned long addr; ++ ++ /* Clear memory breakpoints. */ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_set) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_remove_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ kgdb_break[i].state = bp_removed; ++ } ++ ++ /* Clear hardware breakpoints. */ ++ kgdb_remove_all_hw_break(); ++ ++ return 0; ++} ++ ++static inline int shadow_pid(int realpid) ++{ ++ if (realpid) { ++ return realpid; ++ } ++ return pid_max + smp_processor_id(); ++} ++ ++static char gdbmsgbuf[BUFMAX + 1]; ++static void kgdb_msg_write(const char *s, int len) ++{ ++ int i; ++ int wcount; ++ char *bufptr; ++ ++ /* 'O'utput */ ++ gdbmsgbuf[0] = 'O'; ++ ++ /* Fill and send buffers... */ ++ while (len > 0) { ++ bufptr = gdbmsgbuf + 1; ++ ++ /* Calculate how many this time */ ++ if ((len << 1) > (BUFMAX - 2)) ++ wcount = (BUFMAX - 2) >> 1; ++ else ++ wcount = len; ++ ++ /* Pack in hex chars */ ++ for (i = 0; i < wcount; i++) ++ bufptr = pack_hex_byte(bufptr, s[i]); ++ *bufptr = '\0'; ++ ++ /* Move up */ ++ s += wcount; ++ len -= wcount; ++ ++ /* Write packet */ ++ put_packet(gdbmsgbuf); ++ } ++} ++ ++/* ++ * This function does all command procesing for interfacing to gdb. ++ * ++ * Locking hierarchy: ++ * interface locks, if any (begin_session) ++ * kgdb lock (debugger_active) ++ * ++ * Note that since we can be in here prior to our cpumask being filled ++ * out, we err on the side of caution and loop over NR_CPUS instead ++ * of a for_each_online_cpu. ++ * ++ */ ++int kgdb_handle_exception(int ex_vector, int signo, int err_code, ++ struct pt_regs *linux_regs) ++{ ++ unsigned long length, addr; ++ char *ptr; ++ unsigned long flags; ++ unsigned i; ++ long threadid; ++ threadref thref; ++ struct task_struct *thread = NULL; ++ unsigned procid; ++ int numshadowth = num_online_cpus() + kgdb_ops->shadowth; ++ long kgdb_usethreadid = 0; ++ int error = 0, all_cpus_synced = 0; ++ struct pt_regs *shadowregs; ++ int processor = smp_processor_id(); ++ void *local_debuggerinfo; ++ ++ /* Panic on recursive debugger calls. */ ++ if (atomic_read(&debugger_active) == smp_processor_id() + 1) ++ return 0; ++ ++ acquirelock: ++ ++ /* Call the I/O drivers pre_exception routine if the I/O ++ * driver defined one ++ */ ++ if (kgdb_io_ops.pre_exception) ++ kgdb_io_ops.pre_exception(); ++ ++ /* ++ * Interrupts will be restored by the 'trap return' code, except when ++ * single stepping. ++ */ ++ local_irq_save(flags); ++ ++ /* Hold debugger_active */ ++ procid = smp_processor_id(); ++ ++ while (cmpxchg(&atomic_read(&debugger_active), 0, (procid + 1)) != 0) { ++ int i = 25; /* an arbitrary number */ ++ ++ while (--i) ++ cpu_relax(); ++ ++ if (atomic_read(&cpu_doing_single_step) != -1 && ++ atomic_read(&cpu_doing_single_step) != procid) ++ udelay(1); ++ } ++ ++ atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1); ++ ++ /* ++ * Don't enter if the last instance of the exception handler wanted to ++ * come into the debugger again. ++ */ ++ if (atomic_read(&cpu_doing_single_step) != -1 && ++ atomic_read(&cpu_doing_single_step) != procid) { ++ atomic_set(&debugger_active, 0); ++ local_irq_restore(flags); ++ goto acquirelock; ++ } ++ ++ /* ++ * Don't enter if we have hit a removed breakpoint. ++ */ ++ if (kgdb_skipexception(ex_vector, linux_regs)) ++ goto kgdb_restore; ++ ++ kgdb_info[processor].debuggerinfo = linux_regs; ++ kgdb_info[processor].task = current; ++ ++ kgdb_disable_hw_debug(linux_regs); ++ ++ if (!debugger_step || !kgdb_contthread) ++ for (i = 0; i < NR_CPUS; i++) ++ spin_lock(&slavecpulocks[i]); ++ ++ /* Make sure we get the other CPUs */ ++ if (!debugger_step || !kgdb_contthread) ++ kgdb_roundup_cpus(flags); ++ ++ /* spin_lock code is good enough as a barrier so we don't ++ * need one here */ ++ atomic_set(&procindebug[processor], 1); ++ ++ /* Wait a reasonable time for the other CPUs to be notified and ++ * be waiting for us. Very early on this could be imperfect ++ * as num_online_cpus() could be 0.*/ ++ for (i = 0; i < ROUNDUP_WAIT; i++) { ++ int cpu, num = 0; ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ if (atomic_read(&procindebug[cpu])) ++ num++; ++ } ++ if (num >= num_online_cpus()) { ++ all_cpus_synced = 1; ++ break; ++ } ++ } ++ ++ /* Clear the out buffer. */ ++ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); ++ ++ /* Master processor is completely in the debugger */ ++ kgdb_post_master_code(linux_regs, ex_vector, err_code); ++ kgdb_deactivate_sw_breakpoints(); ++ debugger_step = 0; ++ kgdb_contthread = NULL; ++ ++ if (kgdb_connected) { ++ /* If we're still unable to roundup all of the CPUs, ++ * send an 'O' packet informing the user again. */ ++ if (!all_cpus_synced) ++ kgdb_msg_write("Not all CPUs have been synced for " ++ "KGDB\n", 39); ++ /* Reply to host that an exception has occurred */ ++ ptr = remcom_out_buffer; ++ *ptr++ = 'T'; ++ *ptr++ = hexchars[(signo >> 4) % 16]; ++ *ptr++ = hexchars[signo % 16]; ++ ptr += strlen(strcpy(ptr, "thread:")); ++ int_to_threadref(&thref, shadow_pid(current->pid)); ++ ptr = pack_threadid(ptr, &thref); ++ *ptr++ = ';'; ++ ++ put_packet(remcom_out_buffer); ++ } ++ ++ kgdb_usethread = kgdb_info[processor].task; ++ kgdb_usethreadid = shadow_pid(kgdb_info[processor].task->pid); ++ ++ while (kgdb_io_ops.read_char) { ++ char *bpt_type; ++ error = 0; ++ ++ /* Clear the out buffer. */ ++ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); ++ ++ get_packet(remcom_in_buffer); ++ ++ switch (remcom_in_buffer[0]) { ++ case '?': ++ /* We know that this packet is only sent ++ * during initial connect. So to be safe, ++ * we clear out our breakpoints now incase ++ * GDB is reconnecting. */ ++ remove_all_break(); ++ /* Also, if we haven't been able to roundup all ++ * CPUs, send an 'O' packet informing the user ++ * as much. Only need to do this once. */ ++ if (!all_cpus_synced) ++ kgdb_msg_write("Not all CPUs have been " ++ "synced for KGDB\n", 39); ++ remcom_out_buffer[0] = 'S'; ++ remcom_out_buffer[1] = hexchars[signo >> 4]; ++ remcom_out_buffer[2] = hexchars[signo % 16]; ++ break; ++ ++ case 'g': /* return the value of the CPU registers */ ++ thread = kgdb_usethread; ++ ++ if (!thread) { ++ thread = kgdb_info[processor].task; ++ local_debuggerinfo = ++ kgdb_info[processor].debuggerinfo; ++ } else { ++ local_debuggerinfo = NULL; ++ for (i = 0; i < NR_CPUS; i++) { ++ /* Try to find the task on some other ++ * or possibly this node if we do not ++ * find the matching task then we try ++ * to approximate the results. ++ */ ++ if (thread == kgdb_info[i].task) ++ local_debuggerinfo = ++ kgdb_info[i].debuggerinfo; ++ } ++ } ++ ++ /* All threads that don't have debuggerinfo should be ++ * in __schedule() sleeping, since all other CPUs ++ * are in kgdb_wait, and thus have debuggerinfo. */ ++ if (kgdb_ops->shadowth && ++ kgdb_usethreadid >= pid_max + num_online_cpus()) { ++ shadowregs = kgdb_shadow_regs(linux_regs, ++ kgdb_usethreadid - ++ pid_max - ++ num_online_cpus ++ ()); ++ if (!shadowregs) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ regs_to_gdb_regs(gdb_regs, shadowregs); ++ } else if (local_debuggerinfo) ++ regs_to_gdb_regs(gdb_regs, local_debuggerinfo); ++ else { ++ /* Pull stuff saved during ++ * switch_to; nothing else is ++ * accessible (or even particularly relevant). ++ * This should be enough for a stack trace. */ ++ sleeping_thread_to_gdb_regs(gdb_regs, thread); ++ } ++ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, ++ NUMREGBYTES); ++ break; ++ ++ /* set the value of the CPU registers - return OK */ ++ case 'G': ++ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, ++ NUMREGBYTES); ++ ++ if (kgdb_usethread && kgdb_usethread != current) ++ error_packet(remcom_out_buffer, -EINVAL); ++ else { ++ gdb_regs_to_regs(gdb_regs, linux_regs); ++ strcpy(remcom_out_buffer, "OK"); ++ } ++ break; ++ ++ /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ ++ case 'm': ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && ++ kgdb_hex2long(&ptr, &length) > 0) { ++ if (IS_ERR(ptr = kgdb_mem2hex((char *)addr, ++ remcom_out_buffer, ++ length))) ++ error_packet(remcom_out_buffer, ++ PTR_ERR(ptr)); ++ } else ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ ++ /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ ++ case 'M': ++ if (IS_ERR(ptr = write_mem_msg(0))) ++ error_packet(remcom_out_buffer, PTR_ERR(ptr)); ++ else ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ ++ case 'X': ++ if (IS_ERR(ptr = write_mem_msg(1))) ++ error_packet(remcom_out_buffer, PTR_ERR(ptr)); ++ else ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ ++ /* kill or detach. KGDB should treat this like a ++ * continue. ++ */ ++ case 'D': ++ if ((error = remove_all_break()) < 0) { ++ error_packet(remcom_out_buffer, error); ++ } else { ++ strcpy(remcom_out_buffer, "OK"); ++ kgdb_connected = 0; ++ } ++ put_packet(remcom_out_buffer); ++ goto default_handle; ++ ++ case 'k': ++ /* Don't care about error from remove_all_break */ ++ remove_all_break(); ++ kgdb_connected = 0; ++ goto default_handle; ++ ++ /* Reboot */ ++ case 'R': ++ /* For now, only honor R0 */ ++ if (strcmp(remcom_in_buffer, "R0") == 0) { ++ printk(KERN_CRIT "Executing reboot\n"); ++ strcpy(remcom_out_buffer, "OK"); ++ put_packet(remcom_out_buffer); ++ emergency_sync(); ++ /* Execution should not return from ++ * machine_restart() ++ */ ++ machine_restart(NULL); ++ kgdb_connected = 0; ++ goto default_handle; ++ } ++ ++ /* query */ ++ case 'q': ++ switch (remcom_in_buffer[1]) { ++ case 's': ++ case 'f': ++ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", ++ 10)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ ++ /* ++ * If we have not yet completed in ++ * pidhash_init() there isn't much we ++ * can give back. ++ */ ++ if (last_pid == 0) { ++ if (remcom_in_buffer[1] == 'f') ++ strcpy(remcom_out_buffer, ++ "m0000000000000001"); ++ break; ++ } ++ ++ if (remcom_in_buffer[1] == 'f') { ++ threadid = 1; ++ } ++ remcom_out_buffer[0] = 'm'; ++ ptr = remcom_out_buffer + 1; ++ for (i = 0; i < 17 && threadid < pid_max + ++ numshadowth; threadid++) { ++ thread = getthread(linux_regs, ++ threadid); ++ if (thread) { ++ int_to_threadref(&thref, ++ threadid); ++ pack_threadid(ptr, &thref); ++ ptr += 16; ++ *(ptr++) = ','; ++ i++; ++ } ++ } ++ *(--ptr) = '\0'; ++ break; ++ ++ case 'C': ++ /* Current thread id */ ++ strcpy(remcom_out_buffer, "QC"); ++ ++ threadid = shadow_pid(current->pid); ++ ++ int_to_threadref(&thref, threadid); ++ pack_threadid(remcom_out_buffer + 2, &thref); ++ break; ++ case 'T': ++ if (memcmp(remcom_in_buffer + 1, ++ "ThreadExtraInfo,", 16)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ threadid = 0; ++ ptr = remcom_in_buffer + 17; ++ kgdb_hex2long(&ptr, &threadid); ++ if (!getthread(linux_regs, threadid)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ if (threadid < pid_max) { ++ kgdb_mem2hex(getthread(linux_regs, ++ threadid)->comm, ++ remcom_out_buffer, 16); ++ } else if (threadid >= pid_max + ++ num_online_cpus()) { ++ kgdb_shadowinfo(linux_regs, ++ remcom_out_buffer, ++ threadid - pid_max - ++ num_online_cpus()); ++ } else { ++ static char tmpstr[23 + ++ BUF_THREAD_ID_SIZE]; ++ sprintf(tmpstr, "Shadow task %d" ++ " for pid 0", ++ (int)(threadid - pid_max)); ++ kgdb_mem2hex(tmpstr, remcom_out_buffer, ++ strlen(tmpstr)); ++ } ++ break; ++ } ++ break; ++ ++ /* task related */ ++ case 'H': ++ switch (remcom_in_buffer[1]) { ++ case 'g': ++ ptr = &remcom_in_buffer[2]; ++ kgdb_hex2long(&ptr, &threadid); ++ thread = getthread(linux_regs, threadid); ++ if (!thread && threadid > 0) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ kgdb_usethread = thread; ++ kgdb_usethreadid = threadid; ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ ++ case 'c': ++ ptr = &remcom_in_buffer[2]; ++ kgdb_hex2long(&ptr, &threadid); ++ if (!threadid) { ++ kgdb_contthread = NULL; ++ } else { ++ thread = getthread(linux_regs, ++ threadid); ++ if (!thread && threadid > 0) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ kgdb_contthread = thread; ++ } ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ } ++ break; ++ ++ /* Query thread status */ ++ case 'T': ++ ptr = &remcom_in_buffer[1]; ++ kgdb_hex2long(&ptr, &threadid); ++ thread = getthread(linux_regs, threadid); ++ if (thread) ++ strcpy(remcom_out_buffer, "OK"); ++ else ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ /* Since GDB-5.3, it's been drafted that '0' is a software ++ * breakpoint, '1' is a hardware breakpoint, so let's do ++ * that. ++ */ ++ case 'z': ++ case 'Z': ++ bpt_type = &remcom_in_buffer[1]; ++ ptr = &remcom_in_buffer[2]; ++ ++ if (kgdb_ops->set_hw_breakpoint && *bpt_type >= '1') { ++ /* Unsupported */ ++ if (*bpt_type > '4') ++ break; ++ } else if (*bpt_type != '0' && *bpt_type != '1') ++ /* Unsupported. */ ++ break; ++ /* Test if this is a hardware breakpoint, and ++ * if we support it. */ ++ if (*bpt_type == '1' && ++ !kgdb_ops->flags & KGDB_HW_BREAKPOINT) ++ /* Unsupported. */ ++ break; ++ ++ if (*(ptr++) != ',') { ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } else if (kgdb_hex2long(&ptr, &addr)) { ++ if (*(ptr++) != ',' || ++ !kgdb_hex2long(&ptr, &length)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ } else { ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } ++ ++ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') ++ error = kgdb_set_sw_break(addr); ++ else if (remcom_in_buffer[0] == 'Z' && *bpt_type == '1') ++ error = kgdb_set_hw_break(addr); ++ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') ++ error = kgdb_remove_sw_break(addr); ++ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '1') ++ error = kgdb_remove_hw_break(addr); ++ else if (remcom_in_buffer[0] == 'Z') ++ error = kgdb_ops->set_hw_breakpoint(addr, ++ (int)length, ++ *bpt_type); ++ else if (remcom_in_buffer[0] == 'z') ++ error = kgdb_ops->remove_hw_breakpoint(addr, ++ (int) ++ length, ++ *bpt_type); ++ ++ if (error == 0) ++ strcpy(remcom_out_buffer, "OK"); ++ else ++ error_packet(remcom_out_buffer, error); ++ ++ break; ++ case 'c': ++ case 's': ++ if (kgdb_contthread && kgdb_contthread != current) { ++ /* Can't switch threads in kgdb */ ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } ++ kgdb_activate_sw_breakpoints(); ++ /* Followthrough to default processing */ ++ default: ++ default_handle: ++ error = kgdb_arch_handle_exception(ex_vector, signo, ++ err_code, ++ remcom_in_buffer, ++ remcom_out_buffer, ++ linux_regs); ++ ++ if (error >= 0 || remcom_in_buffer[0] == 'D' || ++ remcom_in_buffer[0] == 'k') ++ goto kgdb_exit; ++ ++ } /* switch */ ++ ++ /* reply to the request */ ++ put_packet(remcom_out_buffer); ++ } ++ ++ kgdb_exit: ++ /* Call the I/O driver's post_exception routine if the I/O ++ * driver defined one. ++ */ ++ if (kgdb_io_ops.post_exception) ++ kgdb_io_ops.post_exception(); ++ ++ kgdb_info[processor].debuggerinfo = NULL; ++ kgdb_info[processor].task = NULL; ++ atomic_set(&procindebug[processor], 0); ++ ++ if (!debugger_step || !kgdb_contthread) { ++ for (i = 0; i < NR_CPUS; i++) ++ spin_unlock(&slavecpulocks[i]); ++ /* Wait till all the processors have quit ++ * from the debugger. */ ++ for (i = 0; i < NR_CPUS; i++) { ++ while (atomic_read(&procindebug[i])) { ++ int j = 10; /* an arbitrary number */ ++ ++ while (--j) ++ cpu_relax(); ++ } ++ } ++ } ++ ++#ifdef CONFIG_SMP ++ /* This delay has a real purpose. The problem is that if you ++ * are single-stepping, you are sending an NMI to all the ++ * other processors to stop them. Interrupts come in, but ++ * don't get handled. Then you let them go just long enough ++ * to get into their interrupt routines and use up some stack. ++ * You stop them again, and then do the same thing. After a ++ * while you blow the stack on the other processors. This ++ * delay gives some time for interrupts to be cleared out on ++ * the other processors. ++ */ ++ if (debugger_step) ++ mdelay(2); ++#endif ++kgdb_restore: ++ /* Free debugger_active */ ++ atomic_set(&debugger_active, 0); ++ local_irq_restore(flags); ++ ++ return error; ++} ++ ++/* ++ * GDB places a breakpoint at this function to know dynamically ++ * loaded objects. It's not defined static so that only one instance with this ++ * name exists in the kernel. ++ */ ++ ++int module_event(struct notifier_block *self, unsigned long val, void *data) ++{ ++ return 0; ++} ++ ++static struct notifier_block kgdb_module_load_nb = { ++ .notifier_call = module_event, ++}; ++ ++void kgdb_nmihook(int cpu, void *regs) ++{ ++#ifdef CONFIG_SMP ++ if (!atomic_read(&procindebug[cpu]) && atomic_read(&debugger_active) != (cpu + 1)) ++ kgdb_wait((struct pt_regs *)regs); ++#endif ++} ++ ++/* ++ * This is called when a panic happens. All we need to do is ++ * breakpoint(). ++ */ ++static int kgdb_panic_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ breakpoint(); ++ ++ return 0; ++} ++ ++static struct notifier_block kgdb_panic_notifier = { ++ .notifier_call = kgdb_panic_notify, ++}; ++ ++/* ++ * Initialization that needs to be done in either of our entry points. ++ */ ++static void __init kgdb_internal_init(void) ++{ ++ int i; ++ ++ /* Initialize our spinlocks. */ ++ for (i = 0; i < NR_CPUS; i++) ++ spin_lock_init(&slavecpulocks[i]); ++ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) ++ kgdb_break[i].state = bp_none; ++ ++ /* Initialize the I/O handles */ ++ memset(&kgdb_io_ops_prev, 0, sizeof(kgdb_io_ops_prev)); ++ ++ /* We can't do much if this fails */ ++ register_module_notifier(&kgdb_module_load_nb); ++ ++ kgdb_initialized = 1; ++} ++ ++static void kgdb_register_for_panic(void) ++{ ++ /* Register for panics(). */ ++ /* The registration is done in the kgdb_register_for_panic ++ * routine because KGDB should not try to handle a panic when ++ * there are no kgdb_io_ops setup. It is assumed that the ++ * kgdb_io_ops are setup at the time this method is called. ++ */ ++ if (!kgdb_from_module_registered) { ++ atomic_notifier_chain_register(&panic_notifier_list, ++ &kgdb_panic_notifier); ++ kgdb_from_module_registered = 1; ++ } ++} ++ ++static void kgdb_unregister_for_panic(void) ++{ ++ /* When this routine is called KGDB should unregister from the ++ * panic handler and clean up, making sure it is not handling any ++ * break exceptions at the time. ++ */ ++ if (kgdb_from_module_registered) { ++ kgdb_from_module_registered = 0; ++ atomic_notifier_chain_unregister(&panic_notifier_list, ++ &kgdb_panic_notifier); ++ } ++} ++ ++int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops) ++{ ++ ++ if (kgdb_connected) { ++ printk(KERN_ERR "kgdb: Cannot load I/O module while KGDB " ++ "connected.\n"); ++ return -EINVAL; ++ } ++ ++ /* Save the old values so they can be restored */ ++ if (kgdb_io_handler_cnt >= MAX_KGDB_IO_HANDLERS) { ++ printk(KERN_ERR "kgdb: No more I/O handles available.\n"); ++ return -EINVAL; ++ } ++ ++ /* Check to see if there is an existing driver and if so save its ++ * values. Also check to make sure the same driver was not trying ++ * to re-register. ++ */ ++ if (kgdb_io_ops.read_char != NULL && ++ kgdb_io_ops.read_char != local_kgdb_io_ops->read_char) { ++ memcpy(&kgdb_io_ops_prev[kgdb_io_handler_cnt], ++ &kgdb_io_ops, sizeof(struct kgdb_io)); ++ kgdb_io_handler_cnt++; ++ } ++ ++ /* Initialize the io values for this module */ ++ memcpy(&kgdb_io_ops, local_kgdb_io_ops, sizeof(struct kgdb_io)); ++ ++ /* Make the call to register kgdb if is not initialized */ ++ kgdb_register_for_panic(); ++ ++ return 0; ++} ++ ++void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops) ++{ ++ int i; ++ ++ /* Unregister KGDB if there were no other prior io hooks, else ++ * restore the io hooks. ++ */ ++ if (kgdb_io_handler_cnt > 0 && kgdb_io_ops_prev[0].read_char != NULL) { ++ /* First check if the hook that is in use is the one being ++ * removed */ ++ if (kgdb_io_ops.read_char == local_kgdb_io_ops->read_char) { ++ /* Set 'i' to the value of where the list should be ++ * shifed */ ++ i = kgdb_io_handler_cnt - 1; ++ memcpy(&kgdb_io_ops, &kgdb_io_ops_prev[i], ++ sizeof(struct kgdb_io)); ++ } else { ++ /* Simple case to remove an entry for an I/O handler ++ * that is not in use */ ++ for (i = 0; i < kgdb_io_handler_cnt; i++) { ++ if (kgdb_io_ops_prev[i].read_char == ++ local_kgdb_io_ops->read_char) ++ break; ++ } ++ } ++ ++ /* Shift all the entries in the handler array so it is ++ * ordered from oldest to newest. ++ */ ++ kgdb_io_handler_cnt--; ++ for (; i < kgdb_io_handler_cnt; i++) { ++ memcpy(&kgdb_io_ops_prev[i], &kgdb_io_ops_prev[i + 1], ++ sizeof(struct kgdb_io)); ++ } ++ /* Handle the case if we are on the last element and set it ++ * to NULL; */ ++ memset(&kgdb_io_ops_prev[kgdb_io_handler_cnt], 0, ++ sizeof(struct kgdb_io)); ++ ++ if (kgdb_connected) ++ printk(KERN_ERR "kgdb: WARNING: I/O method changed " ++ "while kgdb was connected state.\n"); ++ } else { ++ /* KGDB is no longer able to communicate out, so ++ * unregister our hooks and reset state. */ ++ kgdb_unregister_for_panic(); ++ if (kgdb_connected) { ++ printk(KERN_CRIT "kgdb: I/O module was unloaded while " ++ "a debugging session was running. " ++ "KGDB will be reset.\n"); ++ if (remove_all_break() < 0) ++ printk(KERN_CRIT "kgdb: Reset failed.\n"); ++ kgdb_connected = 0; ++ } ++ memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io)); ++ } ++} ++ ++/* ++ * There are times we need to call a tasklet to cause a breakpoint ++ * as calling breakpoint() at that point might be fatal. We have to ++ * check that the exception stack is setup, as tasklets may be scheduled ++ * prior to this. When that happens, it is up to the architecture to ++ * schedule this when it is safe to run. ++ */ ++static void kgdb_tasklet_bpt(unsigned long ing) ++{ ++ if(CHECK_EXCEPTION_STACK()) ++ breakpoint(); ++} ++ ++DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); ++ ++/* ++ * This function can be called very early, either via early_param() or ++ * an explicit breakpoint() early on. ++ */ ++static void __init kgdb_early_entry(void) ++{ ++ /* ++ * Don't try and do anything until the architecture is able to ++ * setup the exception stack. In this case, it is up to the ++ * architecture to hook in and look at us when they are ready. ++ */ ++ if(!CHECK_EXCEPTION_STACK()) { ++ kgdb_initialized = -1; ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ return; ++ } ++ ++ /* Let the architecture do any setup that it needs to. */ ++ kgdb_arch_init(); ++ ++ /* Now try the I/O. */ ++ /* For early entry kgdb_io_ops.init must be defined */ ++ if (!kgdb_io_ops.init || kgdb_io_ops.init()) { ++ /* Try again later. */ ++ kgdb_initialized = -1; ++ return; ++ } ++ ++ /* Finish up. */ ++ kgdb_internal_init(); ++ ++ /* KGDB can assume that if kgdb_io_ops.init was defined that the ++ * panic registion should be performed at this time. This means ++ * kgdb_io_ops.init did not come from a kernel module and was ++ * initialized statically by a built in. ++ */ ++ if (kgdb_io_ops.init) ++ kgdb_register_for_panic(); ++} ++ ++/* ++ * This function will always be invoked to make sure that KGDB will grab ++ * what it needs to so that if something happens while the system is ++ * running, KGDB will get involved. If kgdb_early_entry() has already ++ * been invoked, there is little we need to do. ++ */ ++static int __init kgdb_late_entry(void) ++{ ++ int need_break = 0; ++ ++ /* If kgdb_initialized is -1 then we were passed kgdbwait. */ ++ if (kgdb_initialized == -1) ++ need_break = 1; ++ ++ /* ++ * If we haven't tried to initialize KGDB yet, we need to call ++ * kgdb_arch_init before moving onto the I/O. ++ */ ++ if (!kgdb_initialized) ++ kgdb_arch_init(); ++ ++ if (kgdb_initialized != 1) { ++ if (kgdb_io_ops.init && kgdb_io_ops.init()) { ++ /* When KGDB allows I/O via modules and the core ++ * I/O init fails KGDB must default to defering the ++ * I/O setup, and appropriately print an error about ++ * it. ++ */ ++ printk(KERN_ERR "kgdb: Could not setup core I/O " ++ "for KGDB.\n"); ++ printk(KERN_INFO "kgdb: Defering I/O setup to kernel " ++ "module.\n"); ++ memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io)); ++ } ++ ++ kgdb_internal_init(); ++ ++ /* KGDB can assume that if kgdb_io_ops.init was defined that ++ * panic registion should be performed at this time. This means ++ * kgdb_io_ops.init did not come from a kernel module and was ++ * initialized statically by a built in. ++ */ ++ if (kgdb_io_ops.init) ++ kgdb_register_for_panic(); ++ } ++ ++ /* Registering to reboot notifier list*/ ++ register_reboot_notifier(&kgdb_reboot_notifier); ++ ++ /* Now do any late init of the I/O. */ ++ if (kgdb_io_ops.late_init) ++ kgdb_io_ops.late_init(); ++ ++ if (need_break) { ++ printk(KERN_CRIT "kgdb: Waiting for connection from remote" ++ " gdb...\n"); ++ breakpoint(); ++ } ++ ++ return 0; ++} ++ ++late_initcall(kgdb_late_entry); ++ ++/* ++ * This function will generate a breakpoint exception. It is used at the ++ * beginning of a program to sync up with a debugger and can be used ++ * otherwise as a quick means to stop program execution and "break" into ++ * the debugger. ++ */ ++void breakpoint(void) ++{ ++ if (kgdb_initialized != 1) { ++ kgdb_early_entry(); ++ if (kgdb_initialized == 1) ++ printk(KERN_CRIT "Waiting for connection from remote " ++ "gdb...\n"); ++ else { ++ printk(KERN_CRIT "KGDB cannot initialize I/O yet.\n"); ++ return; ++ } ++ } ++ ++ atomic_set(&kgdb_setting_breakpoint, 1); ++ wmb(); ++ BREAKPOINT(); ++ wmb(); ++ atomic_set(&kgdb_setting_breakpoint, 0); ++} ++ ++EXPORT_SYMBOL(breakpoint); ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, ++ struct tty_struct *tty) ++{ ++ printk("Entering GDB stub\n"); ++ breakpoint(); ++} ++static struct sysrq_key_op sysrq_gdb_op = { ++ .handler = sysrq_handle_gdb, ++ .help_msg = "Gdb", ++ .action_msg = "GDB", ++}; ++ ++static int gdb_register_sysrq(void) ++{ ++ printk("Registering GDB sysrq handler\n"); ++ register_sysrq_key('g', &sysrq_gdb_op); ++ return 0; ++} ++ ++module_init(gdb_register_sysrq); ++#endif ++ ++static int kgdb_notify_reboot(struct notifier_block *this, ++ unsigned long code, void *x) ++{ ++ ++ unsigned long flags; ++ ++ /* If we're debugging, or KGDB has not connected, don't try ++ * and print. */ ++ if (!kgdb_connected || atomic_read(&debugger_active) != 0) ++ return 0; ++ if ((code == SYS_RESTART) || (code == SYS_HALT) || (code == SYS_POWER_OFF)){ ++ local_irq_save(flags); ++ put_packet("X00"); ++ local_irq_restore(flags); ++ } ++ return NOTIFY_DONE; ++} ++ ++#ifdef CONFIG_KGDB_CONSOLE ++void kgdb_console_write(struct console *co, const char *s, unsigned count) ++{ ++ unsigned long flags; ++ ++ /* If we're debugging, or KGDB has not connected, don't try ++ * and print. */ ++ if (!kgdb_connected || atomic_read(&debugger_active) != 0) ++ return; ++ ++ local_irq_save(flags); ++ kgdb_msg_write(s, count); ++ local_irq_restore(flags); ++} ++ ++struct console kgdbcons = { ++ .name = "kgdb", ++ .write = kgdb_console_write, ++ .flags = CON_PRINTBUFFER | CON_ENABLED, ++}; ++static int __init kgdb_console_init(void) ++{ ++ register_console(&kgdbcons); ++ return 0; ++} ++ ++console_initcall(kgdb_console_init); ++#endif ++ ++static int __init opt_kgdb_enter(char *str) ++{ ++ /* We've already done this by an explicit breakpoint() call. */ ++ if (kgdb_initialized) ++ return 0; ++ ++ /* Call breakpoint() which will take care of init. */ ++ breakpoint(); ++ ++ return 0; ++} ++ ++early_param("kgdbwait", opt_kgdb_enter); +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/kgdbarchlib.c linux-2.6.18-53.1.14.kgdb/kernel/kgdbarchlib.c +--- linux-2.6.18-53.1.14/kernel/kgdbarchlib.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/kgdbarchlib.c 2008-06-10 15:37:25.000000000 +0400 +@@ -0,0 +1,198 @@ ++#include ++ ++struct kgdb_arch *kgdb_ops = &arch_kgdb_ops; ++ ++/** ++ * kgdb_arch_init - Perform any architecture specific initalization. ++ * ++ * RETURN: ++ * The return value is ignored. ++ * ++ * This function will handle the initalization of any architecture ++ * specific hooks. ++ */ ++int __attribute__ ((weak)) ++ kgdb_arch_init(void) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. ++ * @regs: Current &struct pt_regs. ++ * ++ * This function will be called if the particular architecture must ++ * disable hardware debugging while it is processing gdb packets or ++ * handling exception. ++ */ ++void __attribute__ ((weak)) ++ kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++} ++ ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++int __attribute__ ((weak)) ++ kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_set_hw_break - Set a hardware breakpoint at @addr. ++ * @addr: The address to set a hardware breakpoint at. ++ */ ++int __attribute__ ((weak)) ++ kgdb_set_hw_break(unsigned long addr) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_remove_hw_break - Remove a hardware breakpoint at @addr. ++ * @addr: The address to remove a hardware breakpoint from. ++ */ ++int __attribute__ ((weak)) ++ kgdb_remove_hw_break(unsigned long addr) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_remove_all_hw_break - Clear all hardware breakpoints. ++ */ ++void __attribute__ ((weak)) ++ kgdb_remove_all_hw_break(void) ++{ ++} ++ ++/** ++ * kgdb_correct_hw_break - Correct hardware breakpoints. ++ * ++ * A hook to allow for changes to the hardware breakpoint, called ++ * after a single step (s) or continue (c) packet, and once we're about ++ * to let the kernel continue running. ++ * ++ * This is used to set the hardware breakpoint registers for all the ++ * slave cpus on an SMP configuration. This must be called after any ++ * changes are made to the hardware breakpoints (such as by a single ++ * step (s) or continue (c) packet. This is only required on ++ * architectures that support SMP and every processor has its own set ++ * of breakpoint registers. ++ */ ++void __attribute__ ((weak)) ++ kgdb_correct_hw_break(void) ++{ ++} ++ ++/** ++ * kgdb_post_master_code - Save error vector/code numbers. ++ * @regs: Original pt_regs. ++ * @e_vector: Original error vector. ++ * @err_code: Original error code. ++ * ++ * This is needed on architectures which support SMP and KGDB. ++ * This function is called after all the slave cpus have been put ++ * to a know spin state and the master CPU has control over KGDB. ++ */ ++ ++void __attribute__ ((weak)) ++ kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++} ++ ++/** ++ * kgdb_roundup_cpus - Get other CPUs into a holding pattern ++ * @flags: Current IRQ state ++ * ++ * On SMP systems, we need to get the attention of the other CPUs ++ * and get them be in a known state. This should do what is needed ++ * to get the other CPUs to call kgdb_wait(). Note that on some arches, ++ * the NMI approach is not used for rounding up all the CPUs. For example, ++ * in case of MIPS, smp_call_function() is used to roundup CPUs. In ++ * this case, we have to make sure that interrupts are enabled before ++ * calling smp_call_function(). The argument to this function is ++ * the flags that will be used when restoring the interrupts. There is ++ * local_irq_save() call before kgdb_roundup_cpus(). ++ */ ++void __attribute__ ((weak)) ++ kgdb_roundup_cpus(unsigned long flags) ++{ ++} ++ ++/** ++ * kgdb_shadowinfo - Get shadowed information on @threadid. ++ * @regs: The &struct pt_regs of the current process. ++ * @buffer: A buffer of %BUFMAX size. ++ * @threadid: The thread id of the shadowed process to get information on. ++ */ ++void __attribute__ ((weak)) ++ kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid) ++{ ++} ++ ++/** ++ * kgdb_get_shadow_thread - Get the shadowed &task_struct of @threadid. ++ * @regs: The &struct pt_regs of the current thread. ++ * @threadid: The thread id of the shadowed process to get information on. ++ * ++ * RETURN: ++ * This returns a pointer to the &struct task_struct of the shadowed ++ * thread, @threadid. ++ */ ++struct task_struct __attribute__ ((weak)) ++ * kgdb_get_shadow_thread(struct pt_regs *regs, int threadid) ++{ ++ return NULL; ++} ++ ++/** ++ * kgdb_shadow_regs - Return the shadowed registers of @threadid. ++ * @regs: The &struct pt_regs of the current thread. ++ * @threadid: The thread id we want the &struct pt_regs for. ++ * ++ * RETURN: ++ * The a pointer to the &struct pt_regs of the shadowed thread @threadid. ++ */ ++struct pt_regs __attribute__ ((weak)) ++ * kgdb_shadow_regs(struct pt_regs *regs, int threadid) ++{ ++ return NULL; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_validate_break_address(unsigned long addr) ++{ ++ int error = 0; ++ char tmp_variable[BREAK_INSTR_SIZE]; ++ error = kgdb_get_mem((char *)addr, tmp_variable, BREAK_INSTR_SIZE); ++ return error; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) ++{ ++ int error = 0; ++ if ((error = kgdb_get_mem((char *)addr, ++ saved_instr, BREAK_INSTR_SIZE)) < 0) ++ return error; ++ ++ if ((error = kgdb_set_mem((char *)addr, kgdb_ops->gdb_bpt_instr, ++ BREAK_INSTR_SIZE)) < 0) ++ return error; ++ return 0; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) ++{ ++ ++ int error = 0; ++ if ((error =kgdb_set_mem((char *)addr, (char *)bundle, ++ BREAK_INSTR_SIZE)) < 0) ++ return error; ++ return 0; ++} +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/module.c linux-2.6.18-53.1.14.kgdb/kernel/module.c +--- linux-2.6.18-53.1.14/kernel/module.c 2008-03-06 05:54:13.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/module.c 2008-06-10 15:39:15.000000000 +0400 +@@ -65,6 +65,7 @@ static DEFINE_SPINLOCK(modlist_lock); + /* List of modules, protected by module_mutex AND modlist_lock */ + static DEFINE_MUTEX(module_mutex); + static LIST_HEAD(modules); ++static DECLARE_MUTEX(notify_mutex); + + static BLOCKING_NOTIFIER_HEAD(module_notify_list); + +@@ -701,6 +702,12 @@ sys_delete_module(const char __user *nam + if (ret != 0) + goto out; + ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, ++ mod); ++ up(¬ify_mutex); ++ ++ + /* Never wait if forced. */ + if (!forced && module_refcount(mod) != 0) + wait_for_zero_refcount(mod); +@@ -713,6 +720,11 @@ sys_delete_module(const char __user *nam + } + free_module(mod); + ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GONE, ++ NULL); ++ up(¬ify_mutex); ++ + out: + mutex_unlock(&module_mutex); + return ret; +@@ -1119,6 +1131,11 @@ static void free_module(struct module *m + /* Arch-specific cleanup. */ + module_arch_cleanup(mod); + ++#ifdef CONFIG_KGDB ++ /* kgdb info */ ++ vfree(mod->mod_sections); ++#endif ++ + /* Module unload stuff */ + module_unload_free(mod); + +@@ -1378,6 +1395,31 @@ static void setup_modinfo(struct module + } + } + ++#ifdef CONFIG_KGDB ++int add_modsects (struct module *mod, Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const ++ char *secstrings) ++{ ++ int i; ++ ++ mod->num_sections = hdr->e_shnum - 1; ++ mod->mod_sections = vmalloc((hdr->e_shnum - 1)* ++ sizeof (struct mod_section)); ++ ++ if (mod->mod_sections == NULL) { ++ return -ENOMEM; ++ } ++ ++ for (i = 1; i < hdr->e_shnum; i++) { ++ mod->mod_sections[i - 1].address = (void *)sechdrs[i].sh_addr; ++ strncpy(mod->mod_sections[i - 1].name, secstrings + ++ sechdrs[i].sh_name, MAX_SECTNAME); ++ mod->mod_sections[i - 1].name[MAX_SECTNAME] = '\0'; ++ } ++ ++ return 0; ++} ++#endif ++ + #ifdef CONFIG_KALLSYMS + int is_exported(const char *name, const struct module *mod) + { +@@ -1796,6 +1838,12 @@ static struct module *load_module(void _ + + add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + ++#ifdef CONFIG_KGDB ++ if ((err = add_modsects(mod, hdr, sechdrs, secstrings)) < 0) { ++ goto nomodsectinfo; ++ } ++#endif ++ + err = module_finalize(hdr, sechdrs, mod); + if (err < 0) + goto cleanup; +@@ -1856,6 +1904,11 @@ static struct module *load_module(void _ + arch_cleanup: + module_arch_cleanup(mod); + cleanup: ++ ++#ifdef CONFIG_KGDB ++nomodsectinfo: ++ vfree(mod->mod_sections); ++#endif + module_unload_free(mod); + module_free(mod, mod->module_init); + free_core: +@@ -1927,6 +1980,10 @@ sys_init_module(void __user *umod, + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, ++ mod); ++ up(¬ify_mutex); + synchronize_sched(); + if (mod->unsafe) + printk(KERN_ERR "%s: module is now stuck!\n", +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/sched.c linux-2.6.18-53.1.14.kgdb/kernel/sched.c +--- linux-2.6.18-53.1.14/kernel/sched.c 2008-03-06 05:54:44.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/sched.c 2008-06-10 15:37:25.000000000 +0400 +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -6835,6 +6836,9 @@ void __might_sleep(char *file, int line) + #ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + ++ if (atomic_read(&debugger_active)) ++ return; ++ + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/softlockup.c linux-2.6.18-53.1.14.kgdb/kernel/softlockup.c +--- linux-2.6.18-53.1.14/kernel/softlockup.c 2008-03-06 05:54:44.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/softlockup.c 2008-06-10 15:39:21.000000000 +0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + static DEFINE_SPINLOCK(print_lock); + +@@ -37,6 +38,9 @@ static struct notifier_block panic_block + void touch_softlockup_watchdog(void) + { + __raw_get_cpu_var(touch_timestamp) = jiffies; ++#ifdef CONFIG_KGDB ++ atomic_set(&kgdb_sync_softlockup[raw_smp_processor_id()], 0); ++#endif + } + EXPORT_SYMBOL(touch_softlockup_watchdog); + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/kernel/timer.c linux-2.6.18-53.1.14.kgdb/kernel/timer.c +--- linux-2.6.18-53.1.14/kernel/timer.c 2008-03-06 05:54:50.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/kernel/timer.c 2008-06-10 15:39:21.000000000 +0400 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1385,7 +1386,11 @@ static void run_timer_softirq(struct sof + */ + void run_local_timers(void) + { ++ int this_cpu = smp_processor_id(); + raise_softirq(TIMER_SOFTIRQ); ++#ifdef CONFIG_KGDB ++ if(!atomic_read(&kgdb_sync_softlockup[this_cpu])) ++#endif + softlockup_tick(); + } + +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/lib/Kconfig.debug linux-2.6.18-53.1.14.kgdb/lib/Kconfig.debug +--- linux-2.6.18-53.1.14/lib/Kconfig.debug 2008-03-06 05:54:32.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/lib/Kconfig.debug 2008-06-10 15:38:56.000000000 +0400 +@@ -324,7 +324,7 @@ config DEBUG_LIST + + config FRAME_POINTER + bool "Compile the kernel with frame pointers" +- depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390) ++ depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || SUPERH) + default y if DEBUG_INFO && UML + help + If you say Y here the resulting kernel image will be slightly larger +@@ -377,3 +377,158 @@ config RCU_TORTURE_TEST + at boot time (you probably don't). + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. ++ ++config WANT_EXTRA_DEBUG_INFORMATION ++ bool ++ select DEBUG_INFO ++ select FRAME_POINTER if X86 || SUPERH ++ default n ++ ++config KGDB ++ bool "KGDB: kernel debugging with remote gdb" ++ select WANT_EXTRA_DEBUG_INFORMATION ++ depends on DEBUG_KERNEL && (ARM || X86 || MIPS || (SUPERH && !SUPERH64) || IA64 || X86_64 || PPC) ++ help ++ If you say Y here, it will be possible to remotely debug the ++ kernel using gdb. It is strongly suggested that you enable ++ DEBUG_INFO, and if available on your platform, FRAME_POINTER. ++ Documentation of kernel debugger available at ++ http://kgdb.sourceforge.net as well as in DocBook form ++ in Documentation/DocBook/. If unsure, say N. ++ ++config KGDB_CONSOLE ++ bool "KGDB: Console messages through gdb" ++ depends on KGDB ++ help ++ If you say Y here, console messages will appear through gdb. ++ Other consoles such as tty or ttyS will continue to work as usual. ++ Note, that if you use this in conjunction with KGDB_ETH, if the ++ ethernet driver runs into an error condition during use with KGDB ++ it is possible to hit an infinite recusrion, causing the kernel ++ to crash, and typically reboot. For this reason, it is preferable ++ to use NETCONSOLE in conjunction with KGDB_ETH instead of ++ KGDB_CONSOLE. ++ ++choice ++ prompt "Method for KGDB communication" ++ depends on KGDB ++ default KGDB_8250_NOMODULE ++ default KGDB_MPSC if SERIAL_MPSC ++ default KGDB_CPM_UART if (8xx || 8260) ++ default KGDB_SIBYTE if SIBYTE_SB1xxx_SOC ++ help ++ There are a number of different ways in which you can communicate ++ with KGDB. The most common is via serial, with the 8250 driver ++ (should your hardware have an 8250, or ns1655x style uart). ++ Another option is to use the NETPOLL framework and UDP, should ++ your ethernet card support this. Other options may exist. ++ You can elect to have one core I/O driver that is built into the ++ kernel for debugging as the kernel is booting, or using only ++ kernel modules. ++ ++config KGDB_ONLY_MODULES ++ bool "KGDB: Use only kernel modules for I/O" ++ depends on MODULES ++ help ++ Use only kernel modules to configure KGDB I/O after the ++ kernel is booted. ++ ++config KGDB_8250_NOMODULE ++ bool "KGDB: On generic serial port (8250)" ++ select KGDB_8250 ++ help ++ Uses generic serial port (8250) to communicate with the host ++ GDB. This is independent of the normal (SERIAL_8250) driver ++ for this chipset. ++ ++config KGDBOE_NOMODULE ++ bool "KGDB: On ethernet - in kernel" ++ select KGDBOE ++ select NETPOLL ++ select NETPOLL_TRAP ++ select NETPOLL_RX ++ help ++ Uses the NETPOLL API to communicate with the host GDB via UDP. ++ In order for this to work, the ethernet interface specified must ++ support the NETPOLL API, and this must be initialized at boot. ++ See the documentation for syntax. ++ ++config KGDB_MPSC ++ bool "KGDB on MV64x60 MPSC" ++ depends on SERIAL_MPSC ++ help ++ Uses a Marvell GT64260B or MV64x60 Multi-Purpose Serial ++ Controller (MPSC) channel. Note that the GT64260A is not ++ supported. ++ ++config KGDB_CPM_UART ++ bool "KGDB: On CPM UART" ++ depends on PPC && (CPM2 || 8xx) ++ help ++ Uses CPM UART to communicate with the host GDB. ++ ++config KGDB_SIBYTE ++ bool "KGDB: On the Broadcom SWARM serial port" ++ depends on MIPS && SIBYTE_SB1xxx_SOC ++endchoice ++ ++config KGDBOE ++ tristate "KGDB: On ethernet" if !KGDBOE_NOMODULE ++ depends on m && KGDB ++ select NETPOLL ++ select NETPOLL_TRAP ++ select NETPOLL_RX ++ help ++ Uses the NETPOLL API to communicate with the host GDB via UDP. ++ In order for this to work, the ethernet interface specified must ++ support the NETPOLL API, and this must be initialized at boot. ++ See the documentation for syntax. ++ ++config KGDB_8250 ++ tristate "KGDB: On generic serial port (8250)" if !KGDB_8250_NOMODULE ++ depends on m && KGDB_ONLY_MODULES ++ help ++ Uses generic serial port (8250) to communicate with the host ++ GDB. This is independent of the normal (SERIAL_8250) driver ++ for this chipset. ++ ++config KGDB_SIMPLE_SERIAL ++ bool "Simple selection of KGDB serial port" ++ depends on KGDB_8250_NOMODULE ++ default y ++ help ++ If you say Y here, you will only have to pick the baud rate ++ and port number that you wish to use for KGDB. Note that this ++ only works on architectures that register known serial ports ++ early on. If you say N, you will have to provide, either here ++ or on the command line, the type (I/O or MMIO), IRQ and ++ address to use. If in doubt, say Y. ++ ++config KGDB_BAUDRATE ++ int "Debug serial port baud rate" ++ depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) ++ default "115200" ++ help ++ gdb and the kernel stub need to agree on the baud rate to be ++ used. Standard rates from 9600 to 115200 are allowed, and this ++ may be overridden via the commandline. ++ ++config KGDB_PORT_NUM ++ int "Serial port number for KGDB" ++ range 0 1 if KGDB_MPSC ++ range 0 3 ++ depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) || KGDB_MPSC ++ default "1" ++ help ++ Pick the port number (0 based) for KGDB to use. ++ ++config KGDB_8250_CONF_STRING ++ string "Configuration string for KGDB" ++ depends on KGDB_8250_NOMODULE && !KGDB_SIMPLE_SERIAL ++ default "io,2f8,115200,3" if X86 ++ help ++ The format of this string should be ,
,,. For example, to use the ++ serial port on an i386 box located at 0x2f8 and 115200 baud ++ on IRQ 3 at use: ++ io,2f8,115200,3 +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/net/core/netpoll.c linux-2.6.18-53.1.14.kgdb/net/core/netpoll.c +--- linux-2.6.18-53.1.14/net/core/netpoll.c 2008-03-06 05:54:27.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/net/core/netpoll.c 2008-06-10 15:37:49.000000000 +0400 +@@ -525,7 +525,8 @@ int __netpoll_rx(struct sk_buff *skb) + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), +- ulen - sizeof(struct udphdr)); ++ ulen - sizeof(struct udphdr), ++ skb); + + kfree_skb(skb); + return 1; +diff -rupbBN -X ../client-cleanup/dontdiff linux-2.6.18-53.1.14/scripts/dwarfh.awk linux-2.6.18-53.1.14.kgdb/scripts/dwarfh.awk +--- linux-2.6.18-53.1.14/scripts/dwarfh.awk 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18-53.1.14.kgdb/scripts/dwarfh.awk 2008-06-10 15:39:01.000000000 +0400 +@@ -0,0 +1,19 @@ ++BEGIN { ++ print "#ifndef _ELF_DWARF_H" ++ print "/* Machine generated from dwarf2.h by scripts/dwarfh.awk */" ++} ++$2 == "=" { ++ gsub(/,/, "", $3) ++ print "#define " $1 "\t " $3 ++} ++$1 == "#define" { ++ print $0 ++ while( index($0,"\\") == length($0)){ ++ getline ++ print $0 ++ } ++} ++/.*/ {} ++END { ++ print "#endif" ++} diff --git a/lustre/kernel_patches/patches/8kstack-2.6.12.patch b/lustre/kernel_patches/patches/8kstack-2.6.12.patch index f3a2160..26b0f3f 100644 --- a/lustre/kernel_patches/patches/8kstack-2.6.12.patch +++ b/lustre/kernel_patches/patches/8kstack-2.6.12.patch @@ -1,3 +1,6 @@ +Increase the stack size to 8kB. This is only required for i386 arch, +because other kernels always have 8kB stack sizes. + Index: linux-2.6.9-5.0.3.EL/include/asm-i386/thread_info.h =================================================================== --- linux-2.6.9-5.0.3.EL.orig/include/asm-i386/thread_info.h 2005-02-25 10:25:33.000000000 +0200 diff --git a/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch b/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch index a6e7351f..e7e40ac 100644 --- a/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch +++ b/lustre/kernel_patches/patches/dev_read_only-2.6.22-vanilla.patch @@ -1,3 +1,11 @@ +Set the underlying block device "read only" and silently discard writes +to the device at the block layer. This allows the block device queue +to drain quickly for controlled failback of the device. + +At one time it was required to avoid crashes in the JBD layer during +failover, but it may also be possible to just allow the inflight IO to +complete and have Lustre handle this more gracefully. + diff -urp linux-2.6.18.1.orig/block/ll_rw_blk.c linux-2.6.18.1/block/ll_rw_blk.c --- linux-2.6.18.1.orig/block/ll_rw_blk.c 2006-10-14 06:34:03.000000000 +0300 +++ linux-2.6.18.1/block/ll_rw_blk.c 2007-05-29 14:50:46.000000000 +0300 diff --git a/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch index a6813e6..9727ea4 100644 --- a/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch +++ b/lustre/kernel_patches/patches/export-2.6.18-vanilla.patch @@ -1,15 +1,8 @@ -Index: linux-2.6/fs/open.c -=================================================================== ---- linux-2.6.orig/fs/open.c 2006-07-15 16:10:37.000000000 +0800 -+++ linux-2.6/fs/open.c 2006-07-15 16:22:04.000000000 +0800 -@@ -808,7 +808,6 @@ asmlinkage long sys_lchown(const char __ - return error; - } - -- - asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group) - { - struct file * file; +Allow starting the commit of a journal transaction, without waiting for +it to complete. This is a performance enhancement for OST IO so that +the journal commit can run concurrently with the file IO. It isn't +necessary if the client can handle bulk IO recovery (bug 16919). + Index: linux-2.6/fs/jbd/journal.c =================================================================== --- linux-2.6.orig/fs/jbd/journal.c 2006-07-15 16:13:50.000000000 +0800 diff --git a/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch index 834c886..a01aaaf 100644 --- a/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch +++ b/lustre/kernel_patches/patches/export-show_task-2.6.18-vanilla.patch @@ -1,3 +1,8 @@ +Export the show_task() function in order to get better process stacks. +It will also print the current process stack, which is useful. + +This is a nice-to-have but not required for Lustre to work. + Index: linux-2.6/kernel/sched.c =================================================================== --- linux-2.6.orig/kernel/sched.c 2006-07-15 11:51:46.000000000 +0800 diff --git a/lustre/kernel_patches/patches/i_filter_data.patch b/lustre/kernel_patches/patches/i_filter_data.patch index 8a21a9e..663b5f8 100644 --- a/lustre/kernel_patches/patches/i_filter_data.patch +++ b/lustre/kernel_patches/patches/i_filter_data.patch @@ -1,3 +1,7 @@ +The i_filterdata is currently only used by the size-on-mds to store the +epoch number for the inode. This could be moved to another field in +ldiskfs or elsewhere in the inode that isn't used by Lustre callers. + Index: linux-2.6.18.8/include/linux/fs.h =================================================================== --- linux-2.6.18.8.orig/include/linux/fs.h 2007-06-05 12:55:19.000000000 +0200 diff --git a/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch new file mode 100644 index 0000000..4d157ad --- /dev/null +++ b/lustre/kernel_patches/patches/kgdb-2.6.18-vanilla.patch @@ -0,0 +1,19778 @@ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Documentation/DocBook/Makefile linux-2.6.18.kgdb/Documentation/DocBook/Makefile +--- linux-2.6.18/Documentation/DocBook/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/Documentation/DocBook/Makefile 2008-06-10 16:18:58.000000000 +0400 +@@ -11,7 +11,7 @@ DOCBOOKS := wanbook.xml z8530book.xml mc + procfs-guide.xml writing_usb_driver.xml \ + kernel-api.xml journal-api.xml lsm.xml usb.xml \ + gadget.xml libata.xml mtdnand.xml librs.xml rapidio.xml \ +- genericirq.xml ++ genericirq.xml kgdb.xml + + ### + # The build process is as follows (targets): +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Documentation/DocBook/kgdb.tmpl linux-2.6.18.kgdb/Documentation/DocBook/kgdb.tmpl +--- linux-2.6.18/Documentation/DocBook/kgdb.tmpl 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/Documentation/DocBook/kgdb.tmpl 2008-06-10 16:19:47.000000000 +0400 +@@ -0,0 +1,250 @@ ++ ++ ++ ++ ++ ++ KGDB Internals ++ ++ ++ ++ Tom ++ Rini ++ ++
++ trini@kernel.crashing.org ++
++
++
++
++ ++ ++ ++ Amit S. ++ Kale ++ ++
++ amitkale@linsyssoft.com ++
++
++
++
++ ++ ++ 2004-2005 ++ MontaVista Software, Inc. ++ ++ ++ 2004 ++ Amit S. Kale ++ ++ ++ ++ ++ This file is licensed under the terms of the GNU General Public License ++ version 2. This program is licensed "as is" without any warranty of any ++ kind, whether express or implied. ++ ++ ++ ++
++ ++ ++ ++ Introduction ++ ++ kgdb is a source level debugger for linux kernel. It is used along ++ with gdb to debug a linux kernel. Kernel developers can debug a kernel ++ similar to application programs with the use of kgdb. It makes it ++ possible to place breakpoints in kernel code, step through the code ++ and observe variables. ++ ++ ++ Two machines are required for using kgdb. One of these machines is a ++ development machine and the other is a test machine. The machines are ++ typically connected through a serial line, a null-modem cable which ++ connects their serial ports. It is also possible however, to use an ++ ethernet connection between the machines. The kernel to be debugged ++ runs on the test machine. gdb runs on the development machine. The ++ serial line or ethernet connection is used by gdb to communicate to ++ the kernel being debugged. ++ ++ ++ ++ Compiling a kernel ++ ++ To enable CONFIG_KGDB, look under the "Kernel debugging" ++ and then select "KGDB: kernel debugging with remote gdb". ++ ++ ++ The first choice for I/O is CONFIG_KGDB_ONLY_MODULES. ++ This means that you will only be able to use KGDB after loading a ++ kernel module that defines how you want to be able to talk with ++ KGDB. There are two other choices (more on some architectures) that ++ can be enabled as modules later, if not picked here. ++ ++ The first of these is CONFIG_KGDB_8250_NOMODULE. ++ This has sub-options such as CONFIG_KGDB_SIMPLE_SERIAL ++ which toggles choosing the serial port by ttyS number or by specifying ++ a port and IRQ number. ++ ++ ++ The second of these choices on most systems for I/O is ++ CONFIG_KGDBOE. This requires that the machine to be ++ debugged has an ethernet card which supports the netpoll API, such as ++ the cards supported by CONFIG_E100. There are no ++ sub-options for this, but a kernel command line option is required. ++ ++ ++ ++ Booting the kernel ++ ++ The Kernel command line option kgdbwait makes kgdb ++ wait for gdb connection during booting of a kernel. If the ++ CONFIG_KGDB_8250 driver is used (or if applicable, ++ another serial driver) this breakpoint will happen very early on, before ++ console output. If you wish to change serial port information and you ++ have enabled both CONFIG_KGDB_8250 and ++ CONFIG_KGDB_SIMPLE_SERIAL then you must pass the option ++ kgdb8250=<io or mmio>,<address>,<baud ++ rate>,<irq> before kgdbwait. ++ The values io or mmio refer to ++ if the address being passed next needs to be memory mapped ++ (mmio) or not. The address must ++ be passed in hex and is the hardware address and will be remapped if ++ passed as mmio. The value ++ baud rate and irq are base-10. ++ The supported values for baud rate are ++ 9600, 19200, ++ 38400, 57600, and ++ 115200. ++ ++ ++ To have KGDB stop the kernel and wait, with the compiled values for the ++ serial driver, pass in: kgdbwait. ++ ++ ++ To specify the values of the SH SCI(F) serial port at boot: ++ kgdbsci=0,115200. ++ ++ ++ To specify the values of the serial port at boot: ++ kgdb8250=io,3f8,115200,3. ++ On IA64 this could also be: ++ kgdb8250=mmio,0xff5e0000,115200,74 ++ And to have KGDB also stop the kernel and wait for GDB to connect, pass in ++ kgdbwait after this arguement. ++ ++ ++ To configure the CONFIG_KGDBOE driver, pass in ++ kgdboe=[src-port]@<src-ip>/[dev],[tgt-port]@<tgt-ip>/[tgt-macaddr] ++ where: ++ ++ src-port (optional): source for UDP packets (defaults to 6443) ++ src-ip: source IP to use (interface address) ++ dev (optional): network interface (eth0) ++ tgt-port (optional): port GDB will use (defaults to 6442) ++ tgt-ip: IP address GDB will be connecting from ++ tgt-macaddr (optional): ethernet MAC address for logging agent (default is broadcast) ++ ++ ++ ++ The CONFIG_KGDBOE driver can be reconfigured at run ++ time, if CONFIG_SYSFS and ++ CONFIG_MODULES by echo'ing a new config string to ++ /sys/module/kgdboe/parameter/kgdboe. The ++ driver can be unconfigured with the special string ++ not_configured. ++ ++ ++ ++ Connecting gdb ++ ++ If you have used any of the methods to have KGDB stop and create ++ an initial breakpoint described in the previous chapter, kgdb prints ++ the message "Waiting for connection from remote gdb..." on the console ++ and waits for connection from gdb. At this point you connect gdb to kgdb. ++ ++ ++ Example (serial): ++ ++ ++ % gdb ./vmlinux ++ (gdb) set remotebaud 115200 ++ (gdb) target remote /dev/ttyS0 ++ ++ ++ Example (ethernet): ++ ++ ++ % gdb ./vmlinux ++ (gdb) target remote udp:192.168.2.2:6443 ++ ++ ++ Once connected, you can debug a kernel the way you would debug an ++ application program. ++ ++ ++ ++ Architecture specific notes ++ ++ SuperH: The NMI switch found on some boards can be used to trigger an ++ initial breakpoint. Subsequent triggers do nothing. If console ++ is enabled on the SCI(F) serial port, and that is the port being used ++ for KGDB, then you must trigger a breakpoint via sysrq, NMI, or ++ some other method prior to connecting, or echo a control-c to the ++ serial port. Also, to use the SCI(F) port for KGDB, the ++ CONFIG_SERIAL_SH_SCI driver must be enabled. ++ ++ ++ ++ The common backend (required) ++ ++ There are a few flags which must be set on every architecture in ++ their <asm/kgdb.h> file. These are: ++ ++ ++ ++ NUMREGBYTES: The size in bytes of all of the registers, so ++ that we can ensure they will all fit into a packet. ++ ++ ++ BUFMAX: The size in bytes of the buffer GDB will read into. ++ This must be larger than NUMREGBYTES. ++ ++ ++ CACHE_FLUSH_IS_SAFE: Set to one if it always safe to call ++ flush_cache_range or flush_icache_range. On some architectures, ++ these functions may not be safe to call on SMP since we keep other ++ CPUs in a holding pattern. ++ ++ ++ ++ ++ ++ There are also the following functions for the common backend, ++ found in kernel/kgdb.c that must be supplied by the ++ architecture-specific backend. No weak version of these is provided. ++ ++!Iinclude/linux/kgdb.h ++ ++ ++ The common backend (optional) ++ ++ These functions are part of the common backend, found in kernel/kgdb.c ++ and are optionally implemented. Some functions (with _hw_ in the name) ++ end up being required on arches which use hardware breakpoints. ++ ++!Ikernel/kgdb.c ++ ++ ++ Driver-Specific Functions ++ ++ Some of the I/O drivers have additional functions that can be ++ called, that are specific to the driver. Calls from other places ++ to these functions must be wrapped in #ifdefs for the driver in ++ question. ++ ++!Idrivers/serial/8250_kgdb.c ++ ++
+diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/MAINTAINERS linux-2.6.18.kgdb/MAINTAINERS +--- linux-2.6.18/MAINTAINERS 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/MAINTAINERS 2008-06-10 16:18:58.000000000 +0400 +@@ -1685,6 +1685,15 @@ L: linux-kernel@vger.kernel.org + L: fastboot@osdl.org + S: Maintained + ++KGDB ++P: Tom Rini ++P: Amit S. Kale ++M: trini@kernel.crashing.org ++M: amitkale@linsyssoft.com ++W: http://sourceforge.net/projects/kgdb ++L: kgdb-bugreport@lists.sourceforge.net ++S: Maintained ++ + KPROBES + P: Prasanna S Panchamukhi + M: prasanna@in.ibm.com +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/Makefile linux-2.6.18.kgdb/Makefile +--- linux-2.6.18/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/Makefile 2008-06-10 16:19:57.000000000 +0400 +@@ -990,6 +990,7 @@ MRPROPER_DIRS += include/config include + MRPROPER_FILES += .config .config.old include/asm .version .old_version \ + include/linux/autoconf.h include/linux/version.h \ + include/linux/utsrelease.h \ ++ include/linux/dwarf2-defs.h \ + Module.symvers tags TAGS cscope* + + # clean - Delete most, but leave enough to build external modules +@@ -1416,7 +1417,11 @@ clean := -f $(if $(KBUILD_SRC),$(srctree + endif # skip-makefile + + PHONY += FORCE +-FORCE: ++include/linux/dwarf2-defs.h: $(srctree)/include/linux/dwarf2.h $(srctree)/scripts/dwarfh.awk ++ mkdir -p include/linux/ ++ awk -f $(srctree)/scripts/dwarfh.awk $(srctree)/include/linux/dwarf2.h > include/linux/dwarf2-defs.h ++ ++FORCE: include/linux/dwarf2-defs.h + + + # Declare the contents of the .PHONY variable as phony. We keep that +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/Makefile linux-2.6.18.kgdb/arch/arm/kernel/Makefile +--- linux-2.6.18/arch/arm/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/kernel/Makefile 2008-06-10 16:19:51.000000000 +0400 +@@ -20,6 +20,7 @@ obj-$(CONFIG_ISA_DMA) += dma-isa.o + obj-$(CONFIG_PCI) += bios32.o isa.o + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_OABI_COMPAT) += sys_oabi-compat.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + obj-$(CONFIG_CRUNCH) += crunch.o crunch-bits.o + AFLAGS_crunch-bits.o := -Wa,-mcpu=ep9312 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/entry-armv.S linux-2.6.18.kgdb/arch/arm/kernel/entry-armv.S +--- linux-2.6.18/arch/arm/kernel/entry-armv.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/kernel/entry-armv.S 2008-06-10 16:19:58.000000000 +0400 +@@ -15,6 +15,7 @@ + * it to save wrong values... Be aware! + */ + ++#include + #include + #include + #include +@@ -232,6 +233,7 @@ svc_preempt: + beq preempt_return @ go again + b 1b + #endif ++ CFI_END_FRAME(__irq_svc) + + .align 5 + __und_svc: +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/arm/kernel/kgdb-jmp.S +--- linux-2.6.18/arch/arm/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/arm/kernel/kgdb-jmp.S 2008-06-10 16:19:51.000000000 +0400 +@@ -0,0 +1,32 @@ ++/* ++ * arch/arm/kernel/kgdb-jmp.S ++ * ++ * Trivial setjmp and longjmp procedures to support bus error recovery ++ * which may occur during kgdb memory read/write operations. ++ * ++ * Author: MontaVista Software, Inc. ++ * source@mvista.com ++ * ++ * 2002-2005 (c) MontaVista Software, Inc. This file is licensed under the ++ * terms of the GNU General Public License version 2. This program as licensed ++ * "as is" without any warranty of any kind, whether express or implied. ++ */ ++#include ++ ++ENTRY (kgdb_fault_setjmp) ++ /* Save registers */ ++ stmia r0, {r0-r14} ++ str lr,[r0, #60] ++ mrs r1,cpsr ++ str r1,[r0,#64] ++ ldr r1,[r0,#4] ++ mov r0, #0 ++ mov pc,lr ++ ++ENTRY (kgdb_fault_longjmp) ++ /* Restore registers */ ++ mov r1,#1 ++ str r1,[r0] ++ ldr r1,[r0, #64] ++ msr spsr,r1 ++ ldmia r0,{r0-pc}^ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/kgdb.c linux-2.6.18.kgdb/arch/arm/kernel/kgdb.c +--- linux-2.6.18/arch/arm/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/arm/kernel/kgdb.c 2008-06-10 16:19:51.000000000 +0400 +@@ -0,0 +1,208 @@ ++/* ++ * arch/arm/kernel/kgdb.c ++ * ++ * ARM KGDB support ++ * ++ * Copyright (c) 2002-2004 MontaVista Software, Inc ++ * ++ * Authors: George Davis ++ * Deepak Saxena ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Make a local copy of the registers passed into the handler (bletch) */ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs) ++{ ++ int regno; ++ ++ /* Initialize all to zero (??) */ ++ for (regno = 0; regno < GDB_MAX_REGS; regno++) ++ gdb_regs[regno] = 0; ++ ++ gdb_regs[_R0] = kernel_regs->ARM_r0; ++ gdb_regs[_R1] = kernel_regs->ARM_r1; ++ gdb_regs[_R2] = kernel_regs->ARM_r2; ++ gdb_regs[_R3] = kernel_regs->ARM_r3; ++ gdb_regs[_R4] = kernel_regs->ARM_r4; ++ gdb_regs[_R5] = kernel_regs->ARM_r5; ++ gdb_regs[_R6] = kernel_regs->ARM_r6; ++ gdb_regs[_R7] = kernel_regs->ARM_r7; ++ gdb_regs[_R8] = kernel_regs->ARM_r8; ++ gdb_regs[_R9] = kernel_regs->ARM_r9; ++ gdb_regs[_R10] = kernel_regs->ARM_r10; ++ gdb_regs[_FP] = kernel_regs->ARM_fp; ++ gdb_regs[_IP] = kernel_regs->ARM_ip; ++ gdb_regs[_SP] = kernel_regs->ARM_sp; ++ gdb_regs[_LR] = kernel_regs->ARM_lr; ++ gdb_regs[_PC] = kernel_regs->ARM_pc; ++ gdb_regs[_CPSR] = kernel_regs->ARM_cpsr; ++} ++ ++/* Copy local gdb registers back to kgdb regs, for later copy to kernel */ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *kernel_regs) ++{ ++ kernel_regs->ARM_r0 = gdb_regs[_R0]; ++ kernel_regs->ARM_r1 = gdb_regs[_R1]; ++ kernel_regs->ARM_r2 = gdb_regs[_R2]; ++ kernel_regs->ARM_r3 = gdb_regs[_R3]; ++ kernel_regs->ARM_r4 = gdb_regs[_R4]; ++ kernel_regs->ARM_r5 = gdb_regs[_R5]; ++ kernel_regs->ARM_r6 = gdb_regs[_R6]; ++ kernel_regs->ARM_r7 = gdb_regs[_R7]; ++ kernel_regs->ARM_r8 = gdb_regs[_R8]; ++ kernel_regs->ARM_r9 = gdb_regs[_R9]; ++ kernel_regs->ARM_r10 = gdb_regs[_R10]; ++ kernel_regs->ARM_fp = gdb_regs[_FP]; ++ kernel_regs->ARM_ip = gdb_regs[_IP]; ++ kernel_regs->ARM_sp = gdb_regs[_SP]; ++ kernel_regs->ARM_lr = gdb_regs[_LR]; ++ kernel_regs->ARM_pc = gdb_regs[_PC]; ++ kernel_regs->ARM_cpsr = gdb_regs[GDB_MAX_REGS - 1]; ++} ++ ++static inline struct pt_regs *kgdb_get_user_regs(struct task_struct *task) ++{ ++ return (struct pt_regs *) ++ ((unsigned long)task->thread_info + THREAD_SIZE - ++ 8 - sizeof(struct pt_regs)); ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, ++ struct task_struct *task) ++{ ++ int regno; ++ struct pt_regs *thread_regs; ++ ++ /* Just making sure... */ ++ if (task == NULL) ++ return; ++ ++ /* Initialize to zero */ ++ for (regno = 0; regno < GDB_MAX_REGS; regno++) ++ gdb_regs[regno] = 0; ++ ++ /* Otherwise, we have only some registers from switch_to() */ ++ thread_regs = kgdb_get_user_regs(task); ++ gdb_regs[_R0] = thread_regs->ARM_r0; /* Not really valid? */ ++ gdb_regs[_R1] = thread_regs->ARM_r1; /* " " */ ++ gdb_regs[_R2] = thread_regs->ARM_r2; /* " " */ ++ gdb_regs[_R3] = thread_regs->ARM_r3; /* " " */ ++ gdb_regs[_R4] = thread_regs->ARM_r4; ++ gdb_regs[_R5] = thread_regs->ARM_r5; ++ gdb_regs[_R6] = thread_regs->ARM_r6; ++ gdb_regs[_R7] = thread_regs->ARM_r7; ++ gdb_regs[_R8] = thread_regs->ARM_r8; ++ gdb_regs[_R9] = thread_regs->ARM_r9; ++ gdb_regs[_R10] = thread_regs->ARM_r10; ++ gdb_regs[_FP] = thread_regs->ARM_fp; ++ gdb_regs[_IP] = thread_regs->ARM_ip; ++ gdb_regs[_SP] = thread_regs->ARM_sp; ++ gdb_regs[_LR] = thread_regs->ARM_lr; ++ gdb_regs[_PC] = thread_regs->ARM_pc; ++ gdb_regs[_CPSR] = thread_regs->ARM_cpsr; ++} ++ ++static int compiled_break; ++ ++int kgdb_arch_handle_exception(int exception_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ long addr; ++ char *ptr; ++ ++ switch (remcom_in_buffer[0]) { ++ case 'c': ++ kgdb_contthread = NULL; ++ ++ /* ++ * Try to read optional parameter, pc unchanged if no parm. ++ * If this was a compiled breakpoint, we need to move ++ * to the next instruction or we will just breakpoint ++ * over and over again. ++ */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) { ++ linux_regs->ARM_pc = addr; ++ } else if (compiled_break == 1) { ++ linux_regs->ARM_pc += 4; ++ } ++ ++ compiled_break = 0; ++ ++ return 0; ++ } ++ ++ return -1; ++} ++ ++static int kgdb_brk_fn(struct pt_regs *regs, unsigned int instr) ++{ ++ kgdb_handle_exception(1, SIGTRAP, 0, regs); ++ ++ return 0; ++} ++ ++static int kgdb_compiled_brk_fn(struct pt_regs *regs, unsigned int instr) ++{ ++ compiled_break = 1; ++ kgdb_handle_exception(1, SIGTRAP, 0, regs); ++ ++ return 0; ++} ++ ++static struct undef_hook kgdb_brkpt_hook = { ++ .instr_mask = 0xffffffff, ++ .instr_val = KGDB_BREAKINST, ++ .fn = kgdb_brk_fn ++}; ++ ++static struct undef_hook kgdb_compiled_brkpt_hook = { ++ .instr_mask = 0xffffffff, ++ .instr_val = KGDB_COMPILED_BREAK, ++ .fn = kgdb_compiled_brk_fn ++}; ++ ++/* ++ * Register our undef instruction hooks with ARM undef core. ++ * We regsiter a hook specifically looking for the KGB break inst ++ * and we handle the normal undef case within the do_undefinstr ++ * handler. ++ */ ++int kgdb_arch_init(void) ++{ ++ register_undef_hook(&kgdb_brkpt_hook); ++ register_undef_hook(&kgdb_compiled_brkpt_hook); ++ ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifndef __ARMEB__ ++ .gdb_bpt_instr = {0xfe, 0xde, 0xff, 0xe7} ++#else ++ .gdb_bpt_instr = {0xe7, 0xff, 0xde, 0xfe} ++#endif ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/setup.c linux-2.6.18.kgdb/arch/arm/kernel/setup.c +--- linux-2.6.18/arch/arm/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/kernel/setup.c 2008-06-10 16:19:51.000000000 +0400 +@@ -829,6 +829,11 @@ void __init setup_arch(char **cmdline_p) + conswitchp = &dummy_con; + #endif + #endif ++ ++#if defined(CONFIG_KGDB) ++ extern void __init early_trap_init(void); ++ early_trap_init(); ++#endif + } + + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/kernel/traps.c linux-2.6.18.kgdb/arch/arm/kernel/traps.c +--- linux-2.6.18/arch/arm/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/kernel/traps.c 2008-06-10 16:19:51.000000000 +0400 +@@ -278,6 +278,7 @@ asmlinkage void do_undefinstr(struct pt_ + unsigned int instr; + struct undef_hook *hook; + siginfo_t info; ++ mm_segment_t fs; + void __user *pc; + + /* +@@ -287,12 +288,15 @@ asmlinkage void do_undefinstr(struct pt_ + */ + regs->ARM_pc -= correction; + ++ fs = get_fs(); ++ set_fs(KERNEL_DS); + pc = (void __user *)instruction_pointer(regs); + if (thumb_mode(regs)) { + get_user(instr, (u16 __user *)pc); + } else { + get_user(instr, (u32 __user *)pc); + } ++ set_fs(fs); + + spin_lock_irq(&undef_lock); + list_for_each_entry(hook, &undef_hook, node) { +@@ -684,6 +688,13 @@ EXPORT_SYMBOL(abort); + + void __init trap_init(void) + { ++#if defined(CONFIG_KGDB) ++ return; ++} ++ ++void __init early_trap_init(void) ++{ ++#endif + unsigned long vectors = CONFIG_VECTORS_BASE; + extern char __stubs_start[], __stubs_end[]; + extern char __vectors_start[], __vectors_end[]; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp2000/core.c linux-2.6.18.kgdb/arch/arm/mach-ixp2000/core.c +--- linux-2.6.18/arch/arm/mach-ixp2000/core.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-ixp2000/core.c 2008-06-10 16:19:51.000000000 +0400 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -184,6 +185,9 @@ static struct platform_device ixp2000_se + void __init ixp2000_uart_init(void) + { + platform_device_register(&ixp2000_serial_device); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixp2000_serial_port); ++#endif + } + + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp2000/ixdp2x01.c linux-2.6.18.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c +--- linux-2.6.18/arch/arm/mach-ixp2000/ixdp2x01.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-ixp2000/ixdp2x01.c 2008-06-10 16:19:51.000000000 +0400 +@@ -38,6 +38,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -413,6 +414,11 @@ static void __init ixdp2x01_init_machine + platform_add_devices(ixdp2x01_devices, ARRAY_SIZE(ixdp2x01_devices)); + ixp2000_uart_init(); + ixdp2x01_uart_init(); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixdp425_serial_ports[0]); ++ kgdb8250_add_port(1, &ixdp425_serial_ports[1]); ++#endif + } + + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp4xx/coyote-setup.c linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c +--- linux-2.6.18/arch/arm/mach-ixp4xx/coyote-setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/coyote-setup.c 2008-06-10 16:19:51.000000000 +0400 +@@ -96,6 +96,10 @@ static void __init coyote_init(void) + } + + platform_add_devices(coyote_devices, ARRAY_SIZE(coyote_devices)); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &coyote_serial_port); ++#endif + } + + #ifdef CONFIG_ARCH_ADI_COYOTE +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-ixp4xx/ixdp425-setup.c linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c +--- linux-2.6.18/arch/arm/mach-ixp4xx/ixdp425-setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-ixp4xx/ixdp425-setup.c 2008-06-10 16:19:51.000000000 +0400 +@@ -24,6 +24,7 @@ + #include + #include + #include ++#include + + static struct flash_platform_data ixdp425_flash_data = { + .map_name = "cfi_probe", +@@ -76,7 +77,8 @@ static struct plat_serial8250_port ixdp4 + .mapbase = IXP4XX_UART1_BASE_PHYS, + .membase = (char *)IXP4XX_UART1_BASE_VIRT + REG_OFFSET, + .irq = IRQ_IXP4XX_UART1, +- .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST, ++ .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | ++ UPF_SHARE_IRQ, + .iotype = UPIO_MEM, + .regshift = 2, + .uartclk = IXP4XX_UART_XTAL, +@@ -85,7 +87,8 @@ static struct plat_serial8250_port ixdp4 + .mapbase = IXP4XX_UART2_BASE_PHYS, + .membase = (char *)IXP4XX_UART2_BASE_VIRT + REG_OFFSET, + .irq = IRQ_IXP4XX_UART2, +- .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST, ++ .flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST | ++ UPF_SHARE_IRQ, + .iotype = UPIO_MEM, + .regshift = 2, + .uartclk = IXP4XX_UART_XTAL, +@@ -116,6 +119,11 @@ static void __init ixdp425_init(void) + IXP4XX_EXP_BUS_BASE(0) + ixp4xx_exp_bus_size - 1; + + platform_add_devices(ixdp425_devices, ARRAY_SIZE(ixdp425_devices)); ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &ixdp425_serial_ports[0]); ++ kgdb8250_add_port(1, &ixdp425_serial_ports[1]); ++#endif + } + + #ifdef CONFIG_ARCH_IXDP425 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-omap1/serial.c linux-2.6.18.kgdb/arch/arm/mach-omap1/serial.c +--- linux-2.6.18/arch/arm/mach-omap1/serial.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-omap1/serial.c 2008-06-10 16:19:51.000000000 +0400 +@@ -15,6 +15,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -199,6 +200,9 @@ void __init omap_serial_init(void) + break; + } + omap_serial_reset(&serial_platform_data[i]); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_platform_port(i, &serial_platform_data[i]); ++#endif + } + } + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-pxa/Makefile linux-2.6.18.kgdb/arch/arm/mach-pxa/Makefile +--- linux-2.6.18/arch/arm/mach-pxa/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mach-pxa/Makefile 2008-06-10 16:19:51.000000000 +0400 +@@ -31,6 +31,7 @@ obj-$(CONFIG_LEDS) += $(led-y) + # Misc features + obj-$(CONFIG_PM) += pm.o sleep.o + obj-$(CONFIG_PXA_SSP) += ssp.o ++obj-$(CONFIG_KGDB_PXA_SERIAL) += kgdb-serial.o + + ifeq ($(CONFIG_PXA27x),y) + obj-$(CONFIG_PM) += standby.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-pxa/kgdb-serial.c linux-2.6.18.kgdb/arch/arm/mach-pxa/kgdb-serial.c +--- linux-2.6.18/arch/arm/mach-pxa/kgdb-serial.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/arm/mach-pxa/kgdb-serial.c 2008-06-10 16:19:51.000000000 +0400 +@@ -0,0 +1,98 @@ ++/* ++ * linux/arch/arm/mach-pxa/kgdb-serial.c ++ * ++ * Provides low level kgdb serial support hooks for PXA2xx boards ++ * ++ * Author: Nicolas Pitre ++ * Copyright: (C) 2002-2005 MontaVista Software Inc. ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License version 2 as ++ * published by the Free Software Foundation. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#if defined(CONFIG_KGDB_PXA_FFUART) ++ ++#define UART FFUART ++#define CKEN_UART CKEN6_FFUART ++#define GPIO_RX_MD GPIO34_FFRXD_MD ++#define GPIO_TX_MD GPIO39_FFTXD_MD ++ ++#elif defined(CONFIG_KGDB_PXA_BTUART) ++ ++#define UART BTUART ++#define CKEN_UART CKEN7_BTUART ++#define GPIO_RX_MD GPIO42_BTRXD_MD ++#define GPIO_TX_MD GPIO43_BTTXD_MD ++ ++#elif defined(CONFIG_KGDB_PXA_STUART) ++ ++#define UART STUART ++#define CKEN_UART CKEN5_STUART ++#define GPIO_RX_MD GPIO46_STRXD_MD ++#define GPIO_TX_MD GPIO47_STTXD_MD ++ ++#endif ++ ++#define UART_BAUDRATE (CONFIG_KGDB_BAUDRATE) ++ ++static volatile unsigned long *port = (unsigned long *)&UART; ++ ++static int kgdb_serial_init(void) ++{ ++ pxa_set_cken(CKEN_UART, 1); ++ pxa_gpio_mode(GPIO_RX_MD); ++ pxa_gpio_mode(GPIO_TX_MD); ++ ++ port[UART_IER] = 0; ++ port[UART_LCR] = LCR_DLAB; ++ port[UART_DLL] = ((921600 / UART_BAUDRATE) & 0xff); ++ port[UART_DLM] = ((921600 / UART_BAUDRATE) >> 8); ++ port[UART_LCR] = LCR_WLS1 | LCR_WLS0; ++ port[UART_MCR] = 0; ++ port[UART_IER] = IER_UUE; ++ port[UART_FCR] = FCR_ITL_16; ++ ++ return 0; ++} ++ ++static void kgdb_serial_putchar(int c) ++{ ++ if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE) ++ kgdb_serial_init(); ++ while (!(port[UART_LSR] & LSR_TDRQ)) ++ cpu_relax(); ++ port[UART_TX] = c; ++} ++ ++static void kgdb_serial_flush(void) ++{ ++ if ((CKEN & CKEN_UART) && (port[UART_IER] & IER_UUE)) ++ while (!(port[UART_LSR] & LSR_TEMT)) ++ cpu_relax(); ++} ++ ++static int kgdb_serial_getchar(void) ++{ ++ unsigned char c; ++ if (!(CKEN & CKEN_UART) || port[UART_IER] != IER_UUE) ++ kgdb_serial_init(); ++ while (!(port[UART_LSR] & UART_LSR_DR)) ++ cpu_relax(); ++ c = port[UART_RX]; ++ return c; ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .init = kgdb_serial_init, ++ .write_char = kgdb_serial_putchar, ++ .flush = kgdb_serial_flush, ++ .read_char = kgdb_serial_getchar, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mach-versatile/kgdb_serial.c linux-2.6.18.kgdb/arch/arm/mach-versatile/kgdb_serial.c +--- linux-2.6.18/arch/arm/mach-versatile/kgdb_serial.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/arm/mach-versatile/kgdb_serial.c 2008-06-10 16:19:51.000000000 +0400 +@@ -0,0 +1,121 @@ ++/* ++ * arch/arm/mach-versatile/kgdb_serial.c ++ * ++ * Author: Manish Lachwani, mlachwani@mvista.com ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ * ++ * Support for KGDB on ARM Versatile. ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define ARM_BAUD_38400 23 ++/* ++ * Functions that will be used later ++ */ ++#define UART_GET_INT_STATUS(p) readb((p) + UART010_IIR) ++#define UART_GET_MIS(p) readw((p) + UART011_MIS) ++#define UART_PUT_ICR(p, c) writel((c), (p) + UART010_ICR) ++#define UART_GET_FR(p) readb((p) + UART01x_FR) ++#define UART_GET_CHAR(p) readb((p) + UART01x_DR) ++#define UART_PUT_CHAR(p, c) writel((c), (p) + UART01x_DR) ++#define UART_GET_RSR(p) readb((p) + UART01x_RSR) ++#define UART_GET_CR(p) readb((p) + UART010_CR) ++#define UART_PUT_CR(p,c) writel((c), (p) + UART010_CR) ++#define UART_GET_LCRL(p) readb((p) + UART010_LCRL) ++#define UART_PUT_LCRL(p,c) writel((c), (p) + UART010_LCRL) ++#define UART_GET_LCRM(p) readb((p) + UART010_LCRM) ++#define UART_PUT_LCRM(p,c) writel((c), (p) + UART010_LCRM) ++#define UART_GET_LCRH(p) readb((p) + UART010_LCRH) ++#define UART_PUT_LCRH(p,c) writel((c), (p) + UART010_LCRH) ++#define UART_RX_DATA(s) (((s) & UART01x_FR_RXFE) == 0) ++#define UART_TX_READY(s) (((s) & UART01x_FR_TXFF) == 0) ++#define UART_TX_EMPTY(p) ((UART_GET_FR(p) & UART01x_FR_TMSK) == 0) ++ ++/* ++ * KGDB IRQ ++ */ ++static int kgdb_irq = 12; ++static volatile unsigned char *port = NULL; ++ ++static int kgdb_serial_init(void) ++{ ++ int rate = ARM_BAUD_38400; ++ ++ port = IO_ADDRESS(0x101F1000); ++ UART_PUT_CR(port, 0); ++ ++ /* Set baud rate */ ++ UART_PUT_LCRM(port, ((rate & 0xf00) >> 8)); ++ UART_PUT_LCRL(port, (rate & 0xff)); ++ UART_PUT_LCRH(port, UART01x_LCRH_WLEN_8 | UART01x_LCRH_FEN); ++ UART_PUT_CR(port, UART01x_CR_UARTEN); ++ ++ return 0; ++} ++ ++static void kgdb_serial_putchar(int ch) ++{ ++ unsigned int status; ++ ++ do { ++ status = UART_GET_FR(port); ++ } while (!UART_TX_READY(status)); ++ ++ UART_PUT_CHAR(port, ch); ++} ++ ++static int kgdb_serial_getchar(void) ++{ ++ unsigned int status; ++ int ch; ++ ++ do { ++ status = UART_GET_FR(port); ++ } while (!UART_RX_DATA(status)); ++ ch = UART_GET_CHAR(port); ++ return ch; ++} ++ ++static struct uart_port kgdb_amba_port = { ++ .irq = 12, ++ .iobase = 0, ++ .iotype = UPIO_MEM, ++ .membase = (unsigned char *)IO_ADDRESS(0x101F1000), ++}; ++ ++static irqreturn_t kgdb_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ int status = UART_GET_MIS(port); ++ ++ if (irq != kgdb_irq) ++ return IRQ_NONE; ++ ++ if (status & 0x40) ++ breakpoint(); ++ ++ return IRQ_HANDLED; ++} ++ ++static void __init kgdb_hookup_irq(void) ++{ ++ request_irq(kgdb_irq, kgdb_interrupt, SA_SHIRQ, "GDB-stub", ++ &kgdb_amba_port); ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .init = kgdb_serial_init, ++ .write_char = kgdb_serial_putchar, ++ .read_char = kgdb_serial_getchar, ++ .late_init = kgdb_hookup_irq, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/arm/mm/extable.c linux-2.6.18.kgdb/arch/arm/mm/extable.c +--- linux-2.6.18/arch/arm/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/arm/mm/extable.c 2008-06-10 16:19:51.000000000 +0400 +@@ -2,6 +2,7 @@ + * linux/arch/arm/mm/extable.c + */ + #include ++#include + #include + + int fixup_exception(struct pt_regs *regs) +@@ -11,6 +12,12 @@ int fixup_exception(struct pt_regs *regs + fixup = search_exception_tables(instruction_pointer(regs)); + if (fixup) + regs->ARM_pc = fixup->fixup; ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + + return fixup != NULL; + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/Makefile linux-2.6.18.kgdb/arch/i386/kernel/Makefile +--- linux-2.6.18/arch/i386/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/Makefile 2008-06-10 16:19:17.000000000 +0400 +@@ -39,6 +39,7 @@ obj-$(CONFIG_VM86) += vm86.o + obj-$(CONFIG_EARLY_PRINTK) += early_printk.o + obj-$(CONFIG_HPET_TIMER) += hpet.o + obj-$(CONFIG_K8_NB) += k8.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + EXTRA_AFLAGS := -traditional + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/entry.S linux-2.6.18.kgdb/arch/i386/kernel/entry.S +--- linux-2.6.18/arch/i386/kernel/entry.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/entry.S 2008-06-10 16:19:58.000000000 +0400 +@@ -201,7 +201,7 @@ VM_MASK = 0x00020000 + CFI_OFFSET ecx, ECX-OLDESP;\ + CFI_OFFSET ebx, EBX-OLDESP + +-ENTRY(ret_from_fork) ++KPROBE_ENTRY(ret_from_fork) + CFI_STARTPROC + pushl %eax + CFI_ADJUST_CFA_OFFSET 4 +@@ -664,7 +664,7 @@ ENTRY(simd_coprocessor_error) + jmp error_code + CFI_ENDPROC + +-ENTRY(device_not_available) ++KPROBE_ENTRY(device_not_available) + RING0_INT_FRAME + pushl $-1 # mark this as an int + CFI_ADJUST_CFA_OFFSET 4 +@@ -909,7 +909,7 @@ ENTRY(machine_check) + CFI_ENDPROC + #endif + +-ENTRY(spurious_interrupt_bug) ++KPROBE_ENTRY(spurious_interrupt_bug) + RING0_INT_FRAME + pushl $0 + CFI_ADJUST_CFA_OFFSET 4 +@@ -953,3 +953,108 @@ ENDPROC(arch_unwind_init_running) + #include "syscall_table.S" + + syscall_table_size=(.-sys_call_table) ++ ++# Here we do call frames. We cheat a bit as we only really need ++# correct frames at locations we can actually look at from a ++# debugger. Since the break instruction trap actually goes thru ++# some of this code, we don't really need info on those areas, but ++# only after the fact. I.e. if we can not step or break in a ++# location or end up with a return address pointing at the ++# location, we don't need a correct call frame for it. ++ ++#ifdef CONFIG_KGDB ++ ++#include ++/* ++ * The register numbers as known by gdb ++ */ ++ ++#define _EAX 0 ++#define _ECX 1 ++#define _EDX 2 ++#define _EBX 3 ++#define _ESP 4 ++#define _EBP 5 ++#define _ESI 6 ++#define _EDI 7 ++#define _PC 8 ++#define _EIP 8 ++#define _PS 9 ++#define _EFLAGS 9 ++#define _CS 10 ++#define _SS 11 ++#define _DS 12 ++#define _ES 13 ++#define _FS 14 ++#define _GS 15 ++ /* ++ * This code uses macros defined in linux/dwarf2-lang.h ++ * They attempt to follow the dwarf2 naming conventions... sort of.. ++ */ ++ENTRY(end_of_stack_stop_unwind_function) ++ .long end_of_stack_stop_unwind_function+1 ++ ++ .text ++ ++ CFI_preamble(c1,_PC,1,1) ++ CFA_define_reference(_ESP,OLDESP) /* Stack pointer */ ++ CFA_expression(_EIP) ++ CFA_exp_OP_dup /* copy old esp */ ++ CFA_exp_OP_consts(CS-OLDESP) /* offset to CS address */ ++ CFA_exp_OP_plus /* should be CS address */ ++ CFA_exp_OP_deref /* get the CS */ ++ CFA_exp_OP_const4s(VM_MASK|3) /* prepare to mask it */ ++ CFA_exp_OP_and /* mask it, zero means kernel */ ++ CFA_exp_OP_bra(eip_user_rtn) /* branch if user */ ++ CFA_exp_OP_const4s(EIP-OLDESP) /* offset to return address */ ++ CFA_exp_OP_plus /* add that in */ ++ CFA_exp_OP_skip(eip_end) /* done if kernel, skip out */ ++eip_user_rtn: ++ CFA_exp_OP_addr(end_of_stack_stop_unwind_function)/*dummy function */ ++eip_end: ++ CFA_expression_end ++ CFA_define_offset(_EBX,EBX-OLDESP) ++ CFA_define_offset(_ECX,ECX-OLDESP) ++ CFA_define_offset(_EDX,EDX-OLDESP) ++ CFA_define_offset(_ESI,ESI-OLDESP) ++ CFA_define_offset(_EDI,EDI-OLDESP) ++ CFA_define_offset(_EBP,EBP-OLDESP) ++ CFA_define_offset(_EAX,EAX-OLDESP) ++ CFA_define_offset(_EFLAGS,EFLAGS-OLDESP) ++ CFI_postamble() ++ ++/* ++ * This provides an uwind for our dummy end of unwind function. ++ * Current convention is to provied an undefined return address. ++ */ ++ CFI_preamble(c2,_PC,1,1) ++ CFA_define_reference(_ESP,0) /* Stack pointer */ ++ CFA_undefine_reg(_EIP) ++ CFI_postamble() ++ ++ FDE_preamble(c2,end_of_stack_stop_unwind_function, \ ++ end_of_stack_stop_unwind_function+5) ++ FDE_postamble() ++ /* ++ * This is VERY sloppy. At this point all we want to do is get ++ * the frame right for back tracing. It will not be good if ++ * you try to single step. We use already defined labels. ++ * We want to cover all call outs. ++ * We could also recode this as just one FDE, but this works and ++ * I want to get it out. ++ */ ++ FDE_preamble(c1,ret_from_fork,ret_from_exception) ++ CFA_define_cfa_offset(4) /* one extra word on stack */ ++ FDE_postamble() ++ ++ FDE_preamble(c1,ret_from_exception,device_not_available_emulate) ++ FDE_postamble() ++ ++ FDE_preamble(c1,device_not_available_emulate,debug) ++ CFA_define_cfa_offset(4) /* one extra word on stack */ ++ FDE_postamble() ++ ++ FDE_preamble(c1, debug,spurious_interrupt_bug) ++ FDE_postamble() ++ ++#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/head.S linux-2.6.18.kgdb/arch/i386/kernel/head.S +--- linux-2.6.18/arch/i386/kernel/head.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/head.S 2008-06-10 16:19:58.000000000 +0400 +@@ -10,6 +10,7 @@ + .text + #include + #include ++#include + #include + #include + #include +@@ -326,6 +327,10 @@ is386: movl $2,%ecx # set MP + #endif /* CONFIG_SMP */ + jmp start_kernel + ++ /* This dwarf code tells gdb that this is the end of the unwind */ ++ /* This uses the CFA set up for pc=1 located in entry.S */ ++ CFI_END_FRAME(is386) ++ + /* + * We depend on ET to be correct. This checks for 287/387. + */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/i386/kernel/kgdb-jmp.S +--- linux-2.6.18/arch/i386/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/i386/kernel/kgdb-jmp.S 2008-06-10 16:19:17.000000000 +0400 +@@ -0,0 +1,74 @@ ++/* ++ * arch/i386/kernel/kgdb-jmp.S ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: George Anzinger ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 1996, 1996, 1997, 2000, 2001 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++ ++#define PCOFF 0 ++#define LINKAGE 4 /* just the return address */ ++#define PTR_SIZE 4 ++#define PARMS LINKAGE /* no space for saved regs */ ++#define JMPBUF PARMS ++#define VAL JMPBUF+PTR_SIZE ++ ++#define JB_BX 0 ++#define JB_SI 1 ++#define JB_DI 2 ++#define JB_BP 3 ++#define JB_SP 4 ++#define JB_PC 5 ++ ++/* This must be called prior to kgdb_fault_longjmp and ++ * kgdb_fault_longjmp must not be called outside of the context of the ++ * last call to kgdb_fault_setjmp. ++ * kgdb_fault_setjmp(int *jmp_buf[6]) ++ */ ++ENTRY(kgdb_fault_setjmp) ++ movl JMPBUF(%esp), %eax ++ ++ /* Save registers. */ ++ movl %ebx, (JB_BX*4)(%eax) ++ movl %esi, (JB_SI*4)(%eax) ++ movl %edi, (JB_DI*4)(%eax) ++ /* Save SP as it will be after we return. */ ++ leal JMPBUF(%esp), %ecx ++ movl %ecx, (JB_SP*4)(%eax) ++ movl PCOFF(%esp), %ecx /* Save PC we are returning to now. */ ++ movl %ecx, (JB_PC*4)(%eax) ++ movl %ebp, (JB_BP*4)(%eax) /* Save caller's frame pointer. */ ++ ++ /* Restore state so we can now try the access. */ ++ movl JMPBUF(%esp), %ecx /* User's jmp_buf in %ecx. */ ++ /* Save the return address now. */ ++ movl (JB_PC*4)(%ecx), %edx ++ /* Restore registers. */ ++ movl $0, %eax ++ movl (JB_SP*4)(%ecx), %esp ++ jmp *%edx /* Jump to saved PC. */ ++ ++/* kgdb_fault_longjmp(int *jmp_buf[6]) */ ++ENTRY(kgdb_fault_longjmp) ++ movl JMPBUF(%esp), %ecx /* User's jmp_buf in %ecx. */ ++ /* Save the return address now. */ ++ movl (JB_PC*4)(%ecx), %edx ++ /* Restore registers. */ ++ movl (JB_BX*4)(%ecx), %ebx ++ movl (JB_SI*4)(%ecx), %esi ++ movl (JB_DI*4)(%ecx), %edi ++ movl (JB_BP*4)(%ecx), %ebp ++ movl $1, %eax ++ movl (JB_SP*4)(%ecx), %esp ++ jmp *%edx /* Jump to saved PC. */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/kgdb.c linux-2.6.18.kgdb/arch/i386/kernel/kgdb.c +--- linux-2.6.18/arch/i386/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/i386/kernel/kgdb.c 2008-06-10 16:20:15.000000000 +0400 +@@ -0,0 +1,363 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ */ ++/* ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Updated by: Tom Rini ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by ++ * David Grothe ++ * Additional support from Tigran Aivazian ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++ ++#include "mach_ipi.h" ++ ++/* Put the error code here just in case the user cares. */ ++int gdb_i386errcode; ++/* Likewise, the vector number here (since GDB only gets the signal ++ number through the usual means, and that's not very specific). */ ++int gdb_i386vector = -1; ++ ++extern atomic_t cpu_doing_single_step; ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_EAX] = regs->eax; ++ gdb_regs[_EBX] = regs->ebx; ++ gdb_regs[_ECX] = regs->ecx; ++ gdb_regs[_EDX] = regs->edx; ++ gdb_regs[_ESI] = regs->esi; ++ gdb_regs[_EDI] = regs->edi; ++ gdb_regs[_EBP] = regs->ebp; ++ gdb_regs[_DS] = regs->xds; ++ gdb_regs[_ES] = regs->xes; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_CS] = regs->xcs; ++ gdb_regs[_PC] = regs->eip; ++ gdb_regs[_ESP] = (int)(®s->esp); ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} ++ ++/* ++ * Extracts ebp, esp and eip values understandable by gdb from the values ++ * saved by switch_to. ++ * thread.esp points to ebp. flags and ebp are pushed in switch_to hence esp ++ * prior to entering switch_to is 8 greater then the value that is saved. ++ * If switch_to changes, change following code appropriately. ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ gdb_regs[_EAX] = 0; ++ gdb_regs[_EBX] = 0; ++ gdb_regs[_ECX] = 0; ++ gdb_regs[_EDX] = 0; ++ gdb_regs[_ESI] = 0; ++ gdb_regs[_EDI] = 0; ++ gdb_regs[_EBP] = *(unsigned long *)p->thread.esp; ++ gdb_regs[_DS] = __KERNEL_DS; ++ gdb_regs[_ES] = __KERNEL_DS; ++ gdb_regs[_PS] = 0; ++ gdb_regs[_CS] = __KERNEL_CS; ++ gdb_regs[_PC] = p->thread.eip; ++ gdb_regs[_ESP] = p->thread.esp; ++ gdb_regs[_SS] = __KERNEL_DS; ++ gdb_regs[_FS] = 0xFFFF; ++ gdb_regs[_GS] = 0xFFFF; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ regs->eax = gdb_regs[_EAX]; ++ regs->ebx = gdb_regs[_EBX]; ++ regs->ecx = gdb_regs[_ECX]; ++ regs->edx = gdb_regs[_EDX]; ++ regs->esi = gdb_regs[_ESI]; ++ regs->edi = gdb_regs[_EDI]; ++ regs->ebp = gdb_regs[_EBP]; ++ regs->xds = gdb_regs[_DS]; ++ regs->xes = gdb_regs[_ES]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->xcs = gdb_regs[_CS]; ++ regs->eip = gdb_regs[_PC]; ++} ++ ++static struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned addr; ++} breakinfo[4] = { ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++ { .enabled = 0 }, ++}; ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned dr7; ++ ++ asm volatile ("movl %%db7, %0\n":"=r" (dr7) ++ :); ++ do { ++ unsigned addr0, addr1, addr2, addr3; ++ asm volatile ("movl %%db0, %0\n" ++ "movl %%db1, %1\n" ++ "movl %%db2, %2\n" ++ "movl %%db3, %3\n":"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3):); ++ } while (0); ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movl %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movl %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movl %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movl %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) ++ asm volatile ("movl %0, %%db7\n"::"r" (dr7)); ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].addr == addr && breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 0; ++ return 0; ++} ++ ++void kgdb_remove_all_hw_break(void) ++{ ++ int i; ++ ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].enabled) { ++ /* Do what? */ ++ ; ++ } ++ memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); ++ } ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (!breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 1; ++ breakinfo[idx].type = 1; ++ breakinfo[idx].len = 1; ++ breakinfo[idx].addr = addr; ++ return 0; ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ /* Disable hardware debugging while we are in kgdb */ ++ asm volatile ("movl %0,%%db7": /* no output */ :"r" (0)); ++} ++ ++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++ /* Master processor is completely in the debugger */ ++ gdb_i386vector = e_vector; ++ gdb_i386errcode = err_code; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ long addr; ++ char *ptr; ++ int newPC, dr6; ++ ++ switch (remcom_in_buffer[0]) { ++ case 'c': ++ case 's': ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->eip = addr; ++ newPC = linux_regs->eip; ++ ++ /* clear the trace bit */ ++ linux_regs->eflags &= ~TF_MASK; ++ atomic_set(&cpu_doing_single_step, -1); ++ ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++ linux_regs->eflags |= TF_MASK; ++ debugger_step = 1; ++ atomic_set(&cpu_doing_single_step,smp_processor_id()); ++ } ++ ++ asm volatile ("movl %%db6, %0\n":"=r" (dr6)); ++ if (!(dr6 & 0x4000)) { ++ long breakno; ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno) && ++ breakinfo[breakno].type == 0) { ++ /* Set restore flag */ ++ linux_regs->eflags |= X86_EFLAGS_RF; ++ break; ++ } ++ } ++ } ++ kgdb_correct_hw_break(); ++ asm volatile ("movl %0, %%db6\n"::"r" (0)); ++ ++ return (0); ++ } /* switch */ ++ /* this means that we do not want to exit from the handler */ ++ return -1; ++} ++ ++/* Register KGDB with the i386die_chain so that we hook into all of the right ++ * spots. */ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ ++ /* Bad memory access? */ ++ if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active) ++ && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ } else if (cmd == DIE_PAGE_FAULT) ++ /* A normal page fault, ignore. */ ++ return NOTIFY_DONE; ++ else if ((cmd == DIE_NMI || cmd == DIE_NMI_IPI || ++ cmd == DIE_NMIWATCHDOG) && atomic_read(&debugger_active)) { ++ /* CPU roundup */ ++ kgdb_nmihook(smp_processor_id(), regs); ++ return NOTIFY_STOP; ++ } else if (cmd == DIE_NMI_IPI || cmd == DIE_NMI || user_mode(regs) || ++ (cmd == DIE_DEBUG && atomic_read(&debugger_active))) ++ /* Normal watchdog event or userspace debugging, or spurious ++ * debug exception, ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&i386die_chain, &kgdb_notifier); ++ return 0; ++} ++ ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++ ++int kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ if (exception == 3 && kgdb_isremovedbreak(regs->eip - 1)) { ++ regs->eip -= 1; ++ return 1; ++ } ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/process.c linux-2.6.18.kgdb/arch/i386/kernel/process.c +--- linux-2.6.18/arch/i386/kernel/process.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/process.c 2008-06-10 16:19:58.000000000 +0400 +@@ -328,7 +328,27 @@ __asm__(".section .text\n" + "call *%ebx\n\t" + "pushl %eax\n\t" + "call do_exit\n" ++ "kernel_thread_helper_end:\n\t" + ".previous"); ++#ifdef CONFIG_KGDB ++#include ++ ++ /* This dwarf code tells gdb that this is the end of the unwind */ ++ /* This uses the CFA set up for pc=1 located in entry.S */ ++#define _ESP 4 ++#define _PC 8 ++#define _EIP 8 ++__asm__( ++ QUOTE_THIS( ++ CFI_preamble(dwarf_4,_PC,1,1) ++ CFA_define_reference(_ESP,0) /* Stack pointer */ ++ CFA_undefine_reg(_EIP) ++ CFI_postamble() ++ ++ FDE_preamble(dwarf_4,kernel_thread_helper,kernel_thread_helper_end) ++ FDE_postamble() ++ )); ++#endif + + /* + * Create a kernel thread +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/setup.c linux-2.6.18.kgdb/arch/i386/kernel/setup.c +--- linux-2.6.18/arch/i386/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/setup.c 2008-06-10 16:19:17.000000000 +0400 +@@ -148,6 +148,7 @@ EXPORT_SYMBOL(ist_info); + struct e820map e820; + + extern void early_cpu_init(void); ++extern void early_trap_init(void); + extern void generic_apic_probe(char *); + extern int root_mountflags; + +@@ -1444,6 +1445,7 @@ void __init setup_arch(char **cmdline_p) + memcpy(&boot_cpu_data, &new_cpu_data, sizeof(new_cpu_data)); + pre_setup_arch_hook(); + early_cpu_init(); ++ early_trap_init(); + + /* + * FIXME: This isn't an official loader_type right +@@ -1500,6 +1502,7 @@ void __init setup_arch(char **cmdline_p) + data_resource.end = virt_to_phys(_edata)-1; + + parse_cmdline_early(cmdline_p); ++ parse_early_param(); + + #ifdef CONFIG_EARLY_PRINTK + { +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/smpboot.c linux-2.6.18.kgdb/arch/i386/kernel/smpboot.c +--- linux-2.6.18/arch/i386/kernel/smpboot.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/smpboot.c 2008-06-10 16:19:58.000000000 +0400 +@@ -592,6 +592,9 @@ void __devinit initialize_secondary(void + + asm volatile( + "movl %0,%%esp\n\t" ++#ifdef CONFIG_KGDB ++ "pushl end_of_stack_stop_unwind_function\n\t" ++#endif + "jmp *%1" + : + :"r" (current->thread.esp),"r" (current->thread.eip)); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/kernel/traps.c linux-2.6.18.kgdb/arch/i386/kernel/traps.c +--- linux-2.6.18/arch/i386/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/kernel/traps.c 2008-06-10 16:19:17.000000000 +0400 +@@ -863,6 +863,7 @@ fastcall void __kprobes do_debug(struct + */ + clear_dr7: + set_debugreg(0, 7); ++ notify_die(DIE_DEBUG, "debug2", regs, condition, error_code, SIGTRAP); + return; + + debug_vm86: +@@ -1167,6 +1168,12 @@ static void __init set_task_gate(unsigne + _set_gate(idt_table+n,5,0,0,(gdt_entry<<3)); + } + ++/* Some traps need to be set early. */ ++void __init early_trap_init(void) { ++ set_intr_gate(1,&debug); ++ set_system_intr_gate(3, &int3); /* int3 can be called from all */ ++ set_intr_gate(14,&page_fault); ++} + + void __init trap_init(void) + { +@@ -1183,10 +1190,8 @@ void __init trap_init(void) + #endif + + set_trap_gate(0,÷_error); +- set_intr_gate(1,&debug); + set_intr_gate(2,&nmi); +- set_system_intr_gate(3, &int3); /* int3/4 can be called from all */ +- set_system_gate(4,&overflow); ++ set_system_gate(4,&overflow); /* int4/5 can be called from all */ + set_trap_gate(5,&bounds); + set_trap_gate(6,&invalid_op); + set_trap_gate(7,&device_not_available); +@@ -1196,7 +1201,6 @@ void __init trap_init(void) + set_trap_gate(11,&segment_not_present); + set_trap_gate(12,&stack_segment); + set_trap_gate(13,&general_protection); +- set_intr_gate(14,&page_fault); + set_trap_gate(15,&spurious_interrupt_bug); + set_trap_gate(16,&coprocessor_error); + set_trap_gate(17,&alignment_check); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/i386/mm/fault.c linux-2.6.18.kgdb/arch/i386/mm/fault.c +--- linux-2.6.18/arch/i386/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/i386/mm/fault.c 2008-06-10 16:19:17.000000000 +0400 +@@ -539,6 +539,10 @@ no_context: + if (is_prefetch(regs, address, error_code)) + return; + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ error_code, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/Makefile linux-2.6.18.kgdb/arch/ia64/kernel/Makefile +--- linux-2.6.18/arch/ia64/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/Makefile 2008-06-10 16:19:32.000000000 +0400 +@@ -31,6 +31,7 @@ obj-$(CONFIG_KPROBES) += kprobes.o jpro + obj-$(CONFIG_IA64_UNCACHED_ALLOCATOR) += uncached.o + obj-$(CONFIG_AUDIT) += audit.o + mca_recovery-y += mca_drv.o mca_drv_asm.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + + # The gate DSO image is built using a special linker script. + targets += gate.so gate-syms.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/entry.S linux-2.6.18.kgdb/arch/ia64/kernel/entry.S +--- linux-2.6.18/arch/ia64/kernel/entry.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/entry.S 2008-06-10 16:20:23.000000000 +0400 +@@ -953,9 +953,9 @@ GLOBAL_ENTRY(ia64_leave_kernel) + shr.u r18=r19,16 // get byte size of existing "dirty" partition + ;; + mov r16=ar.bsp // get existing backing store pointer +- addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 ++(pUStk) addl r17=THIS_CPU(ia64_phys_stacked_size_p8),r0 + ;; +- ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 ++(pUStk) ld4 r17=[r17] // r17 = cpu_data->phys_stacked_size_p8 + (pKStk) br.cond.dpnt skip_rbs_switch + + /* +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/ivt.S linux-2.6.18.kgdb/arch/ia64/kernel/ivt.S +--- linux-2.6.18/arch/ia64/kernel/ivt.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/ivt.S 2008-06-10 16:20:23.000000000 +0400 +@@ -52,6 +52,14 @@ + #include + #include + ++#ifdef CONFIG_KGDB ++#define KGDB_ENABLE_PSR_DB mov r31=psr;; movl r30=IA64_PSR_DB;; \ ++ or r31=r31,r30;; \ ++ mov psr.l=r31;; srlz.i;; ++#else ++#define KGDB_ENABLE_PSR_DB ++#endif ++ + #if 1 + # define PSR_DEFAULT_BITS psr.ac + #else +@@ -519,6 +527,7 @@ ENTRY(page_fault) + movl r14=ia64_leave_kernel + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + mov rp=r14 + ;; + adds out2=16,r12 // out2 = pointer to pt_regs +@@ -863,6 +872,7 @@ ENTRY(interrupt) + srlz.i // ensure everybody knows psr.ic is back on + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + ;; + MCA_RECOVER_RANGE(interrupt) + alloc r14=ar.pfs,0,0,2,0 // must be first in an insn group +@@ -1110,6 +1120,7 @@ ENTRY(non_syscall) + movl r15=ia64_leave_kernel + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + mov rp=r15 + ;; + br.call.sptk.many b6=ia64_bad_break // avoid WAW on CFM and ignore return addr +@@ -1143,6 +1154,7 @@ ENTRY(dispatch_unaligned_handler) + adds r3=8,r2 // set up second base pointer + ;; + SAVE_REST ++ KGDB_ENABLE_PSR_DB + movl r14=ia64_leave_kernel + ;; + mov rp=r14 +@@ -1185,6 +1197,10 @@ ENTRY(dispatch_to_fault_handler) + adds r3=8,r2 // set up second base pointer for SAVE_REST + ;; + SAVE_REST ++ cmp.eq p6,p0=29,out0 ++(p6) br.cond.spnt 1f;; // debug_vector ++ KGDB_ENABLE_PSR_DB ++1: + movl r14=ia64_leave_kernel + ;; + mov rp=r14 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/ia64/kernel/kgdb-jmp.S +--- linux-2.6.18/arch/ia64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/kgdb-jmp.S 2008-06-10 16:19:32.000000000 +0400 +@@ -0,0 +1,238 @@ ++/* setjmp() and longjmp() assembler support for kdb on ia64. ++ ++ This code was copied from glibc CVS as of 2001-06-27 and modified where ++ necessary to fit the kernel. ++ Keith Owens 2001-06-27 ++ */ ++ ++/* Copyright (C) 1999, 2000, 2001 Free Software Foundation, Inc. ++ Contributed by David Mosberger-Tang . ++ ++ The GNU C Library is free software; you can redistribute it and/or ++ modify it under the terms of the GNU Library General Public License as ++ published by the Free Software Foundation; either version 2 of the ++ License, or (at your option) any later version. ++ ++ The GNU C Library is distributed in the hope that it will be useful, ++ but WITHOUT ANY WARRANTY; without even the implied warranty of ++ MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ Library General Public License for more details. ++ ++ You should have received a copy of the GNU Library General Public ++ License along with the GNU C Library; see the file COPYING.LIB. If ++ not, write to the Free Software Foundation, Inc., ++ 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA. ++*/ ++ ++#include ++GLOBAL_ENTRY(kgdb_fault_setjmp) ++ .prologue ASM_UNW_PRLG_RP|ASM_UNW_PRLG_PFS, ASM_UNW_PRLG_GRSAVE(2) ++ alloc loc1=ar.pfs,2,2,2,0 ++ mov r16=ar.unat ++ ;; ++ mov r17=ar.fpsr ++ mov r2=in0 ++ add r3=8,in0 ++ ;; ++.mem.offset 0,0; ++ st8.spill.nta [r2]=sp,16 // r12 (sp) ++.mem.offset 8,0; ++ st8.spill.nta [r3]=gp,16 // r1 (gp) ++ ;; ++ st8.nta [r2]=r16,16 // save caller's unat ++ st8.nta [r3]=r17,16 // save fpsr ++ add r8=0xa0,in0 ++ ;; ++.mem.offset 160,0; ++ st8.spill.nta [r2]=r4,16 // r4 ++.mem.offset 168,0; ++ st8.spill.nta [r3]=r5,16 // r5 ++ add r9=0xb0,in0 ++ ;; ++ stf.spill.nta [r8]=f2,32 ++ stf.spill.nta [r9]=f3,32 ++ mov loc0=rp ++ .body ++ ;; ++ stf.spill.nta [r8]=f4,32 ++ stf.spill.nta [r9]=f5,32 ++ mov r17=b1 ++ ;; ++ stf.spill.nta [r8]=f16,32 ++ stf.spill.nta [r9]=f17,32 ++ mov r18=b2 ++ ;; ++ stf.spill.nta [r8]=f18,32 ++ stf.spill.nta [r9]=f19,32 ++ mov r19=b3 ++ ;; ++ stf.spill.nta [r8]=f20,32 ++ stf.spill.nta [r9]=f21,32 ++ mov r20=b4 ++ ;; ++ stf.spill.nta [r8]=f22,32 ++ stf.spill.nta [r9]=f23,32 ++ mov r21=b5 ++ ;; ++ stf.spill.nta [r8]=f24,32 ++ stf.spill.nta [r9]=f25,32 ++ mov r22=ar.lc ++ ;; ++ stf.spill.nta [r8]=f26,32 ++ stf.spill.nta [r9]=f27,32 ++ mov r24=pr ++ ;; ++ stf.spill.nta [r8]=f28,32 ++ stf.spill.nta [r9]=f29,32 ++ ;; ++ stf.spill.nta [r8]=f30 ++ stf.spill.nta [r9]=f31 ++ ++.mem.offset 0,0; ++ st8.spill.nta [r2]=r6,16 // r6 ++.mem.offset 8,0; ++ st8.spill.nta [r3]=r7,16 // r7 ++ ;; ++ mov r23=ar.bsp ++ mov r25=ar.unat ++ st8.nta [r2]=loc0,16 // b0 ++ st8.nta [r3]=r17,16 // b1 ++ ;; ++ st8.nta [r2]=r18,16 // b2 ++ st8.nta [r3]=r19,16 // b3 ++ ;; ++ st8.nta [r2]=r20,16 // b4 ++ st8.nta [r3]=r21,16 // b5 ++ ;; ++ st8.nta [r2]=loc1,16 // ar.pfs ++ st8.nta [r3]=r22,16 // ar.lc ++ ;; ++ st8.nta [r2]=r24,16 // pr ++ st8.nta [r3]=r23,16 // ar.bsp ++ ;; ++ st8.nta [r2]=r25 // ar.unat ++ st8.nta [r3]=in0 // &__jmp_buf ++ mov r8=0 ++ mov rp=loc0 ++ mov ar.pfs=loc1 ++ br.ret.sptk.few rp ++END(kdba_setjmp) ++#define pPos p6 /* is rotate count positive? */ ++#define pNeg p7 /* is rotate count negative? */ ++GLOBAL_ENTRY(kgdb_fault_longjmp) ++ alloc r8=ar.pfs,2,1,0,0 ++ mov r27=ar.rsc ++ add r2=0x98,in0 // r2 <- &jmpbuf.orig_jmp_buf_addr ++ ;; ++ ld8 r8=[r2],-16 // r8 <- orig_jmp_buf_addr ++ mov r10=ar.bsp ++ and r11=~0x3,r27 // clear ar.rsc.mode ++ ;; ++ flushrs // flush dirty regs to backing store (must be first in insn grp) ++ ld8 r23=[r2],8 // r23 <- jmpbuf.ar_bsp ++ sub r8=r8,in0 // r8 <- &orig_jmpbuf - &jmpbuf ++ ;; ++ ld8 r25=[r2] // r25 <- jmpbuf.ar_unat ++ extr.u r8=r8,3,6 // r8 <- (&orig_jmpbuf - &jmpbuf)/8 & 0x3f ++ ;; ++ cmp.lt pNeg,pPos=r8,r0 ++ mov r2=in0 ++ ;; ++(pPos) mov r16=r8 ++(pNeg) add r16=64,r8 ++(pPos) sub r17=64,r8 ++(pNeg) sub r17=r0,r8 ++ ;; ++ mov ar.rsc=r11 // put RSE in enforced lazy mode ++ shr.u r8=r25,r16 ++ add r3=8,in0 // r3 <- &jmpbuf.r1 ++ shl r9=r25,r17 ++ ;; ++ or r25=r8,r9 ++ ;; ++ mov r26=ar.rnat ++ mov ar.unat=r25 // setup ar.unat (NaT bits for r1, r4-r7, and r12) ++ ;; ++ ld8.fill.nta sp=[r2],16 // r12 (sp) ++ ld8.fill.nta gp=[r3],16 // r1 (gp) ++ dep r11=-1,r23,3,6 // r11 <- ia64_rse_rnat_addr(jmpbuf.ar_bsp) ++ ;; ++ ld8.nta r16=[r2],16 // caller's unat ++ ld8.nta r17=[r3],16 // fpsr ++ ;; ++ ld8.fill.nta r4=[r2],16 // r4 ++ ld8.fill.nta r5=[r3],16 // r5 (gp) ++ cmp.geu p8,p0=r10,r11 // p8 <- (ar.bsp >= jmpbuf.ar_bsp) ++ ;; ++ ld8.fill.nta r6=[r2],16 // r6 ++ ld8.fill.nta r7=[r3],16 // r7 ++ ;; ++ mov ar.unat=r16 // restore caller's unat ++ mov ar.fpsr=r17 // restore fpsr ++ ;; ++ ld8.nta r16=[r2],16 // b0 ++ ld8.nta r17=[r3],16 // b1 ++ ;; ++(p8) ld8 r26=[r11] // r26 <- *ia64_rse_rnat_addr(jmpbuf.ar_bsp) ++ mov ar.bspstore=r23 // restore ar.bspstore ++ ;; ++ ld8.nta r18=[r2],16 // b2 ++ ld8.nta r19=[r3],16 // b3 ++ ;; ++ ld8.nta r20=[r2],16 // b4 ++ ld8.nta r21=[r3],16 // b5 ++ ;; ++ ld8.nta r11=[r2],16 // ar.pfs ++ ld8.nta r22=[r3],56 // ar.lc ++ ;; ++ ld8.nta r24=[r2],32 // pr ++ mov b0=r16 ++ ;; ++ ldf.fill.nta f2=[r2],32 ++ ldf.fill.nta f3=[r3],32 ++ mov b1=r17 ++ ;; ++ ldf.fill.nta f4=[r2],32 ++ ldf.fill.nta f5=[r3],32 ++ mov b2=r18 ++ ;; ++ ldf.fill.nta f16=[r2],32 ++ ldf.fill.nta f17=[r3],32 ++ mov b3=r19 ++ ;; ++ ldf.fill.nta f18=[r2],32 ++ ldf.fill.nta f19=[r3],32 ++ mov b4=r20 ++ ;; ++ ldf.fill.nta f20=[r2],32 ++ ldf.fill.nta f21=[r3],32 ++ mov b5=r21 ++ ;; ++ ldf.fill.nta f22=[r2],32 ++ ldf.fill.nta f23=[r3],32 ++ mov ar.lc=r22 ++ ;; ++ ldf.fill.nta f24=[r2],32 ++ ldf.fill.nta f25=[r3],32 ++ cmp.eq p8,p9=0,in1 ++ ;; ++ ldf.fill.nta f26=[r2],32 ++ ldf.fill.nta f27=[r3],32 ++ mov ar.pfs=r11 ++ ;; ++ ldf.fill.nta f28=[r2],32 ++ ldf.fill.nta f29=[r3],32 ++ ;; ++ ldf.fill.nta f30=[r2] ++ ldf.fill.nta f31=[r3] ++(p8) mov r8=1 ++ ++ mov ar.rnat=r26 // restore ar.rnat ++ ;; ++ mov ar.rsc=r27 // restore ar.rsc ++(p9) mov r8=in1 ++ ++ invala // virt. -> phys. regnum mapping may change ++ mov pr=r24,-1 ++ br.ret.sptk.few rp ++END(kgdb_fault_longjmp) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/kgdb.c linux-2.6.18.kgdb/arch/ia64/kernel/kgdb.c +--- linux-2.6.18/arch/ia64/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/kgdb.c 2008-06-10 16:19:32.000000000 +0400 +@@ -0,0 +1,1131 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * (c) Copyright 2005 Hewlett-Packard Development Company, L.P. ++ * Bob Picco ++ */ ++/* ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by David Grothe ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define NUM_REGS 590 ++#define REGISTER_BYTES (NUM_REGS*8+128*8) ++#define REGISTER_BYTE(N) (((N) * 8) \ ++ + ((N) <= IA64_FR0_REGNUM ? \ ++ 0 : 8 * (((N) > IA64_FR127_REGNUM) ? 128 : (N) - IA64_FR0_REGNUM))) ++#define REGISTER_SIZE(N) \ ++ (((N) >= IA64_FR0_REGNUM && (N) <= IA64_FR127_REGNUM) ? 16 : 8) ++#define IA64_GR0_REGNUM 0 ++#define IA64_FR0_REGNUM 128 ++#define IA64_FR127_REGNUM (IA64_FR0_REGNUM+127) ++#define IA64_PR0_REGNUM 256 ++#define IA64_BR0_REGNUM 320 ++#define IA64_VFP_REGNUM 328 ++#define IA64_PR_REGNUM 330 ++#define IA64_IP_REGNUM 331 ++#define IA64_PSR_REGNUM 332 ++#define IA64_CFM_REGNUM 333 ++#define IA64_AR0_REGNUM 334 ++#define IA64_NAT0_REGNUM 462 ++#define IA64_NAT31_REGNUM (IA64_NAT0_REGNUM+31) ++#define IA64_NAT32_REGNUM (IA64_NAT0_REGNUM+32) ++#define IA64_RSC_REGNUM (IA64_AR0_REGNUM+16) ++#define IA64_BSP_REGNUM (IA64_AR0_REGNUM+17) ++#define IA64_BSPSTORE_REGNUM (IA64_AR0_REGNUM+18) ++#define IA64_RNAT_REGNUM (IA64_AR0_REGNUM+19) ++#define IA64_FCR_REGNUM (IA64_AR0_REGNUM+21) ++#define IA64_EFLAG_REGNUM (IA64_AR0_REGNUM+24) ++#define IA64_CSD_REGNUM (IA64_AR0_REGNUM+25) ++#define IA64_SSD_REGNUM (IA64_AR0_REGNUM+26) ++#define IA64_CFLG_REGNUM (IA64_AR0_REGNUM+27) ++#define IA64_FSR_REGNUM (IA64_AR0_REGNUM+28) ++#define IA64_FIR_REGNUM (IA64_AR0_REGNUM+29) ++#define IA64_FDR_REGNUM (IA64_AR0_REGNUM+30) ++#define IA64_CCV_REGNUM (IA64_AR0_REGNUM+32) ++#define IA64_UNAT_REGNUM (IA64_AR0_REGNUM+36) ++#define IA64_FPSR_REGNUM (IA64_AR0_REGNUM+40) ++#define IA64_ITC_REGNUM (IA64_AR0_REGNUM+44) ++#define IA64_PFS_REGNUM (IA64_AR0_REGNUM+64) ++#define IA64_LC_REGNUM (IA64_AR0_REGNUM+65) ++#define IA64_EC_REGNUM (IA64_AR0_REGNUM+66) ++ ++#define REGISTER_INDEX(N) (REGISTER_BYTE(N) / sizeof (unsigned long)) ++#define BREAK_INSTR_ALIGN (~0xfULL) ++ ++#define ptoff(V) ((unsigned int) &((struct pt_regs *)0x0)->V) ++struct reg_to_ptreg_index { ++ unsigned int reg; ++ unsigned int ptregoff; ++}; ++ ++static struct reg_to_ptreg_index gr_reg_to_ptreg_index[] = { ++ {IA64_GR0_REGNUM + 1, ptoff(r1)}, ++ {IA64_GR0_REGNUM + 2, ptoff(r2)}, ++ {IA64_GR0_REGNUM + 3, ptoff(r3)}, ++ {IA64_GR0_REGNUM + 8, ptoff(r8)}, ++ {IA64_GR0_REGNUM + 9, ptoff(r9)}, ++ {IA64_GR0_REGNUM + 10, ptoff(r10)}, ++ {IA64_GR0_REGNUM + 11, ptoff(r11)}, ++ {IA64_GR0_REGNUM + 12, ptoff(r12)}, ++ {IA64_GR0_REGNUM + 13, ptoff(r13)}, ++ {IA64_GR0_REGNUM + 14, ptoff(r14)}, ++ {IA64_GR0_REGNUM + 15, ptoff(r15)}, ++ {IA64_GR0_REGNUM + 16, ptoff(r16)}, ++ {IA64_GR0_REGNUM + 17, ptoff(r17)}, ++ {IA64_GR0_REGNUM + 18, ptoff(r18)}, ++ {IA64_GR0_REGNUM + 19, ptoff(r19)}, ++ {IA64_GR0_REGNUM + 20, ptoff(r20)}, ++ {IA64_GR0_REGNUM + 21, ptoff(r21)}, ++ {IA64_GR0_REGNUM + 22, ptoff(r22)}, ++ {IA64_GR0_REGNUM + 23, ptoff(r23)}, ++ {IA64_GR0_REGNUM + 24, ptoff(r24)}, ++ {IA64_GR0_REGNUM + 25, ptoff(r25)}, ++ {IA64_GR0_REGNUM + 26, ptoff(r26)}, ++ {IA64_GR0_REGNUM + 27, ptoff(r27)}, ++ {IA64_GR0_REGNUM + 28, ptoff(r28)}, ++ {IA64_GR0_REGNUM + 29, ptoff(r29)}, ++ {IA64_GR0_REGNUM + 30, ptoff(r30)}, ++ {IA64_GR0_REGNUM + 31, ptoff(r31)}, ++}; ++ ++static struct reg_to_ptreg_index br_reg_to_ptreg_index[] = { ++ {IA64_BR0_REGNUM, ptoff(b0)}, ++ {IA64_BR0_REGNUM + 6, ptoff(b6)}, ++ {IA64_BR0_REGNUM + 7, ptoff(b7)}, ++}; ++ ++static struct reg_to_ptreg_index ar_reg_to_ptreg_index[] = { ++ {IA64_PFS_REGNUM, ptoff(ar_pfs)}, ++ {IA64_UNAT_REGNUM, ptoff(ar_unat)}, ++ {IA64_RNAT_REGNUM, ptoff(ar_rnat)}, ++ {IA64_BSPSTORE_REGNUM, ptoff(ar_bspstore)}, ++ {IA64_RSC_REGNUM, ptoff(ar_rsc)}, ++ {IA64_CSD_REGNUM, ptoff(ar_csd)}, ++ {IA64_SSD_REGNUM, ptoff(ar_ssd)}, ++ {IA64_FPSR_REGNUM, ptoff(ar_fpsr)}, ++ {IA64_CCV_REGNUM, ptoff(ar_ccv)}, ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int kgdb_gr_reg(int regnum, struct unw_frame_info *info, ++ unsigned long *reg, int rw) ++{ ++ char nat; ++ ++ if ((regnum >= IA64_GR0_REGNUM && regnum <= (IA64_GR0_REGNUM + 1)) || ++ (regnum >= (IA64_GR0_REGNUM + 4) && ++ regnum <= (IA64_GR0_REGNUM + 7))) ++ return !unw_access_gr(info, regnum - IA64_GR0_REGNUM, ++ reg, &nat, rw); ++ else ++ return 0; ++} ++static int kgdb_gr_ptreg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int i, result = 1; ++ char nat; ++ ++ if (!((regnum >= (IA64_GR0_REGNUM + 2) && ++ regnum <= (IA64_GR0_REGNUM + 3)) || ++ (regnum >= (IA64_GR0_REGNUM + 8) && ++ regnum <= (IA64_GR0_REGNUM + 15)) || ++ (regnum >= (IA64_GR0_REGNUM + 16) && ++ regnum <= (IA64_GR0_REGNUM + 31)))) ++ return 0; ++ else if (rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++) ++ if (gr_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *)(((void *)ptregs) + ++ gr_reg_to_ptreg_index[i].ptregoff)) = *reg; ++ break; ++ } ++ } else if (!rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(gr_reg_to_ptreg_index); i++) ++ if (gr_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) ++ (((void *)ptregs) + ++ gr_reg_to_ptreg_index[i].ptregoff)); ++ break; ++ } ++ } else ++ result = !unw_access_gr(info, regnum - IA64_GR0_REGNUM, ++ reg, &nat, rw); ++ return result; ++} ++ ++static int kgdb_br_reg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int i, result = 1; ++ ++ if (!(regnum >= IA64_BR0_REGNUM && regnum <= (IA64_BR0_REGNUM + 7))) ++ return 0; ++ ++ switch (regnum) { ++ case IA64_BR0_REGNUM: ++ case IA64_BR0_REGNUM + 6: ++ case IA64_BR0_REGNUM + 7: ++ if (rw) { ++ for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++) ++ if (br_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *) ++ (((void *)ptregs) + ++ br_reg_to_ptreg_index[i].ptregoff)) = ++ *reg; ++ break; ++ } ++ } else ++ for (i = 0; i < ARRAY_SIZE(br_reg_to_ptreg_index); i++) ++ if (br_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) ++ (((void *)ptregs) + ++ br_reg_to_ptreg_index[i]. ++ ptregoff)); ++ break; ++ } ++ break; ++ case IA64_BR0_REGNUM + 1: ++ case IA64_BR0_REGNUM + 2: ++ case IA64_BR0_REGNUM + 3: ++ case IA64_BR0_REGNUM + 4: ++ case IA64_BR0_REGNUM + 5: ++ result = !unw_access_br(info, regnum - IA64_BR0_REGNUM, ++ reg, rw); ++ break; ++ } ++ ++ return result; ++} ++ ++static int kgdb_fr_reg(int regnum, char *inbuffer, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, ++ struct ia64_fpreg *freg, int rw) ++{ ++ int result = 1; ++ ++ if (!(regnum >= IA64_FR0_REGNUM && regnum <= (IA64_FR0_REGNUM + 127))) ++ return 0; ++ ++ switch (regnum) { ++ case IA64_FR0_REGNUM + 6: ++ case IA64_FR0_REGNUM + 7: ++ case IA64_FR0_REGNUM + 8: ++ case IA64_FR0_REGNUM + 9: ++ case IA64_FR0_REGNUM + 10: ++ case IA64_FR0_REGNUM + 11: ++ case IA64_FR0_REGNUM + 12: ++ if (rw) { ++ char *ptr = inbuffer; ++ ++ freg->u.bits[0] = *reg; ++ kgdb_hex2long(&ptr, &freg->u.bits[1]); ++ *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))) = ++ *freg; ++ break; ++ } else if (!ptregs) ++ result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM, ++ freg, rw); ++ else ++ *freg = ++ *(&ptregs->f6 + (regnum - (IA64_FR0_REGNUM + 6))); ++ break; ++ default: ++ if (!rw) ++ result = !unw_access_fr(info, regnum - IA64_FR0_REGNUM, ++ freg, rw); ++ else ++ result = 0; ++ break; ++ } ++ ++ return result; ++} ++ ++static int kgdb_ar_reg(int regnum, struct pt_regs * ptregs, ++ struct unw_frame_info *info, unsigned long *reg, int rw) ++{ ++ int result = 0, i; ++ ++ if (!(regnum >= IA64_AR0_REGNUM && regnum <= IA64_EC_REGNUM)) ++ return 0; ++ ++ if (rw && ptregs) { ++ for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++) ++ if (ar_reg_to_ptreg_index[i].reg == regnum) { ++ *((unsigned long *) (((void *)ptregs) + ++ ar_reg_to_ptreg_index[i].ptregoff)) = ++ *reg; ++ result = 1; ++ break; ++ } ++ } else if (ptregs) { ++ for (i = 0; i < ARRAY_SIZE(ar_reg_to_ptreg_index); i++) ++ if (ar_reg_to_ptreg_index[i].reg == regnum) { ++ *reg = *((unsigned long *) (((void *)ptregs) + ++ ar_reg_to_ptreg_index[i].ptregoff)); ++ result = 1; ++ break; ++ } ++ } ++ ++ if (result) ++ return result; ++ ++ result = 1; ++ ++ switch (regnum) { ++ case IA64_CSD_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_CSD, reg, rw); ++ break; ++ case IA64_SSD_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_SSD, reg, rw); ++ break; ++ case IA64_UNAT_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_RNAT_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_BSPSTORE_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_PFS_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RNAT, reg, rw); ++ break; ++ case IA64_LC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_LC, reg, rw); ++ break; ++ case IA64_EC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_EC, reg, rw); ++ break; ++ case IA64_FPSR_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_FPSR, reg, rw); ++ break; ++ case IA64_RSC_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_RSC, reg, rw); ++ break; ++ case IA64_CCV_REGNUM: ++ result = !unw_access_ar(info, UNW_AR_CCV, reg, rw); ++ break; ++ default: ++ result = 0; ++ } ++ ++ return result; ++} ++ ++void kgdb_get_reg(char *outbuffer, int regnum, struct unw_frame_info *info, ++ struct pt_regs *ptregs) ++{ ++ unsigned long reg, size = 0, *mem = ® ++ struct ia64_fpreg freg; ++ ++ if (kgdb_gr_reg(regnum, info, ®, 0) || ++ kgdb_gr_ptreg(regnum, ptregs, info, ®, 0) || ++ kgdb_br_reg(regnum, ptregs, info, ®, 0) || ++ kgdb_ar_reg(regnum, ptregs, info, ®, 0)) ++ size = sizeof(reg); ++ else if (kgdb_fr_reg(regnum, NULL, ptregs, info, ®, &freg, 0)) { ++ size = sizeof(freg); ++ mem = (unsigned long *)&freg; ++ } else if (regnum == IA64_IP_REGNUM) { ++ if (!ptregs) { ++ unw_get_ip(info, ®); ++ size = sizeof(reg); ++ } else { ++ reg = ptregs->cr_iip; ++ size = sizeof(reg); ++ } ++ } else if (regnum == IA64_CFM_REGNUM) { ++ if (!ptregs) ++ unw_get_cfm(info, ®); ++ else ++ reg = ptregs->cr_ifs; ++ size = sizeof(reg); ++ } else if (regnum == IA64_PSR_REGNUM) { ++ if (!ptregs && kgdb_usethread) ++ ptregs = (struct pt_regs *) ++ ((unsigned long)kgdb_usethread + ++ IA64_STK_OFFSET) - 1; ++ if (ptregs) ++ reg = ptregs->cr_ipsr; ++ size = sizeof(reg); ++ } else if (regnum == IA64_PR_REGNUM) { ++ if (ptregs) ++ reg = ptregs->pr; ++ else ++ unw_access_pr(info, ®, 0); ++ size = sizeof(reg); ++ } else if (regnum == IA64_BSP_REGNUM) { ++ unw_get_bsp(info, ®); ++ size = sizeof(reg); ++ } ++ ++ if (size) { ++ kgdb_mem2hex((char *) mem, outbuffer, size); ++ outbuffer[size*2] = 0; ++ } ++ else ++ strcpy(outbuffer, "E0"); ++ ++ return; ++} ++ ++void kgdb_put_reg(char *inbuffer, char *outbuffer, int regnum, ++ struct unw_frame_info *info, struct pt_regs *ptregs) ++{ ++ unsigned long reg; ++ struct ia64_fpreg freg; ++ char *ptr = inbuffer; ++ ++ kgdb_hex2long(&ptr, ®); ++ strcpy(outbuffer, "OK"); ++ ++ if (kgdb_gr_reg(regnum, info, ®, 1) || ++ kgdb_gr_ptreg(regnum, ptregs, info, ®, 1) || ++ kgdb_br_reg(regnum, ptregs, info, ®, 1) || ++ kgdb_fr_reg(regnum, inbuffer, ptregs, info, ®, &freg, 1) || ++ kgdb_ar_reg(regnum, ptregs, info, ®, 1)) ; ++ else if (regnum == IA64_IP_REGNUM) ++ ptregs->cr_iip = reg; ++ else if (regnum == IA64_CFM_REGNUM) ++ ptregs->cr_ifs = reg; ++ else if (regnum == IA64_PSR_REGNUM) ++ ptregs->cr_ipsr = reg; ++ else if (regnum == IA64_PR_REGNUM) ++ ptregs->pr = reg; ++ else ++ strcpy(outbuffer, "E01"); ++ return; ++} ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ ++} ++ ++#define MAX_HW_BREAKPOINT (20) ++long hw_break_total_dbr, hw_break_total_ibr; ++#define HW_BREAKPOINT (hw_break_total_dbr + hw_break_total_ibr) ++#define WATCH_INSTRUCTION 0x0 ++#define WATCH_WRITE 0x1 ++#define WATCH_READ 0x2 ++#define WATCH_ACCESS 0x3 ++ ++#define HWCAP_DBR ((1 << WATCH_WRITE) | (1 << WATCH_READ)) ++#define HWCAP_IBR (1 << WATCH_INSTRUCTION) ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned long capable; ++ unsigned long type; ++ unsigned long mask; ++ unsigned long addr; ++} *breakinfo; ++ ++static struct hw_breakpoint hwbreaks[MAX_HW_BREAKPOINT]; ++ ++enum instruction_type { A, I, M, F, B, L, X, u }; ++ ++static enum instruction_type bundle_encoding[32][3] = { ++ {M, I, I}, /* 00 */ ++ {M, I, I}, /* 01 */ ++ {M, I, I}, /* 02 */ ++ {M, I, I}, /* 03 */ ++ {M, L, X}, /* 04 */ ++ {M, L, X}, /* 05 */ ++ {u, u, u}, /* 06 */ ++ {u, u, u}, /* 07 */ ++ {M, M, I}, /* 08 */ ++ {M, M, I}, /* 09 */ ++ {M, M, I}, /* 0A */ ++ {M, M, I}, /* 0B */ ++ {M, F, I}, /* 0C */ ++ {M, F, I}, /* 0D */ ++ {M, M, F}, /* 0E */ ++ {M, M, F}, /* 0F */ ++ {M, I, B}, /* 10 */ ++ {M, I, B}, /* 11 */ ++ {M, B, B}, /* 12 */ ++ {M, B, B}, /* 13 */ ++ {u, u, u}, /* 14 */ ++ {u, u, u}, /* 15 */ ++ {B, B, B}, /* 16 */ ++ {B, B, B}, /* 17 */ ++ {M, M, B}, /* 18 */ ++ {M, M, B}, /* 19 */ ++ {u, u, u}, /* 1A */ ++ {u, u, u}, /* 1B */ ++ {M, F, B}, /* 1C */ ++ {M, F, B}, /* 1D */ ++ {u, u, u}, /* 1E */ ++ {u, u, u}, /* 1F */ ++}; ++ ++int kgdb_validate_break_address(unsigned long addr) ++{ ++ int error; ++ char tmp_variable[BREAK_INSTR_SIZE]; ++ error = kgdb_get_mem((char *)(addr & BREAK_INSTR_ALIGN), tmp_variable, ++ BREAK_INSTR_SIZE); ++ return error; ++} ++ ++int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) ++{ ++ extern unsigned long _start[]; ++ unsigned long slot = addr & BREAK_INSTR_ALIGN, bundle_addr; ++ unsigned long template; ++ struct bundle { ++ struct { ++ unsigned long long template:5; ++ unsigned long long slot0:41; ++ unsigned long long slot1_p0:64 - 46; ++ } quad0; ++ struct { ++ unsigned long long slot1_p1:41 - (64 - 46); ++ unsigned long long slot2:41; ++ } quad1; ++ } bundle; ++ int ret; ++ ++ bundle_addr = addr & ~0xFULL; ++ ++ if (bundle_addr == (unsigned long)_start) ++ return 0; ++ ++ ret = kgdb_get_mem((char *)bundle_addr, (char *)&bundle, ++ BREAK_INSTR_SIZE); ++ if (ret < 0) ++ return ret; ++ ++ if (slot > 2) ++ slot = 0; ++ ++ memcpy(saved_instr, &bundle, BREAK_INSTR_SIZE); ++ template = bundle.quad0.template; ++ ++ if (slot == 1 && bundle_encoding[template][1] == L) ++ slot = 2; ++ ++ switch (slot) { ++ case 0: ++ bundle.quad0.slot0 = BREAKNUM; ++ break; ++ case 1: ++ bundle.quad0.slot1_p0 = BREAKNUM; ++ bundle.quad1.slot1_p1 = (BREAKNUM >> (64 - 46)); ++ break; ++ case 2: ++ bundle.quad1.slot2 = BREAKNUM; ++ break; ++ } ++ ++ return kgdb_set_mem((char *)bundle_addr, (char *)&bundle, ++ BREAK_INSTR_SIZE); ++} ++ ++int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) ++{ ++ extern unsigned long _start[]; ++ ++ addr = addr & BREAK_INSTR_ALIGN; ++ if (addr == (unsigned long)_start) ++ return 0; ++ return kgdb_set_mem((char *)addr, (char *)bundle, BREAK_INSTR_SIZE); ++} ++ ++static int hw_breakpoint_init; ++ ++void do_init_hw_break(void) ++{ ++ s64 status; ++ int i; ++ ++ hw_breakpoint_init = 1; ++ ++#ifdef CONFIG_IA64_HP_SIM ++ hw_break_total_ibr = 8; ++ hw_break_total_dbr = 8; ++ status = 0; ++#else ++ status = ia64_pal_debug_info(&hw_break_total_ibr, &hw_break_total_dbr); ++#endif ++ ++ if (status) { ++ printk(KERN_INFO "do_init_hw_break: pal call failed %d\n", ++ (int)status); ++ return; ++ } ++ ++ if (HW_BREAKPOINT > MAX_HW_BREAKPOINT) { ++ printk(KERN_INFO "do_init_hw_break: %d exceeds max %d\n", ++ (int)HW_BREAKPOINT, (int)MAX_HW_BREAKPOINT); ++ ++ while ((HW_BREAKPOINT > MAX_HW_BREAKPOINT) ++ && hw_break_total_ibr != 1) ++ hw_break_total_ibr--; ++ while (HW_BREAKPOINT > MAX_HW_BREAKPOINT) ++ hw_break_total_dbr--; ++ } ++ ++ breakinfo = hwbreaks; ++ ++ memset(breakinfo, 0, HW_BREAKPOINT * sizeof(struct hw_breakpoint)); ++ ++ for (i = 0; i < hw_break_total_dbr; i++) ++ breakinfo[i].capable = HWCAP_DBR; ++ ++ for (; i < HW_BREAKPOINT; i++) ++ breakinfo[i].capable = HWCAP_IBR; ++ ++ return; ++} ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ ++ if (!breakinfo) ++ return; ++ ++ for (breakno = 0; breakno < HW_BREAKPOINT; breakno++) { ++ if (breakinfo[breakno].enabled) { ++ if (breakinfo[breakno].capable & HWCAP_IBR) { ++ int ibreakno = breakno - hw_break_total_dbr; ++ ia64_set_ibr(ibreakno << 1, ++ breakinfo[breakno].addr); ++ ia64_set_ibr((ibreakno << 1) + 1, ++ (~breakinfo[breakno].mask & ++ ((1UL << 56UL) - 1)) | ++ (1UL << 56UL) | (1UL << 63UL)); ++ } else { ++ ia64_set_dbr(breakno << 1, ++ breakinfo[breakno].addr); ++ ia64_set_dbr((breakno << 1) + 1, ++ (~breakinfo[breakno]. ++ mask & ((1UL << 56UL) - 1)) | ++ (1UL << 56UL) | ++ (breakinfo[breakno].type << 62UL)); ++ } ++ } else { ++ if (breakinfo[breakno].capable & HWCAP_IBR) ++ ia64_set_ibr(((breakno - ++ hw_break_total_dbr) << 1) + 1, ++ 0); ++ else ++ ia64_set_dbr((breakno << 1) + 1, 0); ++ } ++ } ++ ++ return; ++} ++ ++int hardware_breakpoint(unsigned long addr, int length, int type, int action) ++{ ++ int breakno, found, watch; ++ unsigned long mask; ++ extern unsigned long _start[]; ++ ++ if (!hw_breakpoint_init) ++ do_init_hw_break(); ++ ++ if (!breakinfo) ++ return 0; ++ else if (addr == (unsigned long)_start) ++ return 1; ++ ++ if (type == WATCH_ACCESS) ++ mask = HWCAP_DBR; ++ else ++ mask = 1UL << type; ++ ++ for (watch = 0, found = 0, breakno = 0; breakno < HW_BREAKPOINT; ++ breakno++) { ++ if (action) { ++ if (breakinfo[breakno].enabled ++ || !(breakinfo[breakno].capable & mask)) ++ continue; ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].mask = length - 1; ++ breakinfo[breakno].addr = addr; ++ watch = breakno; ++ } else if (breakinfo[breakno].enabled && ++ ((length < 0 && breakinfo[breakno].addr == addr) || ++ ((breakinfo[breakno].capable & mask) && ++ (breakinfo[breakno].mask == (length - 1)) && ++ (breakinfo[breakno].addr == addr)))) { ++ breakinfo[breakno].enabled = 0; ++ breakinfo[breakno].type = 0UL; ++ } else ++ continue; ++ found++; ++ if (type != WATCH_ACCESS) ++ break; ++ else if (found == 2) ++ break; ++ else ++ mask = HWCAP_IBR; ++ } ++ ++ if (type == WATCH_ACCESS && found == 1) { ++ breakinfo[watch].enabled = 0; ++ found = 0; ++ } ++ ++ mb(); ++ return found; ++} ++ ++int kgdb_arch_set_hw_breakpoint(unsigned long addr, int len, ++ enum kgdb_bptype type) ++{ ++ return hardware_breakpoint(addr, len, type - '1', 1); ++} ++ ++int kgdb_arch_remove_hw_breakpoint(unsigned long addr, int len, ++ enum kgdb_bptype type) ++{ ++ return hardware_breakpoint(addr, len, type - '1', 0); ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 0); ++ ++} ++ ++void kgdb_remove_all_hw_break(void) ++{ ++ int i; ++ ++ for (i = 0; i < HW_BREAKPOINT; i++) ++ memset(&breakinfo[i], 0, sizeof(struct hw_breakpoint)); ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ return hardware_breakpoint(addr, 8, WATCH_INSTRUCTION, 1); ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ unsigned long hw_breakpoint_status; ++ ++ hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, ++ hw_breakpoint_status ^ IA64_PSR_DB); ++} ++ ++volatile static struct smp_unw { ++ struct unw_frame_info *unw; ++ struct task_struct *task; ++} smp_unw[NR_CPUS]; ++ ++static int inline kgdb_get_blocked_state(struct task_struct *p, ++ struct unw_frame_info *unw) ++{ ++ unsigned long ip; ++ int count = 0; ++ ++ unw_init_from_blocked_task(unw, p); ++ ip = 0UL; ++ do { ++ if (unw_unwind(unw) < 0) ++ return -1; ++ unw_get_ip(unw, &ip); ++ if (!in_sched_functions(ip)) ++ break; ++ } while (count++ < 16); ++ ++ if (!ip) ++ return -1; ++ else ++ return 0; ++} ++ ++static void inline kgdb_wait(struct pt_regs *regs) ++{ ++ unsigned long hw_breakpoint_status = ia64_getreg(_IA64_REG_PSR); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, ++ hw_breakpoint_status ^ IA64_PSR_DB); ++ kgdb_nmihook(smp_processor_id(), regs); ++ if (hw_breakpoint_status & IA64_PSR_DB) ++ ia64_setreg(_IA64_REG_PSR_L, hw_breakpoint_status); ++ ++ return; ++} ++ ++static void inline normalize(struct unw_frame_info *running, ++ struct pt_regs *regs) ++{ ++ unsigned long sp; ++ ++ do { ++ unw_get_sp(running, &sp); ++ if ((sp + 0x10) >= (unsigned long)regs) ++ break; ++ } while (unw_unwind(running) >= 0); ++ ++ return; ++} ++ ++static void kgdb_init_running(struct unw_frame_info *unw, void *data) ++{ ++ struct pt_regs *regs; ++ ++ regs = data; ++ normalize(unw, regs); ++ smp_unw[smp_processor_id()].unw = unw; ++ kgdb_wait(regs); ++} ++ ++void kgdb_wait_ipi(struct pt_regs *regs) ++{ ++ struct unw_frame_info unw; ++ ++ smp_unw[smp_processor_id()].task = current; ++ ++ if (user_mode(regs)) { ++ smp_unw[smp_processor_id()].unw = (struct unw_frame_info *)1; ++ kgdb_wait(regs); ++ } else { ++ if (current->state == TASK_RUNNING) ++ unw_init_running(kgdb_init_running, regs); ++ else { ++ if (kgdb_get_blocked_state(current, &unw)) ++ smp_unw[smp_processor_id()].unw = ++ (struct unw_frame_info *)1; ++ else ++ smp_unw[smp_processor_id()].unw = &unw; ++ kgdb_wait(regs); ++ } ++ } ++ ++ smp_unw[smp_processor_id()].unw = NULL; ++ return; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ if (num_online_cpus() > 1) ++ smp_send_nmi_allbutself(); ++} ++ ++static volatile int kgdb_hwbreak_sstep[NR_CPUS]; ++ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ unsigned long err = args->err; ++ ++ switch (cmd) { ++ default: ++ return NOTIFY_DONE; ++ case DIE_PAGE_FAULT_NO_CONTEXT: ++ if (atomic_read(&debugger_active) && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ } ++ break; ++ case DIE_BREAK: ++ if (user_mode(regs) || err == 0x80001) ++ return NOTIFY_DONE; ++ break; ++ case DIE_FAULT: ++ if (user_mode(regs)) ++ return NOTIFY_DONE; ++ else if (err == 36 && kgdb_hwbreak_sstep[smp_processor_id()]) { ++ kgdb_hwbreak_sstep[smp_processor_id()] = 0; ++ regs->cr_ipsr &= ~IA64_PSR_SS; ++ return NOTIFY_STOP; ++ } ++ case DIE_MCA_MONARCH_PROCESS: ++ case DIE_INIT_MONARCH_PROCESS: ++ break; ++ } ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&ia64die_chain, &kgdb_notifier); ++ return 0; ++} ++ ++static void do_kgdb_handle_exception(struct unw_frame_info *, void *data); ++ ++struct kgdb_state { ++ int e_vector; ++ int signo; ++ unsigned long err_code; ++ struct pt_regs *regs; ++ struct unw_frame_info *unw; ++ char *inbuf; ++ char *outbuf; ++ int unwind; ++ int ret; ++}; ++ ++static void inline kgdb_pc(struct pt_regs *regs, unsigned long pc) ++{ ++ regs->cr_iip = pc & ~0xf; ++ ia64_psr(regs)->ri = pc & 0x3; ++ return; ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, ++ int err_code, char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ struct kgdb_state info; ++ ++ info.e_vector = e_vector; ++ info.signo = signo; ++ info.err_code = err_code; ++ info.unw = (void *)0; ++ info.inbuf = remcom_in_buffer; ++ info.outbuf = remcom_out_buffer; ++ info.unwind = 0; ++ info.ret = -1; ++ ++ if (remcom_in_buffer[0] == 'c' || remcom_in_buffer[0] == 's') { ++ info.regs = linux_regs; ++ do_kgdb_handle_exception(NULL, &info); ++ } else if (kgdb_usethread == current) { ++ info.regs = linux_regs; ++ info.unwind = 1; ++ unw_init_running(do_kgdb_handle_exception, &info); ++ } else if (kgdb_usethread->state != TASK_RUNNING) { ++ struct unw_frame_info unw_info; ++ ++ if (kgdb_get_blocked_state(kgdb_usethread, &unw_info)) { ++ info.ret = 1; ++ goto bad; ++ } ++ info.regs = NULL; ++ do_kgdb_handle_exception(&unw_info, &info); ++ } else { ++ int i; ++ ++ for (i = 0; i < NR_CPUS; i++) ++ if (smp_unw[i].task == kgdb_usethread && smp_unw[i].unw ++ && smp_unw[i].unw != (struct unw_frame_info *)1) { ++ info.regs = NULL; ++ do_kgdb_handle_exception(smp_unw[i].unw, &info); ++ break; ++ } else { ++ info.ret = 1; ++ goto bad; ++ } ++ } ++ ++ bad: ++ if (info.ret != -1 && remcom_in_buffer[0] == 'p') { ++ unsigned long bad = 0xbad4badbadbadbadUL; ++ ++ printk("kgdb_arch_handle_exception: p packet bad (%s)\n", ++ remcom_in_buffer); ++ kgdb_mem2hex((char *)&bad, remcom_out_buffer, sizeof(bad)); ++ remcom_out_buffer[sizeof(bad) * 2] = 0; ++ info.ret = -1; ++ } ++ return info.ret; ++} ++ ++/* ++ * This is done because I evidently made an incorrect 'p' encoding ++ * when my patch for gdb was committed. It was later corrected. This ++ * check supports both my wrong encoding of the register number and ++ * the correct encoding. Eventually this should be eliminated and ++ * kgdb_hex2long should be demarshalling the regnum. ++ */ ++static inline int check_packet(unsigned int regnum, char *packet) ++{ ++ static int check_done, swap; ++ unsigned long reglong; ++ ++ if (likely(check_done)) { ++ if (swap) { ++ kgdb_hex2long(&packet, ®long); ++ regnum = (int) reglong; ++ } ++ ++ } else { ++ if (regnum > NUM_REGS) { ++ kgdb_hex2long(&packet, ®long); ++ regnum = (int) reglong; ++ swap = 1; ++ } ++ check_done = 1; ++ } ++ return regnum; ++} ++ ++static void do_kgdb_handle_exception(struct unw_frame_info *unw_info, ++ void *data) ++{ ++ long addr; ++ char *ptr; ++ unsigned long newPC; ++ int e_vector, signo; ++ unsigned long err_code; ++ struct pt_regs *linux_regs; ++ struct kgdb_state *info; ++ char *remcom_in_buffer, *remcom_out_buffer; ++ ++ info = data; ++ info->unw = unw_info; ++ e_vector = info->e_vector; ++ signo = info->signo; ++ err_code = info->err_code; ++ remcom_in_buffer = info->inbuf; ++ remcom_out_buffer = info->outbuf; ++ linux_regs = info->regs; ++ ++ if (info->unwind) ++ normalize(unw_info, linux_regs); ++ ++ switch (remcom_in_buffer[0]) { ++ case 'p': ++ { ++ unsigned int regnum; ++ ++ kgdb_hex2mem(&remcom_in_buffer[1], (char *)®num, ++ sizeof(regnum)); ++ regnum = check_packet(regnum, &remcom_in_buffer[1]); ++ if (regnum >= NUM_REGS) { ++ remcom_out_buffer[0] = 'E'; ++ remcom_out_buffer[1] = 0; ++ } else ++ kgdb_get_reg(remcom_out_buffer, regnum, ++ unw_info, linux_regs); ++ break; ++ } ++ case 'P': ++ { ++ unsigned int regno; ++ long v; ++ char *ptr; ++ ++ ptr = &remcom_in_buffer[1]; ++ if ((!kgdb_usethread || kgdb_usethread == current) && ++ kgdb_hex2long(&ptr, &v) && ++ *ptr++ == '=' && (v >= 0)) { ++ regno = (unsigned int)v; ++ regno = (regno >= NUM_REGS ? 0 : regno); ++ kgdb_put_reg(ptr, remcom_out_buffer, regno, ++ unw_info, linux_regs); ++ } else ++ strcpy(remcom_out_buffer, "E01"); ++ break; ++ } ++ case 'c': ++ case 's': ++ if (e_vector == TRAP_BRKPT && err_code == KGDBBREAKNUM) { ++ if (ia64_psr(linux_regs)->ri < 2) ++ kgdb_pc(linux_regs, linux_regs->cr_iip + ++ ia64_psr(linux_regs)->ri + 1); ++ else ++ kgdb_pc(linux_regs, linux_regs->cr_iip + 16); ++ } ++ ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) { ++ linux_regs->cr_iip = addr; ++ } ++ newPC = linux_regs->cr_iip; ++ ++ /* clear the trace bit */ ++ linux_regs->cr_ipsr &= ~IA64_PSR_SS; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ ++ /* set the trace bit if we're stepping or took a hardware break */ ++ if (remcom_in_buffer[0] == 's' || e_vector == TRAP_HWBKPT) { ++ linux_regs->cr_ipsr |= IA64_PSR_SS; ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ ++ kgdb_correct_hw_break(); ++ ++ /* if not hardware breakpoint, then reenable them */ ++ if (e_vector != TRAP_HWBKPT) ++ linux_regs->cr_ipsr |= IA64_PSR_DB; ++ else { ++ kgdb_hwbreak_sstep[smp_processor_id()] = 1; ++ linux_regs->cr_ipsr &= ~IA64_PSR_DB; ++ } ++ ++ info->ret = 0; ++ break; ++ default: ++ break; ++ } ++ ++ return; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .set_hw_breakpoint = kgdb_arch_set_hw_breakpoint, ++ .remove_hw_breakpoint = kgdb_arch_remove_hw_breakpoint, ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/process.c linux-2.6.18.kgdb/arch/ia64/kernel/process.c +--- linux-2.6.18/arch/ia64/kernel/process.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/process.c 2008-06-10 16:20:23.000000000 +0400 +@@ -458,6 +458,9 @@ copy_thread (int nr, unsigned long clone + */ + child_ptregs->cr_ipsr = ((child_ptregs->cr_ipsr | IA64_PSR_BITS_TO_SET) + & ~(IA64_PSR_BITS_TO_CLEAR | IA64_PSR_PP | IA64_PSR_UP)); ++#ifdef CONFIG_KGDB ++ child_ptregs->cr_ipsr |= IA64_PSR_DB; ++#endif + + /* + * NOTE: The calling convention considers all floating point +@@ -686,6 +689,9 @@ kernel_thread (int (*fn)(void *), void * + regs.pt.r11 = (unsigned long) arg; /* 2nd argument */ + /* Preserve PSR bits, except for bits 32-34 and 37-45, which we can't read. */ + regs.pt.cr_ipsr = ia64_getreg(_IA64_REG_PSR) | IA64_PSR_BN; ++#ifdef CONFIG_KGDB ++ regs.pt.cr_ipsr |= IA64_PSR_DB; ++#endif + regs.pt.cr_ifs = 1UL << 63; /* mark as valid, empty frame */ + regs.sw.ar_fpsr = regs.pt.ar_fpsr = ia64_getreg(_IA64_REG_AR_FPSR); + regs.sw.ar_bspstore = (unsigned long) current + IA64_RBS_OFFSET; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/smp.c linux-2.6.18.kgdb/arch/ia64/kernel/smp.c +--- linux-2.6.18/arch/ia64/kernel/smp.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/smp.c 2008-06-10 16:19:32.000000000 +0400 +@@ -47,6 +47,7 @@ + #include + #include + #include ++#include + + /* + * Structure and data for smp_call_function(). This is designed to minimise static memory +@@ -66,6 +67,9 @@ static volatile struct call_data_struct + + #define IPI_CALL_FUNC 0 + #define IPI_CPU_STOP 1 ++#ifdef CONFIG_KGDB ++#define IPI_KGDB_INTERRUPT 2 ++#endif + + /* This needs to be cacheline aligned because it is written to by *other* CPUs. */ + static DEFINE_PER_CPU(u64, ipi_operation) ____cacheline_aligned; +@@ -155,6 +159,11 @@ handle_IPI (int irq, void *dev_id, struc + case IPI_CPU_STOP: + stop_this_cpu(); + break; ++#ifdef CONFIG_KGDB ++ case IPI_KGDB_INTERRUPT: ++ kgdb_wait_ipi(regs); ++ break; ++#endif + + default: + printk(KERN_CRIT "Unknown IPI on CPU %d: %lu\n", this_cpu, which); +@@ -305,6 +314,14 @@ smp_call_function_single (int cpuid, voi + } + EXPORT_SYMBOL(smp_call_function_single); + ++#ifdef CONFIG_KGDB ++void ++smp_send_nmi_allbutself(void) ++{ ++ send_IPI_allbutself(IPI_KGDB_INTERRUPT); ++} ++#endif ++ + /* + * this function sends a 'generic call function' IPI to all other CPUs + * in the system. +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/traps.c linux-2.6.18.kgdb/arch/ia64/kernel/traps.c +--- linux-2.6.18/arch/ia64/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/traps.c 2008-06-10 16:19:32.000000000 +0400 +@@ -200,8 +200,12 @@ __kprobes ia64_bad_break (unsigned long + break; + + default: +- if (break_num < 0x40000 || break_num > 0x100000) ++ if (break_num < 0x40000 || break_num > 0x100000) { ++ if (notify_die(DIE_BREAK, "bad break", regs, ++ break_num, TRAP_BRKPT, SIGTRAP) == NOTIFY_STOP) ++ return; + die_if_kernel("Bad break", regs, break_num); ++ } + + if (break_num < 0x80000) { + sig = SIGILL; code = __ILL_BREAK; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/kernel/unwind.c linux-2.6.18.kgdb/arch/ia64/kernel/unwind.c +--- linux-2.6.18/arch/ia64/kernel/unwind.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/kernel/unwind.c 2008-06-10 16:20:23.000000000 +0400 +@@ -72,10 +72,68 @@ + # define STAT(x...) + #endif + ++#ifdef CONFIG_KGDB ++#define KGDB_EARLY_SIZE 100 ++static struct unw_reg_state __initdata kgdb_reg_state[KGDB_EARLY_SIZE]; ++static struct unw_labeled_state __initdata kgdb_labeled_state[KGDB_EARLY_SIZE]; ++void __initdata *kgdb_reg_state_free, __initdata *kgdb_labeled_state_free; ++ ++static void __init ++kgdb_malloc_init(void) ++{ ++ int i; ++ ++ kgdb_reg_state_free = kgdb_reg_state; ++ for (i = 1; i < KGDB_EARLY_SIZE; i++) { ++ *((unsigned long *) &kgdb_reg_state[i]) = (unsigned long) kgdb_reg_state_free; ++ kgdb_reg_state_free = &kgdb_reg_state[i]; ++ } ++ ++ kgdb_labeled_state_free = kgdb_labeled_state; ++ for (i = 1; i < KGDB_EARLY_SIZE; i++) { ++ *((unsigned long *) &kgdb_labeled_state[i]) = ++ (unsigned long) kgdb_labeled_state_free; ++ kgdb_labeled_state_free = &kgdb_labeled_state[i]; ++ } ++ ++} ++ ++static void * __init ++kgdb_malloc(void **mem) ++{ ++ void *p; ++ ++ p = *mem; ++ *mem = *((void **) p); ++ return p; ++} ++ ++static void __init ++kgdb_free(void **mem, void *p) ++{ ++ *((void **)p) = *mem; ++ *mem = p; ++} ++ ++#define alloc_reg_state() (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_malloc(&kgdb_reg_state_free) : \ ++ kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC)) ++#define free_reg_state(usr) (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_free(&kgdb_reg_state_free, usr) : \ ++ kfree(usr)) ++#define alloc_labeled_state() (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_malloc(&kgdb_labeled_state_free) : \ ++ kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC)) ++#define free_labeled_state(usr) (!malloc_sizes[0].cs_cachep ? \ ++ kgdb_free(&kgdb_labeled_state_free, usr) : \ ++ kfree(usr)) ++ ++#else + #define alloc_reg_state() kmalloc(sizeof(struct unw_reg_state), GFP_ATOMIC) + #define free_reg_state(usr) kfree(usr) + #define alloc_labeled_state() kmalloc(sizeof(struct unw_labeled_state), GFP_ATOMIC) + #define free_labeled_state(usr) kfree(usr) ++#endif + + typedef unsigned long unw_word; + typedef unsigned char unw_hash_index_t; +@@ -238,6 +296,24 @@ static struct { + #endif + }; + ++#ifdef CONFIG_KGDB ++/* ++ * This makes it safe to call breakpoint() very early ++ * in setup_arch providing: ++ * 1) breakpoint isn't called between lines in cpu_init ++ * where init_mm.mm_count is incremented and ia64_mmu_init ++ * is called. Otherwise the test below is invalid. ++ * 2) the memory examined doesn't result in tlbmiss. ++ */ ++static unsigned long inline kgdb_unimpl_va_mask(void) ++{ ++ if (atomic_read(&init_mm.mm_count) > 1) ++ return local_cpu_data->unimpl_va_mask; ++ else ++ return 0UL; ++} ++#endif ++ + static inline int + read_only (void *addr) + { +@@ -1786,7 +1862,11 @@ run_script (struct unw_script *script, s + + case UNW_INSN_LOAD: + #ifdef UNW_DEBUG ++#ifdef CONFIG_KGDB ++ if ((s[val] & (kgdb_unimpl_va_mask() | 0x7)) != 0 ++#else + if ((s[val] & (local_cpu_data->unimpl_va_mask | 0x7)) != 0 ++#endif + || s[val] < TASK_SIZE) + { + UNW_DPRINT(0, "unwind.%s: rejecting bad psp=0x%lx\n", +@@ -1821,7 +1901,11 @@ find_save_locs (struct unw_frame_info *i + struct unw_script *scr; + unsigned long flags = 0; + ++#ifdef CONFIG_KGDB ++ if ((info->ip & (kgdb_unimpl_va_mask() | 0xf)) || info->ip < TASK_SIZE) { ++#else + if ((info->ip & (local_cpu_data->unimpl_va_mask | 0xf)) || info->ip < TASK_SIZE) { ++#endif + /* don't let obviously bad addresses pollute the cache */ + /* FIXME: should really be level 0 but it occurs too often. KAO */ + UNW_DPRINT(1, "unwind.%s: rejecting bad ip=0x%lx\n", __FUNCTION__, info->ip); +@@ -2249,6 +2333,9 @@ unw_init (void) + + init_unwind_table(&unw.kernel_table, "kernel", KERNEL_START, (unsigned long) __gp, + __start_unwind, __end_unwind); ++#ifdef CONFIG_KGDB ++ kgdb_malloc_init(); ++#endif + } + + /* +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/mm/extable.c linux-2.6.18.kgdb/arch/ia64/mm/extable.c +--- linux-2.6.18/arch/ia64/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/mm/extable.c 2008-06-10 16:19:32.000000000 +0400 +@@ -6,6 +6,7 @@ + */ + + #include ++#include + + #include + #include +@@ -73,6 +74,11 @@ search_extable (const struct exception_t + else + last = mid - 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + return NULL; + } + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ia64/mm/fault.c linux-2.6.18.kgdb/arch/ia64/mm/fault.c +--- linux-2.6.18/arch/ia64/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ia64/mm/fault.c 2008-06-10 16:19:32.000000000 +0400 +@@ -266,6 +266,10 @@ ia64_do_page_fault (unsigned long addres + */ + bust_spinlocks(1); + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ isr, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + if (address < PAGE_SIZE) + printk(KERN_ALERT "Unable to handle kernel NULL pointer dereference (address %016lx)\n", address); + else +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/Kconfig.debug linux-2.6.18.kgdb/arch/mips/Kconfig.debug +--- linux-2.6.18/arch/mips/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/Kconfig.debug 2008-06-10 16:19:28.000000000 +0400 +@@ -37,25 +37,6 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-config KGDB +- bool "Remote GDB kernel debugging" +- depends on DEBUG_KERNEL +- select DEBUG_INFO +- help +- If you say Y here, it will be possible to remotely debug the MIPS +- kernel using gdb. This enlarges your kernel image disk size by +- several megabytes and requires a machine with more than 16 MB, +- better 32 MB RAM to avoid excessive linking time. This is only +- useful for kernel hackers. If unsure, say N. +- +-config GDB_CONSOLE +- bool "Console output to GDB" +- depends on KGDB +- help +- If you are using GDB for remote debugging over a serial port and +- would like kernel messages to be formatted into GDB $O packets so +- that GDB prints them as program output, say 'Y'. +- + config SB1XXX_CORELIS + bool "Corelis Debugger" + depends on SIBYTE_SB1xxx_SOC +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/Makefile linux-2.6.18.kgdb/arch/mips/kernel/Makefile +--- linux-2.6.18/arch/mips/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/kernel/Makefile 2008-06-10 16:19:28.000000000 +0400 +@@ -59,7 +59,8 @@ obj-$(CONFIG_MIPS32_COMPAT) += linux32.o + obj-$(CONFIG_MIPS32_N32) += binfmt_elfn32.o scall64-n32.o signal_n32.o + obj-$(CONFIG_MIPS32_O32) += binfmt_elfo32.o scall64-o32.o ptrace32.o + +-obj-$(CONFIG_KGDB) += gdb-low.o gdb-stub.o ++obj-$(CONFIG_KGDB) += kgdb_handler.o kgdb.o kgdb-jmp.o \ ++ kgdb-setjmp.o + obj-$(CONFIG_PROC_FS) += proc.o + + obj-$(CONFIG_64BIT) += cpu-bugs64.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/gdb-low.S linux-2.6.18.kgdb/arch/mips/kernel/gdb-low.S +--- linux-2.6.18/arch/mips/kernel/gdb-low.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/kernel/gdb-low.S 1970-01-01 03:00:00.000000000 +0300 +@@ -1,394 +0,0 @@ +-/* +- * gdb-low.S contains the low-level trap handler for the GDB stub. +- * +- * Copyright (C) 1995 Andreas Busse +- */ +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_32BIT +-#define DMFC0 mfc0 +-#define DMTC0 mtc0 +-#define LDC1 lwc1 +-#define SDC1 lwc1 +-#endif +-#ifdef CONFIG_64BIT +-#define DMFC0 dmfc0 +-#define DMTC0 dmtc0 +-#define LDC1 ldc1 +-#define SDC1 ldc1 +-#endif +- +-/* +- * [jsun] We reserves about 2x GDB_FR_SIZE in stack. The lower (addressed) +- * part is used to store registers and passed to exception handler. +- * The upper part is reserved for "call func" feature where gdb client +- * saves some of the regs, setups call frame and passes args. +- * +- * A trace shows about 200 bytes are used to store about half of all regs. +- * The rest should be big enough for frame setup and passing args. +- */ +- +-/* +- * The low level trap handler +- */ +- .align 5 +- NESTED(trap_low, GDB_FR_SIZE, sp) +- .set noat +- .set noreorder +- +- mfc0 k0, CP0_STATUS +- sll k0, 3 /* extract cu0 bit */ +- bltz k0, 1f +- move k1, sp +- +- /* +- * Called from user mode, go somewhere else. +- */ +- mfc0 k0, CP0_CAUSE +- andi k0, k0, 0x7c +-#ifdef CONFIG_64BIT +- dsll k0, k0, 1 +-#endif +- PTR_L k1, saved_vectors(k0) +- jr k1 +- nop +-1: +- move k0, sp +- PTR_SUBU sp, k1, GDB_FR_SIZE*2 # see comment above +- LONG_S k0, GDB_FR_REG29(sp) +- LONG_S $2, GDB_FR_REG2(sp) +- +-/* +- * First save the CP0 and special registers +- */ +- +- mfc0 v0, CP0_STATUS +- LONG_S v0, GDB_FR_STATUS(sp) +- mfc0 v0, CP0_CAUSE +- LONG_S v0, GDB_FR_CAUSE(sp) +- DMFC0 v0, CP0_EPC +- LONG_S v0, GDB_FR_EPC(sp) +- DMFC0 v0, CP0_BADVADDR +- LONG_S v0, GDB_FR_BADVADDR(sp) +- mfhi v0 +- LONG_S v0, GDB_FR_HI(sp) +- mflo v0 +- LONG_S v0, GDB_FR_LO(sp) +- +-/* +- * Now the integer registers +- */ +- +- LONG_S zero, GDB_FR_REG0(sp) /* I know... */ +- LONG_S $1, GDB_FR_REG1(sp) +- /* v0 already saved */ +- LONG_S $3, GDB_FR_REG3(sp) +- LONG_S $4, GDB_FR_REG4(sp) +- LONG_S $5, GDB_FR_REG5(sp) +- LONG_S $6, GDB_FR_REG6(sp) +- LONG_S $7, GDB_FR_REG7(sp) +- LONG_S $8, GDB_FR_REG8(sp) +- LONG_S $9, GDB_FR_REG9(sp) +- LONG_S $10, GDB_FR_REG10(sp) +- LONG_S $11, GDB_FR_REG11(sp) +- LONG_S $12, GDB_FR_REG12(sp) +- LONG_S $13, GDB_FR_REG13(sp) +- LONG_S $14, GDB_FR_REG14(sp) +- LONG_S $15, GDB_FR_REG15(sp) +- LONG_S $16, GDB_FR_REG16(sp) +- LONG_S $17, GDB_FR_REG17(sp) +- LONG_S $18, GDB_FR_REG18(sp) +- LONG_S $19, GDB_FR_REG19(sp) +- LONG_S $20, GDB_FR_REG20(sp) +- LONG_S $21, GDB_FR_REG21(sp) +- LONG_S $22, GDB_FR_REG22(sp) +- LONG_S $23, GDB_FR_REG23(sp) +- LONG_S $24, GDB_FR_REG24(sp) +- LONG_S $25, GDB_FR_REG25(sp) +- LONG_S $26, GDB_FR_REG26(sp) +- LONG_S $27, GDB_FR_REG27(sp) +- LONG_S $28, GDB_FR_REG28(sp) +- /* sp already saved */ +- LONG_S $30, GDB_FR_REG30(sp) +- LONG_S $31, GDB_FR_REG31(sp) +- +- CLI /* disable interrupts */ +- TRACE_IRQS_OFF +- +-/* +- * Followed by the floating point registers +- */ +- mfc0 v0, CP0_STATUS /* FPU enabled? */ +- srl v0, v0, 16 +- andi v0, v0, (ST0_CU1 >> 16) +- +- beqz v0,2f /* disabled, skip */ +- nop +- +- SDC1 $0, GDB_FR_FPR0(sp) +- SDC1 $1, GDB_FR_FPR1(sp) +- SDC1 $2, GDB_FR_FPR2(sp) +- SDC1 $3, GDB_FR_FPR3(sp) +- SDC1 $4, GDB_FR_FPR4(sp) +- SDC1 $5, GDB_FR_FPR5(sp) +- SDC1 $6, GDB_FR_FPR6(sp) +- SDC1 $7, GDB_FR_FPR7(sp) +- SDC1 $8, GDB_FR_FPR8(sp) +- SDC1 $9, GDB_FR_FPR9(sp) +- SDC1 $10, GDB_FR_FPR10(sp) +- SDC1 $11, GDB_FR_FPR11(sp) +- SDC1 $12, GDB_FR_FPR12(sp) +- SDC1 $13, GDB_FR_FPR13(sp) +- SDC1 $14, GDB_FR_FPR14(sp) +- SDC1 $15, GDB_FR_FPR15(sp) +- SDC1 $16, GDB_FR_FPR16(sp) +- SDC1 $17, GDB_FR_FPR17(sp) +- SDC1 $18, GDB_FR_FPR18(sp) +- SDC1 $19, GDB_FR_FPR19(sp) +- SDC1 $20, GDB_FR_FPR20(sp) +- SDC1 $21, GDB_FR_FPR21(sp) +- SDC1 $22, GDB_FR_FPR22(sp) +- SDC1 $23, GDB_FR_FPR23(sp) +- SDC1 $24, GDB_FR_FPR24(sp) +- SDC1 $25, GDB_FR_FPR25(sp) +- SDC1 $26, GDB_FR_FPR26(sp) +- SDC1 $27, GDB_FR_FPR27(sp) +- SDC1 $28, GDB_FR_FPR28(sp) +- SDC1 $29, GDB_FR_FPR29(sp) +- SDC1 $30, GDB_FR_FPR30(sp) +- SDC1 $31, GDB_FR_FPR31(sp) +- +-/* +- * FPU control registers +- */ +- +- cfc1 v0, CP1_STATUS +- LONG_S v0, GDB_FR_FSR(sp) +- cfc1 v0, CP1_REVISION +- LONG_S v0, GDB_FR_FIR(sp) +- +-/* +- * Current stack frame ptr +- */ +- +-2: +- LONG_S sp, GDB_FR_FRP(sp) +- +-/* +- * CP0 registers (R4000/R4400 unused registers skipped) +- */ +- +- mfc0 v0, CP0_INDEX +- LONG_S v0, GDB_FR_CP0_INDEX(sp) +- mfc0 v0, CP0_RANDOM +- LONG_S v0, GDB_FR_CP0_RANDOM(sp) +- DMFC0 v0, CP0_ENTRYLO0 +- LONG_S v0, GDB_FR_CP0_ENTRYLO0(sp) +- DMFC0 v0, CP0_ENTRYLO1 +- LONG_S v0, GDB_FR_CP0_ENTRYLO1(sp) +- DMFC0 v0, CP0_CONTEXT +- LONG_S v0, GDB_FR_CP0_CONTEXT(sp) +- mfc0 v0, CP0_PAGEMASK +- LONG_S v0, GDB_FR_CP0_PAGEMASK(sp) +- mfc0 v0, CP0_WIRED +- LONG_S v0, GDB_FR_CP0_WIRED(sp) +- DMFC0 v0, CP0_ENTRYHI +- LONG_S v0, GDB_FR_CP0_ENTRYHI(sp) +- mfc0 v0, CP0_PRID +- LONG_S v0, GDB_FR_CP0_PRID(sp) +- +- .set at +- +-/* +- * Continue with the higher level handler +- */ +- +- move a0,sp +- +- jal handle_exception +- nop +- +-/* +- * Restore all writable registers, in reverse order +- */ +- +- .set noat +- +- LONG_L v0, GDB_FR_CP0_ENTRYHI(sp) +- LONG_L v1, GDB_FR_CP0_WIRED(sp) +- DMTC0 v0, CP0_ENTRYHI +- mtc0 v1, CP0_WIRED +- LONG_L v0, GDB_FR_CP0_PAGEMASK(sp) +- LONG_L v1, GDB_FR_CP0_ENTRYLO1(sp) +- mtc0 v0, CP0_PAGEMASK +- DMTC0 v1, CP0_ENTRYLO1 +- LONG_L v0, GDB_FR_CP0_ENTRYLO0(sp) +- LONG_L v1, GDB_FR_CP0_INDEX(sp) +- DMTC0 v0, CP0_ENTRYLO0 +- LONG_L v0, GDB_FR_CP0_CONTEXT(sp) +- mtc0 v1, CP0_INDEX +- DMTC0 v0, CP0_CONTEXT +- +- +-/* +- * Next, the floating point registers +- */ +- mfc0 v0, CP0_STATUS /* check if the FPU is enabled */ +- srl v0, v0, 16 +- andi v0, v0, (ST0_CU1 >> 16) +- +- beqz v0, 3f /* disabled, skip */ +- nop +- +- LDC1 $31, GDB_FR_FPR31(sp) +- LDC1 $30, GDB_FR_FPR30(sp) +- LDC1 $29, GDB_FR_FPR29(sp) +- LDC1 $28, GDB_FR_FPR28(sp) +- LDC1 $27, GDB_FR_FPR27(sp) +- LDC1 $26, GDB_FR_FPR26(sp) +- LDC1 $25, GDB_FR_FPR25(sp) +- LDC1 $24, GDB_FR_FPR24(sp) +- LDC1 $23, GDB_FR_FPR23(sp) +- LDC1 $22, GDB_FR_FPR22(sp) +- LDC1 $21, GDB_FR_FPR21(sp) +- LDC1 $20, GDB_FR_FPR20(sp) +- LDC1 $19, GDB_FR_FPR19(sp) +- LDC1 $18, GDB_FR_FPR18(sp) +- LDC1 $17, GDB_FR_FPR17(sp) +- LDC1 $16, GDB_FR_FPR16(sp) +- LDC1 $15, GDB_FR_FPR15(sp) +- LDC1 $14, GDB_FR_FPR14(sp) +- LDC1 $13, GDB_FR_FPR13(sp) +- LDC1 $12, GDB_FR_FPR12(sp) +- LDC1 $11, GDB_FR_FPR11(sp) +- LDC1 $10, GDB_FR_FPR10(sp) +- LDC1 $9, GDB_FR_FPR9(sp) +- LDC1 $8, GDB_FR_FPR8(sp) +- LDC1 $7, GDB_FR_FPR7(sp) +- LDC1 $6, GDB_FR_FPR6(sp) +- LDC1 $5, GDB_FR_FPR5(sp) +- LDC1 $4, GDB_FR_FPR4(sp) +- LDC1 $3, GDB_FR_FPR3(sp) +- LDC1 $2, GDB_FR_FPR2(sp) +- LDC1 $1, GDB_FR_FPR1(sp) +- LDC1 $0, GDB_FR_FPR0(sp) +- +-/* +- * Now the CP0 and integer registers +- */ +- +-3: +-#ifdef CONFIG_MIPS_MT_SMTC +- /* Read-modify write of Status must be atomic */ +- mfc0 t2, CP0_TCSTATUS +- ori t1, t2, TCSTATUS_IXMT +- mtc0 t1, CP0_TCSTATUS +- andi t2, t2, TCSTATUS_IXMT +- _ehb +- DMT 9 # dmt t1 +- jal mips_ihb +- nop +-#endif /* CONFIG_MIPS_MT_SMTC */ +- mfc0 t0, CP0_STATUS +- ori t0, 0x1f +- xori t0, 0x1f +- mtc0 t0, CP0_STATUS +-#ifdef CONFIG_MIPS_MT_SMTC +- andi t1, t1, VPECONTROL_TE +- beqz t1, 9f +- nop +- EMT # emt +-9: +- mfc0 t1, CP0_TCSTATUS +- xori t1, t1, TCSTATUS_IXMT +- or t1, t1, t2 +- mtc0 t1, CP0_TCSTATUS +- _ehb +-#endif /* CONFIG_MIPS_MT_SMTC */ +- LONG_L v0, GDB_FR_STATUS(sp) +- LONG_L v1, GDB_FR_EPC(sp) +- mtc0 v0, CP0_STATUS +- DMTC0 v1, CP0_EPC +- LONG_L v0, GDB_FR_HI(sp) +- LONG_L v1, GDB_FR_LO(sp) +- mthi v0 +- mtlo v1 +- LONG_L $31, GDB_FR_REG31(sp) +- LONG_L $30, GDB_FR_REG30(sp) +- LONG_L $28, GDB_FR_REG28(sp) +- LONG_L $27, GDB_FR_REG27(sp) +- LONG_L $26, GDB_FR_REG26(sp) +- LONG_L $25, GDB_FR_REG25(sp) +- LONG_L $24, GDB_FR_REG24(sp) +- LONG_L $23, GDB_FR_REG23(sp) +- LONG_L $22, GDB_FR_REG22(sp) +- LONG_L $21, GDB_FR_REG21(sp) +- LONG_L $20, GDB_FR_REG20(sp) +- LONG_L $19, GDB_FR_REG19(sp) +- LONG_L $18, GDB_FR_REG18(sp) +- LONG_L $17, GDB_FR_REG17(sp) +- LONG_L $16, GDB_FR_REG16(sp) +- LONG_L $15, GDB_FR_REG15(sp) +- LONG_L $14, GDB_FR_REG14(sp) +- LONG_L $13, GDB_FR_REG13(sp) +- LONG_L $12, GDB_FR_REG12(sp) +- LONG_L $11, GDB_FR_REG11(sp) +- LONG_L $10, GDB_FR_REG10(sp) +- LONG_L $9, GDB_FR_REG9(sp) +- LONG_L $8, GDB_FR_REG8(sp) +- LONG_L $7, GDB_FR_REG7(sp) +- LONG_L $6, GDB_FR_REG6(sp) +- LONG_L $5, GDB_FR_REG5(sp) +- LONG_L $4, GDB_FR_REG4(sp) +- LONG_L $3, GDB_FR_REG3(sp) +- LONG_L $2, GDB_FR_REG2(sp) +- LONG_L $1, GDB_FR_REG1(sp) +-#if defined(CONFIG_CPU_R3000) || defined(CONFIG_CPU_TX39XX) +- LONG_L k0, GDB_FR_EPC(sp) +- LONG_L $29, GDB_FR_REG29(sp) /* Deallocate stack */ +- jr k0 +- rfe +-#else +- LONG_L sp, GDB_FR_REG29(sp) /* Deallocate stack */ +- +- .set mips3 +- eret +- .set mips0 +-#endif +- .set at +- .set reorder +- END(trap_low) +- +-LEAF(kgdb_read_byte) +-4: lb t0, (a0) +- sb t0, (a1) +- li v0, 0 +- jr ra +- .section __ex_table,"a" +- PTR 4b, kgdbfault +- .previous +- END(kgdb_read_byte) +- +-LEAF(kgdb_write_byte) +-5: sb a0, (a1) +- li v0, 0 +- jr ra +- .section __ex_table,"a" +- PTR 5b, kgdbfault +- .previous +- END(kgdb_write_byte) +- +- .type kgdbfault@function +- .ent kgdbfault +- +-kgdbfault: li v0, -EFAULT +- jr ra +- .end kgdbfault +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/gdb-stub.c linux-2.6.18.kgdb/arch/mips/kernel/gdb-stub.c +--- linux-2.6.18/arch/mips/kernel/gdb-stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/kernel/gdb-stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,1154 +0,0 @@ +-/* +- * arch/mips/kernel/gdb-stub.c +- * +- * Originally written by Glenn Engel, Lake Stevens Instrument Division +- * +- * Contributed by HP Systems +- * +- * Modified for SPARC by Stu Grossman, Cygnus Support. +- * +- * Modified for Linux/MIPS (and MIPS in general) by Andreas Busse +- * Send complaints, suggestions etc. to +- * +- * Copyright (C) 1995 Andreas Busse +- * +- * Copyright (C) 2003 MontaVista Software Inc. +- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net +- */ +- +-/* +- * To enable debugger support, two things need to happen. One, a +- * call to set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * Two, a breakpoint needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint(). Breakpoint() +- * simulates a breakpoint by executing a BREAK instruction. +- * +- * +- * The following gdb commands are supported: +- * +- * command function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * +- * k kill +- * +- * ? What was the last sigval ? SNN (signal NN) +- * +- * bBB..BB Set baud rate to BB..BB OK or BNN, then sets +- * baud rate +- * +- * All commands and responses are sent with a packet which includes a +- * checksum. A packet consists of +- * +- * $#. +- * +- * where +- * :: +- * :: < two hex digits computed as modulo 256 sum of > +- * +- * When a packet is received, it is first acknowledged with either '+' or '-'. +- * '+' indicates a successful transfer. '-' indicates a failed transfer. +- * +- * Example: +- * +- * Host: Reply: +- * $m0,10#2a +$00010203040506070809101112131415#42 +- * +- * +- * ============== +- * MORE EXAMPLES: +- * ============== +- * +- * For reference -- the following are the steps that one +- * company took (RidgeRun Inc) to get remote gdb debugging +- * going. In this scenario the host machine was a PC and the +- * target platform was a Galileo EVB64120A MIPS evaluation +- * board. +- * +- * Step 1: +- * First download gdb-5.0.tar.gz from the internet. +- * and then build/install the package. +- * +- * Example: +- * $ tar zxf gdb-5.0.tar.gz +- * $ cd gdb-5.0 +- * $ ./configure --target=mips-linux-elf +- * $ make +- * $ install +- * $ which mips-linux-elf-gdb +- * /usr/local/bin/mips-linux-elf-gdb +- * +- * Step 2: +- * Configure linux for remote debugging and build it. +- * +- * Example: +- * $ cd ~/linux +- * $ make menuconfig +- * $ make +- * +- * Step 3: +- * Download the kernel to the remote target and start +- * the kernel running. It will promptly halt and wait +- * for the host gdb session to connect. It does this +- * since the "Kernel Hacking" option has defined +- * CONFIG_KGDB which in turn enables your calls +- * to: +- * set_debug_traps(); +- * breakpoint(); +- * +- * Step 4: +- * Start the gdb session on the host. +- * +- * Example: +- * $ mips-linux-elf-gdb vmlinux +- * (gdb) set remotebaud 115200 +- * (gdb) target remote /dev/ttyS1 +- * ...at this point you are connected to +- * the remote target and can use gdb +- * in the normal fasion. Setting +- * breakpoints, single stepping, +- * printing variables, etc. +- */ +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * external low-level support routines +- */ +- +-extern int putDebugChar(char c); /* write a single character */ +-extern char getDebugChar(void); /* read and return a single char */ +-extern void trap_low(void); +- +-/* +- * breakpoint and test functions +- */ +-extern void breakpoint(void); +-extern void breakinst(void); +-extern void async_breakpoint(void); +-extern void async_breakinst(void); +-extern void adel(void); +- +-/* +- * local prototypes +- */ +- +-static void getpacket(char *buffer); +-static void putpacket(char *buffer); +-static int computeSignal(int tt); +-static int hex(unsigned char ch); +-static int hexToInt(char **ptr, int *intValue); +-static int hexToLong(char **ptr, long *longValue); +-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault); +-void handle_exception(struct gdb_regs *regs); +- +-int kgdb_enabled; +- +-/* +- * spin locks for smp case +- */ +-static DEFINE_SPINLOCK(kgdb_lock); +-static raw_spinlock_t kgdb_cpulock[NR_CPUS] = { +- [0 ... NR_CPUS-1] = __RAW_SPIN_LOCK_UNLOCKED, +-}; +- +-/* +- * BUFMAX defines the maximum number of characters in inbound/outbound buffers +- * at least NUMREGBYTES*2 are needed for register packets +- */ +-#define BUFMAX 2048 +- +-static char input_buffer[BUFMAX]; +-static char output_buffer[BUFMAX]; +-static int initialized; /* !0 means we've been initialized */ +-static int kgdb_started; +-static const char hexchars[]="0123456789abcdef"; +- +-/* Used to prevent crashes in memory access. Note that they'll crash anyway if +- we haven't set up fault handlers yet... */ +-int kgdb_read_byte(unsigned char *address, unsigned char *dest); +-int kgdb_write_byte(unsigned char val, unsigned char *dest); +- +-/* +- * Convert ch from a hex digit to an int +- */ +-static int hex(unsigned char ch) +-{ +- if (ch >= 'a' && ch <= 'f') +- return ch-'a'+10; +- if (ch >= '0' && ch <= '9') +- return ch-'0'; +- if (ch >= 'A' && ch <= 'F') +- return ch-'A'+10; +- return -1; +-} +- +-/* +- * scan for the sequence $# +- */ +-static void getpacket(char *buffer) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- unsigned char ch; +- +- do { +- /* +- * wait around for the start character, +- * ignore all other characters +- */ +- while ((ch = (getDebugChar() & 0x7f)) != '$') ; +- +- checksum = 0; +- xmitcsum = -1; +- count = 0; +- +- /* +- * now, read until a # or end of buffer is found +- */ +- while (count < BUFMAX) { +- ch = getDebugChar(); +- if (ch == '#') +- break; +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- if (count >= BUFMAX) +- continue; +- +- buffer[count] = 0; +- +- if (ch == '#') { +- xmitcsum = hex(getDebugChar() & 0x7f) << 4; +- xmitcsum |= hex(getDebugChar() & 0x7f); +- +- if (checksum != xmitcsum) +- putDebugChar('-'); /* failed checksum */ +- else { +- putDebugChar('+'); /* successful transfer */ +- +- /* +- * if a sequence char is present, +- * reply the sequence ID +- */ +- if (buffer[2] == ':') { +- putDebugChar(buffer[0]); +- putDebugChar(buffer[1]); +- +- /* +- * remove sequence chars from buffer +- */ +- count = strlen(buffer); +- for (i=3; i <= count; i++) +- buffer[i-3] = buffer[i]; +- } +- } +- } +- } +- while (checksum != xmitcsum); +-} +- +-/* +- * send the packet in buffer. +- */ +-static void putpacket(char *buffer) +-{ +- unsigned char checksum; +- int count; +- unsigned char ch; +- +- /* +- * $#. +- */ +- +- do { +- putDebugChar('$'); +- checksum = 0; +- count = 0; +- +- while ((ch = buffer[count]) != 0) { +- if (!(putDebugChar(ch))) +- return; +- checksum += ch; +- count += 1; +- } +- +- putDebugChar('#'); +- putDebugChar(hexchars[checksum >> 4]); +- putDebugChar(hexchars[checksum & 0xf]); +- +- } +- while ((getDebugChar() & 0x7f) != '+'); +-} +- +- +-/* +- * Convert the memory pointed to by mem into hex, placing result in buf. +- * Return a pointer to the last char put in buf (null), in case of mem fault, +- * return 0. +- * may_fault is non-zero if we are reading from arbitrary memory, but is currently +- * not used. +- */ +-static unsigned char *mem2hex(char *mem, char *buf, int count, int may_fault) +-{ +- unsigned char ch; +- +- while (count-- > 0) { +- if (kgdb_read_byte(mem++, &ch) != 0) +- return 0; +- *buf++ = hexchars[ch >> 4]; +- *buf++ = hexchars[ch & 0xf]; +- } +- +- *buf = 0; +- +- return buf; +-} +- +-/* +- * convert the hex array pointed to by buf into binary to be placed in mem +- * return a pointer to the character AFTER the last byte written +- * may_fault is non-zero if we are reading from arbitrary memory, but is currently +- * not used. +- */ +-static char *hex2mem(char *buf, char *mem, int count, int binary, int may_fault) +-{ +- int i; +- unsigned char ch; +- +- for (i=0; itt && ht->signo; ht++) +- saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low); +- +- putDebugChar('+'); /* 'hello world' */ +- /* +- * In case GDB is started before us, ack any packets +- * (presumably "$?#xx") sitting there. +- */ +- while((c = getDebugChar()) != '$'); +- while((c = getDebugChar()) != '#'); +- c = getDebugChar(); /* eat first csum byte */ +- c = getDebugChar(); /* eat second csum byte */ +- putDebugChar('+'); /* ack it */ +- +- initialized = 1; +- local_irq_restore(flags); +-} +- +-void restore_debug_traps(void) +-{ +- struct hard_trap_info *ht; +- unsigned long flags; +- +- local_irq_save(flags); +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- set_except_vector(ht->tt, saved_vectors[ht->tt]); +- local_irq_restore(flags); +-} +- +-/* +- * Convert the MIPS hardware trap type code to a Unix signal number. +- */ +-static int computeSignal(int tt) +-{ +- struct hard_trap_info *ht; +- +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- if (ht->tt == tt) +- return ht->signo; +- +- return SIGHUP; /* default for things we don't know about */ +-} +- +-/* +- * While we find nice hex chars, build an int. +- * Return number of chars processed. +- */ +-static int hexToInt(char **ptr, int *intValue) +-{ +- int numChars = 0; +- int hexValue; +- +- *intValue = 0; +- +- while (**ptr) { +- hexValue = hex(**ptr); +- if (hexValue < 0) +- break; +- +- *intValue = (*intValue << 4) | hexValue; +- numChars ++; +- +- (*ptr)++; +- } +- +- return (numChars); +-} +- +-static int hexToLong(char **ptr, long *longValue) +-{ +- int numChars = 0; +- int hexValue; +- +- *longValue = 0; +- +- while (**ptr) { +- hexValue = hex(**ptr); +- if (hexValue < 0) +- break; +- +- *longValue = (*longValue << 4) | hexValue; +- numChars ++; +- +- (*ptr)++; +- } +- +- return numChars; +-} +- +- +-#if 0 +-/* +- * Print registers (on target console) +- * Used only to debug the stub... +- */ +-void show_gdbregs(struct gdb_regs * regs) +-{ +- /* +- * Saved main processor registers +- */ +- printk("$0 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg0, regs->reg1, regs->reg2, regs->reg3, +- regs->reg4, regs->reg5, regs->reg6, regs->reg7); +- printk("$8 : %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg8, regs->reg9, regs->reg10, regs->reg11, +- regs->reg12, regs->reg13, regs->reg14, regs->reg15); +- printk("$16: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg16, regs->reg17, regs->reg18, regs->reg19, +- regs->reg20, regs->reg21, regs->reg22, regs->reg23); +- printk("$24: %08lx %08lx %08lx %08lx %08lx %08lx %08lx %08lx\n", +- regs->reg24, regs->reg25, regs->reg26, regs->reg27, +- regs->reg28, regs->reg29, regs->reg30, regs->reg31); +- +- /* +- * Saved cp0 registers +- */ +- printk("epc : %08lx\nStatus: %08lx\nCause : %08lx\n", +- regs->cp0_epc, regs->cp0_status, regs->cp0_cause); +-} +-#endif /* dead code */ +- +-/* +- * We single-step by setting breakpoints. When an exception +- * is handled, we need to restore the instructions hoisted +- * when the breakpoints were set. +- * +- * This is where we save the original instructions. +- */ +-static struct gdb_bp_save { +- unsigned long addr; +- unsigned int val; +-} step_bp[2]; +- +-#define BP 0x0000000d /* break opcode */ +- +-/* +- * Set breakpoint instructions for single stepping. +- */ +-static void single_step(struct gdb_regs *regs) +-{ +- union mips_instruction insn; +- unsigned long targ; +- int is_branch, is_cond, i; +- +- targ = regs->cp0_epc; +- insn.word = *(unsigned int *)targ; +- is_branch = is_cond = 0; +- +- switch (insn.i_format.opcode) { +- /* +- * jr and jalr are in r_format format. +- */ +- case spec_op: +- switch (insn.r_format.func) { +- case jalr_op: +- case jr_op: +- targ = *(®s->reg0 + insn.r_format.rs); +- is_branch = 1; +- break; +- } +- break; +- +- /* +- * This group contains: +- * bltz_op, bgez_op, bltzl_op, bgezl_op, +- * bltzal_op, bgezal_op, bltzall_op, bgezall_op. +- */ +- case bcond_op: +- is_branch = is_cond = 1; +- targ += 4 + (insn.i_format.simmediate << 2); +- break; +- +- /* +- * These are unconditional and in j_format. +- */ +- case jal_op: +- case j_op: +- is_branch = 1; +- targ += 4; +- targ >>= 28; +- targ <<= 28; +- targ |= (insn.j_format.target << 2); +- break; +- +- /* +- * These are conditional. +- */ +- case beq_op: +- case beql_op: +- case bne_op: +- case bnel_op: +- case blez_op: +- case blezl_op: +- case bgtz_op: +- case bgtzl_op: +- case cop0_op: +- case cop1_op: +- case cop2_op: +- case cop1x_op: +- is_branch = is_cond = 1; +- targ += 4 + (insn.i_format.simmediate << 2); +- break; +- } +- +- if (is_branch) { +- i = 0; +- if (is_cond && targ != (regs->cp0_epc + 8)) { +- step_bp[i].addr = regs->cp0_epc + 8; +- step_bp[i++].val = *(unsigned *)(regs->cp0_epc + 8); +- *(unsigned *)(regs->cp0_epc + 8) = BP; +- } +- step_bp[i].addr = targ; +- step_bp[i].val = *(unsigned *)targ; +- *(unsigned *)targ = BP; +- } else { +- step_bp[0].addr = regs->cp0_epc + 4; +- step_bp[0].val = *(unsigned *)(regs->cp0_epc + 4); +- *(unsigned *)(regs->cp0_epc + 4) = BP; +- } +-} +- +-/* +- * If asynchronously interrupted by gdb, then we need to set a breakpoint +- * at the interrupted instruction so that we wind up stopped with a +- * reasonable stack frame. +- */ +-static struct gdb_bp_save async_bp; +- +-/* +- * Swap the interrupted EPC with our asynchronous breakpoint routine. +- * This is safer than stuffing the breakpoint in-place, since no cache +- * flushes (or resulting smp_call_functions) are required. The +- * assumption is that only one CPU will be handling asynchronous bp's, +- * and only one can be active at a time. +- */ +-extern spinlock_t smp_call_lock; +- +-void set_async_breakpoint(unsigned long *epc) +-{ +- /* skip breaking into userland */ +- if ((*epc & 0x80000000) == 0) +- return; +- +-#ifdef CONFIG_SMP +- /* avoid deadlock if someone is make IPC */ +- if (spin_is_locked(&smp_call_lock)) +- return; +-#endif +- +- async_bp.addr = *epc; +- *epc = (unsigned long)async_breakpoint; +-} +- +-static void kgdb_wait(void *arg) +-{ +- unsigned flags; +- int cpu = smp_processor_id(); +- +- local_irq_save(flags); +- +- __raw_spin_lock(&kgdb_cpulock[cpu]); +- __raw_spin_unlock(&kgdb_cpulock[cpu]); +- +- local_irq_restore(flags); +-} +- +-/* +- * GDB stub needs to call kgdb_wait on all processor with interrupts +- * disabled, so it uses it's own special variant. +- */ +-static int kgdb_smp_call_kgdb_wait(void) +-{ +-#ifdef CONFIG_SMP +- struct call_data_struct data; +- int i, cpus = num_online_cpus() - 1; +- int cpu = smp_processor_id(); +- +- /* +- * Can die spectacularly if this CPU isn't yet marked online +- */ +- BUG_ON(!cpu_online(cpu)); +- +- if (!cpus) +- return 0; +- +- if (spin_is_locked(&smp_call_lock)) { +- /* +- * Some other processor is trying to make us do something +- * but we're not going to respond... give up +- */ +- return -1; +- } +- +- /* +- * We will continue here, accepting the fact that +- * the kernel may deadlock if another CPU attempts +- * to call smp_call_function now... +- */ +- +- data.func = kgdb_wait; +- data.info = NULL; +- atomic_set(&data.started, 0); +- data.wait = 0; +- +- spin_lock(&smp_call_lock); +- call_data = &data; +- mb(); +- +- /* Send a message to all other CPUs and wait for them to respond */ +- for (i = 0; i < NR_CPUS; i++) +- if (cpu_online(i) && i != cpu) +- core_send_ipi(i, SMP_CALL_FUNCTION); +- +- /* Wait for response */ +- /* FIXME: lock-up detection, backtrace on lock-up */ +- while (atomic_read(&data.started) != cpus) +- barrier(); +- +- call_data = NULL; +- spin_unlock(&smp_call_lock); +-#endif +- +- return 0; +-} +- +-/* +- * This function does all command processing for interfacing to gdb. It +- * returns 1 if you should skip the instruction at the trap address, 0 +- * otherwise. +- */ +-void handle_exception (struct gdb_regs *regs) +-{ +- int trap; /* Trap type */ +- int sigval; +- long addr; +- int length; +- char *ptr; +- unsigned long *stack; +- int i; +- int bflag = 0; +- +- kgdb_started = 1; +- +- /* +- * acquire the big kgdb spinlock +- */ +- if (!spin_trylock(&kgdb_lock)) { +- /* +- * some other CPU has the lock, we should go back to +- * receive the gdb_wait IPC +- */ +- return; +- } +- +- /* +- * If we're in async_breakpoint(), restore the real EPC from +- * the breakpoint. +- */ +- if (regs->cp0_epc == (unsigned long)async_breakinst) { +- regs->cp0_epc = async_bp.addr; +- async_bp.addr = 0; +- } +- +- /* +- * acquire the CPU spinlocks +- */ +- for (i = num_online_cpus()-1; i >= 0; i--) +- if (__raw_spin_trylock(&kgdb_cpulock[i]) == 0) +- panic("kgdb: couldn't get cpulock %d\n", i); +- +- /* +- * force other cpus to enter kgdb +- */ +- kgdb_smp_call_kgdb_wait(); +- +- /* +- * If we're in breakpoint() increment the PC +- */ +- trap = (regs->cp0_cause & 0x7c) >> 2; +- if (trap == 9 && regs->cp0_epc == (unsigned long)breakinst) +- regs->cp0_epc += 4; +- +- /* +- * If we were single_stepping, restore the opcodes hoisted +- * for the breakpoint[s]. +- */ +- if (step_bp[0].addr) { +- *(unsigned *)step_bp[0].addr = step_bp[0].val; +- step_bp[0].addr = 0; +- +- if (step_bp[1].addr) { +- *(unsigned *)step_bp[1].addr = step_bp[1].val; +- step_bp[1].addr = 0; +- } +- } +- +- stack = (long *)regs->reg29; /* stack ptr */ +- sigval = computeSignal(trap); +- +- /* +- * reply to host that an exception has occurred +- */ +- ptr = output_buffer; +- +- /* +- * Send trap type (converted to signal) +- */ +- *ptr++ = 'T'; +- *ptr++ = hexchars[sigval >> 4]; +- *ptr++ = hexchars[sigval & 0xf]; +- +- /* +- * Send Error PC +- */ +- *ptr++ = hexchars[REG_EPC >> 4]; +- *ptr++ = hexchars[REG_EPC & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->cp0_epc, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- /* +- * Send frame pointer +- */ +- *ptr++ = hexchars[REG_FP >> 4]; +- *ptr++ = hexchars[REG_FP & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->reg30, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- /* +- * Send stack pointer +- */ +- *ptr++ = hexchars[REG_SP >> 4]; +- *ptr++ = hexchars[REG_SP & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->reg29, ptr, sizeof(long), 0); +- *ptr++ = ';'; +- +- *ptr++ = 0; +- putpacket(output_buffer); /* send it off... */ +- +- /* +- * Wait for input from remote GDB +- */ +- while (1) { +- output_buffer[0] = 0; +- getpacket(input_buffer); +- +- switch (input_buffer[0]) +- { +- case '?': +- output_buffer[0] = 'S'; +- output_buffer[1] = hexchars[sigval >> 4]; +- output_buffer[2] = hexchars[sigval & 0xf]; +- output_buffer[3] = 0; +- break; +- +- /* +- * Detach debugger; let CPU run +- */ +- case 'D': +- putpacket(output_buffer); +- goto finish_kgdb; +- break; +- +- case 'd': +- /* toggle debug flag */ +- break; +- +- /* +- * Return the value of the CPU registers +- */ +- case 'g': +- ptr = output_buffer; +- ptr = mem2hex((char *)®s->reg0, ptr, 32*sizeof(long), 0); /* r0...r31 */ +- ptr = mem2hex((char *)®s->cp0_status, ptr, 6*sizeof(long), 0); /* cp0 */ +- ptr = mem2hex((char *)®s->fpr0, ptr, 32*sizeof(long), 0); /* f0...31 */ +- ptr = mem2hex((char *)®s->cp1_fsr, ptr, 2*sizeof(long), 0); /* cp1 */ +- ptr = mem2hex((char *)®s->frame_ptr, ptr, 2*sizeof(long), 0); /* frp */ +- ptr = mem2hex((char *)®s->cp0_index, ptr, 16*sizeof(long), 0); /* cp0 */ +- break; +- +- /* +- * set the value of the CPU registers - return OK +- */ +- case 'G': +- { +- ptr = &input_buffer[1]; +- hex2mem(ptr, (char *)®s->reg0, 32*sizeof(long), 0, 0); +- ptr += 32*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp0_status, 6*sizeof(long), 0, 0); +- ptr += 6*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->fpr0, 32*sizeof(long), 0, 0); +- ptr += 32*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp1_fsr, 2*sizeof(long), 0, 0); +- ptr += 2*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->frame_ptr, 2*sizeof(long), 0, 0); +- ptr += 2*(2*sizeof(long)); +- hex2mem(ptr, (char *)®s->cp0_index, 16*sizeof(long), 0, 0); +- strcpy(output_buffer,"OK"); +- } +- break; +- +- /* +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA +- */ +- case 'm': +- ptr = &input_buffer[1]; +- +- if (hexToLong(&ptr, &addr) +- && *ptr++ == ',' +- && hexToInt(&ptr, &length)) { +- if (mem2hex((char *)addr, output_buffer, length, 1)) +- break; +- strcpy (output_buffer, "E03"); +- } else +- strcpy(output_buffer,"E01"); +- break; +- +- /* +- * XAA..AA,LLLL: Write LLLL escaped binary bytes at address AA.AA +- */ +- case 'X': +- bflag = 1; +- /* fall through */ +- +- /* +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK +- */ +- case 'M': +- ptr = &input_buffer[1]; +- +- if (hexToLong(&ptr, &addr) +- && *ptr++ == ',' +- && hexToInt(&ptr, &length) +- && *ptr++ == ':') { +- if (hex2mem(ptr, (char *)addr, length, bflag, 1)) +- strcpy(output_buffer, "OK"); +- else +- strcpy(output_buffer, "E03"); +- } +- else +- strcpy(output_buffer, "E02"); +- break; +- +- /* +- * cAA..AA Continue at address AA..AA(optional) +- */ +- case 'c': +- /* try to read optional parameter, pc unchanged if no parm */ +- +- ptr = &input_buffer[1]; +- if (hexToLong(&ptr, &addr)) +- regs->cp0_epc = addr; +- +- goto exit_kgdb_exception; +- break; +- +- /* +- * kill the program; let us try to restart the machine +- * Reset the whole machine. +- */ +- case 'k': +- case 'r': +- machine_restart("kgdb restarts machine"); +- break; +- +- /* +- * Step to next instruction +- */ +- case 's': +- /* +- * There is no single step insn in the MIPS ISA, so we +- * use breakpoints and continue, instead. +- */ +- single_step(regs); +- goto exit_kgdb_exception; +- /* NOTREACHED */ +- break; +- +- /* +- * Set baud rate (bBB) +- * FIXME: Needs to be written +- */ +- case 'b': +- { +-#if 0 +- int baudrate; +- extern void set_timer_3(); +- +- ptr = &input_buffer[1]; +- if (!hexToInt(&ptr, &baudrate)) +- { +- strcpy(output_buffer,"B01"); +- break; +- } +- +- /* Convert baud rate to uart clock divider */ +- +- switch (baudrate) +- { +- case 38400: +- baudrate = 16; +- break; +- case 19200: +- baudrate = 33; +- break; +- case 9600: +- baudrate = 65; +- break; +- default: +- baudrate = 0; +- strcpy(output_buffer,"B02"); +- goto x1; +- } +- +- if (baudrate) { +- putpacket("OK"); /* Ack before changing speed */ +- set_timer_3(baudrate); /* Set it */ +- } +-#endif +- } +- break; +- +- } /* switch */ +- +- /* +- * reply to the request +- */ +- +- putpacket(output_buffer); +- +- } /* while */ +- +- return; +- +-finish_kgdb: +- restore_debug_traps(); +- +-exit_kgdb_exception: +- /* release locks so other CPUs can go */ +- for (i = num_online_cpus()-1; i >= 0; i--) +- __raw_spin_unlock(&kgdb_cpulock[i]); +- spin_unlock(&kgdb_lock); +- +- __flush_cache_all(); +- return; +-} +- +-/* +- * This function will generate a breakpoint exception. It is used at the +- * beginning of a program to sync up with a debugger and can be used +- * otherwise as a quick means to stop program execution and "break" into +- * the debugger. +- */ +-void breakpoint(void) +-{ +- if (!initialized) +- return; +- +- __asm__ __volatile__( +- ".globl breakinst\n\t" +- ".set\tnoreorder\n\t" +- "nop\n" +- "breakinst:\tbreak\n\t" +- "nop\n\t" +- ".set\treorder" +- ); +-} +- +-/* Nothing but the break; don't pollute any registers */ +-void async_breakpoint(void) +-{ +- __asm__ __volatile__( +- ".globl async_breakinst\n\t" +- ".set\tnoreorder\n\t" +- "nop\n" +- "async_breakinst:\tbreak\n\t" +- "nop\n\t" +- ".set\treorder" +- ); +-} +- +-void adel(void) +-{ +- __asm__ __volatile__( +- ".globl\tadel\n\t" +- "lui\t$8,0x8000\n\t" +- "lw\t$9,1($8)\n\t" +- ); +-} +- +-/* +- * malloc is needed by gdb client in "call func()", even a private one +- * will make gdb happy +- */ +-static void * __attribute_used__ malloc(size_t size) +-{ +- return kmalloc(size, GFP_ATOMIC); +-} +- +-static void __attribute_used__ free (void *where) +-{ +- kfree(where); +-} +- +-#ifdef CONFIG_GDB_CONSOLE +- +-void gdb_putsn(const char *str, int l) +-{ +- char outbuf[18]; +- +- if (!kgdb_started) +- return; +- +- outbuf[0]='O'; +- +- while(l) { +- int i = (l>8)?8:l; +- mem2hex((char *)str, &outbuf[1], i, 0); +- outbuf[(i*2)+1]=0; +- putpacket(outbuf); +- str += i; +- l -= i; +- } +-} +- +-static void gdb_console_write(struct console *con, const char *s, unsigned n) +-{ +- gdb_putsn(s, n); +-} +- +-static struct console gdb_console = { +- .name = "gdb", +- .write = gdb_console_write, +- .flags = CON_PRINTBUFFER, +- .index = -1 +-}; +- +-static int __init register_gdb_console(void) +-{ +- register_console(&gdb_console); +- +- return 0; +-} +- +-console_initcall(register_gdb_console); +- +-#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/irq.c linux-2.6.18.kgdb/arch/mips/kernel/irq.c +--- linux-2.6.18/arch/mips/kernel/irq.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/kernel/irq.c 2008-06-10 16:19:28.000000000 +0400 +@@ -25,6 +25,10 @@ + #include + #include + #include ++#include ++ ++/* Keep track of if we've done certain initialization already or not. */ ++int kgdb_early_setup; + + /* + * 'what should we do if we get a hw irq event on an illegal vector'. +@@ -115,23 +119,13 @@ asmlinkage void spurious_interrupt(struc + atomic_inc(&irq_err_count); + } + +-#ifdef CONFIG_KGDB +-extern void breakpoint(void); +-extern void set_debug_traps(void); +- +-static int kgdb_flag = 1; +-static int __init nokgdb(char *str) +-{ +- kgdb_flag = 0; +- return 1; +-} +-__setup("nokgdb", nokgdb); +-#endif +- + void __init init_IRQ(void) + { + int i; + ++ if (kgdb_early_setup) ++ return; ++ + for (i = 0; i < NR_IRQS; i++) { + irq_desc[i].status = IRQ_DISABLED; + irq_desc[i].action = NULL; +@@ -144,12 +138,12 @@ void __init init_IRQ(void) + } + + arch_init_irq(); +- + #ifdef CONFIG_KGDB +- if (kgdb_flag) { +- printk("Wait for gdb client connection ...\n"); +- set_debug_traps(); +- breakpoint(); +- } ++ /* ++ * We have been called before kgdb_arch_init(). Hence, ++ * we dont want the traps to be reinitialized ++ */ ++ if (kgdb_early_setup == 0) ++ kgdb_early_setup = 1; + #endif + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb-jmp.c linux-2.6.18.kgdb/arch/mips/kernel/kgdb-jmp.c +--- linux-2.6.18/arch/mips/kernel/kgdb-jmp.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb-jmp.c 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,116 @@ ++/* ++ * arch/mips/kernel/kgdb-jmp.c ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: Tom Rini ++ * Author: Manish Lachwani ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 1996, 1997, 2000, 2002, 2003 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++#include ++ ++#ifdef CONFIG_MIPS64 ++/* ++ * MIPS 64-bit ++ */ ++ ++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp) ++{ ++ __asm__ __volatile__ ("sd $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__ ("sd $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__ ("sd $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__ ("sd $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__ ("sd $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__ ("sd $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__ ("sd $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__ ("sd $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__ ("sd $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__ ("sd $31, %0" : : "m" (curr_context[9])); ++ curr_context[10] = (long *)sp; ++ curr_context[11] = (long *)fp; ++ ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ unsigned long sp_val, fp_val; ++ ++ __asm__ __volatile__ ("ld $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__ ("ld $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__ ("ld $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__ ("ld $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__ ("ld $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__ ("ld $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__ ("ld $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__ ("ld $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__ ("ld $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__ ("ld $25, %0" : : "m" (curr_context[9])); ++ sp_val = curr_context[10]; ++ fp_val = curr_context[11]; ++ __asm__ __volatile__ ("ld $29, %0\n\t" ++ "ld $30, %1\n\t" : : "m" (sp_val), "m" (fp_val)); ++ ++ __asm__ __volatile__ ("dli $2, 1"); ++ __asm__ __volatile__ ("j $25"); ++ ++ for (;;); ++} ++#else ++/* ++ * MIPS 32-bit ++ */ ++ ++int kgdb_fault_setjmp_aux(unsigned long *curr_context, int sp, int fp) ++{ ++ __asm__ __volatile__("sw $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__("sw $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__("sw $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__("sw $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__("sw $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__("sw $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__("sw $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__("sw $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__("sw $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__("sw $31, %0" : : "m" (curr_context[9])); ++ curr_context[10] = (long *)sp; ++ curr_context[11] = (long *)fp; ++ ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ unsigned long sp_val, fp_val; ++ ++ __asm__ __volatile__("lw $gp, %0" : : "m" (curr_context[0])); ++ __asm__ __volatile__("lw $16, %0" : : "m" (curr_context[1])); ++ __asm__ __volatile__("lw $17, %0" : : "m" (curr_context[2])); ++ __asm__ __volatile__("lw $18, %0" : : "m" (curr_context[3])); ++ __asm__ __volatile__("lw $19, %0" : : "m" (curr_context[4])); ++ __asm__ __volatile__("lw $20, %0" : : "m" (curr_context[5])); ++ __asm__ __volatile__("lw $21, %0" : : "m" (curr_context[6])); ++ __asm__ __volatile__("lw $22, %0" : : "m" (curr_context[7])); ++ __asm__ __volatile__("lw $23, %0" : : "m" (curr_context[8])); ++ __asm__ __volatile__("lw $25, %0" : : "m" (curr_context[9])); ++ sp_val = curr_context[10]; ++ fp_val = curr_context[11]; ++ __asm__ __volatile__("lw $29, %0\n\t" ++ "lw $30, %1\n\t" : : "m" (sp_val), "m" (fp_val)); ++ ++ __asm__ __volatile__("li $2, 1"); ++ __asm__ __volatile__("jr $25"); ++ ++ for (;;); ++} ++#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb-setjmp.S linux-2.6.18.kgdb/arch/mips/kernel/kgdb-setjmp.S +--- linux-2.6.18/arch/mips/kernel/kgdb-setjmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb-setjmp.S 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,28 @@ ++/* ++ * arch/mips/kernel/kgdb-jmp.c ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Copyright (C) 2005 by MontaVista Software. ++ * Author: Manish Lachwani (mlachwani@mvista.com) ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++ .ent kgdb_fault_setjmp,0 ++ENTRY (kgdb_fault_setjmp) ++ move a1, sp ++ move a2, fp ++#ifdef CONFIG_MIPS64 ++ nop ++#endif ++ j kgdb_fault_setjmp_aux ++ .end kgdb_fault_setjmp +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb.c linux-2.6.18.kgdb/arch/mips/kernel/kgdb.c +--- linux-2.6.18/arch/mips/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb.c 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,297 @@ ++/* ++ * arch/mips/kernel/kgdb.c ++ * ++ * Originally written by Glenn Engel, Lake Stevens Instrument Division ++ * ++ * Contributed by HP Systems ++ * ++ * Modified for SPARC by Stu Grossman, Cygnus Support. ++ * ++ * Modified for Linux/MIPS (and MIPS in general) by Andreas Busse ++ * Send complaints, suggestions etc. to ++ * ++ * Copyright (C) 1995 Andreas Busse ++ * ++ * Copyright (C) 2003 MontaVista Software Inc. ++ * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net ++ * ++ * Copyright (C) 2004-2005 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++static struct hard_trap_info { ++ unsigned char tt; /* Trap type code for MIPS R3xxx and R4xxx */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++ { 6, SIGBUS }, /* instruction bus error */ ++ { 7, SIGBUS }, /* data bus error */ ++ { 9, SIGTRAP }, /* break */ ++/* { 11, SIGILL }, */ /* CPU unusable */ ++ { 12, SIGFPE }, /* overflow */ ++ { 13, SIGTRAP }, /* trap */ ++ { 14, SIGSEGV }, /* virtual instruction cache coherency */ ++ { 15, SIGFPE }, /* floating point exception */ ++ { 23, SIGSEGV }, /* watch */ ++ { 31, SIGSEGV }, /* virtual data cache coherency */ ++ { 0, 0} /* Must be last */ ++}; ++ ++/* Save the normal trap handlers for user-mode traps. */ ++void *saved_vectors[32]; ++ ++extern void trap_low(void); ++extern void breakinst(void); ++extern void init_IRQ(void); ++ ++void kgdb_call_nmi_hook(void *ignored) ++{ ++ kgdb_nmihook(smp_processor_id(), (void *)0); ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ local_irq_restore(flags); ++ smp_call_function(kgdb_call_nmi_hook, 0, 0, 0); ++ local_irq_save(flags); ++} ++ ++static int compute_signal(int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++/* ++ * Set up exception handlers for tracing and breakpoints ++ */ ++void handle_exception(struct pt_regs *regs) ++{ ++ int trap = (regs->cp0_cause & 0x7c) >> 2; ++ ++ if (fixup_exception(regs)) { ++ return; ++ } ++ ++ if (atomic_read(&debugger_active)) ++ kgdb_nmihook(smp_processor_id(), regs); ++ ++ if (atomic_read(&kgdb_setting_breakpoint)) ++ if ((trap == 9) && (regs->cp0_epc == (unsigned long)breakinst)) ++ regs->cp0_epc += 4; ++ ++ kgdb_handle_exception(0, compute_signal(trap), 0, regs); ++ ++ /* In SMP mode, __flush_cache_all does IPI */ ++ __flush_cache_all(); ++} ++ ++void set_debug_traps(void) ++{ ++ struct hard_trap_info *ht; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ saved_vectors[ht->tt] = set_except_vector(ht->tt, trap_low); ++ ++ local_irq_restore(flags); ++} ++ ++#if 0 ++/* This should be called before we exit kgdb_handle_exception() I believe. ++ * -- Tom ++ */ ++void restore_debug_traps(void) ++{ ++ struct hard_trap_info *ht; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ set_except_vector(ht->tt, saved_vectors[ht->tt]); ++ local_irq_restore(flags); ++} ++#endif ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ *(ptr++) = regs->cp0_status; ++ *(ptr++) = regs->lo; ++ *(ptr++) = regs->hi; ++ *(ptr++) = regs->cp0_badvaddr; ++ *(ptr++) = regs->cp0_cause; ++ *(ptr++) = regs->cp0_epc; ++ ++ return; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ ++ int reg; ++ const gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 32; reg++) ++ regs->regs[reg] = *(ptr++); ++ ++ regs->cp0_status = *(ptr++); ++ regs->lo = *(ptr++); ++ regs->hi = *(ptr++); ++ regs->cp0_badvaddr = *(ptr++); ++ regs->cp0_cause = *(ptr++); ++ regs->cp0_epc = *(ptr++); ++ ++ return; ++} ++ ++/* ++ * Similar to regs_to_gdb_regs() except that process is sleeping and so ++ * we may not be able to get all the info. ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ int reg; ++ struct thread_info *ti = p->thread_info; ++ unsigned long ksp = (unsigned long)ti + THREAD_SIZE - 32; ++ struct pt_regs *regs = (struct pt_regs *)ksp - 1; ++ gdb_reg_t *ptr = (gdb_reg_t*)gdb_regs; ++ ++ for (reg = 0; reg < 16; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ /* S0 - S7 */ ++ for (reg = 16; reg < 24; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ for (reg = 24; reg < 28; reg++) ++ *(ptr++) = 0; ++ ++ /* GP, SP, FP, RA */ ++ for (reg = 28; reg < 32; reg++) ++ *(ptr++) = regs->regs[reg]; ++ ++ *(ptr++) = regs->cp0_status; ++ *(ptr++) = regs->lo; ++ *(ptr++) = regs->hi; ++ *(ptr++) = regs->cp0_badvaddr; ++ *(ptr++) = regs->cp0_cause; ++ *(ptr++) = regs->cp0_epc; ++ ++ return; ++} ++ ++/* ++ * Calls linux_debug_hook before the kernel dies. If KGDB is enabled, ++ * then try to fall into the debugger ++ */ ++static int kgdb_mips_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = (struct die_args *)ptr; ++ struct pt_regs *regs = args->regs; ++ int trap = (regs->cp0_cause & 0x7c) >> 2; ++ ++ /* See if KGDB is interested. */ ++ if (user_mode(regs)) ++ /* Userpace events, ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(trap, compute_signal(trap), 0, regs); ++ return NOTIFY_OK; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_mips_notify, ++}; ++ ++/* ++ * Handle the 's' and 'c' commands ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *regs) ++{ ++ char *ptr; ++ unsigned long address; ++ int cpu = smp_processor_id(); ++ ++ switch (remcom_in_buffer[0]) { ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &address)) ++ regs->cp0_epc = address; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ if (remcom_in_buffer[0] == 's') ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, cpu); ++ ++ return 0; ++ } ++ ++ return -1; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifdef CONFIG_CPU_LITTLE_ENDIAN ++ .gdb_bpt_instr = {0xd}, ++#else ++ .gdb_bpt_instr = {0x00, 0x00, 0x00, 0x0d}, ++#endif ++}; ++ ++/* ++ * We use kgdb_early_setup so that functions we need to call now don't ++ * cause trouble when called again later. ++ */ ++int kgdb_arch_init(void) ++{ ++ /* Board-specifics. */ ++ /* Force some calls to happen earlier. */ ++ if (kgdb_early_setup == 0) { ++ trap_init(); ++ init_IRQ(); ++ kgdb_early_setup = 1; ++ } ++ ++ /* Set our traps. */ ++ /* This needs to be done more finely grained again, paired in ++ * a before/after in kgdb_handle_exception(...) -- Tom */ ++ set_debug_traps(); ++ notifier_chain_register(&mips_die_chain, &kgdb_notifier); ++ ++ return 0; ++} +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/kgdb_handler.S linux-2.6.18.kgdb/arch/mips/kernel/kgdb_handler.S +--- linux-2.6.18/arch/mips/kernel/kgdb_handler.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/mips/kernel/kgdb_handler.S 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,57 @@ ++/* ++ * arch/mips/kernel/kgdb_handler.S ++ * ++ * Copyright (C) 2004-2005 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This file is licensed under the terms of the GNU General Public ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++/* ++ * Trap Handler for the new KGDB framework. The main KGDB handler is ++ * handle_exception that will be called from here ++ * ++ */ ++ ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++ .align 5 ++ NESTED(trap_low, PT_SIZE, sp) ++ .set noat ++ .set noreorder ++ ++ /* ++ * Check for privileged instructions in user mode. For ++ * this, check the cu0 bit in the CPU status register. ++ */ ++ mfc0 k0, CP0_STATUS ++ sll k0, 3 ++ bltz k0, 1f ++ move k1, sp ++ ++ /* ++ * GDB userland from within KGDB. If a user mode address ++ * then jump to the saved exception handler ++ */ ++ mfc0 k1, CP0_CAUSE ++ andi k1, k1, 0x7c ++ PTR_L k0, saved_vectors(k1) ++ jr k0 ++ nop ++1: ++ SAVE_ALL ++ .set at ++ .set reorder ++ move a0, sp ++ jal handle_exception ++ j ret_from_exception ++ END(trap_low) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/kernel/traps.c linux-2.6.18.kgdb/arch/mips/kernel/traps.c +--- linux-2.6.18/arch/mips/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/kernel/traps.c 2008-06-10 16:19:28.000000000 +0400 +@@ -10,6 +10,8 @@ + * Kevin D. Kissell, kevink@mips.com and Carsten Langgaard, carstenl@mips.com + * Copyright (C) 2000, 01 MIPS Technologies, Inc. + * Copyright (C) 2002, 2003, 2004, 2005 Maciej W. Rozycki ++ * ++ * KGDB specific changes - Manish Lachwani (mlachwani@mvista.com) + */ + #include + #include +@@ -20,6 +22,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -40,6 +43,7 @@ + #include + #include + #include ++#include + + extern asmlinkage void handle_int(void); + extern asmlinkage void handle_tlbm(void); +@@ -78,6 +82,21 @@ void (*board_bind_eic_interrupt)(int irq + */ + #define MODULE_RANGE (8*1024*1024) + ++struct notifier_block *mips_die_chain; ++static spinlock_t die_notifier_lock = SPIN_LOCK_UNLOCKED; ++ ++int register_die_notifier(struct notifier_block *nb) ++{ ++ int err = 0; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&die_notifier_lock, flags); ++ err = notifier_chain_register(&mips_die_chain, nb); ++ spin_unlock_irqrestore(&die_notifier_lock, flags); ++ ++ return err; ++} ++ + /* + * This routine abuses get_user()/put_user() to reference pointers + * with at least a bit of error checking ... +@@ -1387,6 +1406,11 @@ void __init trap_init(void) + extern char except_vec4; + unsigned long i; + ++#if defined(CONFIG_KGDB) ++ if (kgdb_early_setup) ++ return; /* Already done */ ++#endif ++ + if (cpu_has_veic || cpu_has_vint) + ebase = (unsigned long) alloc_bootmem_low_pages (0x200 + VECTORSPACING*64); + else +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/generic/Makefile linux-2.6.18.kgdb/arch/mips/mips-boards/generic/Makefile +--- linux-2.6.18/arch/mips/mips-boards/generic/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/mips-boards/generic/Makefile 2008-06-10 16:19:28.000000000 +0400 +@@ -21,6 +21,5 @@ + obj-y := reset.o display.o init.o memory.o printf.o \ + cmdline.o time.o + obj-$(CONFIG_PCI) += pci.o +-obj-$(CONFIG_KGDB) += gdb_hook.o + + EXTRA_AFLAGS := $(CFLAGS) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/generic/init.c linux-2.6.18.kgdb/arch/mips/mips-boards/generic/init.c +--- linux-2.6.18/arch/mips/mips-boards/generic/init.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/mips-boards/generic/init.c 2008-06-10 16:19:28.000000000 +0400 +@@ -37,15 +37,6 @@ + + #include + +-#ifdef CONFIG_KGDB +-extern int rs_kgdb_hook(int, int); +-extern int rs_putDebugChar(char); +-extern char rs_getDebugChar(void); +-extern int saa9730_kgdb_hook(int); +-extern int saa9730_putDebugChar(char); +-extern char saa9730_getDebugChar(void); +-#endif +- + int prom_argc; + int *_prom_argv, *_prom_envp; + +@@ -172,58 +163,6 @@ static void __init console_config(void) + } + #endif + +-#ifdef CONFIG_KGDB +-void __init kgdb_config (void) +-{ +- extern int (*generic_putDebugChar)(char); +- extern char (*generic_getDebugChar)(void); +- char *argptr; +- int line, speed; +- +- argptr = prom_getcmdline(); +- if ((argptr = strstr(argptr, "kgdb=ttyS")) != NULL) { +- argptr += strlen("kgdb=ttyS"); +- if (*argptr != '0' && *argptr != '1') +- printk("KGDB: Unknown serial line /dev/ttyS%c, " +- "falling back to /dev/ttyS1\n", *argptr); +- line = *argptr == '0' ? 0 : 1; +- printk("KGDB: Using serial line /dev/ttyS%d for session\n", line); +- +- speed = 0; +- if (*++argptr == ',') +- { +- int c; +- while ((c = *++argptr) && ('0' <= c && c <= '9')) +- speed = speed * 10 + c - '0'; +- } +-#ifdef CONFIG_MIPS_ATLAS +- if (line == 1) { +- speed = saa9730_kgdb_hook(speed); +- generic_putDebugChar = saa9730_putDebugChar; +- generic_getDebugChar = saa9730_getDebugChar; +- } +- else +-#endif +- { +- speed = rs_kgdb_hook(line, speed); +- generic_putDebugChar = rs_putDebugChar; +- generic_getDebugChar = rs_getDebugChar; +- } +- +- prom_printf("KGDB: Using serial line /dev/ttyS%d at %d for session, " +- "please connect your debugger\n", line ? 1 : 0, speed); +- +- { +- char *s; +- for (s = "Please connect GDB to this port\r\n"; *s; ) +- generic_putDebugChar (*s++); +- } +- +- /* Breakpoint is invoked after interrupts are initialised */ +- } +-} +-#endif +- + void __init mips_nmi_setup (void) + { + void *base; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mips-boards/malta/malta_setup.c linux-2.6.18.kgdb/arch/mips/mips-boards/malta/malta_setup.c +--- linux-2.6.18/arch/mips/mips-boards/malta/malta_setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/mips-boards/malta/malta_setup.c 2008-06-10 16:19:28.000000000 +0400 +@@ -46,10 +46,6 @@ extern void mips_reboot_setup(void); + extern void mips_time_init(void); + extern unsigned long mips_rtc_get_time(void); + +-#ifdef CONFIG_KGDB +-extern void kgdb_config(void); +-#endif +- + struct resource standard_io_resources[] = { + { .name = "dma1", .start = 0x00, .end = 0x1f, .flags = IORESOURCE_BUSY }, + { .name = "timer", .start = 0x40, .end = 0x5f, .flags = IORESOURCE_BUSY }, +@@ -124,10 +120,6 @@ void __init plat_mem_setup(void) + */ + enable_dma(4); + +-#ifdef CONFIG_KGDB +- kgdb_config (); +-#endif +- + if ((mips_revision_corid == MIPS_REVISION_CORID_BONITO64) || + (mips_revision_corid == MIPS_REVISION_CORID_CORE_20K) || + (mips_revision_corid == MIPS_REVISION_CORID_CORE_EMUL_BON)) { +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/mm/extable.c linux-2.6.18.kgdb/arch/mips/mm/extable.c +--- linux-2.6.18/arch/mips/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/mm/extable.c 2008-06-10 16:19:28.000000000 +0400 +@@ -3,6 +3,7 @@ + */ + #include + #include ++#include + #include + #include + +@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs + + return 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif + + return 0; + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/cfe/setup.c linux-2.6.18.kgdb/arch/mips/sibyte/cfe/setup.c +--- linux-2.6.18/arch/mips/sibyte/cfe/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/cfe/setup.c 2008-06-10 16:19:28.000000000 +0400 +@@ -58,10 +58,6 @@ int cfe_cons_handle; + extern unsigned long initrd_start, initrd_end; + #endif + +-#ifdef CONFIG_KGDB +-extern int kgdb_port; +-#endif +- + static void ATTRIB_NORET cfe_linux_exit(void *arg) + { + int warm = *(int *)arg; +@@ -242,9 +238,6 @@ void __init prom_init(void) + int argc = fw_arg0; + char **envp = (char **) fw_arg2; + int *prom_vec = (int *) fw_arg3; +-#ifdef CONFIG_KGDB +- char *arg; +-#endif + + _machine_restart = cfe_linux_restart; + _machine_halt = cfe_linux_halt; +@@ -308,13 +301,6 @@ void __init prom_init(void) + } + } + +-#ifdef CONFIG_KGDB +- if ((arg = strstr(arcs_cmdline,"kgdb=duart")) != NULL) +- kgdb_port = (arg[10] == '0') ? 0 : 1; +- else +- kgdb_port = 1; +-#endif +- + #ifdef CONFIG_BLK_DEV_INITRD + { + char *ptr; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/Makefile linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/Makefile +--- linux-2.6.18/arch/mips/sibyte/sb1250/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/Makefile 2008-06-10 16:19:28.000000000 +0400 +@@ -4,5 +4,6 @@ obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_SIBYTE_TBPROF) += bcm1250_tbprof.o + obj-$(CONFIG_SIBYTE_STANDALONE) += prom.o + obj-$(CONFIG_SIBYTE_BUS_WATCHER) += bus_watcher.o ++obj-$(CONFIG_KGDB_SIBYTE) += kgdb_sibyte.o + + EXTRA_AFLAGS := $(CFLAGS) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/irq.c linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/irq.c +--- linux-2.6.18/arch/mips/sibyte/sb1250/irq.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/irq.c 2008-06-10 16:19:28.000000000 +0400 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -59,16 +60,6 @@ static void sb1250_set_affinity(unsigned + extern unsigned long ldt_eoi_space; + #endif + +-#ifdef CONFIG_KGDB +-static int kgdb_irq; +- +-/* Default to UART1 */ +-int kgdb_port = 1; +-#ifdef CONFIG_SIBYTE_SB1250_DUART +-extern char sb1250_duart_present[]; +-#endif +-#endif +- + static struct irq_chip sb1250_irq_type = { + .typename = "SB1250-IMR", + .startup = startup_sb1250_irq, +@@ -324,6 +315,11 @@ void __init arch_init_irq(void) + unsigned int imask = STATUSF_IP4 | STATUSF_IP3 | STATUSF_IP2 | + STATUSF_IP1 | STATUSF_IP0; + ++#ifdef CONFIG_KGDB ++ if (kgdb_early_setup) ++ return; ++#endif ++ + /* Default everything to IP2 */ + for (i = 0; i < SB1250_NR_IRQS; i++) { /* was I0 */ + __raw_writeq(IMR_IP2_VAL, +@@ -375,50 +371,6 @@ void __init arch_init_irq(void) + /* Enable necessary IPs, disable the rest */ + change_c0_status(ST0_IM, imask); + +-#ifdef CONFIG_KGDB +- if (kgdb_flag) { +- kgdb_irq = K_INT_UART_0 + kgdb_port; +- +-#ifdef CONFIG_SIBYTE_SB1250_DUART +- sb1250_duart_present[kgdb_port] = 0; +-#endif +- /* Setup uart 1 settings, mapper */ +- __raw_writeq(M_DUART_IMR_BRK, +- IOADDR(A_DUART_IMRREG(kgdb_port))); +- +- sb1250_steal_irq(kgdb_irq); +- __raw_writeq(IMR_IP6_VAL, +- IOADDR(A_IMR_REGISTER(0, +- R_IMR_INTERRUPT_MAP_BASE) + +- (kgdb_irq << 3))); +- sb1250_unmask_irq(0, kgdb_irq); +- } +-#endif +-} +- +-#ifdef CONFIG_KGDB +- +-#include +- +-#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +-#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +- +-static void sb1250_kgdb_interrupt(struct pt_regs *regs) +-{ +- /* +- * Clear break-change status (allow some time for the remote +- * host to stop the break, since we would see another +- * interrupt on the end-of-break too) +- */ +- kstat_this_cpu.irqs[kgdb_irq]++; +- mdelay(500); +- duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT | +- M_DUART_RX_EN | M_DUART_TX_EN); +- set_async_breakpoint(®s->cp0_epc); +-} +- +-#endif /* CONFIG_KGDB */ +- + static inline int dclz(unsigned long long x) + { + int lz; +@@ -473,7 +425,7 @@ asmlinkage void plat_irq_dispatch(struct + sb1250_mailbox_interrupt(regs); + #endif + +-#ifdef CONFIG_KGDB ++#ifdef CONFIG_KGDB_SIBYTE + else if (pending & CAUSEF_IP6) /* KGDB (uart 1) */ + sb1250_kgdb_interrupt(regs); + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/sb1250/kgdb_sibyte.c linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c +--- linux-2.6.18/arch/mips/sibyte/sb1250/kgdb_sibyte.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/sb1250/kgdb_sibyte.c 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,164 @@ ++/* ++ * arch/mips/sibyte/sb1250/kgdb_sibyte.c ++ * ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * 2004 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++/* ++ * Support for KGDB on the Broadcom Sibyte. The SWARM board ++ * for example does not have a 8250/16550 compatible serial ++ * port. Hence, we need to have a driver for the serial ++ * ports to handle KGDB. This board needs nothing in addition ++ * to what is normally provided by the gdb portion of the stub. ++ */ ++ ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++int kgdb_port = 1; ++static int kgdb_irq; ++ ++extern char sb1250_duart_present[]; ++extern int sb1250_steal_irq(int irq); ++ ++/* Forward declarations. */ ++static void kgdbsibyte_init_duart(void); ++static int kgdb_init_io(void); ++ ++#define IMR_IP6_VAL K_INT_MAP_I4 ++#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) ++#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) ++ ++static void kgdb_swarm_write_char(int c) ++{ ++ while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0) ; ++ duart_out(R_DUART_TX_HOLD, c); ++} ++ ++static int kgdb_swarm_read_char(void) ++{ ++ int ret_char; ++ unsigned int status; ++ ++ status = duart_in(R_DUART_STATUS); ++ while ((status & M_DUART_RX_RDY) == 0) { ++ status = duart_in(R_DUART_STATUS); ++ } ++ ++ /* ++ * Check for framing error ++ */ ++ if (status & M_DUART_FRM_ERR) { ++ kgdbsibyte_init_duart(); ++ kgdb_swarm_write_char('-'); ++ return '-'; ++ } ++ ++ ret_char = duart_in(R_DUART_RX_HOLD); ++ ++ return ret_char; ++} ++ ++void sb1250_kgdb_interrupt(struct pt_regs *regs) ++{ ++ int kgdb_irq = K_INT_UART_0 + kgdb_port; ++ /* ++ * Clear break-change status (allow some time for the remote ++ * host to stop the break, since we would see another ++ * interrupt on the end-of-break too) ++ */ ++ kstat_this_cpu.irqs[kgdb_irq]++; ++ mdelay(500); ++ duart_out(R_DUART_CMD, V_DUART_MISC_CMD_RESET_BREAK_INT | ++ M_DUART_RX_EN | M_DUART_TX_EN); ++ if (kgdb_io_ops.init != kgdb_init_io) { ++ /* Throw away the data if another I/O routine is ++ * active. ++ */ ++ unsigned int status; ++ ++ status = duart_in(R_DUART_STATUS); ++ while ((status & M_DUART_RX_RDY) == 0) { ++ status = duart_in(R_DUART_STATUS); ++ } ++ /* ++ * Check for framing error ++ */ ++ if (status & M_DUART_FRM_ERR) { ++ kgdbsibyte_init_duart(); ++ } ++ duart_in(R_DUART_RX_HOLD); ++ } else ++ breakpoint(); ++ ++} ++ ++/* ++ * We use port #1 and we set it for 115200 BAUD, 8n1. ++ */ ++static void kgdbsibyte_init_duart(void) ++{ ++ /* Set 8n1. */ ++ duart_out(R_DUART_MODE_REG_1, ++ V_DUART_BITS_PER_CHAR_8 | V_DUART_PARITY_MODE_NONE); ++ duart_out(R_DUART_MODE_REG_2, M_DUART_STOP_BIT_LEN_1); ++ /* Set baud rate of 115200. */ ++ duart_out(R_DUART_CLK_SEL, V_DUART_BAUD_RATE(115200)); ++ /* Enable rx and tx */ ++ duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN); ++} ++ ++static int kgdb_init_io(void) ++{ ++#ifdef CONFIG_SIBYTE_SB1250_DUART ++ sb1250_duart_present[kgdb_port] = 0; ++#endif ++ ++ kgdbsibyte_init_duart(); ++ ++ return 0; ++} ++ ++/* ++ * Hookup our IRQ line. We will already have been initialized a ++ * this point. ++ */ ++static void __init kgdbsibyte_hookup_irq(void) ++{ ++ /* Steal the IRQ. */ ++ kgdb_irq = K_INT_UART_0 + kgdb_port; ++ ++ /* Setup uart 1 settings, mapper */ ++ __raw_writeq(M_DUART_IMR_BRK, IOADDR(A_DUART_IMRREG(kgdb_port))); ++ ++ sb1250_steal_irq(kgdb_irq); ++ ++ __raw_writeq(IMR_IP6_VAL, ++ IOADDR(A_IMR_REGISTER(0, R_IMR_INTERRUPT_MAP_BASE) + ++ (kgdb_irq << 3))); ++ ++ sb1250_unmask_irq(0, kgdb_irq); ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_swarm_read_char, ++ .write_char = kgdb_swarm_write_char, ++ .init = kgdb_init_io, ++ .late_init = kgdbsibyte_hookup_irq, ++ .pre_exception = NULL, ++ .post_exception = NULL ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/swarm/Makefile linux-2.6.18.kgdb/arch/mips/sibyte/swarm/Makefile +--- linux-2.6.18/arch/mips/sibyte/swarm/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/swarm/Makefile 2008-06-10 16:19:28.000000000 +0400 +@@ -1,3 +1 @@ + lib-y = setup.o rtc_xicor1241.o rtc_m41t81.o +- +-lib-$(CONFIG_KGDB) += dbg_io.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/sibyte/swarm/dbg_io.c linux-2.6.18.kgdb/arch/mips/sibyte/swarm/dbg_io.c +--- linux-2.6.18/arch/mips/sibyte/swarm/dbg_io.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/sibyte/swarm/dbg_io.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,76 +0,0 @@ +-/* +- * kgdb debug routines for SiByte boards. +- * +- * Copyright (C) 2001 MontaVista Software Inc. +- * Author: Jun Sun, jsun@mvista.com or jsun@junsun.net +- * +- * This program is free software; you can redistribute it and/or modify it +- * under the terms of the GNU General Public License as published by the +- * Free Software Foundation; either version 2 of the License, or (at your +- * option) any later version. +- * +- */ +- +-/* -------------------- BEGINNING OF CONFIG --------------------- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-/* +- * We use the second serial port for kgdb traffic. +- * 115200, 8, N, 1. +- */ +- +-#define BAUD_RATE 115200 +-#define CLK_DIVISOR V_DUART_BAUD_RATE(BAUD_RATE) +-#define DATA_BITS V_DUART_BITS_PER_CHAR_8 /* or 7 */ +-#define PARITY V_DUART_PARITY_MODE_NONE /* or even */ +-#define STOP_BITS M_DUART_STOP_BIT_LEN_1 /* or 2 */ +- +-static int duart_initialized = 0; /* 0: need to be init'ed by kgdb */ +- +-/* -------------------- END OF CONFIG --------------------- */ +-extern int kgdb_port; +- +-#define duart_out(reg, val) csr_out32(val, IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +-#define duart_in(reg) csr_in32(IOADDR(A_DUART_CHANREG(kgdb_port,reg))) +- +-void putDebugChar(unsigned char c); +-unsigned char getDebugChar(void); +-static void +-duart_init(int clk_divisor, int data, int parity, int stop) +-{ +- duart_out(R_DUART_MODE_REG_1, data | parity); +- duart_out(R_DUART_MODE_REG_2, stop); +- duart_out(R_DUART_CLK_SEL, clk_divisor); +- +- duart_out(R_DUART_CMD, M_DUART_RX_EN | M_DUART_TX_EN); /* enable rx and tx */ +-} +- +-void +-putDebugChar(unsigned char c) +-{ +- if (!duart_initialized) { +- duart_initialized = 1; +- duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS); +- } +- while ((duart_in(R_DUART_STATUS) & M_DUART_TX_RDY) == 0); +- duart_out(R_DUART_TX_HOLD, c); +-} +- +-unsigned char +-getDebugChar(void) +-{ +- if (!duart_initialized) { +- duart_initialized = 1; +- duart_init(CLK_DIVISOR, DATA_BITS, PARITY, STOP_BITS); +- } +- while ((duart_in(R_DUART_STATUS) & M_DUART_RX_RDY) == 0) ; +- return duart_in(R_DUART_RX_HOLD); +-} +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/mips/tx4938/common/Makefile linux-2.6.18.kgdb/arch/mips/tx4938/common/Makefile +--- linux-2.6.18/arch/mips/tx4938/common/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/mips/tx4938/common/Makefile 2008-06-10 16:19:28.000000000 +0400 +@@ -7,5 +7,5 @@ + # + + obj-y += prom.o setup.o irq.o rtc_rx5c348.o +-obj-$(CONFIG_KGDB) += dbgio.o ++obj-$(CONFIG_KGDB_8250) += dbgio.o + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/Kconfig.debug linux-2.6.18.kgdb/arch/powerpc/Kconfig.debug +--- linux-2.6.18/arch/powerpc/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/Kconfig.debug 2008-06-10 16:19:22.000000000 +0400 +@@ -18,52 +18,9 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-config DEBUGGER +- bool "Enable debugger hooks" +- depends on DEBUG_KERNEL +- help +- Include in-kernel hooks for kernel debuggers. Unless you are +- intending to debug the kernel, say N here. +- +-config KGDB +- bool "Include kgdb kernel debugger" +- depends on DEBUGGER && (BROKEN || PPC_GEN550 || 4xx) +- select DEBUG_INFO +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-choice +- prompt "Serial Port" +- depends on KGDB +- default KGDB_TTYS1 +- +-config KGDB_TTYS0 +- bool "ttyS0" +- +-config KGDB_TTYS1 +- bool "ttyS1" +- +-config KGDB_TTYS2 +- bool "ttyS2" +- +-config KGDB_TTYS3 +- bool "ttyS3" +- +-endchoice +- +-config KGDB_CONSOLE +- bool "Enable serial console thru kgdb port" +- depends on KGDB && 8xx || CPM2 +- help +- If you enable this, all serial console messages will be sent +- over the gdb stub. +- If unsure, say N. +- + config XMON + bool "Include xmon kernel debugger" +- depends on DEBUGGER && !PPC_ISERIES ++ depends on DEBUG_KERNEL && !PPC_ISERIES + help + Include in-kernel hooks for the xmon kernel monitor/debugger. + Unless you are intending to debug the kernel, say N here. +@@ -82,6 +39,11 @@ config XMON_DEFAULT + xmon is normally disabled unless booted with 'xmon=on'. + Use 'xmon=off' to disable xmon init during runtime. + ++config DEBUGGER ++ bool ++ depends on KGDB || XMON ++ default y ++ + config IRQSTACKS + bool "Use separate kernel stacks when processing interrupts" + depends on PPC64 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/Makefile linux-2.6.18.kgdb/arch/powerpc/kernel/Makefile +--- linux-2.6.18/arch/powerpc/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/kernel/Makefile 2008-06-10 16:19:22.000000000 +0400 +@@ -60,6 +60,7 @@ obj-$(CONFIG_BOOTX_TEXT) += btext.o + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_KPROBES) += kprobes.o + obj-$(CONFIG_PPC_UDBG_16550) += legacy_serial.o udbg_16550.o ++obj-$(CONFIG_KGDB) += kgdb.o + module-$(CONFIG_PPC64) += module_64.o + obj-$(CONFIG_MODULES) += $(module-y) + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/kgdb.c linux-2.6.18.kgdb/arch/powerpc/kernel/kgdb.c +--- linux-2.6.18/arch/powerpc/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/powerpc/kernel/kgdb.c 2008-06-10 16:19:22.000000000 +0400 +@@ -0,0 +1,568 @@ ++/* ++ * arch/powerpc/kernel/kgdb.c ++ * ++ * PowerPC backend to the KGDB stub. ++ * ++ * Maintainer: Tom Rini ++ * ++ * Copied from arch/ppc/kernel/kgdb.c, updated for ppc64 ++ * ++ * Copyright (C) 1996 Paul Mackerras (setjmp/longjmp) ++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu) ++ * Copyright (C) 2003 Timesys Corporation. ++ * Copyright (C) 2004-2006 MontaVista Software, Inc. ++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com) ++ * PPC32 support restored by Vitaly Wool and ++ * Sergei Shtylyov ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * This table contains the mapping between PowerPC hardware trap types, and ++ * signals, which are primarily what GDB understands. GDB and the kernel ++ * don't always agree on values, so we use constants taken from gdb-6.2. ++ */ ++static struct hard_trap_info ++{ ++ unsigned int tt; /* Trap type code for powerpc */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++ { 0x0100, 0x02 /* SIGINT */ }, /* system reset */ ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* data access */ ++ { 0x0400, 0x0b /* SIGSEGV */ }, /* instruction access */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* external interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */ ++ { 0x0700, 0x05 /* SIGTRAP */ }, /* program check */ ++ { 0x0800, 0x08 /* SIGFPE */ }, /* fp unavailable */ ++ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* system call */ ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ { 0x2002, 0x05 /* SIGTRAP */ }, /* debug */ ++#if defined(CONFIG_FSL_BOOKE) ++ { 0x2010, 0x08 /* SIGFPE */ }, /* spe unavailable */ ++ { 0x2020, 0x08 /* SIGFPE */ }, /* spe unavailable */ ++ { 0x2030, 0x08 /* SIGFPE */ }, /* spe fp data */ ++ { 0x2040, 0x08 /* SIGFPE */ }, /* spe fp data */ ++ { 0x2050, 0x08 /* SIGFPE */ }, /* spe fp round */ ++ { 0x2060, 0x0e /* SIGILL */ }, /* performace monitor */ ++ { 0x2900, 0x08 /* SIGFPE */ }, /* apu unavailable */ ++ { 0x3100, 0x0e /* SIGALRM */ }, /* fixed interval timer */ ++ { 0x3200, 0x02 /* SIGINT */ }, /* watchdog */ ++#else ++ { 0x1000, 0x0e /* SIGALRM */ }, /* programmable interval timer */ ++ { 0x1010, 0x0e /* SIGALRM */ }, /* fixed interval timer */ ++ { 0x1020, 0x02 /* SIGINT */ }, /* watchdog */ ++ { 0x2010, 0x08 /* SIGFPE */ }, /* fp unavailable */ ++ { 0x2020, 0x08 /* SIGFPE */ }, /* ap unavailable */ ++#endif ++#else ++ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step */ ++#if defined(CONFIG_8xx) ++ { 0x1000, 0x04 /* SIGILL */ }, /* software emulation */ ++#else ++ { 0x0f00, 0x04 /* SIGILL */ }, /* performance monitor */ ++ { 0x0f20, 0x08 /* SIGFPE */ }, /* altivec unavailable */ ++ { 0x1300, 0x05 /* SIGTRAP */ }, /* instruction address break */ ++#if defined(CONFIG_PPC64) ++ { 0x1200, 0x05 /* SIGILL */ }, /* system error */ ++ { 0x1500, 0x04 /* SIGILL */ }, /* soft patch */ ++ { 0x1600, 0x04 /* SIGILL */ }, /* maintenance */ ++ { 0x1700, 0x08 /* SIGFPE */ }, /* altivec assist */ ++ { 0x1800, 0x04 /* SIGILL */ }, /* thermal */ ++#else ++ { 0x1400, 0x02 /* SIGINT */ }, /* SMI */ ++ { 0x1600, 0x08 /* SIGFPE */ }, /* altivec assist */ ++ { 0x1700, 0x04 /* SIGILL */ }, /* TAU */ ++ { 0x2000, 0x05 /* SIGTRAP */ }, /* run mode */ ++#endif ++#endif ++#endif ++ { 0x0000, 0x00 } /* Must be last */ ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int computeSignal(unsigned int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++static int kgdb_call_nmi_hook(struct pt_regs *regs) ++{ ++ kgdb_nmihook(smp_processor_id(), regs); ++ return 0; ++} ++ ++#ifdef CONFIG_SMP ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ smp_send_debugger_break(MSG_ALL_BUT_SELF); ++} ++#endif ++ ++/* KGDB functions to use existing PowerPC64 hooks. */ ++static int kgdb_debugger(struct pt_regs *regs) ++{ ++ return kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++} ++ ++static int kgdb_breakpoint(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) ++ regs->nip += 4; ++ ++ return 1; ++} ++ ++static int kgdb_singlestep(struct pt_regs *regs) ++{ ++ struct thread_info *thread_info, *exception_thread_info; ++ if (user_mode(regs)) ++ return 0; ++ /* ++ * On Book E and perhaps other processsors, singlestep is handled on ++ * the critical exception stack. This causes current_thread_info() ++ * to fail, since it it locates the thread_info by masking off ++ * the low bits of the current stack pointer. We work around ++ * this issue by copying the thread_info from the kernel stack ++ * before calling kgdb_handle_exception, and copying it back ++ * afterwards. On most processors the copy is avoided since ++ * exception_thread_info == thread_info. ++ */ ++ thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); ++ exception_thread_info = current_thread_info(); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(exception_thread_info, thread_info, sizeof *thread_info); ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(thread_info, exception_thread_info, sizeof *thread_info); ++ ++ return 1; ++} ++ ++int kgdb_iabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++int kgdb_dabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++#define PACK64(ptr,src) do { *(ptr++) = (src); } while(0) ++ ++#define PACK32(ptr,src) do { \ ++ u32 *ptr32; \ ++ ptr32 = (u32 *)ptr; \ ++ *(ptr32++) = (src); \ ++ ptr = (unsigned long *)ptr32; \ ++ } while(0) ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++ memset(gdb_regs, 0, NUMREGBYTES); ++ ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, current->thread.evr[reg]); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(long); ++#endif ++ ++ PACK64(ptr, regs->nip); ++ PACK64(ptr, regs->msr); ++ PACK32(ptr, regs->ccr); ++ PACK64(ptr, regs->link); ++ PACK64(ptr, regs->ctr); ++ PACK32(ptr, regs->xer); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ PACK32(ptr, current->thread->fpscr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++#ifdef CONFIG_ALTIVEC ++ PACK32(ptr, current->thread->vscr); ++ PACK32(ptr, current->thread->vrsave); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ PACK32(ptr, current->thread.acc >> 32); ++ PACK32(ptr, current->thread.acc & 0xffffffff); ++ PACK64(ptr, current->thread.spefscr); ++#else ++ ptr += 2 + 1; ++#endif ++#else ++ /* fpscr not used by kernel, leave zero */ ++ PACK32(ptr, 0); ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + ++ STACK_FRAME_OVERHEAD); ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++ memset(gdb_regs, 0, NUMREGBYTES); ++ ++ /* Regs GPR0-2 */ ++ for (reg = 0; reg < 3; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++ /* Regs GPR3-13 are caller saved, not in regs->gpr[] */ ++ ptr += 11; ++ ++ /* Regs GPR14-31 */ ++ for (reg = 14; reg < 32; reg++) ++ PACK64(ptr, regs->gpr[reg]); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ PACK64(ptr, p->thread.evr[reg]); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(long); ++#endif ++ PACK64(ptr, regs->nip); ++ PACK64(ptr, regs->msr); ++ PACK32(ptr, regs->ccr); ++ PACK64(ptr, regs->link); ++ PACK64(ptr, regs->ctr); ++ PACK32(ptr, regs->xer); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ PACK32(ptr, p->thread->fpscr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++#ifdef CONFIG_ALTIVEC ++ PACK32(ptr, p->thread->vscr); ++ PACK32(ptr, p->thread->vrsave); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ PACK32(ptr, p->thread.acc >> 32); ++ PACK32(ptr, p->thread.acc & 0xffffffff); ++ PACK64(ptr, p->thread.spefscr); ++#else ++ ptr += 2 + 1; ++#endif ++#else ++ /* fpscr not used by kernel, leave zero */ ++ PACK32(ptr, 0); ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++#define UNPACK64(dest,ptr) do { dest = *(ptr++); } while(0) ++ ++#define UNPACK32(dest,ptr) do { \ ++ u32 *ptr32; \ ++ ptr32 = (u32 *)ptr; \ ++ dest = *(ptr32++); \ ++ ptr = (unsigned long *)ptr32; \ ++ } while(0) ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ unsigned long *ptr = gdb_regs; ++ int reg; ++ ++#ifdef CONFIG_SPE ++ union { ++ u32 v32[2]; ++ u64 v64; ++ } acc; ++#endif ++ for (reg = 0; reg < 32; reg++) ++ UNPACK64(regs->gpr[reg], ptr); ++ ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ for (reg = 0; reg < 32; reg++) ++ UNPACK64(current->thread.evr[reg], ptr); ++#else ++ ptr += 32; ++#endif ++#else ++ /* fp registers not used by kernel, leave zero */ ++ ptr += 32 * 8 / sizeof(int); ++#endif ++ UNPACK64(regs->nip, ptr); ++ UNPACK64(regs->msr, ptr); ++ UNPACK32(regs->ccr, ptr); ++ UNPACK64(regs->link, ptr); ++ UNPACK64(regs->ctr, ptr); ++ UNPACK32(regs->xer, ptr); ++ ++#if 0 ++ Following are in struct thread_struct, not struct pt_regs, ++ ignoring for now since kernel does not use them. Would it ++ make sense to get them from the thread that kgdb is set to? ++ ++ If this code is enabled, update the definition of NUMREGBYTES to ++ include the vector registers and vector state registers. ++ ++ /* fpscr, vscr, vrsave not used by kernel, leave unchanged */ ++ ++ UNPACK32(current->thread->fpscr, ptr); ++ ++ /* vr registers not used by kernel, leave zero */ ++ ptr += 32 * 16 / sizeof(long); ++ ++ #ifdef CONFIG_ALTIVEC ++ UNPACK32(current->thread->vscr, ptr); ++ UNPACK32(current->thread->vrsave, ptr); ++#else ++ ptr += 2 * 4 / sizeof(long); ++#endif ++#else ++#ifdef CONFIG_FSL_BOOKE ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ UNPACK32(acc.v32[0], ptr); ++ UNPACK32(acc.v32[1], ptr); ++ current->thread.acc = acc.v64; ++ UNPACK64(current->thread.spefscr, ptr); ++#else ++ ptr += 2 + 1; ++#endif ++#endif ++#endif ++ ++ BUG_ON((unsigned long)ptr > ++ (unsigned long)(((void *)gdb_regs) + NUMREGBYTES)); ++} ++ ++/* ++ * This function does PowerPC specific procesing for interfacing to gdb. ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr; ++ ++ switch (remcom_in_buffer[0]) { ++ /* ++ * sAA..AA Step one instruction from AA..AA ++ * This will return an error to gdb .. ++ */ ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->nip = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ mtspr(SPRN_DBCR0, ++ mfspr(SPRN_DBCR0) | DBCR0_IC | DBCR0_IDM); ++ linux_regs->msr |= MSR_DE; ++#else ++ linux_regs->msr |= MSR_SE; ++#endif ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ ++ return -1; ++} ++ ++int kgdb_fault_setjmp(unsigned long *curr_context) ++{ ++#ifdef CONFIG_PPC32 ++ __asm__ __volatile__("mflr 0; stw 0,0(%0);\n\ ++ stw 1,4(%0); stw 2,8(%0);\n\ ++ mfcr 0; stw 0,12(%0);\n\ ++ stmw 13,16(%0)\n" : : "r" (curr_context)); ++#else ++ __asm__ __volatile__("mflr 0; std 0,0(%0)\n\ ++ std 1,8(%0)\n\ ++ std 2,16(%0)\n\ ++ mfcr 0; std 0,24(%0)\n\ ++ std 13,32(%0)\n\ ++ std 14,40(%0)\n\ ++ std 15,48(%0)\n\ ++ std 16,56(%0)\n\ ++ std 17,64(%0)\n\ ++ std 18,72(%0)\n\ ++ std 19,80(%0)\n\ ++ std 20,88(%0)\n\ ++ std 21,96(%0)\n\ ++ std 22,104(%0)\n\ ++ std 23,112(%0)\n\ ++ std 24,120(%0)\n\ ++ std 25,128(%0)\n\ ++ std 26,136(%0)\n\ ++ std 27,144(%0)\n\ ++ std 28,152(%0)\n\ ++ std 29,160(%0)\n\ ++ std 30,168(%0)\n\ ++ std 31,176(%0)\n" : : "r" (curr_context)); ++#endif ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++#ifdef CONFIG_PPC32 ++ __asm__ __volatile__("lmw 13,16(%0);\n\ ++ lwz 0,12(%0); mtcrf 0x38,0;\n\ ++ lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);\n\ ++ mtlr 0; mr 3,1\n" : : "r" (curr_context)); ++#else ++ __asm__ __volatile__("ld 13,32(%0)\n\ ++ ld 14,40(%0)\n\ ++ ld 15,48(%0)\n\ ++ ld 16,56(%0)\n\ ++ ld 17,64(%0)\n\ ++ ld 18,72(%0)\n\ ++ ld 19,80(%0)\n\ ++ ld 20,88(%0)\n\ ++ ld 21,96(%0)\n\ ++ ld 22,104(%0)\n\ ++ ld 23,112(%0)\n\ ++ ld 24,120(%0)\n\ ++ ld 25,128(%0)\n\ ++ ld 26,136(%0)\n\ ++ ld 27,144(%0)\n\ ++ ld 28,152(%0)\n\ ++ ld 29,160(%0)\n\ ++ ld 30,168(%0)\n\ ++ ld 31,176(%0)\n\ ++ ld 0,24(%0)\n\ ++ mtcrf 0x38,0\n\ ++ ld 0,0(%0)\n\ ++ ld 1,8(%0)\n\ ++ ld 2,16(%0)\n\ ++ mtlr 0\n\ ++ mr 3,1\n" : : "r" (curr_context)); ++#endif ++} ++ ++/* ++ * Global data ++ */ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, ++}; ++ ++int kgdb_not_implemented(struct pt_regs *regs) ++{ ++ return 0; ++} ++ ++int kgdb_arch_init(void) ++{ ++#ifdef CONFIG_XMON ++#error Both XMON and KGDB selected in .config. Unselect one of them. ++#endif ++ ++ __debugger_ipi = kgdb_call_nmi_hook; ++ __debugger = kgdb_debugger; ++ __debugger_bpt = kgdb_breakpoint; ++ __debugger_sstep = kgdb_singlestep; ++ __debugger_iabr_match = kgdb_iabr_match; ++ __debugger_dabr_match = kgdb_dabr_match; ++ __debugger_fault_handler = kgdb_not_implemented; ++ ++ return 0; ++} ++ ++arch_initcall(kgdb_arch_init); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/legacy_serial.c linux-2.6.18.kgdb/arch/powerpc/kernel/legacy_serial.c +--- linux-2.6.18/arch/powerpc/kernel/legacy_serial.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/kernel/legacy_serial.c 2008-06-10 16:19:22.000000000 +0400 +@@ -11,6 +11,9 @@ + #include + #include + #include ++#ifdef CONFIG_KGDB_8250 ++#include ++#endif + + #undef DEBUG + +@@ -470,6 +473,9 @@ static int __init serial_dev_init(void) + fixup_port_pio(i, np, port); + if ((port->iotype == UPIO_MEM) || (port->iotype == UPIO_TSI)) + fixup_port_mmio(i, np, port); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_platform_port(i, port); ++#endif + } + + DBG("Registering platform serial ports\n"); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/kernel/setup_32.c linux-2.6.18.kgdb/arch/powerpc/kernel/setup_32.c +--- linux-2.6.18/arch/powerpc/kernel/setup_32.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/kernel/setup_32.c 2008-06-10 16:19:22.000000000 +0400 +@@ -45,10 +45,6 @@ + + #define DBG(fmt...) + +-#if defined CONFIG_KGDB +-#include +-#endif +- + extern void bootx_init(unsigned long r4, unsigned long phys); + + struct ide_machdep_calls ppc_ide_md; +@@ -248,18 +244,6 @@ void __init setup_arch(char **cmdline_p) + /* Register early console */ + register_early_udbg_console(); + +-#if defined(CONFIG_KGDB) +- if (ppc_md.kgdb_map_scc) +- ppc_md.kgdb_map_scc(); +- set_debug_traps(); +- if (strstr(cmd_line, "gdb")) { +- if (ppc_md.progress) +- ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); +- printk("kgdb breakpoint activated\n"); +- breakpoint(); +- } +-#endif +- + /* + * Set cache line size based on type of cpu as a default. + * Systems with OF can look in the properties on the cpu node(s) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/mm/fault.c linux-2.6.18.kgdb/arch/powerpc/mm/fault.c +--- linux-2.6.18/arch/powerpc/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/mm/fault.c 2008-06-10 16:19:22.000000000 +0400 +@@ -28,6 +28,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -424,6 +425,13 @@ void bad_page_fault(struct pt_regs *regs + return; + } + ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++#endif ++ + /* kernel has accessed a bad area */ + + printk(KERN_ALERT "Unable to handle kernel paging request for "); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/powerpc/platforms/powermac/setup.c linux-2.6.18.kgdb/arch/powerpc/platforms/powermac/setup.c +--- linux-2.6.18/arch/powerpc/platforms/powermac/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/powerpc/platforms/powermac/setup.c 2008-06-10 16:19:22.000000000 +0400 +@@ -98,8 +98,6 @@ extern struct machdep_calls pmac_md; + int sccdbg; + #endif + +-extern void zs_kgdb_hook(int tty_num); +- + sys_ctrler_t sys_ctrler = SYS_CTRLER_UNKNOWN; + EXPORT_SYMBOL(sys_ctrler); + +@@ -319,10 +317,6 @@ static void __init pmac_setup_arch(void) + l2cr_init(); + #endif /* CONFIG_PPC32 */ + +-#ifdef CONFIG_KGDB +- zs_kgdb_hook(0); +-#endif +- + find_via_cuda(); + find_via_pmu(); + smu_init(); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/Kconfig.debug linux-2.6.18.kgdb/arch/ppc/Kconfig.debug +--- linux-2.6.18/arch/ppc/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/Kconfig.debug 2008-06-10 16:19:22.000000000 +0400 +@@ -2,42 +2,6 @@ menu "Kernel hacking" + + source "lib/Kconfig.debug" + +-config KGDB +- bool "Include kgdb kernel debugger" +- depends on DEBUG_KERNEL && (BROKEN || PPC_GEN550 || 4xx) +- select DEBUG_INFO +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-choice +- prompt "Serial Port" +- depends on KGDB +- default KGDB_TTYS1 +- +-config KGDB_TTYS0 +- bool "ttyS0" +- +-config KGDB_TTYS1 +- bool "ttyS1" +- +-config KGDB_TTYS2 +- bool "ttyS2" +- +-config KGDB_TTYS3 +- bool "ttyS3" +- +-endchoice +- +-config KGDB_CONSOLE +- bool "Enable serial console thru kgdb port" +- depends on KGDB && 8xx || CPM2 +- help +- If you enable this, all serial console messages will be sent +- over the gdb stub. +- If unsure, say N. +- + config XMON + bool "Include xmon kernel debugger" + depends on DEBUG_KERNEL +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/kgdb.c linux-2.6.18.kgdb/arch/ppc/kernel/kgdb.c +--- linux-2.6.18/arch/ppc/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/ppc/kernel/kgdb.c 2008-06-10 16:20:19.000000000 +0400 +@@ -0,0 +1,350 @@ ++/* ++ * arch/ppc/kernel/kgdb.c ++ * ++ * PowerPC backend to the KGDB stub. ++ * ++ * Maintainer: Tom Rini ++ * ++ * 1998 (c) Michael AK Tesch (tesch@cs.wisc.edu) ++ * Copyright (C) 2003 Timesys Corporation. ++ * 2004 (c) MontaVista Software, Inc. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* ++ * This table contains the mapping between PowerPC hardware trap types, and ++ * signals, which are primarily what GDB understands. GDB and the kernel ++ * don't always agree on values, so we use constants taken from gdb-6.2. ++ */ ++static struct hard_trap_info ++{ ++ unsigned int tt; /* Trap type code for powerpc */ ++ unsigned char signo; /* Signal that we map this trap into */ ++} hard_trap_info[] = { ++#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) ++ { 0x0100, 0x02 /* SIGINT */ }, /* critical input interrupt */ ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* data storage */ ++ { 0x0400, 0x0a /* SIGBUS */ }, /* instruction storage */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alignment */ ++ { 0x0700, 0x04 /* SIGILL */ }, /* program */ ++ { 0x0800, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0900, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0a00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0b00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* syscall */ ++ { 0x0d00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0e00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0f00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x2002, 0x05 /* SIGTRAP */}, /* debug */ ++#else ++ { 0x0200, 0x0b /* SIGSEGV */ }, /* machine check */ ++ { 0x0300, 0x0b /* SIGSEGV */ }, /* address error (store) */ ++ { 0x0400, 0x0a /* SIGBUS */ }, /* instruction bus error */ ++ { 0x0500, 0x02 /* SIGINT */ }, /* interrupt */ ++ { 0x0600, 0x0a /* SIGBUS */ }, /* alingment */ ++ { 0x0700, 0x05 /* SIGTRAP */ }, /* breakpoint trap */ ++ { 0x0800, 0x08 /* SIGFPE */}, /* fpu unavail */ ++ { 0x0900, 0x0e /* SIGALRM */ }, /* decrementer */ ++ { 0x0a00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0b00, 0x04 /* SIGILL */ }, /* reserved */ ++ { 0x0c00, 0x14 /* SIGCHLD */ }, /* syscall */ ++ { 0x0d00, 0x05 /* SIGTRAP */ }, /* single-step/watch */ ++ { 0x0e00, 0x08 /* SIGFPE */ }, /* fp assist */ ++#endif ++ { 0x0000, 0x000 } /* Must be last */ ++}; ++ ++extern atomic_t cpu_doing_single_step; ++ ++static int computeSignal(unsigned int tt) ++{ ++ struct hard_trap_info *ht; ++ ++ for (ht = hard_trap_info; ht->tt && ht->signo; ht++) ++ if (ht->tt == tt) ++ return ht->signo; ++ ++ return SIGHUP; /* default for things we don't know about */ ++} ++ ++/* KGDB functions to use existing PowerPC hooks. */ ++static void kgdb_debugger(struct pt_regs *regs) ++{ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++} ++ ++static int kgdb_breakpoint(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (*(u32 *) (regs->nip) == *(u32 *) (&arch_kgdb_ops.gdb_bpt_instr)) ++ regs->nip += 4; ++ ++ return 1; ++} ++ ++static int kgdb_singlestep(struct pt_regs *regs) ++{ ++ struct thread_info *thread_info, *exception_thread_info; ++ ++ if (user_mode(regs)) ++ return 0; ++ /* ++ * On Book E and perhaps other processsors, singlestep is handled on ++ * the critical exception stack. This causes current_thread_info() ++ * to fail, since it it locates the thread_info by masking off ++ * the low bits of the current stack pointer. We work around ++ * this issue by copying the thread_info from the kernel stack ++ * before calling kgdb_handle_exception, and copying it back ++ * afterwards. On most processors the copy is avoided since ++ * exception_thread_info == thread_info. ++ */ ++ thread_info = (struct thread_info *)(regs->gpr[1] & ~(THREAD_SIZE-1)); ++ exception_thread_info = current_thread_info(); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(exception_thread_info, thread_info, sizeof *thread_info); ++ ++ kgdb_handle_exception(0, SIGTRAP, 0, regs); ++ ++ if (thread_info != exception_thread_info) ++ memcpy(thread_info, exception_thread_info, sizeof *thread_info); ++ ++ return 1; ++} ++ ++int kgdb_iabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++int kgdb_dabr_match(struct pt_regs *regs) ++{ ++ if (user_mode(regs)) ++ return 0; ++ ++ kgdb_handle_exception(0, computeSignal(TRAP(regs)), 0, regs); ++ return 1; ++} ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ unsigned long *ptr = gdb_regs; ++ ++ memset(gdb_regs, 0, MAXREG * 4); ++ ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ *(ptr++) = 0; ++#else ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = current->thread.evr[reg]; ++#endif ++ ++ *(ptr++) = regs->nip; ++ *(ptr++) = regs->msr; ++ *(ptr++) = regs->ccr; ++ *(ptr++) = regs->link; ++ *(ptr++) = regs->ctr; ++ *(ptr++) = regs->xer; ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ *(ptr++) = (current->thread.acc >> 32); ++ *(ptr++) = (current->thread.acc & 0xffffffff); ++ *(ptr++) = current->thread.spefscr; ++#endif ++} ++ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ struct pt_regs *regs = (struct pt_regs *)(p->thread.ksp + ++ STACK_FRAME_OVERHEAD); ++ int reg; ++ unsigned long *ptr = gdb_regs; ++ ++ memset(gdb_regs, 0, MAXREG * 4); ++ ++ /* Regs GPR0-2 */ ++ for (reg = 0; reg < 3; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++ /* Regs GPR3-13 are not saved */ ++ for (reg = 3; reg < 14; reg++) ++ *(ptr++) = 0; ++ ++ /* Regs GPR14-31 */ ++ for (reg = 14; reg < 32; reg++) ++ *(ptr++) = regs->gpr[reg]; ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ *(ptr++) = 0; ++#else ++ for (reg = 0; reg < 32; reg++) ++ *(ptr++) = current->thread.evr[reg]; ++#endif ++ ++ *(ptr++) = regs->nip; ++ *(ptr++) = regs->msr; ++ *(ptr++) = regs->ccr; ++ *(ptr++) = regs->link; ++ *(ptr++) = regs->ctr; ++ *(ptr++) = regs->xer; ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ *(ptr++) = (current->thread.acc >> 32); ++ *(ptr++) = (current->thread.acc & 0xffffffff); ++ *(ptr++) = current->thread.spefscr; ++#endif ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ int reg; ++ unsigned long *ptr = gdb_regs; ++#ifdef CONFIG_SPE ++ union { ++ u32 v32[2]; ++ u64 v64; ++ } u; ++#endif ++ ++ for (reg = 0; reg < 32; reg++) ++ regs->gpr[reg] = *(ptr++); ++ ++#ifndef CONFIG_E500 ++ for (reg = 0; reg < 64; reg++) ++ ptr++; ++#else ++ for (reg = 0; reg < 32; reg++) ++ current->thread.evr[reg] = *(ptr++); ++#endif ++ ++ regs->nip = *(ptr++); ++ regs->msr = *(ptr++); ++ regs->ccr = *(ptr++); ++ regs->link = *(ptr++); ++ regs->ctr = *(ptr++); ++ regs->xer = *(ptr++); ++ ++#ifdef CONFIG_SPE ++ /* u64 acc */ ++ u.v32[0] = *(ptr++); ++ u.v32[1] = *(ptr++); ++ current->thread.acc = u.v64; ++ current->thread.spefscr = *(ptr++); ++#endif ++} ++ ++/* ++ * Save/restore state in case a memory access causes a fault. ++ */ ++int kgdb_fault_setjmp(unsigned long *curr_context) ++{ ++ __asm__ __volatile__("mflr 0; stw 0,0(%0);" ++ "stw 1,4(%0); stw 2,8(%0);" ++ "mfcr 0; stw 0,12(%0);" ++ "stmw 13,16(%0)"::"r"(curr_context)); ++ return 0; ++} ++ ++void kgdb_fault_longjmp(unsigned long *curr_context) ++{ ++ __asm__ __volatile__("lmw 13,16(%0);" ++ "lwz 0,12(%0); mtcrf 0x38,0;" ++ "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);" ++ "mtlr 0; mr 3,1"::"r"(curr_context)); ++} ++ ++/* ++ * This function does PoerPC specific procesing for interfacing to gdb. ++ */ ++int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *linux_regs) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr; ++ ++ switch (remcom_in_buffer[0]) ++ { ++ /* ++ * sAA..AA Step one instruction from AA..AA ++ * This will return an error to gdb .. ++ */ ++ case 's': ++ case 'c': ++ /* handle the optional parameter */ ++ if (kgdb_hex2long (&ptr, &addr)) ++ linux_regs->nip = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcom_in_buffer[0] == 's') { ++#if defined (CONFIG_40x) || defined(CONFIG_BOOKE) ++ mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | ++ DBCR0_IC | DBCR0_IDM); ++ linux_regs->msr |= MSR_DE; ++#else ++ linux_regs->msr |= MSR_SE; ++#endif ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Global data ++ */ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0x7d, 0x82, 0x10, 0x08}, ++}; ++ ++int kgdb_arch_init(void) ++{ ++ debugger = kgdb_debugger; ++ debugger_bpt = kgdb_breakpoint; ++ debugger_sstep = kgdb_singlestep; ++ debugger_iabr_match = kgdb_iabr_match; ++ debugger_dabr_match = kgdb_dabr_match; ++ ++ return 0; ++} ++ ++arch_initcall(kgdb_arch_init); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/ppc-stub.c linux-2.6.18.kgdb/arch/ppc/kernel/ppc-stub.c +--- linux-2.6.18/arch/ppc/kernel/ppc-stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/kernel/ppc-stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,866 +0,0 @@ +-/* +- * ppc-stub.c: KGDB support for the Linux kernel. +- * +- * adapted from arch/sparc/kernel/sparc-stub.c for the PowerPC +- * some stuff borrowed from Paul Mackerras' xmon +- * Copyright (C) 1998 Michael AK Tesch (tesch@cs.wisc.edu) +- * +- * Modifications to run under Linux +- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) +- * +- * This file originally came from the gdb sources, and the +- * copyright notices have been retained below. +- */ +- +-/**************************************************************************** +- +- THIS SOFTWARE IS NOT COPYRIGHTED +- +- HP offers the following for use in the public domain. HP makes no +- warranty with regard to the software or its performance and the +- user accepts the software "AS IS" with all faults. +- +- HP DISCLAIMS ANY WARRANTIES, EXPRESS OR IMPLIED, WITH REGARD +- TO THIS SOFTWARE INCLUDING BUT NOT LIMITED TO THE WARRANTIES +- OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. +- +-****************************************************************************/ +- +-/**************************************************************************** +- * Header: remcom.c,v 1.34 91/03/09 12:29:49 glenne Exp $ +- * +- * Module name: remcom.c $ +- * Revision: 1.34 $ +- * Date: 91/03/09 12:29:49 $ +- * Contributor: Lake Stevens Instrument Division$ +- * +- * Description: low level support for gdb debugger. $ +- * +- * Considerations: only works on target hardware $ +- * +- * Written by: Glenn Engel $ +- * ModuleState: Experimental $ +- * +- * NOTES: See Below $ +- * +- * Modified for SPARC by Stu Grossman, Cygnus Support. +- * +- * This code has been extensively tested on the Fujitsu SPARClite demo board. +- * +- * To enable debugger support, two things need to happen. One, a +- * call to set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * Two, a breakpoint needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint(). Breakpoint() +- * simulates a breakpoint by executing a trap #1. +- * +- ************* +- * +- * The following gdb commands are supported: +- * +- * command function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * qOffsets Get section offsets. Reply is Text=xxx;Data=yyy;Bss=zzz +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * +- * k kill +- * +- * ? What was the last sigval ? SNN (signal NN) +- * +- * bBB..BB Set baud rate to BB..BB OK or BNN, then sets +- * baud rate +- * +- * All commands and responses are sent with a packet which includes a +- * checksum. A packet consists of +- * +- * $#. +- * +- * where +- * :: +- * :: > +- * +- * When a packet is received, it is first acknowledged with either '+' or '-'. +- * '+' indicates a successful transfer. '-' indicates a failed transfer. +- * +- * Example: +- * +- * Host: Reply: +- * $m0,10#2a +$00010203040506070809101112131415#42 +- * +- ****************************************************************************/ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-void breakinst(void); +- +-/* +- * BUFMAX defines the maximum number of characters in inbound/outbound buffers +- * at least NUMREGBYTES*2 are needed for register packets +- */ +-#define BUFMAX 2048 +-static char remcomInBuffer[BUFMAX]; +-static char remcomOutBuffer[BUFMAX]; +- +-static int initialized; +-static int kgdb_active; +-static int kgdb_started; +-static u_int fault_jmp_buf[100]; +-static int kdebug; +- +- +-static const char hexchars[]="0123456789abcdef"; +- +-/* Place where we save old trap entries for restoration - sparc*/ +-/* struct tt_entry kgdb_savettable[256]; */ +-/* typedef void (*trapfunc_t)(void); */ +- +-static void kgdb_fault_handler(struct pt_regs *regs); +-static int handle_exception (struct pt_regs *regs); +- +-#if 0 +-/* Install an exception handler for kgdb */ +-static void exceptionHandler(int tnum, unsigned int *tfunc) +-{ +- /* We are dorking with a live trap table, all irqs off */ +-} +-#endif +- +-int +-kgdb_setjmp(long *buf) +-{ +- asm ("mflr 0; stw 0,0(%0);" +- "stw 1,4(%0); stw 2,8(%0);" +- "mfcr 0; stw 0,12(%0);" +- "stmw 13,16(%0)" +- : : "r" (buf)); +- /* XXX should save fp regs as well */ +- return 0; +-} +-void +-kgdb_longjmp(long *buf, int val) +-{ +- if (val == 0) +- val = 1; +- asm ("lmw 13,16(%0);" +- "lwz 0,12(%0); mtcrf 0x38,0;" +- "lwz 0,0(%0); lwz 1,4(%0); lwz 2,8(%0);" +- "mtlr 0; mr 3,%1" +- : : "r" (buf), "r" (val)); +-} +-/* Convert ch from a hex digit to an int */ +-static int +-hex(unsigned char ch) +-{ +- if (ch >= 'a' && ch <= 'f') +- return ch-'a'+10; +- if (ch >= '0' && ch <= '9') +- return ch-'0'; +- if (ch >= 'A' && ch <= 'F') +- return ch-'A'+10; +- return -1; +-} +- +-/* Convert the memory pointed to by mem into hex, placing result in buf. +- * Return a pointer to the last char put in buf (null), in case of mem fault, +- * return 0. +- */ +-static unsigned char * +-mem2hex(const char *mem, char *buf, int count) +-{ +- unsigned char ch; +- unsigned short tmp_s; +- unsigned long tmp_l; +- +- if (kgdb_setjmp((long*)fault_jmp_buf) == 0) { +- debugger_fault_handler = kgdb_fault_handler; +- +- /* Accessing 16 bit and 32 bit objects in a single +- ** load instruction is required to avoid bad side +- ** effects for some IO registers. +- */ +- +- if ((count == 2) && (((long)mem & 1) == 0)) { +- tmp_s = *(unsigned short *)mem; +- mem += 2; +- *buf++ = hexchars[(tmp_s >> 12) & 0xf]; +- *buf++ = hexchars[(tmp_s >> 8) & 0xf]; +- *buf++ = hexchars[(tmp_s >> 4) & 0xf]; +- *buf++ = hexchars[tmp_s & 0xf]; +- +- } else if ((count == 4) && (((long)mem & 3) == 0)) { +- tmp_l = *(unsigned int *)mem; +- mem += 4; +- *buf++ = hexchars[(tmp_l >> 28) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 24) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 20) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 16) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 12) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 8) & 0xf]; +- *buf++ = hexchars[(tmp_l >> 4) & 0xf]; +- *buf++ = hexchars[tmp_l & 0xf]; +- +- } else { +- while (count-- > 0) { +- ch = *mem++; +- *buf++ = hexchars[ch >> 4]; +- *buf++ = hexchars[ch & 0xf]; +- } +- } +- +- } else { +- /* error condition */ +- } +- debugger_fault_handler = NULL; +- *buf = 0; +- return buf; +-} +- +-/* convert the hex array pointed to by buf into binary to be placed in mem +- * return a pointer to the character AFTER the last byte written. +-*/ +-static char * +-hex2mem(char *buf, char *mem, int count) +-{ +- unsigned char ch; +- int i; +- char *orig_mem; +- unsigned short tmp_s; +- unsigned long tmp_l; +- +- orig_mem = mem; +- +- if (kgdb_setjmp((long*)fault_jmp_buf) == 0) { +- debugger_fault_handler = kgdb_fault_handler; +- +- /* Accessing 16 bit and 32 bit objects in a single +- ** store instruction is required to avoid bad side +- ** effects for some IO registers. +- */ +- +- if ((count == 2) && (((long)mem & 1) == 0)) { +- tmp_s = hex(*buf++) << 12; +- tmp_s |= hex(*buf++) << 8; +- tmp_s |= hex(*buf++) << 4; +- tmp_s |= hex(*buf++); +- +- *(unsigned short *)mem = tmp_s; +- mem += 2; +- +- } else if ((count == 4) && (((long)mem & 3) == 0)) { +- tmp_l = hex(*buf++) << 28; +- tmp_l |= hex(*buf++) << 24; +- tmp_l |= hex(*buf++) << 20; +- tmp_l |= hex(*buf++) << 16; +- tmp_l |= hex(*buf++) << 12; +- tmp_l |= hex(*buf++) << 8; +- tmp_l |= hex(*buf++) << 4; +- tmp_l |= hex(*buf++); +- +- *(unsigned long *)mem = tmp_l; +- mem += 4; +- +- } else { +- for (i=0; i# */ +-static void +-getpacket(char *buffer) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- unsigned char ch; +- +- do { +- /* wait around for the start character, ignore all other +- * characters */ +- while ((ch = (getDebugChar() & 0x7f)) != '$') ; +- +- checksum = 0; +- xmitcsum = -1; +- +- count = 0; +- +- /* now, read until a # or end of buffer is found */ +- while (count < BUFMAX) { +- ch = getDebugChar() & 0x7f; +- if (ch == '#') +- break; +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- if (count >= BUFMAX) +- continue; +- +- buffer[count] = 0; +- +- if (ch == '#') { +- xmitcsum = hex(getDebugChar() & 0x7f) << 4; +- xmitcsum |= hex(getDebugChar() & 0x7f); +- if (checksum != xmitcsum) +- putDebugChar('-'); /* failed checksum */ +- else { +- putDebugChar('+'); /* successful transfer */ +- /* if a sequence char is present, reply the ID */ +- if (buffer[2] == ':') { +- putDebugChar(buffer[0]); +- putDebugChar(buffer[1]); +- /* remove sequence chars from buffer */ +- count = strlen(buffer); +- for (i=3; i <= count; i++) +- buffer[i-3] = buffer[i]; +- } +- } +- } +- } while (checksum != xmitcsum); +-} +- +-/* send the packet in buffer. */ +-static void putpacket(unsigned char *buffer) +-{ +- unsigned char checksum; +- int count; +- unsigned char ch, recv; +- +- /* $#. */ +- do { +- putDebugChar('$'); +- checksum = 0; +- count = 0; +- +- while ((ch = buffer[count])) { +- putDebugChar(ch); +- checksum += ch; +- count += 1; +- } +- +- putDebugChar('#'); +- putDebugChar(hexchars[checksum >> 4]); +- putDebugChar(hexchars[checksum & 0xf]); +- recv = getDebugChar(); +- } while ((recv & 0x7f) != '+'); +-} +- +-static void kgdb_flush_cache_all(void) +-{ +- flush_instruction_cache(); +-} +- +-/* Set up exception handlers for tracing and breakpoints +- * [could be called kgdb_init()] +- */ +-void set_debug_traps(void) +-{ +-#if 0 +- unsigned char c; +- +- save_and_cli(flags); +- +- /* In case GDB is started before us, ack any packets (presumably +- * "$?#xx") sitting there. +- * +- * I've found this code causes more problems than it solves, +- * so that's why it's commented out. GDB seems to work fine +- * now starting either before or after the kernel -bwb +- */ +- +- while((c = getDebugChar()) != '$'); +- while((c = getDebugChar()) != '#'); +- c = getDebugChar(); /* eat first csum byte */ +- c = getDebugChar(); /* eat second csum byte */ +- putDebugChar('+'); /* ack it */ +-#endif +- debugger = kgdb; +- debugger_bpt = kgdb_bpt; +- debugger_sstep = kgdb_sstep; +- debugger_iabr_match = kgdb_iabr_match; +- debugger_dabr_match = kgdb_dabr_match; +- +- initialized = 1; +-} +- +-static void kgdb_fault_handler(struct pt_regs *regs) +-{ +- kgdb_longjmp((long*)fault_jmp_buf, 1); +-} +- +-int kgdb_bpt(struct pt_regs *regs) +-{ +- return handle_exception(regs); +-} +- +-int kgdb_sstep(struct pt_regs *regs) +-{ +- return handle_exception(regs); +-} +- +-void kgdb(struct pt_regs *regs) +-{ +- handle_exception(regs); +-} +- +-int kgdb_iabr_match(struct pt_regs *regs) +-{ +- printk(KERN_ERR "kgdb doesn't support iabr, what?!?\n"); +- return handle_exception(regs); +-} +- +-int kgdb_dabr_match(struct pt_regs *regs) +-{ +- printk(KERN_ERR "kgdb doesn't support dabr, what?!?\n"); +- return handle_exception(regs); +-} +- +-/* Convert the hardware trap type code to a unix signal number. */ +-/* +- * This table contains the mapping between PowerPC hardware trap types, and +- * signals, which are primarily what GDB understands. +- */ +-static struct hard_trap_info +-{ +- unsigned int tt; /* Trap type code for powerpc */ +- unsigned char signo; /* Signal that we map this trap into */ +-} hard_trap_info[] = { +-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +- { 0x100, SIGINT }, /* critical input interrupt */ +- { 0x200, SIGSEGV }, /* machine check */ +- { 0x300, SIGSEGV }, /* data storage */ +- { 0x400, SIGBUS }, /* instruction storage */ +- { 0x500, SIGINT }, /* interrupt */ +- { 0x600, SIGBUS }, /* alignment */ +- { 0x700, SIGILL }, /* program */ +- { 0x800, SIGILL }, /* reserved */ +- { 0x900, SIGILL }, /* reserved */ +- { 0xa00, SIGILL }, /* reserved */ +- { 0xb00, SIGILL }, /* reserved */ +- { 0xc00, SIGCHLD }, /* syscall */ +- { 0xd00, SIGILL }, /* reserved */ +- { 0xe00, SIGILL }, /* reserved */ +- { 0xf00, SIGILL }, /* reserved */ +- /* +- ** 0x1000 PIT +- ** 0x1010 FIT +- ** 0x1020 watchdog +- ** 0x1100 data TLB miss +- ** 0x1200 instruction TLB miss +- */ +- { 0x2002, SIGTRAP}, /* debug */ +-#else +- { 0x200, SIGSEGV }, /* machine check */ +- { 0x300, SIGSEGV }, /* address error (store) */ +- { 0x400, SIGBUS }, /* instruction bus error */ +- { 0x500, SIGINT }, /* interrupt */ +- { 0x600, SIGBUS }, /* alingment */ +- { 0x700, SIGTRAP }, /* breakpoint trap */ +- { 0x800, SIGFPE }, /* fpu unavail */ +- { 0x900, SIGALRM }, /* decrementer */ +- { 0xa00, SIGILL }, /* reserved */ +- { 0xb00, SIGILL }, /* reserved */ +- { 0xc00, SIGCHLD }, /* syscall */ +- { 0xd00, SIGTRAP }, /* single-step/watch */ +- { 0xe00, SIGFPE }, /* fp assist */ +-#endif +- { 0, 0} /* Must be last */ +- +-}; +- +-static int computeSignal(unsigned int tt) +-{ +- struct hard_trap_info *ht; +- +- for (ht = hard_trap_info; ht->tt && ht->signo; ht++) +- if (ht->tt == tt) +- return ht->signo; +- +- return SIGHUP; /* default for things we don't know about */ +-} +- +-#define PC_REGNUM 64 +-#define SP_REGNUM 1 +- +-/* +- * This function does all command processing for interfacing to gdb. +- */ +-static int +-handle_exception (struct pt_regs *regs) +-{ +- int sigval; +- int addr; +- int length; +- char *ptr; +- unsigned int msr; +- +- /* We don't handle user-mode breakpoints. */ +- if (user_mode(regs)) +- return 0; +- +- if (debugger_fault_handler) { +- debugger_fault_handler(regs); +- panic("kgdb longjump failed!\n"); +- } +- if (kgdb_active) { +- printk(KERN_ERR "interrupt while in kgdb, returning\n"); +- return 0; +- } +- +- kgdb_active = 1; +- kgdb_started = 1; +- +-#ifdef KGDB_DEBUG +- printk("kgdb: entering handle_exception; trap [0x%x]\n", +- (unsigned int)regs->trap); +-#endif +- +- kgdb_interruptible(0); +- lock_kernel(); +- msr = mfmsr(); +- mtmsr(msr & ~MSR_EE); /* disable interrupts */ +- +- if (regs->nip == (unsigned long)breakinst) { +- /* Skip over breakpoint trap insn */ +- regs->nip += 4; +- } +- +- /* reply to host that an exception has occurred */ +- sigval = computeSignal(regs->trap); +- ptr = remcomOutBuffer; +- +- *ptr++ = 'T'; +- *ptr++ = hexchars[sigval >> 4]; +- *ptr++ = hexchars[sigval & 0xf]; +- *ptr++ = hexchars[PC_REGNUM >> 4]; +- *ptr++ = hexchars[PC_REGNUM & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex((char *)®s->nip, ptr, 4); +- *ptr++ = ';'; +- *ptr++ = hexchars[SP_REGNUM >> 4]; +- *ptr++ = hexchars[SP_REGNUM & 0xf]; +- *ptr++ = ':'; +- ptr = mem2hex(((char *)regs) + SP_REGNUM*4, ptr, 4); +- *ptr++ = ';'; +- *ptr++ = 0; +- +- putpacket(remcomOutBuffer); +- if (kdebug) +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- +- /* XXX We may want to add some features dealing with poking the +- * XXX page tables, ... (look at sparc-stub.c for more info) +- * XXX also required hacking to the gdb sources directly... +- */ +- +- while (1) { +- remcomOutBuffer[0] = 0; +- +- getpacket(remcomInBuffer); +- switch (remcomInBuffer[0]) { +- case '?': /* report most recent signal */ +- remcomOutBuffer[0] = 'S'; +- remcomOutBuffer[1] = hexchars[sigval >> 4]; +- remcomOutBuffer[2] = hexchars[sigval & 0xf]; +- remcomOutBuffer[3] = 0; +- break; +-#if 0 +- case 'q': /* this screws up gdb for some reason...*/ +- { +- extern long _start, sdata, __bss_start; +- +- ptr = &remcomInBuffer[1]; +- if (strncmp(ptr, "Offsets", 7) != 0) +- break; +- +- ptr = remcomOutBuffer; +- sprintf(ptr, "Text=%8.8x;Data=%8.8x;Bss=%8.8x", +- &_start, &sdata, &__bss_start); +- break; +- } +-#endif +- case 'd': +- /* toggle debug flag */ +- kdebug ^= 1; +- break; +- +- case 'g': /* return the value of the CPU registers. +- * some of them are non-PowerPC names :( +- * they are stored in gdb like: +- * struct { +- * u32 gpr[32]; +- * f64 fpr[32]; +- * u32 pc, ps, cnd, lr; (ps=msr) +- * u32 cnt, xer, mq; +- * } +- */ +- { +- int i; +- ptr = remcomOutBuffer; +- /* General Purpose Regs */ +- ptr = mem2hex((char *)regs, ptr, 32 * 4); +- /* Floating Point Regs - FIXME */ +- /*ptr = mem2hex((char *), ptr, 32 * 8);*/ +- for(i=0; i<(32*8*2); i++) { /* 2chars/byte */ +- ptr[i] = '0'; +- } +- ptr += 32*8*2; +- /* pc, msr, cr, lr, ctr, xer, (mq is unused) */ +- ptr = mem2hex((char *)®s->nip, ptr, 4); +- ptr = mem2hex((char *)®s->msr, ptr, 4); +- ptr = mem2hex((char *)®s->ccr, ptr, 4); +- ptr = mem2hex((char *)®s->link, ptr, 4); +- ptr = mem2hex((char *)®s->ctr, ptr, 4); +- ptr = mem2hex((char *)®s->xer, ptr, 4); +- } +- break; +- +- case 'G': /* set the value of the CPU registers */ +- { +- ptr = &remcomInBuffer[1]; +- +- /* +- * If the stack pointer has moved, you should pray. +- * (cause only god can help you). +- */ +- +- /* General Purpose Regs */ +- hex2mem(ptr, (char *)regs, 32 * 4); +- +- /* Floating Point Regs - FIXME?? */ +- /*ptr = hex2mem(ptr, ??, 32 * 8);*/ +- ptr += 32*8*2; +- +- /* pc, msr, cr, lr, ctr, xer, (mq is unused) */ +- ptr = hex2mem(ptr, (char *)®s->nip, 4); +- ptr = hex2mem(ptr, (char *)®s->msr, 4); +- ptr = hex2mem(ptr, (char *)®s->ccr, 4); +- ptr = hex2mem(ptr, (char *)®s->link, 4); +- ptr = hex2mem(ptr, (char *)®s->ctr, 4); +- ptr = hex2mem(ptr, (char *)®s->xer, 4); +- +- strcpy(remcomOutBuffer,"OK"); +- } +- break; +- case 'H': +- /* don't do anything, yet, just acknowledge */ +- hexToInt(&ptr, &addr); +- strcpy(remcomOutBuffer,"OK"); +- break; +- +- case 'm': /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ +- /* Try to read %x,%x. */ +- +- ptr = &remcomInBuffer[1]; +- +- if (hexToInt(&ptr, &addr) && *ptr++ == ',' +- && hexToInt(&ptr, &length)) { +- if (mem2hex((char *)addr, remcomOutBuffer, +- length)) +- break; +- strcpy(remcomOutBuffer, "E03"); +- } else +- strcpy(remcomOutBuffer, "E01"); +- break; +- +- case 'M': /* MAA..AA,LLLL: Write LLLL bytes at address AA.AA return OK */ +- /* Try to read '%x,%x:'. */ +- +- ptr = &remcomInBuffer[1]; +- +- if (hexToInt(&ptr, &addr) && *ptr++ == ',' +- && hexToInt(&ptr, &length) +- && *ptr++ == ':') { +- if (hex2mem(ptr, (char *)addr, length)) +- strcpy(remcomOutBuffer, "OK"); +- else +- strcpy(remcomOutBuffer, "E03"); +- flush_icache_range(addr, addr+length); +- } else +- strcpy(remcomOutBuffer, "E02"); +- break; +- +- +- case 'k': /* kill the program, actually just continue */ +- case 'c': /* cAA..AA Continue; address AA..AA optional */ +- /* try to read optional parameter, pc unchanged if no parm */ +- +- ptr = &remcomInBuffer[1]; +- if (hexToInt(&ptr, &addr)) +- regs->nip = addr; +- +-/* Need to flush the instruction cache here, as we may have deposited a +- * breakpoint, and the icache probably has no way of knowing that a data ref to +- * some location may have changed something that is in the instruction cache. +- */ +- kgdb_flush_cache_all(); +- mtmsr(msr); +- +- kgdb_interruptible(1); +- unlock_kernel(); +- kgdb_active = 0; +- if (kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- return 1; +- +- case 's': +- kgdb_flush_cache_all(); +-#if defined(CONFIG_40x) || defined(CONFIG_BOOKE) +- mtspr(SPRN_DBCR0, mfspr(SPRN_DBCR0) | DBCR0_IC); +- regs->msr |= MSR_DE; +-#else +- regs->msr |= MSR_SE; +-#endif +- unlock_kernel(); +- kgdb_active = 0; +- if (kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- return 1; +- +- case 'r': /* Reset (if user process..exit ???)*/ +- panic("kgdb reset."); +- break; +- } /* switch */ +- if (remcomOutBuffer[0] && kdebug) { +- printk("remcomInBuffer: %s\n", remcomInBuffer); +- printk("remcomOutBuffer: %s\n", remcomOutBuffer); +- } +- /* reply to the request */ +- putpacket(remcomOutBuffer); +- } /* while(1) */ +-} +- +-/* This function will generate a breakpoint exception. It is used at the +- beginning of a program to sync up with a debugger and can be used +- otherwise as a quick means to stop program execution and "break" into +- the debugger. */ +- +-void +-breakpoint(void) +-{ +- if (!initialized) { +- printk("breakpoint() called b4 kgdb init\n"); +- return; +- } +- +- asm(" .globl breakinst \n\ +- breakinst: .long 0x7d821008"); +-} +- +-#ifdef CONFIG_KGDB_CONSOLE +-/* Output string in GDB O-packet format if GDB has connected. If nothing +- output, returns 0 (caller must then handle output). */ +-int +-kgdb_output_string (const char* s, unsigned int count) +-{ +- char buffer[512]; +- +- if (!kgdb_started) +- return 0; +- +- count = (count <= (sizeof(buffer) / 2 - 2)) +- ? count : (sizeof(buffer) / 2 - 2); +- +- buffer[0] = 'O'; +- mem2hex (s, &buffer[1], count); +- putpacket(buffer); +- +- return 1; +-} +-#endif +- +-static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, +- struct tty_struct *tty) +-{ +- printk("Entering GDB stub\n"); +- breakpoint(); +-} +-static struct sysrq_key_op sysrq_gdb_op = { +- .handler = sysrq_handle_gdb, +- .help_msg = "Gdb", +- .action_msg = "GDB", +-}; +- +-static int gdb_register_sysrq(void) +-{ +- printk("Registering GDB sysrq handler\n"); +- register_sysrq_key('g', &sysrq_gdb_op); +- return 0; +-} +-module_init(gdb_register_sysrq); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/kernel/setup.c linux-2.6.18.kgdb/arch/ppc/kernel/setup.c +--- linux-2.6.18/arch/ppc/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/kernel/setup.c 2008-06-10 16:19:22.000000000 +0400 +@@ -47,10 +47,6 @@ + #include + #endif + +-#if defined CONFIG_KGDB +-#include +-#endif +- + extern void platform_init(unsigned long r3, unsigned long r4, + unsigned long r5, unsigned long r6, unsigned long r7); + extern void identify_cpu(unsigned long offset, unsigned long cpu); +@@ -504,18 +500,6 @@ void __init setup_arch(char **cmdline_p) + #endif /* CONFIG_XMON */ + if ( ppc_md.progress ) ppc_md.progress("setup_arch: enter", 0x3eab); + +-#if defined(CONFIG_KGDB) +- if (ppc_md.kgdb_map_scc) +- ppc_md.kgdb_map_scc(); +- set_debug_traps(); +- if (strstr(cmd_line, "gdb")) { +- if (ppc_md.progress) +- ppc_md.progress("setup_arch: kgdb breakpoint", 0x4000); +- printk("kgdb breakpoint activated\n"); +- breakpoint(); +- } +-#endif +- + /* + * Set cache line size based on type of cpu as a default. + * Systems with OF can look in the properties on the cpu node(s) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/mm/fault.c linux-2.6.18.kgdb/arch/ppc/mm/fault.c +--- linux-2.6.18/arch/ppc/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/mm/fault.c 2008-06-10 16:19:22.000000000 +0400 +@@ -25,6 +25,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -329,6 +330,14 @@ bad_page_fault(struct pt_regs *regs, uns + return; + } + ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) { ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Not reached. */ ++ } ++#endif ++ + /* kernel has accessed a bad area */ + #if defined(CONFIG_XMON) || defined(CONFIG_KGDB) + if (debugger_kernel_faults) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/bubinga.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/bubinga.c +--- linux-2.6.18/arch/ppc/platforms/4xx/bubinga.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/bubinga.c 2008-06-10 16:19:22.000000000 +0400 +@@ -4,7 +4,7 @@ + * Author: SAW (IBM), derived from walnut.c. + * Maintained by MontaVista Software + * +- * 2003 (c) MontaVista Softare Inc. This file is licensed under the ++ * 2003-2004 (c) MontaVista Softare Inc. This file is licensed under the + * terms of the GNU General Public License version 2. This program is + * licensed "as is" without any warranty of any kind, whether express + * or implied. +@@ -100,17 +100,26 @@ bubinga_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + + port.membase = (void*)ACTING_UART1_IO_BASE; + port.irq = ACTING_UART1_INT; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 1 failed\n"); +- } ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + void __init +@@ -255,8 +264,4 @@ platform_init(unsigned long r3, unsigned + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; + #endif +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = bubinga_early_serial_map; +-#endif + } +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/ebony.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ebony.c +--- linux-2.6.18/arch/ppc/platforms/4xx/ebony.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ebony.c 2008-06-10 16:19:22.000000000 +0400 +@@ -32,6 +32,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -226,14 +227,20 @@ ebony_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(0, &port); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Purge TLB entry added in head_44x.S for early serial access */ + _tlbie(UART0_IO_BASE); + #endif +@@ -243,14 +250,18 @@ ebony_early_serial_map(void) + port.uartclk = clocks.uart1; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 1) + printk("Early serial init of port 1 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(1, &port); + #endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + static void __init +@@ -327,8 +338,4 @@ void __init platform_init(unsigned long + + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ebony_early_serial_map; +-#endif + } +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/ocotea.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ocotea.c +--- linux-2.6.18/arch/ppc/platforms/4xx/ocotea.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/ocotea.c 2008-06-10 16:19:22.000000000 +0400 +@@ -30,6 +30,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -249,14 +250,20 @@ ocotea_early_serial_map(void) + port.flags = UPF_BOOT_AUTOCONF | UPF_SKIP_TEST; + port.line = 0; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) + printk("Early serial init of port 0 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(0, &port); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Purge TLB entry added in head_44x.S for early serial access */ + _tlbie(UART0_IO_BASE); + #endif +@@ -266,14 +273,18 @@ ocotea_early_serial_map(void) + port.uartclk = clocks.uart1; + port.line = 1; + +- if (early_serial_setup(&port) != 0) { ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 1) + printk("Early serial init of port 1 failed\n"); +- } ++#endif + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Configure debug serial access */ + gen550_init(1, &port); + #endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &port); ++#endif + } + + static void __init +@@ -343,8 +354,5 @@ void __init platform_init(unsigned long + + ppc_md.nvram_read_val = todc_direct_read_val; + ppc_md.nvram_write_val = todc_direct_write_val; +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ocotea_early_serial_map; +-#endif + ppc_md.init = ocotea_init; + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/4xx/xilinx_ml300.c linux-2.6.18.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c +--- linux-2.6.18/arch/ppc/platforms/4xx/xilinx_ml300.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/4xx/xilinx_ml300.c 2008-06-10 16:19:22.000000000 +0400 +@@ -41,9 +41,6 @@ + * ppc4xx_map_io arch/ppc/syslib/ppc4xx_setup.c + * start_kernel init/main.c + * setup_arch arch/ppc/kernel/setup.c +- * #if defined(CONFIG_KGDB) +- * *ppc_md.kgdb_map_scc() == gen550_kgdb_map_scc +- * #endif + * *ppc_md.setup_arch == ml300_setup_arch this file + * ppc4xx_setup_arch arch/ppc/syslib/ppc4xx_setup.c + * ppc4xx_find_bridges arch/ppc/syslib/ppc405_pci.c +@@ -117,7 +114,6 @@ ml300_early_serial_init(int num, struct + void __init + ml300_early_serial_map(void) + { +-#ifdef CONFIG_SERIAL_8250 + struct plat_serial8250_port *pdata; + int i = 0; + +@@ -129,7 +125,14 @@ ml300_early_serial_map(void) + pdata++; + i++; + } +-#endif /* CONFIG_SERIAL_8250 */ ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&port) != 0) ++ printk("Early serial init of port %d failed\n", i); ++#endif ++ ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(i, &port) ++#endif + } + + void __init +@@ -165,9 +168,4 @@ platform_init(unsigned long r3, unsigned + #if defined(XPAR_POWER_0_POWERDOWN_BASEADDR) + ppc_md.power_off = xilinx_power_off; + #endif +- +-#ifdef CONFIG_KGDB +- ppc_md.early_serial_map = ml300_early_serial_map; +-#endif + } +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/85xx/sbc8560.c linux-2.6.18.kgdb/arch/ppc/platforms/85xx/sbc8560.c +--- linux-2.6.18/arch/ppc/platforms/85xx/sbc8560.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/85xx/sbc8560.c 2008-06-10 16:19:22.000000000 +0400 +@@ -50,7 +50,6 @@ + #include + #include + +-#ifdef CONFIG_SERIAL_8250 + static void __init + sbc8560_early_serial_map(void) + { +@@ -66,12 +65,16 @@ sbc8560_early_serial_map(void) + uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART0_SIZE); + uart_req.type = PORT_16650; + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) +- gen550_init(0, &uart_req); ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&uart_req) != 0) ++ printk("Early serial init of port 0 failed\n"); ++#endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &uart_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &uart_req); + #endif +- +- if (early_serial_setup(&uart_req) != 0) +- printk("Early serial init of port 0 failed\n"); + + /* Assume early_serial_setup() doesn't modify uart_req */ + uart_req.line = 1; +@@ -79,14 +82,17 @@ sbc8560_early_serial_map(void) + uart_req.membase = ioremap(uart_req.mapbase, MPC85xx_UART1_SIZE); + uart_req.irq = MPC85xx_IRQ_EXT10; + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) +- gen550_init(1, &uart_req); ++#ifdef CONFIG_SERIAL_8250 ++ if (early_serial_setup(&uart_req) != 0) ++ printk("Early serial init of port 0 failed\n"); + #endif +- +- if (early_serial_setup(&uart_req) != 0) +- printk("Early serial init of port 1 failed\n"); +-} ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &uart_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &uart_req); + #endif ++} + + /* ************************************************************************ + * +@@ -115,9 +121,7 @@ sbc8560_setup_arch(void) + /* setup PCI host bridges */ + mpc85xx_setup_hose(); + #endif +-#ifdef CONFIG_SERIAL_8250 + sbc8560_early_serial_map(); +-#endif + #ifdef CONFIG_SERIAL_TEXT_DEBUG + /* Invalidate the entry we stole earlier the serial ports + * should be properly mapped */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/chestnut.c linux-2.6.18.kgdb/arch/ppc/platforms/chestnut.c +--- linux-2.6.18/arch/ppc/platforms/chestnut.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/chestnut.c 2008-06-10 16:19:22.000000000 +0400 +@@ -492,7 +492,7 @@ chestnut_power_off(void) + static void __init + chestnut_map_io(void) + { +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + io_block_mapping(CHESTNUT_UART_BASE, CHESTNUT_UART_BASE, 0x100000, + _PAGE_IO); + #endif +@@ -566,9 +566,6 @@ platform_init(unsigned long r3, unsigned + #if defined(CONFIG_SERIAL_TEXT_DEBUG) + ppc_md.progress = gen550_progress; + #endif +-#if defined(CONFIG_KGDB) +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + + if (ppc_md.progress) + ppc_md.progress("chestnut_init(): exit", 0); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/pplus.c linux-2.6.18.kgdb/arch/ppc/platforms/pplus.c +--- linux-2.6.18/arch/ppc/platforms/pplus.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/pplus.c 2008-06-10 16:19:22.000000000 +0400 +@@ -893,9 +893,6 @@ platform_init(unsigned long r3, unsigned + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + #ifdef CONFIG_SMP + smp_ops = &pplus_smp_ops; + #endif /* CONFIG_SMP */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/sandpoint.c linux-2.6.18.kgdb/arch/ppc/platforms/sandpoint.c +--- linux-2.6.18/arch/ppc/platforms/sandpoint.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/sandpoint.c 2008-06-10 16:19:22.000000000 +0400 +@@ -730,9 +730,6 @@ platform_init(unsigned long r3, unsigned + ppc_md.nvram_read_val = todc_mc146818_read_val; + ppc_md.nvram_write_val = todc_mc146818_write_val; + +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/platforms/spruce.c linux-2.6.18.kgdb/arch/ppc/platforms/spruce.c +--- linux-2.6.18/arch/ppc/platforms/spruce.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/platforms/spruce.c 2008-06-10 16:19:22.000000000 +0400 +@@ -178,26 +178,32 @@ spruce_early_serial_map(void) + serial_req.membase = (u_char *)UART0_IO_BASE; + serial_req.regshift = 0; + +-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG) +- gen550_init(0, &serial_req); +-#endif + #ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&serial_req) != 0) + printk("Early serial init of port 0 failed\n"); + #endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(0, &serial_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &port); ++#endif + + /* Assume early_serial_setup() doesn't modify serial_req */ + serial_req.line = 1; + serial_req.irq = UART1_INT; + serial_req.membase = (u_char *)UART1_IO_BASE; + +-#if defined(CONFIG_KGDB) || defined(CONFIG_SERIAL_TEXT_DEBUG) +- gen550_init(1, &serial_req); +-#endif + #ifdef CONFIG_SERIAL_8250 + if (early_serial_setup(&serial_req) != 0) + printk("Early serial init of port 1 failed\n"); + #endif ++#ifdef CONFIG_SERIAL_TEXT_DEBUG ++ gen550_init(1, &serial_req); ++#endif ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &serial_req); ++#endif + } + + TODC_ALLOC(); +@@ -316,7 +322,4 @@ platform_init(unsigned long r3, unsigned + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/Makefile linux-2.6.18.kgdb/arch/ppc/syslib/Makefile +--- linux-2.6.18/arch/ppc/syslib/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/Makefile 2008-06-10 16:19:22.000000000 +0400 +@@ -76,7 +76,6 @@ obj-$(CONFIG_PCI_8260) += m82xx_pci.o p + obj-$(CONFIG_8260_PCI9) += m8260_pci_erratum9.o + obj-$(CONFIG_CPM2) += cpm2_common.o cpm2_pic.o + ifeq ($(CONFIG_PPC_GEN550),y) +-obj-$(CONFIG_KGDB) += gen550_kgdb.o gen550_dbg.o + obj-$(CONFIG_SERIAL_TEXT_DEBUG) += gen550_dbg.o + endif + ifeq ($(CONFIG_SERIAL_MPSC_CONSOLE),y) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/gen550.h linux-2.6.18.kgdb/arch/ppc/syslib/gen550.h +--- linux-2.6.18/arch/ppc/syslib/gen550.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/gen550.h 2008-06-10 16:19:22.000000000 +0400 +@@ -11,4 +11,3 @@ + + extern void gen550_progress(char *, unsigned short); + extern void gen550_init(int, struct uart_port *); +-extern void gen550_kgdb_map_scc(void); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/ibm44x_common.c linux-2.6.18.kgdb/arch/ppc/syslib/ibm44x_common.c +--- linux-2.6.18/arch/ppc/syslib/ibm44x_common.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/ibm44x_common.c 2008-06-10 16:19:22.000000000 +0400 +@@ -192,9 +192,6 @@ void __init ibm44x_platform_init(unsigne + #ifdef CONFIG_SERIAL_TEXT_DEBUG + ppc_md.progress = gen550_progress; + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +-#ifdef CONFIG_KGDB +- ppc_md.kgdb_map_scc = gen550_kgdb_map_scc; +-#endif + + /* + * The Abatron BDI JTAG debugger does not tolerate others +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/mv64x60.c linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60.c +--- linux-2.6.18/arch/ppc/syslib/mv64x60.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60.c 2008-06-10 16:19:22.000000000 +0400 +@@ -241,6 +241,12 @@ static struct resource mv64x60_mpsc0_res + .end = MV64x60_IRQ_SDMA_0, + .flags = IORESOURCE_IRQ, + }, ++ [4] = { ++ .name = "mpsc 0 irq", ++ .start = MV64x60_IRQ_MPSC_0, ++ .end = MV64x60_IRQ_MPSC_0, ++ .flags = IORESOURCE_IRQ, ++ }, + }; + + static struct platform_device mpsc0_device = { +@@ -298,6 +304,12 @@ static struct resource mv64x60_mpsc1_res + .end = MV64360_IRQ_SDMA_1, + .flags = IORESOURCE_IRQ, + }, ++ [4] = { ++ .name = "mpsc 1 irq", ++ .start = MV64360_IRQ_MPSC_1, ++ .end = MV64360_IRQ_MPSC_1, ++ .flags = IORESOURCE_IRQ, ++ }, + }; + + static struct platform_device mpsc1_device = { +@@ -1426,12 +1438,46 @@ mv64x60_pd_fixup(struct mv64x60_handle * + static int __init + mv64x60_add_pds(void) + { +- return platform_add_devices(mv64x60_pd_devs, +- ARRAY_SIZE(mv64x60_pd_devs)); ++ int i, ret = 0; ++ ++ for (i = 0; i < ARRAY_SIZE(mv64x60_pd_devs); i++) { ++ if (mv64x60_pd_devs[i]) { ++ ret = platform_device_register(mv64x60_pd_devs[i]); ++ } ++ if (ret) { ++ while (--i >= 0) ++ platform_device_unregister(mv64x60_pd_devs[i]); ++ break; ++ } ++ } ++ return ret; + } + arch_initcall(mv64x60_add_pds); + + /* ++ * mv64x60_early_get_pdev_data() ++ * ++ * Get the data associated with a platform device by name and number. ++ */ ++struct platform_device * __init ++mv64x60_early_get_pdev_data(const char *name, int id, int remove) ++{ ++ int i; ++ struct platform_device *pdev; ++ ++ for (i = 0; i id == id && ++ !strcmp(pdev->name, name)) { ++ if (remove) ++ mv64x60_pd_devs[i] = NULL; ++ return pdev; ++ } ++ } ++ return NULL; ++} ++ ++/* + ***************************************************************************** + * + * GT64260-Specific Routines +@@ -1764,6 +1810,11 @@ gt64260a_chip_specific_init(struct mv64x + r->start = MV64x60_IRQ_SDMA_0; + r->end = MV64x60_IRQ_SDMA_0; + } ++ if ((r = platform_get_resource(&mpsc1_device, IORESOURCE_IRQ, 1)) ++ != NULL) { ++ r->start = GT64260_IRQ_MPSC_1; ++ r->end = GT64260_IRQ_MPSC_1; ++ } + #endif + } + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/mv64x60_dbg.c linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60_dbg.c +--- linux-2.6.18/arch/ppc/syslib/mv64x60_dbg.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/mv64x60_dbg.c 2008-06-10 16:19:22.000000000 +0400 +@@ -34,7 +34,7 @@ static struct mv64x60_handle mv64x60_dbg + void + mv64x60_progress_init(u32 base) + { +- mv64x60_dbg_bh.v_base = base; ++ mv64x60_dbg_bh.v_base = (void*)base; + return; + } + +@@ -69,53 +69,3 @@ mv64x60_mpsc_progress(char *s, unsigned + return; + } + #endif /* CONFIG_SERIAL_TEXT_DEBUG */ +- +- +-#if defined(CONFIG_KGDB) +- +-#if defined(CONFIG_KGDB_TTYS0) +-#define KGDB_PORT 0 +-#elif defined(CONFIG_KGDB_TTYS1) +-#define KGDB_PORT 1 +-#else +-#error "Invalid kgdb_tty port" +-#endif +- +-void +-putDebugChar(unsigned char c) +-{ +- mv64x60_polled_putc(KGDB_PORT, (char)c); +-} +- +-int +-getDebugChar(void) +-{ +- unsigned char c; +- +- while (!mv64x60_polled_getc(KGDB_PORT, &c)); +- return (int)c; +-} +- +-void +-putDebugString(char* str) +-{ +- while (*str != '\0') { +- putDebugChar(*str); +- str++; +- } +- putDebugChar('\r'); +- return; +-} +- +-void +-kgdb_interruptible(int enable) +-{ +-} +- +-void +-kgdb_map_scc(void) +-{ +- if (ppc_md.early_serial_map) +- ppc_md.early_serial_map(); +-} +-#endif /* CONFIG_KGDB */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/ppc/syslib/ppc85xx_setup.c linux-2.6.18.kgdb/arch/ppc/syslib/ppc85xx_setup.c +--- linux-2.6.18/arch/ppc/syslib/ppc85xx_setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/ppc/syslib/ppc85xx_setup.c 2008-06-10 16:19:22.000000000 +0400 +@@ -69,7 +69,6 @@ mpc85xx_calibrate_decr(void) + mtspr(SPRN_TCR, TCR_DIE); + } + +-#ifdef CONFIG_SERIAL_8250 + void __init + mpc85xx_early_serial_map(void) + { +@@ -85,7 +84,7 @@ mpc85xx_early_serial_map(void) + pdata[0].mapbase += binfo->bi_immr_base; + pdata[0].membase = ioremap(pdata[0].mapbase, MPC85xx_UART0_SIZE); + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + memset(&serial_req, 0, sizeof (serial_req)); + serial_req.iotype = UPIO_MEM; + serial_req.mapbase = pdata[0].mapbase; +@@ -93,18 +92,24 @@ mpc85xx_early_serial_map(void) + serial_req.regshift = 0; + + gen550_init(0, &serial_req); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(0, &serial_req); ++#endif + #endif + + pdata[1].uartclk = binfo->bi_busfreq; + pdata[1].mapbase += binfo->bi_immr_base; + pdata[1].membase = ioremap(pdata[1].mapbase, MPC85xx_UART0_SIZE); + +-#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB) ++#if defined(CONFIG_SERIAL_TEXT_DEBUG) || defined(CONFIG_KGDB_8250) + /* Assume gen550_init() doesn't modify serial_req */ + serial_req.mapbase = pdata[1].mapbase; + serial_req.membase = pdata[1].membase; + + gen550_init(1, &serial_req); ++#ifdef CONFIG_KGDB_8250 ++ kgdb8250_add_port(1, &serial_req); ++#endif + #endif + } + #endif +@@ -363,5 +368,3 @@ mpc85xx_setup_hose(void) + return; + } + #endif /* CONFIG_PCI */ +- +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/Kconfig.debug linux-2.6.18.kgdb/arch/sh/Kconfig.debug +--- linux-2.6.18/arch/sh/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/Kconfig.debug 2008-06-10 16:19:47.000000000 +0400 +@@ -29,96 +29,4 @@ config EARLY_PRINTK + This option is only useful porting the kernel to a new machine, + when the kernel may crash or hang before the serial console is + initialised. If unsure, say N. +- +-config KGDB +- bool "Include KGDB kernel debugger" +- help +- Include in-kernel hooks for kgdb, the Linux kernel source level +- debugger. See for more information. +- Unless you are intending to debug the kernel, say N here. +- +-menu "KGDB configuration options" +- depends on KGDB +- +-config MORE_COMPILE_OPTIONS +- bool "Add any additional compile options" +- help +- If you want to add additional CFLAGS to the kernel build, enable this +- option and then enter what you would like to add in the next question. +- Note however that -g is already appended with the selection of KGDB. +- +-config COMPILE_OPTIONS +- string "Additional compile arguments" +- depends on MORE_COMPILE_OPTIONS +- +-config KGDB_NMI +- bool "Enter KGDB on NMI" +- default n +- +-config KGDB_THREAD +- bool "Include KGDB thread support" +- default y +- +-config SH_KGDB_CONSOLE +- bool "Console messages through GDB" +- default n +- +-config KGDB_SYSRQ +- bool "Allow SysRq 'G' to enter KGDB" +- default y +- +-config KGDB_KERNEL_ASSERTS +- bool "Include KGDB kernel assertions" +- default n +- +-comment "Serial port setup" +- +-config KGDB_DEFPORT +- int "Port number (ttySCn)" +- default "1" +- +-config KGDB_DEFBAUD +- int "Baud rate" +- default "115200" +- +-choice +- prompt "Parity" +- depends on KGDB +- default KGDB_DEFPARITY_N +- +-config KGDB_DEFPARITY_N +- bool "None" +- +-config KGDB_DEFPARITY_E +- bool "Even" +- +-config KGDB_DEFPARITY_O +- bool "Odd" +- +-endchoice +- +-choice +- prompt "Data bits" +- depends on KGDB +- default KGDB_DEFBITS_8 +- +-config KGDB_DEFBITS_8 +- bool "8" +- +-config KGDB_DEFBITS_7 +- bool "7" +- +-endchoice +- +-endmenu +- +-config FRAME_POINTER +- bool "Compile the kernel with frame pointers" +- default y if KGDB +- help +- If you say Y here the resulting kernel image will be slightly larger +- and slower, but it will give very useful debugging information. +- If you don't debug the kernel, you can say N, but we may not be able +- to solve problems without frame pointers. +- + endmenu +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/Makefile linux-2.6.18.kgdb/arch/sh/Makefile +--- linux-2.6.18/arch/sh/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/Makefile 2008-06-10 16:19:47.000000000 +0400 +@@ -43,7 +43,6 @@ cflags-$(CONFIG_CPU_SH4) += -m4 \ + cflags-$(CONFIG_CPU_SH4A) += $(call cc-option,-m4a-nofpu,) + + cflags-$(CONFIG_SH_DSP) += -Wa,-dsp +-cflags-$(CONFIG_SH_KGDB) += -g + + cflags-$(CONFIG_MORE_COMPILE_OPTIONS) += \ + $(shell echo $(CONFIG_COMPILE_OPTIONS) | sed -e 's/"//g') +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/boards/se/7751/setup.c linux-2.6.18.kgdb/arch/sh/boards/se/7751/setup.c +--- linux-2.6.18/arch/sh/boards/se/7751/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/boards/se/7751/setup.c 2008-06-10 16:19:47.000000000 +0400 +@@ -17,10 +17,6 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-#endif +- + /* + * Configure the Super I/O chip + */ +@@ -82,12 +78,6 @@ const char *get_system_type(void) + return "7751 SolutionEngine"; + } + +-#ifdef CONFIG_SH_KGDB +-static int kgdb_uart_setup(void); +-static struct kgdb_sermap kgdb_uart_sermap = +-{ "ttyS", 0, kgdb_uart_setup, NULL }; +-#endif +- + /* + * Initialize the board + */ +@@ -95,133 +85,4 @@ void __init platform_setup(void) + { + /* Call init_smsc() replacement to set up SuperIO. */ + /* XXX: RTC setting comes here */ +-#ifdef CONFIG_SH_KGDB +- kgdb_register_sermap(&kgdb_uart_sermap); +-#endif +-} +- +-/********************************************************************* +- * Currently a hack (e.g. does not interact well w/serial.c, lots of * +- * hardcoded stuff) but may be useful if SCI/F needs debugging. * +- * Mostly copied from x86 code (see files asm-i386/kgdb_local.h and * +- * arch/i386/lib/kgdb_serial.c). * +- *********************************************************************/ +- +-#ifdef CONFIG_SH_KGDB +-#include +-#include +-#include +-#include +- +-#define COM1_PORT 0x3f8 /* Base I/O address */ +-#define COM1_IRQ 4 /* IRQ not used yet */ +-#define COM2_PORT 0x2f8 /* Base I/O address */ +-#define COM2_IRQ 3 /* IRQ not used yet */ +- +-#define SB_CLOCK 1843200 /* Serial baud clock */ +-#define SB_BASE (SB_CLOCK/16) +-#define SB_MCR UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS +- +-struct uart_port { +- int base; +-}; +-#define UART_NPORTS 2 +-struct uart_port uart_ports[] = { +- { COM1_PORT }, +- { COM2_PORT }, +-}; +-struct uart_port *kgdb_uart_port; +- +-#define UART_IN(reg) inb_p(kgdb_uart_port->base + reg) +-#define UART_OUT(reg,v) outb_p((v), kgdb_uart_port->base + reg) +- +-/* Basic read/write functions for the UART */ +-#define UART_LSR_RXCERR (UART_LSR_BI | UART_LSR_FE | UART_LSR_PE) +-static int kgdb_uart_getchar(void) +-{ +- int lsr; +- int c = -1; +- +- while (c == -1) { +- lsr = UART_IN(UART_LSR); +- if (lsr & UART_LSR_DR) +- c = UART_IN(UART_RX); +- if ((lsr & UART_LSR_RXCERR)) +- c = -1; +- } +- return c; +-} +- +-static void kgdb_uart_putchar(int c) +-{ +- while ((UART_IN(UART_LSR) & UART_LSR_THRE) == 0) +- ; +- UART_OUT(UART_TX, c); +-} +- +-/* +- * Initialize UART to configured/requested values. +- * (But we don't interrupts yet, or interact w/serial.c) +- */ +-static int kgdb_uart_setup(void) +-{ +- int port; +- int lcr = 0; +- int bdiv = 0; +- +- if (kgdb_portnum >= UART_NPORTS) { +- KGDB_PRINTK("uart port %d invalid.\n", kgdb_portnum); +- return -1; +- } +- +- kgdb_uart_port = &uart_ports[kgdb_portnum]; +- +- /* Init sequence from gdb_hook_interrupt */ +- UART_IN(UART_RX); +- UART_OUT(UART_IER, 0); +- +- UART_IN(UART_RX); /* Serial driver comments say */ +- UART_IN(UART_IIR); /* this clears interrupt regs */ +- UART_IN(UART_MSR); +- +- /* Figure basic LCR values */ +- switch (kgdb_bits) { +- case '7': +- lcr |= UART_LCR_WLEN7; +- break; +- default: case '8': +- lcr |= UART_LCR_WLEN8; +- break; +- } +- switch (kgdb_parity) { +- case 'O': +- lcr |= UART_LCR_PARITY; +- break; +- case 'E': +- lcr |= (UART_LCR_PARITY | UART_LCR_EPAR); +- break; +- default: break; +- } +- +- /* Figure the baud rate divisor */ +- bdiv = (SB_BASE/kgdb_baud); +- +- /* Set the baud rate and LCR values */ +- UART_OUT(UART_LCR, (lcr | UART_LCR_DLAB)); +- UART_OUT(UART_DLL, (bdiv & 0xff)); +- UART_OUT(UART_DLM, ((bdiv >> 8) & 0xff)); +- UART_OUT(UART_LCR, lcr); +- +- /* Set the MCR */ +- UART_OUT(UART_MCR, SB_MCR); +- +- /* Turn off FIFOs for now */ +- UART_OUT(UART_FCR, 0); +- +- /* Setup complete: initialize function pointers */ +- kgdb_getchar = kgdb_uart_getchar; +- kgdb_putchar = kgdb_uart_putchar; +- +- return 0; + } +-#endif /* CONFIG_SH_KGDB */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/Makefile linux-2.6.18.kgdb/arch/sh/kernel/Makefile +--- linux-2.6.18/arch/sh/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/Makefile 2008-06-10 16:19:47.000000000 +0400 +@@ -13,7 +13,7 @@ obj-y += cpu/ timers/ + obj-$(CONFIG_SMP) += smp.o + obj-$(CONFIG_CF_ENABLER) += cf-enabler.o + obj-$(CONFIG_SH_STANDARD_BIOS) += sh_bios.o +-obj-$(CONFIG_SH_KGDB) += kgdb_stub.o kgdb_jmp.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + obj-$(CONFIG_SH_CPU_FREQ) += cpufreq.o + obj-$(CONFIG_MODULES) += module.o + obj-$(CONFIG_EARLY_PRINTK) += early_printk.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/cpu/sh3/ex.S linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh3/ex.S +--- linux-2.6.18/arch/sh/kernel/cpu/sh3/ex.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh3/ex.S 2008-06-10 16:19:47.000000000 +0400 +@@ -42,7 +42,7 @@ ENTRY(exception_handling_table) + .long exception_error ! reserved_instruction (filled by trap_init) /* 180 */ + .long exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/ + ENTRY(nmi_slot) +-#if defined (CONFIG_KGDB_NMI) ++#if defined (CONFIG_KGDB) + .long debug_enter /* 1C0 */ ! Allow trap to debugger + #else + .long exception_none /* 1C0 */ ! Not implemented yet +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/cpu/sh4/ex.S linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh4/ex.S +--- linux-2.6.18/arch/sh/kernel/cpu/sh4/ex.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/cpu/sh4/ex.S 2008-06-10 16:19:47.000000000 +0400 +@@ -46,7 +46,7 @@ ENTRY(exception_handling_table) + .long exception_error ! reserved_instruction (filled by trap_init) /* 180 */ + .long exception_error ! illegal_slot_instruction (filled by trap_init) /*1A0*/ + ENTRY(nmi_slot) +-#if defined (CONFIG_KGDB_NMI) ++#if defined (CONFIG_KGDB) + .long debug_enter /* 1C0 */ ! Allow trap to debugger + #else + .long exception_none /* 1C0 */ ! Not implemented yet +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/entry.S linux-2.6.18.kgdb/arch/sh/kernel/entry.S +--- linux-2.6.18/arch/sh/kernel/entry.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/entry.S 2008-06-10 16:19:47.000000000 +0400 +@@ -75,7 +75,7 @@ + ENOSYS = 38 + EINVAL = 22 + +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + NMI_VEC = 0x1c0 ! Must catch early for debounce + #endif + +@@ -227,31 +227,33 @@ call_dae: + 2: .long do_address_error + #endif /* CONFIG_MMU */ + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB) + ! Handle kernel debug if either kgdb (SW) or gdb-stub (FW) is present. + ! If both are configured, handle the debug traps (breakpoints) in SW, + ! but still allow BIOS traps to FW. + + .align 2 + debug_kernel: +-#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_SH_KGDB) ++#if defined(CONFIG_SH_STANDARD_BIOS) && defined(CONFIG_KGDB) + /* Force BIOS call to FW (debug_trap put TRA in r8) */ + mov r8,r0 + shlr2 r0 + cmp/eq #0x3f,r0 + bt debug_kernel_fw +-#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS && CONFIG_KGDB */ + +-debug_enter: +-#if defined(CONFIG_SH_KGDB) ++ .align 2 ++ .globl debug_enter ++debug_enter: ++#if defined(CONFIG_KGDB) + /* Jump to kgdb, pass stacked regs as arg */ + debug_kernel_sw: + mov.l 3f, r0 + jmp @r0 + mov r15, r4 + .align 2 +-3: .long kgdb_handle_exception +-#endif /* CONFIG_SH_KGDB */ ++3: .long kgdb_exception_handler ++#endif /* CONFIG_KGDB */ + + #if defined(CONFIG_SH_STANDARD_BIOS) + /* Unwind the stack and jmp to the debug entry */ +@@ -293,12 +295,12 @@ debug_kernel_fw: + 2: .long gdb_vbr_vector + #endif /* CONFIG_SH_STANDARD_BIOS */ + +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB */ + + + .align 2 +-debug_trap: +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) ++debug_trap: ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB) + mov #OFF_SR, r0 + mov.l @(r0,r15), r0 ! get status register + shll r0 +@@ -642,7 +644,7 @@ skip_restore: + 6: or k0, k2 ! Set the IMASK-bits + ldc k2, ssr + ! +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + ! Clear in_nmi + mov.l 4f, k0 + mov #0, k1 +@@ -694,7 +696,7 @@ tlb_miss: + interrupt: + mov.l 2f, k2 + mov.l 3f, k3 +-#if defined(CONFIG_KGDB_NMI) ++#if defined(CONFIG_KGDB) + ! Debounce (filter nested NMI) + mov.l @k2, k0 + mov.l 5f, k1 +@@ -709,7 +711,7 @@ interrupt: + 5: .long NMI_VEC + 6: .long in_nmi + 0: +-#endif /* defined(CONFIG_KGDB_NMI) */ ++#endif /* defined(CONFIG_KGDB) */ + bra handle_exception + mov.l @k2, k2 + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/sh/kernel/kgdb-jmp.S +--- linux-2.6.18/arch/sh/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb-jmp.S 2008-06-10 16:19:47.000000000 +0400 +@@ -0,0 +1,32 @@ ++#include ++ ++ENTRY(kgdb_fault_setjmp) ++ add #(9*4), r4 ++ sts.l pr, @-r4 ++ mov.l r15, @-r4 ++ mov.l r14, @-r4 ++ mov.l r13, @-r4 ++ mov.l r12, @-r4 ++ mov.l r11, @-r4 ++ mov.l r10, @-r4 ++ mov.l r9, @-r4 ++ mov.l r8, @-r4 ++ rts ++ mov #0, r0 ++ ++ENTRY(kgdb_fault_longjmp) ++ mov.l @r4+, r8 ++ mov.l @r4+, r9 ++ mov.l @r4+, r10 ++ mov.l @r4+, r11 ++ mov.l @r4+, r12 ++ mov.l @r4+, r13 ++ mov.l @r4+, r14 ++ mov.l @r4+, r15 ++ lds.l @r4+, pr ++ mov r5, r0 ++ tst r0, r0 ++ bf 1f ++ mov #1, r0 ++1: rts ++ nop +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb.c linux-2.6.18.kgdb/arch/sh/kernel/kgdb.c +--- linux-2.6.18/arch/sh/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb.c 2008-06-10 16:19:47.000000000 +0400 +@@ -0,0 +1,363 @@ ++/* ++ * arch/sh/kernel/kgdb.c ++ * ++ * Contains SH-specific low-level support for KGDB. ++ * ++ * Containes extracts from code by Glenn Engel, Jim Kingdon, ++ * David Grothe , Tigran Aivazian , ++ * Amit S. Kale , William Gatliff , ++ * Ben Lee, Steve Chamberlain and Benoit Miller , ++ * Henry Bell and Jeremy Siegel ++ * ++ * Maintainer: Tom Rini ++ * ++ * 2004 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include ++#include ++#include ++#include ++ ++extern void per_cpu_trap_init(void); ++extern atomic_t cpu_doing_single_step; ++ ++/* Function pointers for linkage */ ++static struct kgdb_regs trap_registers; ++ ++/* Globals. */ ++char in_nmi; /* Set during NMI to prevent reentry */ ++ ++/* TRA differs sh3/4 */ ++#if defined(CONFIG_CPU_SH3) ++#define TRA 0xffffffd0 ++#elif defined(CONFIG_CPU_SH4) ++#define TRA 0xff000020 ++#endif ++ ++/* Macros for single step instruction identification */ ++#define OPCODE_BT(op) (((op) & 0xff00) == 0x8900) ++#define OPCODE_BF(op) (((op) & 0xff00) == 0x8b00) ++#define OPCODE_BTF_DISP(op) (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \ ++ (((op) & 0x7f ) << 1)) ++#define OPCODE_BFS(op) (((op) & 0xff00) == 0x8f00) ++#define OPCODE_BTS(op) (((op) & 0xff00) == 0x8d00) ++#define OPCODE_BRA(op) (((op) & 0xf000) == 0xa000) ++#define OPCODE_BRA_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ ++ (((op) & 0x7ff) << 1)) ++#define OPCODE_BRAF(op) (((op) & 0xf0ff) == 0x0023) ++#define OPCODE_BRAF_REG(op) (((op) & 0x0f00) >> 8) ++#define OPCODE_BSR(op) (((op) & 0xf000) == 0xb000) ++#define OPCODE_BSR_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ ++ (((op) & 0x7ff) << 1)) ++#define OPCODE_BSRF(op) (((op) & 0xf0ff) == 0x0003) ++#define OPCODE_BSRF_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_JMP(op) (((op) & 0xf0ff) == 0x402b) ++#define OPCODE_JMP_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_JSR(op) (((op) & 0xf0ff) == 0x400b) ++#define OPCODE_JSR_REG(op) (((op) >> 8) & 0xf) ++#define OPCODE_RTS(op) ((op) == 0xb) ++#define OPCODE_RTE(op) ((op) == 0x2b) ++ ++#define SR_T_BIT_MASK 0x1 ++#define STEP_OPCODE 0xc320 ++#define BIOS_CALL_TRAP 0x3f ++ ++/* Exception codes as per SH-4 core manual */ ++#define ADDRESS_ERROR_LOAD_VEC 7 ++#define ADDRESS_ERROR_STORE_VEC 8 ++#define TRAP_VEC 11 ++#define INVALID_INSN_VEC 12 ++#define INVALID_SLOT_VEC 13 ++#define NMI_VEC 14 ++#define SERIAL_BREAK_VEC 58 ++ ++/* Misc static */ ++static int stepped_address; ++static short stepped_opcode; ++ ++/* Translate SH-3/4 exception numbers to unix-like signal values */ ++static int compute_signal(const int excep_code) ++{ ++ switch (excep_code) { ++ case INVALID_INSN_VEC: ++ case INVALID_SLOT_VEC: ++ return SIGILL; ++ case ADDRESS_ERROR_LOAD_VEC: ++ case ADDRESS_ERROR_STORE_VEC: ++ return SIGSEGV; ++ case SERIAL_BREAK_VEC: ++ case NMI_VEC: ++ return SIGINT; ++ default: ++ /* Act like it was a break/trap. */ ++ return SIGTRAP; ++ } ++} ++ ++/* ++ * Translate the registers of the system into the format that GDB wants. Since ++ * we use a local structure to store things, instead of getting them out ++ * of pt_regs, we can just do a memcpy. ++ */ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *ign) ++{ ++ memcpy(gdb_regs, &trap_registers, sizeof(trap_registers)); ++} ++ ++/* ++ * On SH we save: r1 (prev->thread.sp) r2 (prev->thread.pc) r4 (prev) r5 (next) ++ * r6 (next->thread.sp) r7 (next->thread.pc) ++ */ ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ int count; ++ ++ for (count = 0; count < 16; count++) ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = p->thread.pc; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++ *(gdb_regs++) = 0; ++} ++ ++/* ++ * Translate the registers values that GDB has given us back into the ++ * format of the system. See the comment above about memcpy. ++ */ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *ign) ++{ ++ memcpy(&trap_registers, gdb_regs, sizeof(trap_registers)); ++} ++ ++/* Calculate the new address for after a step */ ++static short *get_step_address(void) ++{ ++ short op = *(short *)trap_registers.pc; ++ long addr; ++ ++ /* BT */ ++ if (OPCODE_BT(op)) { ++ if (trap_registers.sr & SR_T_BIT_MASK) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 2; ++ } ++ ++ /* BTS */ ++ else if (OPCODE_BTS(op)) { ++ if (trap_registers.sr & SR_T_BIT_MASK) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 4; /* Not in delay slot */ ++ } ++ ++ /* BF */ ++ else if (OPCODE_BF(op)) { ++ if (!(trap_registers.sr & SR_T_BIT_MASK)) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 2; ++ } ++ ++ /* BFS */ ++ else if (OPCODE_BFS(op)) { ++ if (!(trap_registers.sr & SR_T_BIT_MASK)) ++ addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); ++ else ++ addr = trap_registers.pc + 4; /* Not in delay slot */ ++ } ++ ++ /* BRA */ ++ else if (OPCODE_BRA(op)) ++ addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op); ++ ++ /* BRAF */ ++ else if (OPCODE_BRAF(op)) ++ addr = trap_registers.pc + 4 ++ + trap_registers.regs[OPCODE_BRAF_REG(op)]; ++ ++ /* BSR */ ++ else if (OPCODE_BSR(op)) ++ addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op); ++ ++ /* BSRF */ ++ else if (OPCODE_BSRF(op)) ++ addr = trap_registers.pc + 4 ++ + trap_registers.regs[OPCODE_BSRF_REG(op)]; ++ ++ /* JMP */ ++ else if (OPCODE_JMP(op)) ++ addr = trap_registers.regs[OPCODE_JMP_REG(op)]; ++ ++ /* JSR */ ++ else if (OPCODE_JSR(op)) ++ addr = trap_registers.regs[OPCODE_JSR_REG(op)]; ++ ++ /* RTS */ ++ else if (OPCODE_RTS(op)) ++ addr = trap_registers.pr; ++ ++ /* RTE */ ++ else if (OPCODE_RTE(op)) ++ addr = trap_registers.regs[15]; ++ ++ /* Other */ ++ else ++ addr = trap_registers.pc + 2; ++ ++ kgdb_flush_icache_range(addr, addr + 2); ++ return (short *)addr; ++} ++ ++/* The command loop, read and act on requests */ ++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ++ char *remcom_in_buffer, char *remcom_out_buffer, ++ struct pt_regs *ign) ++{ ++ unsigned long addr; ++ char *ptr = &remcom_in_buffer[1]; ++ ++ /* Examine first char of buffer to see what we need to do */ ++ switch (remcom_in_buffer[0]) { ++ case 'c': /* Continue at address AA..AA (optional) */ ++ case 's': /* Step one instruction from AA..AA */ ++ /* Try to read optional parameter, PC unchanged if none */ ++ if (kgdb_hex2long(&ptr, &addr)) ++ trap_registers.pc = addr; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ if (remcom_in_buffer[0] == 's') { ++ /* Replace the instruction immediately after the ++ * current instruction (i.e. next in the expected ++ * flow of control) with a trap instruction, so that ++ * returning will cause only a single instruction to ++ * be executed. Note that this model is slightly ++ * broken for instructions with delay slots ++ * (e.g. B[TF]S, BSR, BRA etc), where both the branch ++ * and the instruction in the delay slot will be ++ * executed. ++ */ ++ /* Determine where the target instruction will send ++ * us to */ ++ unsigned short *next_addr = get_step_address(); ++ stepped_address = (int)next_addr; ++ ++ /* Replace it */ ++ stepped_opcode = *(short *)next_addr; ++ *next_addr = STEP_OPCODE; ++ ++ /* Flush and return */ ++ kgdb_flush_icache_range((long)next_addr, ++ (long)next_addr + 2); ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ } ++ return 0; ++ } ++ return -1; ++} ++ ++/* ++ * When an exception has occured, we are called. We need to set things ++ * up so that we can call kgdb_handle_exception to handle requests from ++ * the remote GDB. ++ */ ++void kgdb_exception_handler(struct pt_regs *regs) ++{ ++ int excep_code, vbr_val; ++ int count; ++ ++ /* Copy kernel regs (from stack) */ ++ for (count = 0; count < 16; count++) ++ trap_registers.regs[count] = regs->regs[count]; ++ trap_registers.pc = regs->pc; ++ trap_registers.pr = regs->pr; ++ trap_registers.sr = regs->sr; ++ trap_registers.gbr = regs->gbr; ++ trap_registers.mach = regs->mach; ++ trap_registers.macl = regs->macl; ++ ++ __asm__ __volatile__("stc vbr, %0":"=r"(vbr_val)); ++ trap_registers.vbr = vbr_val; ++ ++ /* Get the execption code. */ ++ __asm__ __volatile__("stc r2_bank, %0":"=r"(excep_code)); ++ ++ excep_code >>= 5; ++ ++ /* If we got an NMI, and KGDB is not yet initialized, call ++ * breakpoint() to try and initialize everything for us. */ ++ if (excep_code == NMI_VEC && !kgdb_initialized) { ++ breakpoint(); ++ return; ++ } ++ ++ /* TRAP_VEC exception indicates a software trap inserted in place of ++ * code by GDB so back up PC by one instruction, as this instruction ++ * will later be replaced by its original one. Do NOT do this for ++ * trap 0xff, since that indicates a compiled-in breakpoint which ++ * will not be replaced (and we would retake the trap forever) */ ++ if (excep_code == TRAP_VEC && ++ (*(volatile unsigned long *)TRA != (0xff << 2))) ++ trap_registers.pc -= 2; ++ ++ /* If we have been single-stepping, put back the old instruction. ++ * We use stepped_address in case we have stopped more than one ++ * instruction away. */ ++ if (stepped_opcode != 0) { ++ *(short *)stepped_address = stepped_opcode; ++ kgdb_flush_icache_range(stepped_address, stepped_address + 2); ++ } ++ stepped_opcode = 0; ++ ++ /* Call the stub to do the processing. Note that not everything we ++ * need to send back and forth lives in pt_regs. */ ++ kgdb_handle_exception(excep_code, compute_signal(excep_code), 0, regs); ++ ++ /* Copy back the (maybe modified) registers */ ++ for (count = 0; count < 16; count++) ++ regs->regs[count] = trap_registers.regs[count]; ++ regs->pc = trap_registers.pc; ++ regs->pr = trap_registers.pr; ++ regs->sr = trap_registers.sr; ++ regs->gbr = trap_registers.gbr; ++ regs->mach = trap_registers.mach; ++ regs->macl = trap_registers.macl; ++ ++ vbr_val = trap_registers.vbr; ++ __asm__ __volatile__("ldc %0, vbr": :"r"(vbr_val)); ++} ++ ++int __init kgdb_arch_init(void) ++{ ++ per_cpu_trap_init(); ++ ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++#ifdef CONFIG_CPU_LITTLE_ENDIAN ++ .gdb_bpt_instr = {0xff, 0xc3}, ++#else ++ .gdb_bpt_instr = {0xc3, 0xff}, ++#endif ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb_jmp.S linux-2.6.18.kgdb/arch/sh/kernel/kgdb_jmp.S +--- linux-2.6.18/arch/sh/kernel/kgdb_jmp.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb_jmp.S 1970-01-01 03:00:00.000000000 +0300 +@@ -1,33 +0,0 @@ +-#include +- +-ENTRY(setjmp) +- add #(9*4), r4 +- sts.l pr, @-r4 +- mov.l r15, @-r4 +- mov.l r14, @-r4 +- mov.l r13, @-r4 +- mov.l r12, @-r4 +- mov.l r11, @-r4 +- mov.l r10, @-r4 +- mov.l r9, @-r4 +- mov.l r8, @-r4 +- rts +- mov #0, r0 +- +-ENTRY(longjmp) +- mov.l @r4+, r8 +- mov.l @r4+, r9 +- mov.l @r4+, r10 +- mov.l @r4+, r11 +- mov.l @r4+, r12 +- mov.l @r4+, r13 +- mov.l @r4+, r14 +- mov.l @r4+, r15 +- lds.l @r4+, pr +- mov r5, r0 +- tst r0, r0 +- bf 1f +- mov #1, r0 ! in case val==0 +-1: rts +- nop +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/kgdb_stub.c linux-2.6.18.kgdb/arch/sh/kernel/kgdb_stub.c +--- linux-2.6.18/arch/sh/kernel/kgdb_stub.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/kgdb_stub.c 1970-01-01 03:00:00.000000000 +0300 +@@ -1,1491 +0,0 @@ +-/* +- * May be copied or modified under the terms of the GNU General Public +- * License. See linux/COPYING for more information. +- * +- * Containes extracts from code by Glenn Engel, Jim Kingdon, +- * David Grothe , Tigran Aivazian , +- * Amit S. Kale , William Gatliff , +- * Ben Lee, Steve Chamberlain and Benoit Miller . +- * +- * This version by Henry Bell +- * Minor modifications by Jeremy Siegel +- * +- * Contains low-level support for remote debug using GDB. +- * +- * To enable debugger support, two things need to happen. A call to +- * set_debug_traps() is necessary in order to allow any breakpoints +- * or error conditions to be properly intercepted and reported to gdb. +- * A breakpoint also needs to be generated to begin communication. This +- * is most easily accomplished by a call to breakpoint() which does +- * a trapa if the initialisation phase has been successfully completed. +- * +- * In this case, set_debug_traps() is not used to "take over" exceptions; +- * other kernel code is modified instead to enter the kgdb functions here +- * when appropriate (see entry.S for breakpoint traps and NMI interrupts, +- * see traps.c for kernel error exceptions). +- * +- * The following gdb commands are supported: +- * +- * Command Function Return value +- * +- * g return the value of the CPU registers hex data or ENN +- * G set the value of the CPU registers OK or ENN +- * +- * mAA..AA,LLLL Read LLLL bytes at address AA..AA hex data or ENN +- * MAA..AA,LLLL: Write LLLL bytes at address AA.AA OK or ENN +- * XAA..AA,LLLL: Same, but data is binary (not hex) OK or ENN +- * +- * c Resume at current address SNN ( signal NN) +- * cAA..AA Continue at address AA..AA SNN +- * CNN; Resume at current address with signal SNN +- * CNN;AA..AA Resume at address AA..AA with signal SNN +- * +- * s Step one instruction SNN +- * sAA..AA Step one instruction from AA..AA SNN +- * SNN; Step one instruction with signal SNN +- * SNNAA..AA Step one instruction from AA..AA w/NN SNN +- * +- * k kill (Detach GDB) +- * +- * d Toggle debug flag +- * D Detach GDB +- * +- * Hct Set thread t for operations, OK or ENN +- * c = 'c' (step, cont), c = 'g' (other +- * operations) +- * +- * qC Query current thread ID QCpid +- * qfThreadInfo Get list of current threads (first) m +- * qsThreadInfo " " " " " (subsequent) +- * qOffsets Get section offsets Text=x;Data=y;Bss=z +- * +- * TXX Find if thread XX is alive OK or ENN +- * ? What was the last sigval ? SNN (signal NN) +- * O Output to GDB console +- * +- * Remote communication protocol. +- * +- * A debug packet whose contents are is encapsulated for +- * transmission in the form: +- * +- * $ # CSUM1 CSUM2 +- * +- * must be ASCII alphanumeric and cannot include characters +- * '$' or '#'. If starts with two characters followed by +- * ':', then the existing stubs interpret this as a sequence number. +- * +- * CSUM1 and CSUM2 are ascii hex representation of an 8-bit +- * checksum of , the most significant nibble is sent first. +- * the hex digits 0-9,a-f are used. +- * +- * Receiver responds with: +- * +- * + - if CSUM is correct and ready for next packet +- * - - if CSUM is incorrect +- * +- * Responses can be run-length encoded to save space. A '*' means that +- * the next character is an ASCII encoding giving a repeat count which +- * stands for that many repititions of the character preceding the '*'. +- * The encoding is n+29, yielding a printable character where n >=3 +- * (which is where RLE starts to win). Don't use an n > 126. +- * +- * So "0* " means the same as "0000". +- */ +- +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include +-#include +-#include +-#include +-#include +-#include +- +-#ifdef CONFIG_SH_KGDB_CONSOLE +-#include +-#endif +- +-/* Function pointers for linkage */ +-kgdb_debug_hook_t *kgdb_debug_hook; +-kgdb_bus_error_hook_t *kgdb_bus_err_hook; +- +-int (*kgdb_getchar)(void); +-void (*kgdb_putchar)(int); +- +-static void put_debug_char(int c) +-{ +- if (!kgdb_putchar) +- return; +- (*kgdb_putchar)(c); +-} +-static int get_debug_char(void) +-{ +- if (!kgdb_getchar) +- return -1; +- return (*kgdb_getchar)(); +-} +- +-/* Num chars in in/out bound buffers, register packets need NUMREGBYTES * 2 */ +-#define BUFMAX 1024 +-#define NUMREGBYTES (MAXREG*4) +-#define OUTBUFMAX (NUMREGBYTES*2+512) +- +-enum regs { +- R0 = 0, R1, R2, R3, R4, R5, R6, R7, +- R8, R9, R10, R11, R12, R13, R14, R15, +- PC, PR, GBR, VBR, MACH, MACL, SR, +- /* */ +- MAXREG +-}; +- +-static unsigned int registers[MAXREG]; +-struct kgdb_regs trap_registers; +- +-char kgdb_in_gdb_mode; +-char in_nmi; /* Set during NMI to prevent reentry */ +-int kgdb_nofault; /* Boolean to ignore bus errs (i.e. in GDB) */ +-int kgdb_enabled = 1; /* Default to enabled, cmdline can disable */ +-int kgdb_halt; +- +-/* Exposed for user access */ +-struct task_struct *kgdb_current; +-unsigned int kgdb_g_imask; +-int kgdb_trapa_val; +-int kgdb_excode; +- +-/* Default values for SCI (can override via kernel args in setup.c) */ +-#ifndef CONFIG_KGDB_DEFPORT +-#define CONFIG_KGDB_DEFPORT 1 +-#endif +- +-#ifndef CONFIG_KGDB_DEFBAUD +-#define CONFIG_KGDB_DEFBAUD 115200 +-#endif +- +-#if defined(CONFIG_KGDB_DEFPARITY_E) +-#define CONFIG_KGDB_DEFPARITY 'E' +-#elif defined(CONFIG_KGDB_DEFPARITY_O) +-#define CONFIG_KGDB_DEFPARITY 'O' +-#else /* CONFIG_KGDB_DEFPARITY_N */ +-#define CONFIG_KGDB_DEFPARITY 'N' +-#endif +- +-#ifdef CONFIG_KGDB_DEFBITS_7 +-#define CONFIG_KGDB_DEFBITS '7' +-#else /* CONFIG_KGDB_DEFBITS_8 */ +-#define CONFIG_KGDB_DEFBITS '8' +-#endif +- +-/* SCI/UART settings, used in kgdb_console_setup() */ +-int kgdb_portnum = CONFIG_KGDB_DEFPORT; +-int kgdb_baud = CONFIG_KGDB_DEFBAUD; +-char kgdb_parity = CONFIG_KGDB_DEFPARITY; +-char kgdb_bits = CONFIG_KGDB_DEFBITS; +- +-/* Jump buffer for setjmp/longjmp */ +-static jmp_buf rem_com_env; +- +-/* TRA differs sh3/4 */ +-#if defined(CONFIG_CPU_SH3) +-#define TRA 0xffffffd0 +-#elif defined(CONFIG_CPU_SH4) +-#define TRA 0xff000020 +-#endif +- +-/* Macros for single step instruction identification */ +-#define OPCODE_BT(op) (((op) & 0xff00) == 0x8900) +-#define OPCODE_BF(op) (((op) & 0xff00) == 0x8b00) +-#define OPCODE_BTF_DISP(op) (((op) & 0x80) ? (((op) | 0xffffff80) << 1) : \ +- (((op) & 0x7f ) << 1)) +-#define OPCODE_BFS(op) (((op) & 0xff00) == 0x8f00) +-#define OPCODE_BTS(op) (((op) & 0xff00) == 0x8d00) +-#define OPCODE_BRA(op) (((op) & 0xf000) == 0xa000) +-#define OPCODE_BRA_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ +- (((op) & 0x7ff) << 1)) +-#define OPCODE_BRAF(op) (((op) & 0xf0ff) == 0x0023) +-#define OPCODE_BRAF_REG(op) (((op) & 0x0f00) >> 8) +-#define OPCODE_BSR(op) (((op) & 0xf000) == 0xb000) +-#define OPCODE_BSR_DISP(op) (((op) & 0x800) ? (((op) | 0xfffff800) << 1) : \ +- (((op) & 0x7ff) << 1)) +-#define OPCODE_BSRF(op) (((op) & 0xf0ff) == 0x0003) +-#define OPCODE_BSRF_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_JMP(op) (((op) & 0xf0ff) == 0x402b) +-#define OPCODE_JMP_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_JSR(op) (((op) & 0xf0ff) == 0x400b) +-#define OPCODE_JSR_REG(op) (((op) >> 8) & 0xf) +-#define OPCODE_RTS(op) ((op) == 0xb) +-#define OPCODE_RTE(op) ((op) == 0x2b) +- +-#define SR_T_BIT_MASK 0x1 +-#define STEP_OPCODE 0xc320 +-#define BIOS_CALL_TRAP 0x3f +- +-/* Exception codes as per SH-4 core manual */ +-#define ADDRESS_ERROR_LOAD_VEC 7 +-#define ADDRESS_ERROR_STORE_VEC 8 +-#define TRAP_VEC 11 +-#define INVALID_INSN_VEC 12 +-#define INVALID_SLOT_VEC 13 +-#define NMI_VEC 14 +-#define USER_BREAK_VEC 15 +-#define SERIAL_BREAK_VEC 58 +- +-/* Misc static */ +-static int stepped_address; +-static short stepped_opcode; +-static const char hexchars[] = "0123456789abcdef"; +-static char in_buffer[BUFMAX]; +-static char out_buffer[OUTBUFMAX]; +- +-static void kgdb_to_gdb(const char *s); +- +-#ifdef CONFIG_KGDB_THREAD +-static struct task_struct *trapped_thread; +-static struct task_struct *current_thread; +-typedef unsigned char threadref[8]; +-#define BUF_THREAD_ID_SIZE 16 +-#endif +- +-/* Return addr as a real volatile address */ +-static inline unsigned int ctrl_inl(const unsigned long addr) +-{ +- return *(volatile unsigned long *) addr; +-} +- +-/* Correctly set *addr using volatile */ +-static inline void ctrl_outl(const unsigned int b, unsigned long addr) +-{ +- *(volatile unsigned long *) addr = b; +-} +- +-/* Get high hex bits */ +-static char highhex(const int x) +-{ +- return hexchars[(x >> 4) & 0xf]; +-} +- +-/* Get low hex bits */ +-static char lowhex(const int x) +-{ +- return hexchars[x & 0xf]; +-} +- +-/* Convert ch to hex */ +-static int hex(const char ch) +-{ +- if ((ch >= 'a') && (ch <= 'f')) +- return (ch - 'a' + 10); +- if ((ch >= '0') && (ch <= '9')) +- return (ch - '0'); +- if ((ch >= 'A') && (ch <= 'F')) +- return (ch - 'A' + 10); +- return (-1); +-} +- +-/* Convert the memory pointed to by mem into hex, placing result in buf. +- Returns a pointer to the last char put in buf (null) */ +-static char *mem_to_hex(const char *mem, char *buf, const int count) +-{ +- int i; +- int ch; +- unsigned short s_val; +- unsigned long l_val; +- +- /* Check for 16 or 32 */ +- if (count == 2 && ((long) mem & 1) == 0) { +- s_val = *(unsigned short *) mem; +- mem = (char *) &s_val; +- } else if (count == 4 && ((long) mem & 3) == 0) { +- l_val = *(unsigned long *) mem; +- mem = (char *) &l_val; +- } +- for (i = 0; i < count; i++) { +- ch = *mem++; +- *buf++ = highhex(ch); +- *buf++ = lowhex(ch); +- } +- *buf = 0; +- return (buf); +-} +- +-/* Convert the hex array pointed to by buf into binary, to be placed in mem. +- Return a pointer to the character after the last byte written */ +-static char *hex_to_mem(const char *buf, char *mem, const int count) +-{ +- int i; +- unsigned char ch; +- +- for (i = 0; i < count; i++) { +- ch = hex(*buf++) << 4; +- ch = ch + hex(*buf++); +- *mem++ = ch; +- } +- return (mem); +-} +- +-/* While finding valid hex chars, convert to an integer, then return it */ +-static int hex_to_int(char **ptr, int *int_value) +-{ +- int num_chars = 0; +- int hex_value; +- +- *int_value = 0; +- +- while (**ptr) { +- hex_value = hex(**ptr); +- if (hex_value >= 0) { +- *int_value = (*int_value << 4) | hex_value; +- num_chars++; +- } else +- break; +- (*ptr)++; +- } +- return num_chars; +-} +- +-/* Copy the binary array pointed to by buf into mem. Fix $, #, +- and 0x7d escaped with 0x7d. Return a pointer to the character +- after the last byte written. */ +-static char *ebin_to_mem(const char *buf, char *mem, int count) +-{ +- for (; count > 0; count--, buf++) { +- if (*buf == 0x7d) +- *mem++ = *(++buf) ^ 0x20; +- else +- *mem++ = *buf; +- } +- return mem; +-} +- +-/* Pack a hex byte */ +-static char *pack_hex_byte(char *pkt, int byte) +-{ +- *pkt++ = hexchars[(byte >> 4) & 0xf]; +- *pkt++ = hexchars[(byte & 0xf)]; +- return pkt; +-} +- +-#ifdef CONFIG_KGDB_THREAD +- +-/* Pack a thread ID */ +-static char *pack_threadid(char *pkt, threadref * id) +-{ +- char *limit; +- unsigned char *altid; +- +- altid = (unsigned char *) id; +- +- limit = pkt + BUF_THREAD_ID_SIZE; +- while (pkt < limit) +- pkt = pack_hex_byte(pkt, *altid++); +- return pkt; +-} +- +-/* Convert an integer into our threadref */ +-static void int_to_threadref(threadref * id, const int value) +-{ +- unsigned char *scan = (unsigned char *) id; +- int i = 4; +- +- while (i--) +- *scan++ = 0; +- +- *scan++ = (value >> 24) & 0xff; +- *scan++ = (value >> 16) & 0xff; +- *scan++ = (value >> 8) & 0xff; +- *scan++ = (value & 0xff); +-} +- +-/* Return a task structure ptr for a particular pid */ +-static struct task_struct *get_thread(int pid) +-{ +- struct task_struct *thread; +- +- /* Use PID_MAX w/gdb for pid 0 */ +- if (pid == PID_MAX) pid = 0; +- +- /* First check via PID */ +- thread = find_task_by_pid(pid); +- +- if (thread) +- return thread; +- +- /* Start at the start */ +- thread = init_tasks[0]; +- +- /* Walk along the linked list of tasks */ +- do { +- if (thread->pid == pid) +- return thread; +- thread = thread->next_task; +- } while (thread != init_tasks[0]); +- +- return NULL; +-} +- +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* Scan for the start char '$', read the packet and check the checksum */ +-static void get_packet(char *buffer, int buflen) +-{ +- unsigned char checksum; +- unsigned char xmitcsum; +- int i; +- int count; +- char ch; +- +- do { +- /* Ignore everything until the start character */ +- while ((ch = get_debug_char()) != '$'); +- +- checksum = 0; +- xmitcsum = -1; +- count = 0; +- +- /* Now, read until a # or end of buffer is found */ +- while (count < (buflen - 1)) { +- ch = get_debug_char(); +- +- if (ch == '#') +- break; +- +- checksum = checksum + ch; +- buffer[count] = ch; +- count = count + 1; +- } +- +- buffer[count] = 0; +- +- /* Continue to read checksum following # */ +- if (ch == '#') { +- xmitcsum = hex(get_debug_char()) << 4; +- xmitcsum += hex(get_debug_char()); +- +- /* Checksum */ +- if (checksum != xmitcsum) +- put_debug_char('-'); /* Failed checksum */ +- else { +- /* Ack successful transfer */ +- put_debug_char('+'); +- +- /* If a sequence char is present, reply +- the sequence ID */ +- if (buffer[2] == ':') { +- put_debug_char(buffer[0]); +- put_debug_char(buffer[1]); +- +- /* Remove sequence chars from buffer */ +- count = strlen(buffer); +- for (i = 3; i <= count; i++) +- buffer[i - 3] = buffer[i]; +- } +- } +- } +- } +- while (checksum != xmitcsum); /* Keep trying while we fail */ +-} +- +-/* Send the packet in the buffer with run-length encoding */ +-static void put_packet(char *buffer) +-{ +- int checksum; +- char *src; +- int runlen; +- int encode; +- +- do { +- src = buffer; +- put_debug_char('$'); +- checksum = 0; +- +- /* Continue while we still have chars left */ +- while (*src) { +- /* Check for runs up to 99 chars long */ +- for (runlen = 1; runlen < 99; runlen++) { +- if (src[0] != src[runlen]) +- break; +- } +- +- if (runlen > 3) { +- /* Got a useful amount, send encoding */ +- encode = runlen + ' ' - 4; +- put_debug_char(*src); checksum += *src; +- put_debug_char('*'); checksum += '*'; +- put_debug_char(encode); checksum += encode; +- src += runlen; +- } else { +- /* Otherwise just send the current char */ +- put_debug_char(*src); checksum += *src; +- src += 1; +- } +- } +- +- /* '#' Separator, put high and low components of checksum */ +- put_debug_char('#'); +- put_debug_char(highhex(checksum)); +- put_debug_char(lowhex(checksum)); +- } +- while ((get_debug_char()) != '+'); /* While no ack */ +-} +- +-/* A bus error has occurred - perform a longjmp to return execution and +- allow handling of the error */ +-static void kgdb_handle_bus_error(void) +-{ +- longjmp(rem_com_env, 1); +-} +- +-/* Translate SH-3/4 exception numbers to unix-like signal values */ +-static int compute_signal(const int excep_code) +-{ +- int sigval; +- +- switch (excep_code) { +- +- case INVALID_INSN_VEC: +- case INVALID_SLOT_VEC: +- sigval = SIGILL; +- break; +- case ADDRESS_ERROR_LOAD_VEC: +- case ADDRESS_ERROR_STORE_VEC: +- sigval = SIGSEGV; +- break; +- +- case SERIAL_BREAK_VEC: +- case NMI_VEC: +- sigval = SIGINT; +- break; +- +- case USER_BREAK_VEC: +- case TRAP_VEC: +- sigval = SIGTRAP; +- break; +- +- default: +- sigval = SIGBUS; /* "software generated" */ +- break; +- } +- +- return (sigval); +-} +- +-/* Make a local copy of the registers passed into the handler (bletch) */ +-static void kgdb_regs_to_gdb_regs(const struct kgdb_regs *regs, +- int *gdb_regs) +-{ +- gdb_regs[R0] = regs->regs[R0]; +- gdb_regs[R1] = regs->regs[R1]; +- gdb_regs[R2] = regs->regs[R2]; +- gdb_regs[R3] = regs->regs[R3]; +- gdb_regs[R4] = regs->regs[R4]; +- gdb_regs[R5] = regs->regs[R5]; +- gdb_regs[R6] = regs->regs[R6]; +- gdb_regs[R7] = regs->regs[R7]; +- gdb_regs[R8] = regs->regs[R8]; +- gdb_regs[R9] = regs->regs[R9]; +- gdb_regs[R10] = regs->regs[R10]; +- gdb_regs[R11] = regs->regs[R11]; +- gdb_regs[R12] = regs->regs[R12]; +- gdb_regs[R13] = regs->regs[R13]; +- gdb_regs[R14] = regs->regs[R14]; +- gdb_regs[R15] = regs->regs[R15]; +- gdb_regs[PC] = regs->pc; +- gdb_regs[PR] = regs->pr; +- gdb_regs[GBR] = regs->gbr; +- gdb_regs[MACH] = regs->mach; +- gdb_regs[MACL] = regs->macl; +- gdb_regs[SR] = regs->sr; +- gdb_regs[VBR] = regs->vbr; +-} +- +-/* Copy local gdb registers back to kgdb regs, for later copy to kernel */ +-static void gdb_regs_to_kgdb_regs(const int *gdb_regs, +- struct kgdb_regs *regs) +-{ +- regs->regs[R0] = gdb_regs[R0]; +- regs->regs[R1] = gdb_regs[R1]; +- regs->regs[R2] = gdb_regs[R2]; +- regs->regs[R3] = gdb_regs[R3]; +- regs->regs[R4] = gdb_regs[R4]; +- regs->regs[R5] = gdb_regs[R5]; +- regs->regs[R6] = gdb_regs[R6]; +- regs->regs[R7] = gdb_regs[R7]; +- regs->regs[R8] = gdb_regs[R8]; +- regs->regs[R9] = gdb_regs[R9]; +- regs->regs[R10] = gdb_regs[R10]; +- regs->regs[R11] = gdb_regs[R11]; +- regs->regs[R12] = gdb_regs[R12]; +- regs->regs[R13] = gdb_regs[R13]; +- regs->regs[R14] = gdb_regs[R14]; +- regs->regs[R15] = gdb_regs[R15]; +- regs->pc = gdb_regs[PC]; +- regs->pr = gdb_regs[PR]; +- regs->gbr = gdb_regs[GBR]; +- regs->mach = gdb_regs[MACH]; +- regs->macl = gdb_regs[MACL]; +- regs->sr = gdb_regs[SR]; +- regs->vbr = gdb_regs[VBR]; +-} +- +-#ifdef CONFIG_KGDB_THREAD +-/* Make a local copy of registers from the specified thread */ +-asmlinkage void ret_from_fork(void); +-static void thread_regs_to_gdb_regs(const struct task_struct *thread, +- int *gdb_regs) +-{ +- int regno; +- int *tregs; +- +- /* Initialize to zero */ +- for (regno = 0; regno < MAXREG; regno++) +- gdb_regs[regno] = 0; +- +- /* Just making sure... */ +- if (thread == NULL) +- return; +- +- /* A new fork has pt_regs on the stack from a fork() call */ +- if (thread->thread.pc == (unsigned long)ret_from_fork) { +- +- int vbr_val; +- struct pt_regs *kregs; +- kregs = (struct pt_regs*)thread->thread.sp; +- +- gdb_regs[R0] = kregs->regs[R0]; +- gdb_regs[R1] = kregs->regs[R1]; +- gdb_regs[R2] = kregs->regs[R2]; +- gdb_regs[R3] = kregs->regs[R3]; +- gdb_regs[R4] = kregs->regs[R4]; +- gdb_regs[R5] = kregs->regs[R5]; +- gdb_regs[R6] = kregs->regs[R6]; +- gdb_regs[R7] = kregs->regs[R7]; +- gdb_regs[R8] = kregs->regs[R8]; +- gdb_regs[R9] = kregs->regs[R9]; +- gdb_regs[R10] = kregs->regs[R10]; +- gdb_regs[R11] = kregs->regs[R11]; +- gdb_regs[R12] = kregs->regs[R12]; +- gdb_regs[R13] = kregs->regs[R13]; +- gdb_regs[R14] = kregs->regs[R14]; +- gdb_regs[R15] = kregs->regs[R15]; +- gdb_regs[PC] = kregs->pc; +- gdb_regs[PR] = kregs->pr; +- gdb_regs[GBR] = kregs->gbr; +- gdb_regs[MACH] = kregs->mach; +- gdb_regs[MACL] = kregs->macl; +- gdb_regs[SR] = kregs->sr; +- +- asm("stc vbr, %0":"=r"(vbr_val)); +- gdb_regs[VBR] = vbr_val; +- return; +- } +- +- /* Otherwise, we have only some registers from switch_to() */ +- tregs = (int *)thread->thread.sp; +- gdb_regs[R15] = (int)tregs; +- gdb_regs[R14] = *tregs++; +- gdb_regs[R13] = *tregs++; +- gdb_regs[R12] = *tregs++; +- gdb_regs[R11] = *tregs++; +- gdb_regs[R10] = *tregs++; +- gdb_regs[R9] = *tregs++; +- gdb_regs[R8] = *tregs++; +- gdb_regs[PR] = *tregs++; +- gdb_regs[GBR] = *tregs++; +- gdb_regs[PC] = thread->thread.pc; +-} +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* Calculate the new address for after a step */ +-static short *get_step_address(void) +-{ +- short op = *(short *) trap_registers.pc; +- long addr; +- +- /* BT */ +- if (OPCODE_BT(op)) { +- if (trap_registers.sr & SR_T_BIT_MASK) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 2; +- } +- +- /* BTS */ +- else if (OPCODE_BTS(op)) { +- if (trap_registers.sr & SR_T_BIT_MASK) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 4; /* Not in delay slot */ +- } +- +- /* BF */ +- else if (OPCODE_BF(op)) { +- if (!(trap_registers.sr & SR_T_BIT_MASK)) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 2; +- } +- +- /* BFS */ +- else if (OPCODE_BFS(op)) { +- if (!(trap_registers.sr & SR_T_BIT_MASK)) +- addr = trap_registers.pc + 4 + OPCODE_BTF_DISP(op); +- else +- addr = trap_registers.pc + 4; /* Not in delay slot */ +- } +- +- /* BRA */ +- else if (OPCODE_BRA(op)) +- addr = trap_registers.pc + 4 + OPCODE_BRA_DISP(op); +- +- /* BRAF */ +- else if (OPCODE_BRAF(op)) +- addr = trap_registers.pc + 4 +- + trap_registers.regs[OPCODE_BRAF_REG(op)]; +- +- /* BSR */ +- else if (OPCODE_BSR(op)) +- addr = trap_registers.pc + 4 + OPCODE_BSR_DISP(op); +- +- /* BSRF */ +- else if (OPCODE_BSRF(op)) +- addr = trap_registers.pc + 4 +- + trap_registers.regs[OPCODE_BSRF_REG(op)]; +- +- /* JMP */ +- else if (OPCODE_JMP(op)) +- addr = trap_registers.regs[OPCODE_JMP_REG(op)]; +- +- /* JSR */ +- else if (OPCODE_JSR(op)) +- addr = trap_registers.regs[OPCODE_JSR_REG(op)]; +- +- /* RTS */ +- else if (OPCODE_RTS(op)) +- addr = trap_registers.pr; +- +- /* RTE */ +- else if (OPCODE_RTE(op)) +- addr = trap_registers.regs[15]; +- +- /* Other */ +- else +- addr = trap_registers.pc + 2; +- +- kgdb_flush_icache_range(addr, addr + 2); +- return (short *) addr; +-} +- +-/* Set up a single-step. Replace the instruction immediately after the +- current instruction (i.e. next in the expected flow of control) with a +- trap instruction, so that returning will cause only a single instruction +- to be executed. Note that this model is slightly broken for instructions +- with delay slots (e.g. B[TF]S, BSR, BRA etc), where both the branch +- and the instruction in the delay slot will be executed. */ +-static void do_single_step(void) +-{ +- unsigned short *addr = 0; +- +- /* Determine where the target instruction will send us to */ +- addr = get_step_address(); +- stepped_address = (int)addr; +- +- /* Replace it */ +- stepped_opcode = *(short *)addr; +- *addr = STEP_OPCODE; +- +- /* Flush and return */ +- kgdb_flush_icache_range((long) addr, (long) addr + 2); +- return; +-} +- +-/* Undo a single step */ +-static void undo_single_step(void) +-{ +- /* If we have stepped, put back the old instruction */ +- /* Use stepped_address in case we stopped elsewhere */ +- if (stepped_opcode != 0) { +- *(short*)stepped_address = stepped_opcode; +- kgdb_flush_icache_range(stepped_address, stepped_address + 2); +- } +- stepped_opcode = 0; +-} +- +-/* Send a signal message */ +-static void send_signal_msg(const int signum) +-{ +-#ifndef CONFIG_KGDB_THREAD +- out_buffer[0] = 'S'; +- out_buffer[1] = highhex(signum); +- out_buffer[2] = lowhex(signum); +- out_buffer[3] = 0; +- put_packet(out_buffer); +-#else /* CONFIG_KGDB_THREAD */ +- int threadid; +- threadref thref; +- char *out = out_buffer; +- const char *tstring = "thread"; +- +- *out++ = 'T'; +- *out++ = highhex(signum); +- *out++ = lowhex(signum); +- +- while (*tstring) { +- *out++ = *tstring++; +- } +- *out++ = ':'; +- +- threadid = trapped_thread->pid; +- if (threadid == 0) threadid = PID_MAX; +- int_to_threadref(&thref, threadid); +- pack_threadid(out, &thref); +- out += BUF_THREAD_ID_SIZE; +- *out++ = ';'; +- +- *out = 0; +- put_packet(out_buffer); +-#endif /* CONFIG_KGDB_THREAD */ +-} +- +-/* Reply that all was well */ +-static void send_ok_msg(void) +-{ +- strcpy(out_buffer, "OK"); +- put_packet(out_buffer); +-} +- +-/* Reply that an error occurred */ +-static void send_err_msg(void) +-{ +- strcpy(out_buffer, "E01"); +- put_packet(out_buffer); +-} +- +-/* Empty message indicates unrecognised command */ +-static void send_empty_msg(void) +-{ +- put_packet(""); +-} +- +-/* Read memory due to 'm' message */ +-static void read_mem_msg(void) +-{ +- char *ptr; +- int addr; +- int length; +- +- /* Jmp, disable bus error handler */ +- if (setjmp(rem_com_env) == 0) { +- +- kgdb_nofault = 1; +- +- /* Walk through, have m, */ +- ptr = &in_buffer[1]; +- if (hex_to_int(&ptr, &addr) && (*ptr++ == ',')) +- if (hex_to_int(&ptr, &length)) { +- ptr = 0; +- if (length * 2 > OUTBUFMAX) +- length = OUTBUFMAX / 2; +- mem_to_hex((char *) addr, out_buffer, length); +- } +- if (ptr) +- send_err_msg(); +- else +- put_packet(out_buffer); +- } else +- send_err_msg(); +- +- /* Restore bus error handler */ +- kgdb_nofault = 0; +-} +- +-/* Write memory due to 'M' or 'X' message */ +-static void write_mem_msg(int binary) +-{ +- char *ptr; +- int addr; +- int length; +- +- if (setjmp(rem_com_env) == 0) { +- +- kgdb_nofault = 1; +- +- /* Walk through, have M,: */ +- ptr = &in_buffer[1]; +- if (hex_to_int(&ptr, &addr) && (*ptr++ == ',')) +- if (hex_to_int(&ptr, &length) && (*ptr++ == ':')) { +- if (binary) +- ebin_to_mem(ptr, (char*)addr, length); +- else +- hex_to_mem(ptr, (char*)addr, length); +- kgdb_flush_icache_range(addr, addr + length); +- ptr = 0; +- send_ok_msg(); +- } +- if (ptr) +- send_err_msg(); +- } else +- send_err_msg(); +- +- /* Restore bus error handler */ +- kgdb_nofault = 0; +-} +- +-/* Continue message */ +-static void continue_msg(void) +-{ +- /* Try to read optional parameter, PC unchanged if none */ +- char *ptr = &in_buffer[1]; +- int addr; +- +- if (hex_to_int(&ptr, &addr)) +- trap_registers.pc = addr; +-} +- +-/* Continue message with signal */ +-static void continue_with_sig_msg(void) +-{ +- int signal; +- char *ptr = &in_buffer[1]; +- int addr; +- +- /* Report limitation */ +- kgdb_to_gdb("Cannot force signal in kgdb, continuing anyway.\n"); +- +- /* Signal */ +- hex_to_int(&ptr, &signal); +- if (*ptr == ';') +- ptr++; +- +- /* Optional address */ +- if (hex_to_int(&ptr, &addr)) +- trap_registers.pc = addr; +-} +- +-/* Step message */ +-static void step_msg(void) +-{ +- continue_msg(); +- do_single_step(); +-} +- +-/* Step message with signal */ +-static void step_with_sig_msg(void) +-{ +- continue_with_sig_msg(); +- do_single_step(); +-} +- +-/* Send register contents */ +-static void send_regs_msg(void) +-{ +-#ifdef CONFIG_KGDB_THREAD +- if (!current_thread) +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +- else +- thread_regs_to_gdb_regs(current_thread, registers); +-#else +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +-#endif +- +- mem_to_hex((char *) registers, out_buffer, NUMREGBYTES); +- put_packet(out_buffer); +-} +- +-/* Set register contents - currently can't set other thread's registers */ +-static void set_regs_msg(void) +-{ +-#ifdef CONFIG_KGDB_THREAD +- if (!current_thread) { +-#endif +- kgdb_regs_to_gdb_regs(&trap_registers, registers); +- hex_to_mem(&in_buffer[1], (char *) registers, NUMREGBYTES); +- gdb_regs_to_kgdb_regs(registers, &trap_registers); +- send_ok_msg(); +-#ifdef CONFIG_KGDB_THREAD +- } else +- send_err_msg(); +-#endif +-} +- +- +-#ifdef CONFIG_KGDB_THREAD +- +-/* Set the status for a thread */ +-void set_thread_msg(void) +-{ +- int threadid; +- struct task_struct *thread = NULL; +- char *ptr; +- +- switch (in_buffer[1]) { +- +- /* To select which thread for gG etc messages, i.e. supported */ +- case 'g': +- +- ptr = &in_buffer[2]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- +- /* If we haven't found it */ +- if (!thread) { +- send_err_msg(); +- break; +- } +- +- /* Set current_thread (or not) */ +- if (thread == trapped_thread) +- current_thread = NULL; +- else +- current_thread = thread; +- send_ok_msg(); +- break; +- +- /* To select which thread for cCsS messages, i.e. unsupported */ +- case 'c': +- send_ok_msg(); +- break; +- +- default: +- send_empty_msg(); +- break; +- } +-} +- +-/* Is a thread alive? */ +-static void thread_status_msg(void) +-{ +- char *ptr; +- int threadid; +- struct task_struct *thread = NULL; +- +- ptr = &in_buffer[1]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- if (thread) +- send_ok_msg(); +- else +- send_err_msg(); +-} +-/* Send the current thread ID */ +-static void thread_id_msg(void) +-{ +- int threadid; +- threadref thref; +- +- out_buffer[0] = 'Q'; +- out_buffer[1] = 'C'; +- +- if (current_thread) +- threadid = current_thread->pid; +- else if (trapped_thread) +- threadid = trapped_thread->pid; +- else /* Impossible, but just in case! */ +- { +- send_err_msg(); +- return; +- } +- +- /* Translate pid 0 to PID_MAX for gdb */ +- if (threadid == 0) threadid = PID_MAX; +- +- int_to_threadref(&thref, threadid); +- pack_threadid(out_buffer + 2, &thref); +- out_buffer[2 + BUF_THREAD_ID_SIZE] = '\0'; +- put_packet(out_buffer); +-} +- +-/* Send thread info */ +-static void thread_info_msg(void) +-{ +- struct task_struct *thread = NULL; +- int threadid; +- char *pos; +- threadref thref; +- +- /* Start with 'm' */ +- out_buffer[0] = 'm'; +- pos = &out_buffer[1]; +- +- /* For all possible thread IDs - this will overrun if > 44 threads! */ +- /* Start at 1 and include PID_MAX (since GDB won't use pid 0...) */ +- for (threadid = 1; threadid <= PID_MAX; threadid++) { +- +- read_lock(&tasklist_lock); +- thread = get_thread(threadid); +- read_unlock(&tasklist_lock); +- +- /* If it's a valid thread */ +- if (thread) { +- int_to_threadref(&thref, threadid); +- pack_threadid(pos, &thref); +- pos += BUF_THREAD_ID_SIZE; +- *pos++ = ','; +- } +- } +- *--pos = 0; /* Lose final comma */ +- put_packet(out_buffer); +- +-} +- +-/* Return printable info for gdb's 'info threads' command */ +-static void thread_extra_info_msg(void) +-{ +- int threadid; +- struct task_struct *thread = NULL; +- char buffer[20], *ptr; +- int i; +- +- /* Extract thread ID */ +- ptr = &in_buffer[17]; +- hex_to_int(&ptr, &threadid); +- thread = get_thread(threadid); +- +- /* If we don't recognise it, say so */ +- if (thread == NULL) +- strcpy(buffer, "(unknown)"); +- else +- strcpy(buffer, thread->comm); +- +- /* Construct packet */ +- for (i = 0, ptr = out_buffer; buffer[i]; i++) +- ptr = pack_hex_byte(ptr, buffer[i]); +- +- if (thread->thread.pc == (unsigned long)ret_from_fork) { +- strcpy(buffer, ""); +- for (i = 0; buffer[i]; i++) +- ptr = pack_hex_byte(ptr, buffer[i]); +- } +- +- *ptr = '\0'; +- put_packet(out_buffer); +-} +- +-/* Handle all qFooBarBaz messages - have to use an if statement as +- opposed to a switch because q messages can have > 1 char id. */ +-static void query_msg(void) +-{ +- const char *q_start = &in_buffer[1]; +- +- /* qC = return current thread ID */ +- if (strncmp(q_start, "C", 1) == 0) +- thread_id_msg(); +- +- /* qfThreadInfo = query all threads (first) */ +- else if (strncmp(q_start, "fThreadInfo", 11) == 0) +- thread_info_msg(); +- +- /* qsThreadInfo = query all threads (subsequent). We know we have sent +- them all after the qfThreadInfo message, so there are no to send */ +- else if (strncmp(q_start, "sThreadInfo", 11) == 0) +- put_packet("l"); /* el = last */ +- +- /* qThreadExtraInfo = supply printable information per thread */ +- else if (strncmp(q_start, "ThreadExtraInfo", 15) == 0) +- thread_extra_info_msg(); +- +- /* Unsupported - empty message as per spec */ +- else +- send_empty_msg(); +-} +-#endif /* CONFIG_KGDB_THREAD */ +- +-/* +- * Bring up the ports.. +- */ +-static int kgdb_serial_setup(void) +-{ +- extern int kgdb_console_setup(struct console *co, char *options); +- struct console dummy; +- +- kgdb_console_setup(&dummy, 0); +- +- return 0; +-} +- +-/* The command loop, read and act on requests */ +-static void kgdb_command_loop(const int excep_code, const int trapa_value) +-{ +- int sigval; +- +- if (excep_code == NMI_VEC) { +-#ifndef CONFIG_KGDB_NMI +- KGDB_PRINTK("Ignoring unexpected NMI?\n"); +- return; +-#else /* CONFIG_KGDB_NMI */ +- if (!kgdb_enabled) { +- kgdb_enabled = 1; +- kgdb_init(); +- } +-#endif /* CONFIG_KGDB_NMI */ +- } +- +- /* Ignore if we're disabled */ +- if (!kgdb_enabled) +- return; +- +-#ifdef CONFIG_KGDB_THREAD +- /* Until GDB specifies a thread */ +- current_thread = NULL; +- trapped_thread = current; +-#endif +- +- /* Enter GDB mode (e.g. after detach) */ +- if (!kgdb_in_gdb_mode) { +- /* Do serial setup, notify user, issue preemptive ack */ +- kgdb_serial_setup(); +- KGDB_PRINTK("Waiting for GDB (on %s%d at %d baud)\n", +- (kgdb_porttype ? kgdb_porttype->name : ""), +- kgdb_portnum, kgdb_baud); +- kgdb_in_gdb_mode = 1; +- put_debug_char('+'); +- } +- +- /* Reply to host that an exception has occurred */ +- sigval = compute_signal(excep_code); +- send_signal_msg(sigval); +- +- /* TRAP_VEC exception indicates a software trap inserted in place of +- code by GDB so back up PC by one instruction, as this instruction +- will later be replaced by its original one. Do NOT do this for +- trap 0xff, since that indicates a compiled-in breakpoint which +- will not be replaced (and we would retake the trap forever) */ +- if ((excep_code == TRAP_VEC) && (trapa_value != (0xff << 2))) { +- trap_registers.pc -= 2; +- } +- +- /* Undo any stepping we may have done */ +- undo_single_step(); +- +- while (1) { +- +- out_buffer[0] = 0; +- get_packet(in_buffer, BUFMAX); +- +- /* Examine first char of buffer to see what we need to do */ +- switch (in_buffer[0]) { +- +- case '?': /* Send which signal we've received */ +- send_signal_msg(sigval); +- break; +- +- case 'g': /* Return the values of the CPU registers */ +- send_regs_msg(); +- break; +- +- case 'G': /* Set the value of the CPU registers */ +- set_regs_msg(); +- break; +- +- case 'm': /* Read LLLL bytes address AA..AA */ +- read_mem_msg(); +- break; +- +- case 'M': /* Write LLLL bytes address AA..AA, ret OK */ +- write_mem_msg(0); /* 0 = data in hex */ +- break; +- +- case 'X': /* Write LLLL bytes esc bin address AA..AA */ +- if (kgdb_bits == '8') +- write_mem_msg(1); /* 1 = data in binary */ +- else +- send_empty_msg(); +- break; +- +- case 'C': /* Continue, signum included, we ignore it */ +- continue_with_sig_msg(); +- return; +- +- case 'c': /* Continue at address AA..AA (optional) */ +- continue_msg(); +- return; +- +- case 'S': /* Step, signum included, we ignore it */ +- step_with_sig_msg(); +- return; +- +- case 's': /* Step one instruction from AA..AA */ +- step_msg(); +- return; +- +-#ifdef CONFIG_KGDB_THREAD +- +- case 'H': /* Task related */ +- set_thread_msg(); +- break; +- +- case 'T': /* Query thread status */ +- thread_status_msg(); +- break; +- +- case 'q': /* Handle query - currently thread-related */ +- query_msg(); +- break; +-#endif +- +- case 'k': /* 'Kill the program' with a kernel ? */ +- break; +- +- case 'D': /* Detach from program, send reply OK */ +- kgdb_in_gdb_mode = 0; +- send_ok_msg(); +- get_debug_char(); +- return; +- +- default: +- send_empty_msg(); +- break; +- } +- } +-} +- +-/* There has been an exception, most likely a breakpoint. */ +-void kgdb_handle_exception(struct pt_regs *regs) +-{ +- int excep_code, vbr_val; +- int count; +- int trapa_value = ctrl_inl(TRA); +- +- /* Copy kernel regs (from stack) */ +- for (count = 0; count < 16; count++) +- trap_registers.regs[count] = regs->regs[count]; +- trap_registers.pc = regs->pc; +- trap_registers.pr = regs->pr; +- trap_registers.sr = regs->sr; +- trap_registers.gbr = regs->gbr; +- trap_registers.mach = regs->mach; +- trap_registers.macl = regs->macl; +- +- asm("stc vbr, %0":"=r"(vbr_val)); +- trap_registers.vbr = vbr_val; +- +- /* Get excode for command loop call, user access */ +- asm("stc r2_bank, %0":"=r"(excep_code)); +- kgdb_excode = excep_code; +- +- /* Other interesting environment items for reference */ +- asm("stc r6_bank, %0":"=r"(kgdb_g_imask)); +- kgdb_current = current; +- kgdb_trapa_val = trapa_value; +- +- /* Act on the exception */ +- kgdb_command_loop(excep_code >> 5, trapa_value); +- +- kgdb_current = NULL; +- +- /* Copy back the (maybe modified) registers */ +- for (count = 0; count < 16; count++) +- regs->regs[count] = trap_registers.regs[count]; +- regs->pc = trap_registers.pc; +- regs->pr = trap_registers.pr; +- regs->sr = trap_registers.sr; +- regs->gbr = trap_registers.gbr; +- regs->mach = trap_registers.mach; +- regs->macl = trap_registers.macl; +- +- vbr_val = trap_registers.vbr; +- asm("ldc %0, vbr": :"r"(vbr_val)); +- +- return; +-} +- +-/* Trigger a breakpoint by function */ +-void breakpoint(void) +-{ +- if (!kgdb_enabled) { +- kgdb_enabled = 1; +- kgdb_init(); +- } +- BREAKPOINT(); +-} +- +-/* Initialise the KGDB data structures and serial configuration */ +-int kgdb_init(void) +-{ +- if (!kgdb_enabled) +- return 1; +- +- in_nmi = 0; +- kgdb_nofault = 0; +- stepped_opcode = 0; +- kgdb_in_gdb_mode = 0; +- +- if (kgdb_serial_setup() != 0) { +- KGDB_PRINTK("serial setup error\n"); +- return -1; +- } +- +- /* Init ptr to exception handler */ +- kgdb_debug_hook = kgdb_handle_exception; +- kgdb_bus_err_hook = kgdb_handle_bus_error; +- +- /* Enter kgdb now if requested, or just report init done */ +- if (kgdb_halt) { +- kgdb_in_gdb_mode = 1; +- put_debug_char('+'); +- breakpoint(); +- } +- else +- { +- KGDB_PRINTK("stub is initialized.\n"); +- } +- +- return 0; +-} +- +-/* Make function available for "user messages"; console will use it too. */ +- +-char gdbmsgbuf[BUFMAX]; +-#define MAXOUT ((BUFMAX-2)/2) +- +-static void kgdb_msg_write(const char *s, unsigned count) +-{ +- int i; +- int wcount; +- char *bufptr; +- +- /* 'O'utput */ +- gdbmsgbuf[0] = 'O'; +- +- /* Fill and send buffers... */ +- while (count > 0) { +- bufptr = gdbmsgbuf + 1; +- +- /* Calculate how many this time */ +- wcount = (count > MAXOUT) ? MAXOUT : count; +- +- /* Pack in hex chars */ +- for (i = 0; i < wcount; i++) +- bufptr = pack_hex_byte(bufptr, s[i]); +- *bufptr = '\0'; +- +- /* Move up */ +- s += wcount; +- count -= wcount; +- +- /* Write packet */ +- put_packet(gdbmsgbuf); +- } +-} +- +-static void kgdb_to_gdb(const char *s) +-{ +- kgdb_msg_write(s, strlen(s)); +-} +- +-#ifdef CONFIG_SH_KGDB_CONSOLE +-void kgdb_console_write(struct console *co, const char *s, unsigned count) +-{ +- /* Bail if we're not talking to GDB */ +- if (!kgdb_in_gdb_mode) +- return; +- +- kgdb_msg_write(s, count); +-} +-#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/setup.c linux-2.6.18.kgdb/arch/sh/kernel/setup.c +--- linux-2.6.18/arch/sh/kernel/setup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/setup.c 2008-06-10 16:19:47.000000000 +0400 +@@ -28,10 +28,6 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-static int kgdb_parse_options(char *options); +-#endif + extern void * __rd_start, * __rd_end; + /* + * Machine setup.. +@@ -528,93 +524,3 @@ struct seq_operations cpuinfo_op = { + .show = show_cpuinfo, + }; + #endif /* CONFIG_PROC_FS */ +- +-#ifdef CONFIG_SH_KGDB +-/* +- * Parse command-line kgdb options. By default KGDB is enabled, +- * entered on error (or other action) using default serial info. +- * The command-line option can include a serial port specification +- * and an action to override default or configured behavior. +- */ +-struct kgdb_sermap kgdb_sci_sermap = +-{ "ttySC", 5, kgdb_sci_setup, NULL }; +- +-struct kgdb_sermap *kgdb_serlist = &kgdb_sci_sermap; +-struct kgdb_sermap *kgdb_porttype = &kgdb_sci_sermap; +- +-void kgdb_register_sermap(struct kgdb_sermap *map) +-{ +- struct kgdb_sermap *last; +- +- for (last = kgdb_serlist; last->next; last = last->next) +- ; +- last->next = map; +- if (!map->namelen) { +- map->namelen = strlen(map->name); +- } +-} +- +-static int __init kgdb_parse_options(char *options) +-{ +- char c; +- int baud; +- +- /* Check for port spec (or use default) */ +- +- /* Determine port type and instance */ +- if (!memcmp(options, "tty", 3)) { +- struct kgdb_sermap *map = kgdb_serlist; +- +- while (map && memcmp(options, map->name, map->namelen)) +- map = map->next; +- +- if (!map) { +- KGDB_PRINTK("unknown port spec in %s\n", options); +- return -1; +- } +- +- kgdb_porttype = map; +- kgdb_serial_setup = map->setup_fn; +- kgdb_portnum = options[map->namelen] - '0'; +- options += map->namelen + 1; +- +- options = (*options == ',') ? options+1 : options; +- +- /* Read optional parameters (baud/parity/bits) */ +- baud = simple_strtoul(options, &options, 10); +- if (baud != 0) { +- kgdb_baud = baud; +- +- c = toupper(*options); +- if (c == 'E' || c == 'O' || c == 'N') { +- kgdb_parity = c; +- options++; +- } +- +- c = *options; +- if (c == '7' || c == '8') { +- kgdb_bits = c; +- options++; +- } +- options = (*options == ',') ? options+1 : options; +- } +- } +- +- /* Check for action specification */ +- if (!memcmp(options, "halt", 4)) { +- kgdb_halt = 1; +- options += 4; +- } else if (!memcmp(options, "disabled", 8)) { +- kgdb_enabled = 0; +- options += 8; +- } +- +- if (*options) { +- KGDB_PRINTK("ignored unknown options: %s\n", options); +- return 0; +- } +- return 1; +-} +-__setup("kgdb=", kgdb_parse_options); +-#endif /* CONFIG_SH_KGDB */ +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/time.c linux-2.6.18.kgdb/arch/sh/kernel/time.c +--- linux-2.6.18/arch/sh/kernel/time.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/time.c 2008-06-10 16:19:47.000000000 +0400 +@@ -184,12 +184,4 @@ void __init time_init(void) + */ + sys_timer = get_sys_timer(); + printk(KERN_INFO "Using %s for system timer\n", sys_timer->name); +- +-#if defined(CONFIG_SH_KGDB) +- /* +- * Set up kgdb as requested. We do it here because the serial +- * init uses the timer vars we just set up for figuring baud. +- */ +- kgdb_init(); +-#endif + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/kernel/traps.c linux-2.6.18.kgdb/arch/sh/kernel/traps.c +--- linux-2.6.18/arch/sh/kernel/traps.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/kernel/traps.c 2008-06-10 16:19:47.000000000 +0400 +@@ -26,6 +26,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -34,17 +35,8 @@ + #include + #include + +-#ifdef CONFIG_SH_KGDB +-#include +-#define CHK_REMOTE_DEBUG(regs) \ +-{ \ +- if ((kgdb_debug_hook != (kgdb_debug_hook_t *) NULL) && (!user_mode(regs))) \ +- { \ +- (*kgdb_debug_hook)(regs); \ +- } \ +-} +-#else +-#define CHK_REMOTE_DEBUG(regs) ++#ifndef CONFIG_KGDB ++#define kgdb_handle_exception(t, s, e, r) + #endif + + #define DO_ERROR(trapnr, signr, str, name, tsk) \ +@@ -65,7 +57,7 @@ asmlinkage void do_##name(unsigned long + local_irq_enable(); \ + tsk->thread.error_code = error_code; \ + tsk->thread.trap_no = trapnr; \ +- CHK_REMOTE_DEBUG(®s); \ ++ kgdb_handle_exception(trapnr, signr, error_code, ®s); \ + force_sig(signr, tsk); \ + die_if_no_fixup(str,®s,error_code); \ + } +@@ -92,10 +84,12 @@ void die(const char * str, struct pt_reg + { + static int die_counter; + ++#ifdef CONFIG_KGDB ++ kgdb_handle_exception(1, SIGTRAP, err, regs); ++#endif + console_verbose(); + spin_lock_irq(&die_lock); + printk("%s: %04lx [#%d]\n", str, err & 0xffff, ++die_counter); +- CHK_REMOTE_DEBUG(regs); + show_regs(regs); + spin_unlock_irq(&die_lock); + do_exit(SIGSEGV); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/extable.c linux-2.6.18.kgdb/arch/sh/mm/extable.c +--- linux-2.6.18/arch/sh/mm/extable.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/mm/extable.c 2008-06-10 16:19:47.000000000 +0400 +@@ -5,6 +5,7 @@ + */ + + #include ++#include + #include + + int fixup_exception(struct pt_regs *regs) +@@ -16,6 +17,12 @@ int fixup_exception(struct pt_regs *regs + regs->pc = fixup->fixup; + return 1; + } ++#ifdef CONFIG_KGDB ++ if (atomic_read(&debugger_active) && kgdb_may_fault) ++ /* Restore our previous state. */ ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ /* Never reached. */ ++#endif + + return 0; + } +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/fault-nommu.c linux-2.6.18.kgdb/arch/sh/mm/fault-nommu.c +--- linux-2.6.18/arch/sh/mm/fault-nommu.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/mm/fault-nommu.c 2008-06-10 16:19:47.000000000 +0400 +@@ -29,10 +29,6 @@ + #include + #include + +-#if defined(CONFIG_SH_KGDB) +-#include +-#endif +- + extern void die(const char *,struct pt_regs *,long); + + /* +@@ -43,11 +39,6 @@ extern void die(const char *,struct pt_r + asmlinkage void do_page_fault(struct pt_regs *regs, unsigned long writeaccess, + unsigned long address) + { +-#if defined(CONFIG_SH_KGDB) +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +@@ -69,11 +60,6 @@ asmlinkage void do_page_fault(struct pt_ + asmlinkage int __do_page_fault(struct pt_regs *regs, unsigned long writeaccess, + unsigned long address) + { +-#if defined(CONFIG_SH_KGDB) +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + if (address >= TASK_SIZE) + return 1; + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/sh/mm/fault.c linux-2.6.18.kgdb/arch/sh/mm/fault.c +--- linux-2.6.18/arch/sh/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/sh/mm/fault.c 2008-06-10 16:19:47.000000000 +0400 +@@ -28,7 +28,6 @@ + #include + #include + #include +-#include + + extern void die(const char *,struct pt_regs *,long); + +@@ -45,11 +44,6 @@ asmlinkage void do_page_fault(struct pt_ + struct vm_area_struct * vma; + unsigned long page; + +-#ifdef CONFIG_SH_KGDB +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + tsk = current; + mm = tsk->mm; + +@@ -153,6 +147,7 @@ no_context: + } + die("Oops", regs, writeaccess); + do_exit(SIGKILL); ++ dump_stack(); + + /* + * We ran out of memory, or some other thing happened to us that made +@@ -202,11 +197,6 @@ asmlinkage int __do_page_fault(struct pt + spinlock_t *ptl; + int ret = 1; + +-#ifdef CONFIG_SH_KGDB +- if (kgdb_nofault && kgdb_bus_err_hook) +- kgdb_bus_err_hook(); +-#endif +- + #ifdef CONFIG_SH_STORE_QUEUES + addrmax = P4SEG_STORE_QUE + 0x04000000; + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/Kconfig.debug linux-2.6.18.kgdb/arch/x86_64/Kconfig.debug +--- linux-2.6.18/arch/x86_64/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/x86_64/Kconfig.debug 2008-06-10 16:19:41.000000000 +0400 +@@ -55,7 +55,4 @@ config DEBUG_STACK_USAGE + + This option will slow down process creation somewhat. + +-#config X86_REMOTE_DEBUG +-# bool "kgdb debugging stub" +- + endmenu +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/Makefile linux-2.6.18.kgdb/arch/x86_64/kernel/Makefile +--- linux-2.6.18/arch/x86_64/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/x86_64/kernel/Makefile 2008-06-10 16:19:41.000000000 +0400 +@@ -33,6 +33,7 @@ obj-$(CONFIG_IOMMU) += pci-gart.o apert + obj-$(CONFIG_CALGARY_IOMMU) += pci-calgary.o tce.o + obj-$(CONFIG_SWIOTLB) += pci-swiotlb.o + obj-$(CONFIG_KPROBES) += kprobes.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdb-jmp.o + obj-$(CONFIG_X86_PM_TIMER) += pmtimer.o + obj-$(CONFIG_X86_VSMP) += vsmp.o + obj-$(CONFIG_K8_NB) += k8.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/entry.S linux-2.6.18.kgdb/arch/x86_64/kernel/entry.S +--- linux-2.6.18/arch/x86_64/kernel/entry.S 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/x86_64/kernel/entry.S 2008-06-10 16:19:58.000000000 +0400 +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + .code64 + +@@ -881,6 +882,7 @@ error_exit: + RESTORE_ARGS 0,8,0 + jmp iret_label + CFI_ENDPROC ++ CFI_END_FRAME(kernel_thread) + + error_kernelspace: + incl %ebx +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/kgdb-jmp.S linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb-jmp.S +--- linux-2.6.18/arch/x86_64/kernel/kgdb-jmp.S 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb-jmp.S 2008-06-10 16:19:41.000000000 +0400 +@@ -0,0 +1,65 @@ ++/* ++ * arch/x86_64/kernel/kgdb-jmp.S ++ * ++ * Save and restore system registers so that within a limited frame we ++ * may have a fault and "jump back" to a known safe location. ++ * ++ * Author: Tom Rini ++ * ++ * Cribbed from glibc, which carries the following: ++ * Copyright (C) 2001, 2003, 2004 Free Software Foundation, Inc. ++ * Copyright (C) 2005 by MontaVista Software. ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program as licensed "as is" without any warranty of ++ * any kind, whether express or implied. ++ */ ++ ++#include ++ ++#define JB_RBX 0 ++#define JB_RBP 1 ++#define JB_R12 2 ++#define JB_R13 3 ++#define JB_R14 4 ++#define JB_R15 5 ++#define JB_RSP 6 ++#define JB_PC 7 ++ ++ .code64 ++ ++/* This must be called prior to kgdb_fault_longjmp and ++ * kgdb_fault_longjmp must not be called outside of the context of the ++ * last call to kgdb_fault_setjmp. ++ */ ++ENTRY(kgdb_fault_setjmp) ++ /* Save registers. */ ++ movq %rbx, (JB_RBX*8)(%rdi) ++ movq %rbp, (JB_RBP*8)(%rdi) ++ movq %r12, (JB_R12*8)(%rdi) ++ movq %r13, (JB_R13*8)(%rdi) ++ movq %r14, (JB_R14*8)(%rdi) ++ movq %r15, (JB_R15*8)(%rdi) ++ leaq 8(%rsp), %rdx /* Save SP as it will be after we return. */ ++ movq %rdx, (JB_RSP*8)(%rdi) ++ movq (%rsp), %rax /* Save PC we are returning to now. */ ++ movq %rax, (JB_PC*8)(%rdi) ++ /* Set return value for setjmp. */ ++ mov $0,%eax ++ movq (JB_PC*8)(%rdi),%rdx ++ movq (JB_RSP*8)(%rdi),%rsp ++ jmpq *%rdx ++ ++ENTRY(kgdb_fault_longjmp) ++ /* Restore registers. */ ++ movq (JB_RBX*8)(%rdi),%rbx ++ movq (JB_RBP*8)(%rdi),%rbp ++ movq (JB_R12*8)(%rdi),%r12 ++ movq (JB_R13*8)(%rdi),%r13 ++ movq (JB_R14*8)(%rdi),%r14 ++ movq (JB_R15*8)(%rdi),%r15 ++ /* Set return value for setjmp. */ ++ movq (JB_PC*8)(%rdi),%rdx ++ movq (JB_RSP*8)(%rdi),%rsp ++ mov $1,%eax ++ jmpq *%rdx +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/kernel/kgdb.c linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb.c +--- linux-2.6.18/arch/x86_64/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/arch/x86_64/kernel/kgdb.c 2008-06-10 16:19:41.000000000 +0400 +@@ -0,0 +1,474 @@ ++/* ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2, or (at your option) any ++ * later version. ++ * ++ * This program is distributed in the hope that it will be useful, but ++ * WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU ++ * General Public License for more details. ++ * ++ */ ++ ++/* ++ * Copyright (C) 2004 Amit S. Kale ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * Copyright (C) 2002 Andi Kleen, SuSE Labs ++ * Copyright (C) 2004 LinSysSoft Technologies Pvt. Ltd. ++ */ ++/**************************************************************************** ++ * Contributor: Lake Stevens Instrument Division$ ++ * Written by: Glenn Engel $ ++ * Updated by: Amit Kale ++ * Modified for 386 by Jim Kingdon, Cygnus Support. ++ * Origianl kgdb, compatibility with 2.1.xx kernel by ++ * David Grothe ++ * Integrated into 2.2.5 kernel by Tigran Aivazian ++ * X86_64 changes from Andi Kleen's patch merged by Jim Houston ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include /* for linux pt_regs struct */ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++/* Put the error code here just in case the user cares. */ ++int gdb_x86_64errcode; ++/* Likewise, the vector number here (since GDB only gets the signal ++ number through the usual means, and that's not very specific). */ ++int gdb_x86_64vector = -1; ++ ++extern atomic_t cpu_doing_single_step; ++ ++void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ gdb_regs[_RAX] = regs->rax; ++ gdb_regs[_RBX] = regs->rbx; ++ gdb_regs[_RCX] = regs->rcx; ++ gdb_regs[_RDX] = regs->rdx; ++ gdb_regs[_RSI] = regs->rsi; ++ gdb_regs[_RDI] = regs->rdi; ++ gdb_regs[_RBP] = regs->rbp; ++ gdb_regs[_PS] = regs->eflags; ++ gdb_regs[_PC] = regs->rip; ++ gdb_regs[_R8] = regs->r8; ++ gdb_regs[_R9] = regs->r9; ++ gdb_regs[_R10] = regs->r10; ++ gdb_regs[_R11] = regs->r11; ++ gdb_regs[_R12] = regs->r12; ++ gdb_regs[_R13] = regs->r13; ++ gdb_regs[_R14] = regs->r14; ++ gdb_regs[_R15] = regs->r15; ++ gdb_regs[_RSP] = regs->rsp; ++} ++ ++extern void thread_return(void); ++void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, struct task_struct *p) ++{ ++ gdb_regs[_RAX] = 0; ++ gdb_regs[_RBX] = 0; ++ gdb_regs[_RCX] = 0; ++ gdb_regs[_RDX] = 0; ++ gdb_regs[_RSI] = 0; ++ gdb_regs[_RDI] = 0; ++ gdb_regs[_RBP] = *(unsigned long *)p->thread.rsp; ++ gdb_regs[_PS] = *(unsigned long *)(p->thread.rsp + 8); ++ gdb_regs[_PC] = (unsigned long)&thread_return; ++ gdb_regs[_R8] = 0; ++ gdb_regs[_R9] = 0; ++ gdb_regs[_R10] = 0; ++ gdb_regs[_R11] = 0; ++ gdb_regs[_R12] = 0; ++ gdb_regs[_R13] = 0; ++ gdb_regs[_R14] = 0; ++ gdb_regs[_R15] = 0; ++ gdb_regs[_RSP] = p->thread.rsp; ++} ++ ++void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs) ++{ ++ regs->rax = gdb_regs[_RAX]; ++ regs->rbx = gdb_regs[_RBX]; ++ regs->rcx = gdb_regs[_RCX]; ++ regs->rdx = gdb_regs[_RDX]; ++ regs->rsi = gdb_regs[_RSI]; ++ regs->rdi = gdb_regs[_RDI]; ++ regs->rbp = gdb_regs[_RBP]; ++ regs->eflags = gdb_regs[_PS]; ++ regs->rip = gdb_regs[_PC]; ++ regs->r8 = gdb_regs[_R8]; ++ regs->r9 = gdb_regs[_R9]; ++ regs->r10 = gdb_regs[_R10]; ++ regs->r11 = gdb_regs[_R11]; ++ regs->r12 = gdb_regs[_R12]; ++ regs->r13 = gdb_regs[_R13]; ++ regs->r14 = gdb_regs[_R14]; ++ regs->r15 = gdb_regs[_R15]; ++#if 0 /* can't change these */ ++ regs->rsp = gdb_regs[_RSP]; ++ regs->ss = gdb_regs[_SS]; ++ regs->fs = gdb_regs[_FS]; ++ regs->gs = gdb_regs[_GS]; ++#endif ++ ++} /* gdb_regs_to_regs */ ++ ++struct hw_breakpoint { ++ unsigned enabled; ++ unsigned type; ++ unsigned len; ++ unsigned long addr; ++} breakinfo[4] = { { ++enabled:0}, { ++enabled:0}, { ++enabled:0}, { ++enabled:0}}; ++ ++void kgdb_correct_hw_break(void) ++{ ++ int breakno; ++ int correctit; ++ int breakbit; ++ unsigned long dr7; ++ ++ asm volatile ("movq %%db7, %0\n":"=r" (dr7):); ++ do { ++ unsigned long addr0, addr1, addr2, addr3; ++ asm volatile ("movq %%db0, %0\n" ++ "movq %%db1, %1\n" ++ "movq %%db2, %2\n" ++ "movq %%db3, %3\n":"=r" (addr0), "=r"(addr1), ++ "=r"(addr2), "=r"(addr3):); ++ } while (0); ++ correctit = 0; ++ for (breakno = 0; breakno < 3; breakno++) { ++ breakbit = 2 << (breakno << 1); ++ if (!(dr7 & breakbit) && breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 |= breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ dr7 |= (((breakinfo[breakno].len << 2) | ++ breakinfo[breakno].type) << 16) << ++ (breakno << 2); ++ switch (breakno) { ++ case 0: ++ asm volatile ("movq %0, %%dr0\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 1: ++ asm volatile ("movq %0, %%dr1\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 2: ++ asm volatile ("movq %0, %%dr2\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ ++ case 3: ++ asm volatile ("movq %0, %%dr3\n"::"r" ++ (breakinfo[breakno].addr)); ++ break; ++ } ++ } else if ((dr7 & breakbit) && !breakinfo[breakno].enabled) { ++ correctit = 1; ++ dr7 &= ~breakbit; ++ dr7 &= ~(0xf0000 << (breakno << 2)); ++ } ++ } ++ if (correctit) { ++ asm volatile ("movq %0, %%db7\n"::"r" (dr7)); ++ } ++} ++ ++int kgdb_remove_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (breakinfo[i].addr == addr && breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 0; ++ return 0; ++} ++ ++int kgdb_set_hw_break(unsigned long addr) ++{ ++ int i, idx = -1; ++ for (i = 0; i < 4; i++) { ++ if (!breakinfo[i].enabled) { ++ idx = i; ++ break; ++ } ++ } ++ if (idx == -1) ++ return -1; ++ ++ breakinfo[idx].enabled = 1; ++ breakinfo[idx].type = 1; ++ breakinfo[idx].len = 1; ++ breakinfo[idx].addr = addr; ++ return 0; ++} ++ ++int remove_hw_break(unsigned breakno) ++{ ++ if (!breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 0; ++ return 0; ++} ++ ++int set_hw_break(unsigned breakno, unsigned type, unsigned len, unsigned addr) ++{ ++ if (breakinfo[breakno].enabled) { ++ return -1; ++ } ++ breakinfo[breakno].enabled = 1; ++ breakinfo[breakno].type = type; ++ breakinfo[breakno].len = len; ++ breakinfo[breakno].addr = addr; ++ return 0; ++} ++ ++void kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++ /* Disable hardware debugging while we are in kgdb */ ++ asm volatile ("movq %0,%%db7": /* no output */ :"r" (0UL)); ++} ++ ++void kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++ /* Master processor is completely in the debugger */ ++ gdb_x86_64vector = e_vector; ++ gdb_x86_64errcode = err_code; ++} ++ ++void kgdb_roundup_cpus(unsigned long flags) ++{ ++ send_IPI_allbutself(APIC_DM_NMI); ++} ++ ++int kgdb_arch_handle_exception(int e_vector, int signo, int err_code, ++ char *remcomInBuffer, char *remcomOutBuffer, ++ struct pt_regs *linux_regs) ++{ ++ unsigned long addr, length; ++ unsigned long breakno, breaktype; ++ char *ptr; ++ int newPC; ++ unsigned long dr6; ++ ++ switch (remcomInBuffer[0]) { ++ case 'c': ++ case 's': ++ /* try to read optional parameter, pc unchanged if no parm */ ++ ptr = &remcomInBuffer[1]; ++ if (kgdb_hex2long(&ptr, &addr)) ++ linux_regs->rip = addr; ++ newPC = linux_regs->rip; ++ ++ /* clear the trace bit */ ++ linux_regs->eflags &= ~TF_MASK; ++ ++ atomic_set(&cpu_doing_single_step, -1); ++ /* set the trace bit if we're stepping */ ++ if (remcomInBuffer[0] == 's') { ++ linux_regs->eflags |= TF_MASK; ++ debugger_step = 1; ++ if (kgdb_contthread) ++ atomic_set(&cpu_doing_single_step, ++ smp_processor_id()); ++ ++ } ++ ++ asm volatile ("movq %%db6, %0\n":"=r" (dr6)); ++ if (!(dr6 & 0x4000)) { ++ for (breakno = 0; breakno < 4; ++breakno) { ++ if (dr6 & (1 << breakno)) { ++ if (breakinfo[breakno].type == 0) { ++ /* Set restore flag */ ++ linux_regs->eflags |= ++ X86_EFLAGS_RF; ++ break; ++ } ++ } ++ } ++ } ++ kgdb_correct_hw_break(); ++ asm volatile ("movq %0, %%db6\n"::"r" (0UL)); ++ ++ return (0); ++ ++ case 'Y': ++ ptr = &remcomInBuffer[1]; ++ kgdb_hex2long(&ptr, &breakno); ++ ptr++; ++ kgdb_hex2long(&ptr, &breaktype); ++ ptr++; ++ kgdb_hex2long(&ptr, &length); ++ ptr++; ++ kgdb_hex2long(&ptr, &addr); ++ if (set_hw_break(breakno & 0x3, breaktype & 0x3, ++ length & 0x3, addr) == 0) ++ strcpy(remcomOutBuffer, "OK"); ++ else ++ strcpy(remcomOutBuffer, "ERROR"); ++ break; ++ ++ /* Remove hardware breakpoint */ ++ case 'y': ++ ptr = &remcomInBuffer[1]; ++ kgdb_hex2long(&ptr, &breakno); ++ if (remove_hw_break(breakno & 0x3) == 0) ++ strcpy(remcomOutBuffer, "OK"); ++ else ++ strcpy(remcomOutBuffer, "ERROR"); ++ break; ++ ++ } /* switch */ ++ return -1; ++} ++ ++static struct pt_regs *in_interrupt_stack(unsigned long rsp, int cpu) ++{ ++ struct pt_regs *regs; ++ unsigned long end = (unsigned long)cpu_pda(cpu)->irqstackptr; ++ if (rsp <= end && rsp >= end - IRQSTACKSIZE + 8) { ++ regs = *(((struct pt_regs **)end) - 1); ++ return regs; ++ } ++ return NULL; ++} ++ ++static struct pt_regs *in_exception_stack(unsigned long rsp, int cpu) ++{ ++ int i; ++ struct tss_struct *init_tss = &__get_cpu_var(init_tss); ++ for (i = 0; i < N_EXCEPTION_STACKS; i++) ++ if (rsp >= init_tss[cpu].ist[i] && ++ rsp <= init_tss[cpu].ist[i] + EXCEPTION_STKSZ) { ++ struct pt_regs *r = ++ (void *)init_tss[cpu].ist[i] + EXCEPTION_STKSZ; ++ return r - 1; ++ } ++ return NULL; ++} ++ ++void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid) ++{ ++ static char intr_desc[] = "Stack at interrupt entrypoint"; ++ static char exc_desc[] = "Stack at exception entrypoint"; ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ kgdb_mem2hex(intr_desc, buffer, strlen(intr_desc)); ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ kgdb_mem2hex(exc_desc, buffer, strlen(exc_desc)); ++} ++ ++struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, int threadid) ++{ ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ return current; ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ return current; ++ ++ return NULL; ++} ++ ++struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid) ++{ ++ struct pt_regs *stregs; ++ int cpu = hard_smp_processor_id(); ++ ++ if ((stregs = in_interrupt_stack(regs->rsp, cpu))) ++ return stregs; ++ else if ((stregs = in_exception_stack(regs->rsp, cpu))) ++ return stregs; ++ ++ return NULL; ++} ++ ++/* Register KGDB with the die_chain so that we hook into all of the right ++ * spots. */ ++static int kgdb_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ struct die_args *args = ptr; ++ struct pt_regs *regs = args->regs; ++ ++ if (cmd == DIE_PAGE_FAULT_NO_CONTEXT && atomic_read(&debugger_active) ++ && kgdb_may_fault) { ++ kgdb_fault_longjmp(kgdb_fault_jmp_regs); ++ return NOTIFY_STOP; ++ /* CPU roundup? */ ++ } else if (atomic_read(&debugger_active) && cmd == DIE_NMI_IPI) { ++ kgdb_nmihook(smp_processor_id(), regs); ++ return NOTIFY_STOP; ++ /* See if KGDB is interested. */ ++ } else if (cmd == DIE_PAGE_FAULT || user_mode(regs) || ++ cmd == DIE_NMI_IPI || (cmd == DIE_DEBUG && ++ atomic_read(&debugger_active))) ++ /* Userpace events, normal watchdog event, or spurious ++ * debug exception. Ignore. */ ++ return NOTIFY_DONE; ++ ++ kgdb_handle_exception(args->trapnr, args->signr, args->err, regs); ++ ++ return NOTIFY_STOP; ++} ++ ++static struct notifier_block kgdb_notifier = { ++ .notifier_call = kgdb_notify, ++ .priority = 0x7fffffff, /* we need to notified first */ ++}; ++ ++int kgdb_arch_init(void) ++{ ++ atomic_notifier_chain_register(&die_chain, &kgdb_notifier); ++ return 0; ++} ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++ ++int kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ if (exception == 3 && kgdb_isremovedbreak(regs->rip - 1)) { ++ regs->rip -= 1; ++ return 1; ++ } ++ return 0; ++} ++ ++struct kgdb_arch arch_kgdb_ops = { ++ .gdb_bpt_instr = {0xcc}, ++ .flags = KGDB_HW_BREAKPOINT, ++ .shadowth = 1, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/arch/x86_64/mm/fault.c linux-2.6.18.kgdb/arch/x86_64/mm/fault.c +--- linux-2.6.18/arch/x86_64/mm/fault.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/arch/x86_64/mm/fault.c 2008-06-10 16:19:36.000000000 +0400 +@@ -557,6 +557,10 @@ no_context: + if (is_errata93(regs, address)) + return; + ++ if (notify_die(DIE_PAGE_FAULT_NO_CONTEXT, "no context", regs, ++ error_code, 14, SIGSEGV) == NOTIFY_STOP) ++ return; ++ + /* + * Oops. The kernel tried to access some bad page. We'll have to + * terminate things with extreme prejudice. +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/char/keyboard.c linux-2.6.18.kgdb/drivers/char/keyboard.c +--- linux-2.6.18/drivers/char/keyboard.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/char/keyboard.c 2008-06-10 16:20:02.000000000 +0400 +@@ -1174,6 +1174,7 @@ static void kbd_keycode(unsigned int key + sysrq_down = 0; + if (sysrq_down && down && !rep) { + handle_sysrq(kbd_sysrq_xlate[keycode], regs, tty); ++ sysrq_down = 0; /* In case we miss the 'up' event. */ + return; + } + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/net/Makefile linux-2.6.18.kgdb/drivers/net/Makefile +--- linux-2.6.18/drivers/net/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/net/Makefile 2008-06-10 16:19:13.000000000 +0400 +@@ -216,6 +216,7 @@ obj-$(CONFIG_ETRAX_ETHERNET) += cris/ + obj-$(CONFIG_ENP2611_MSF_NET) += ixp2000/ + + obj-$(CONFIG_NETCONSOLE) += netconsole.o ++obj-$(CONFIG_KGDBOE) += kgdboe.o + + obj-$(CONFIG_FS_ENET) += fs_enet/ + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/net/kgdboe.c linux-2.6.18.kgdb/drivers/net/kgdboe.c +--- linux-2.6.18/drivers/net/kgdboe.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/drivers/net/kgdboe.c 2008-06-10 16:19:13.000000000 +0400 +@@ -0,0 +1,294 @@ ++/* ++ * drivers/net/kgdboe.c ++ * ++ * A network interface for GDB. ++ * Based upon 'gdbserial' by David Grothe ++ * and Scott Foehner ++ * ++ * Maintainers: Amit S. Kale and ++ * Tom Rini ++ * ++ * 2004 (c) Amit S. Kale ++ * 2004-2005 (c) MontaVista Software, Inc. ++ * 2005 (c) Wind River Systems, Inc. ++ * ++ * Contributors at various stages not listed above: ++ * San Mehat , Robert Walsh , ++ * wangdi , Matt Mackall , ++ * Pavel Machek , Jason Wessel ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#define IN_BUF_SIZE 512 /* power of 2, please */ ++#define NOT_CONFIGURED_STRING "not_configured" ++#define OUT_BUF_SIZE 30 /* We don't want to send too big of a packet. */ ++#define MAX_KGDBOE_CONFIG_STR 256 ++ ++static char in_buf[IN_BUF_SIZE], out_buf[OUT_BUF_SIZE]; ++static int in_head, in_tail, out_count; ++static atomic_t in_count; ++/* 0 = unconfigured, 1 = netpoll options parsed, 2 = fully configured. */ ++static int configured; ++static struct kgdb_io local_kgdb_io_ops; ++static int use_dynamic_mac; ++ ++MODULE_DESCRIPTION("KGDB driver for network interfaces"); ++MODULE_LICENSE("GPL"); ++static char config[MAX_KGDBOE_CONFIG_STR] = NOT_CONFIGURED_STRING; ++static struct kparam_string kps = { ++ .string = config, ++ .maxlen = MAX_KGDBOE_CONFIG_STR, ++}; ++ ++static void rx_hook(struct netpoll *np, int port, char *msg, int len, ++ struct sk_buff *skb) ++{ ++ int i; ++ ++ np->remote_port = port; ++ ++ /* Copy the MAC address if we need to. */ ++ if (use_dynamic_mac) { ++ memcpy(np->remote_mac, eth_hdr(skb)->h_source, ++ sizeof(np->remote_mac)); ++ use_dynamic_mac = 0; ++ } ++ ++ /* ++ * This could be GDB trying to attach. But it could also be GDB ++ * finishing up a session, with kgdb_connected=0 but GDB sending ++ * an ACK for the final packet. To make sure we don't try and ++ * make a breakpoint when GDB is leaving, make sure that if ++ * !kgdb_connected the only len == 1 packet we allow is ^C. ++ */ ++ if (!kgdb_connected && (len != 1 || msg[0] == 3) && ++ !atomic_read(&kgdb_setting_breakpoint)) { ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ } ++ ++ for (i = 0; i < len; i++) { ++ if (msg[i] == 3) ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ ++ if (atomic_read(&in_count) >= IN_BUF_SIZE) { ++ /* buffer overflow, clear it */ ++ in_head = in_tail = 0; ++ atomic_set(&in_count, 0); ++ break; ++ } ++ in_buf[in_head++] = msg[i]; ++ in_head &= (IN_BUF_SIZE - 1); ++ atomic_inc(&in_count); ++ } ++} ++ ++static struct netpoll np = { ++ .dev_name = "eth0", ++ .name = "kgdboe", ++ .rx_hook = rx_hook, ++ .local_port = 6443, ++ .remote_port = 6442, ++ .remote_mac = { 0xff, 0xff, 0xff, 0xff, 0xff, 0xff }, ++}; ++ ++static void eth_pre_exception_handler(void) ++{ ++ /* Increment the module count when the debugger is active */ ++ if (!kgdb_connected) ++ try_module_get(THIS_MODULE); ++ netpoll_set_trap(1); ++} ++ ++static void eth_post_exception_handler(void) ++{ ++ /* decrement the module count when the debugger detaches */ ++ if (!kgdb_connected) ++ module_put(THIS_MODULE); ++ netpoll_set_trap(0); ++} ++ ++static int eth_get_char(void) ++{ ++ int chr; ++ ++ while (atomic_read(&in_count) == 0) ++ netpoll_poll(&np); ++ ++ chr = in_buf[in_tail++]; ++ in_tail &= (IN_BUF_SIZE - 1); ++ atomic_dec(&in_count); ++ return chr; ++} ++ ++static void eth_flush_buf(void) ++{ ++ if (out_count && np.dev) { ++ netpoll_send_udp(&np, out_buf, out_count); ++ memset(out_buf, 0, sizeof(out_buf)); ++ out_count = 0; ++ } ++} ++ ++static void eth_put_char(u8 chr) ++{ ++ out_buf[out_count++] = chr; ++ if (out_count == OUT_BUF_SIZE) ++ eth_flush_buf(); ++} ++ ++static int option_setup(char *opt) ++{ ++ char opt_scratch[MAX_KGDBOE_CONFIG_STR]; ++ ++ /* If we're being given a new configuration, copy it in. */ ++ if (opt != config) ++ strcpy(config, opt); ++ /* But work on a copy as netpoll_parse_options will eat it. */ ++ strcpy(opt_scratch, opt); ++ configured = !netpoll_parse_options(&np, opt_scratch); ++ ++ use_dynamic_mac = 1; ++ ++ return 0; ++} ++__setup("kgdboe=", option_setup); ++ ++/* With our config string set by some means, configure kgdboe. */ ++static int configure_kgdboe(void) ++{ ++ /* Try out the string. */ ++ option_setup(config); ++ ++ if (!configured) { ++ printk(KERN_ERR "kgdboe: configuration incorrect - kgdboe not " ++ "loaded.\n"); ++ printk(KERN_ERR " Usage: kgdboe=[src-port]@[src-ip]/[dev]," ++ "[tgt-port]@/\n"); ++ return -EINVAL; ++ } ++ ++ /* Bring it up. */ ++ if (netpoll_setup(&np)) { ++ printk(KERN_ERR "kgdboe: netpoll_setup failed kgdboe failed\n"); ++ return -EINVAL; ++ } ++ ++ if (kgdb_register_io_module(&local_kgdb_io_ops)) { ++ netpoll_cleanup(&np); ++ return -EINVAL; ++ } ++ ++ configured = 2; ++ ++ return 0; ++} ++ ++static int init_kgdboe(void) ++{ ++ int ret; ++ ++ /* Already done? */ ++ if (configured == 2) ++ return 0; ++ ++ /* OK, go ahead and do it. */ ++ ret = configure_kgdboe(); ++ ++ if (configured == 2) ++ printk(KERN_INFO "kgdboe: debugging over ethernet enabled\n"); ++ ++ return ret; ++} ++ ++static void cleanup_kgdboe(void) ++{ ++ netpoll_cleanup(&np); ++ configured = 0; ++ kgdb_unregister_io_module(&local_kgdb_io_ops); ++} ++ ++static int param_set_kgdboe_var(const char *kmessage, struct kernel_param *kp) ++{ ++ char kmessage_save[MAX_KGDBOE_CONFIG_STR]; ++ int msg_len = strlen(kmessage); ++ ++ if (msg_len + 1 > MAX_KGDBOE_CONFIG_STR) { ++ printk(KERN_ERR "%s: string doesn't fit in %u chars.\n", ++ kp->name, MAX_KGDBOE_CONFIG_STR - 1); ++ return -ENOSPC; ++ } ++ ++ if (kgdb_connected) { ++ printk(KERN_ERR "kgdboe: Cannot reconfigure while KGDB is " ++ "connected.\n"); ++ return 0; ++ } ++ ++ /* Start the reconfiguration process by saving the old string */ ++ strncpy(kmessage_save, config, sizeof(kmessage_save)); ++ ++ ++ /* Copy in the new param and strip out invalid characters so we ++ * can optionally specify the MAC. ++ */ ++ strncpy(config, kmessage, sizeof(config)); ++ msg_len--; ++ while (msg_len > 0 && ++ (config[msg_len] < ',' || config[msg_len] > 'f')) { ++ config[msg_len] = '\0'; ++ msg_len--; ++ } ++ ++ /* Check to see if we are unconfiguring the io module and that it ++ * was in a fully configured state, as this is the only time that ++ * netpoll_cleanup should get called ++ */ ++ if (configured == 2 && strcmp(config, NOT_CONFIGURED_STRING) == 0) { ++ printk(KERN_INFO "kgdboe: reverting to unconfigured state\n"); ++ cleanup_kgdboe(); ++ return 0; ++ } else ++ /* Go and configure with the new params. */ ++ configure_kgdboe(); ++ ++ if (configured == 2) ++ return 0; ++ ++ /* If the new string was invalid, revert to the previous state, which ++ * is at a minimum not_configured. */ ++ strncpy(config, kmessage_save, sizeof(config)); ++ if (strcmp(kmessage_save, NOT_CONFIGURED_STRING) != 0) { ++ printk(KERN_INFO "kgdboe: reverting to prior configuration\n"); ++ /* revert back to the original config */ ++ strncpy(config, kmessage_save, sizeof(config)); ++ configure_kgdboe(); ++ } ++ return 0; ++} ++ ++static struct kgdb_io local_kgdb_io_ops = { ++ .read_char = eth_get_char, ++ .write_char = eth_put_char, ++ .init = init_kgdboe, ++ .flush = eth_flush_buf, ++ .pre_exception = eth_pre_exception_handler, ++ .post_exception = eth_post_exception_handler ++}; ++ ++module_init(init_kgdboe); ++module_exit(cleanup_kgdboe); ++module_param_call(kgdboe, param_set_kgdboe_var, param_get_string, &kps, 0644); ++MODULE_PARM_DESC(kgdboe, " kgdboe=[src-port]@[src-ip]/[dev]," ++ "[tgt-port]@/\n"); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/8250.c linux-2.6.18.kgdb/drivers/serial/8250.c +--- linux-2.6.18/drivers/serial/8250.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/8250.c 2008-06-10 16:19:03.000000000 +0400 +@@ -2628,6 +2628,25 @@ void serial8250_unregister_port(int line + } + EXPORT_SYMBOL(serial8250_unregister_port); + ++/** ++ * serial8250_unregister_by_port - remove a 16x50 serial port ++ * at runtime. ++ * @port: A &struct uart_port that describes the port to remove. ++ * ++ * Remove one serial port. This may not be called from interrupt ++ * context. We hand the port back to the our control. ++ */ ++void serial8250_unregister_by_port(struct uart_port *port) ++{ ++ struct uart_8250_port *uart; ++ ++ uart = serial8250_find_match_or_unused(port); ++ ++ if (uart) ++ serial8250_unregister_port(uart->port.line); ++} ++EXPORT_SYMBOL(serial8250_unregister_by_port); ++ + static int __init serial8250_init(void) + { + int ret, i; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/8250_kgdb.c linux-2.6.18.kgdb/drivers/serial/8250_kgdb.c +--- linux-2.6.18/drivers/serial/8250_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/drivers/serial/8250_kgdb.c 2008-06-10 16:19:03.000000000 +0400 +@@ -0,0 +1,516 @@ ++/* ++ * 8250 interface for kgdb. ++ * ++ * This is a merging of many different drivers, and all of the people have ++ * had an impact in some form or another: ++ * ++ * 2004-2005 (c) MontaVista Software, Inc. ++ * 2005-2006 (c) Wind River Systems, Inc. ++ * ++ * Amit Kale , David Grothe , ++ * Scott Foehner , George Anzinger , ++ * Robert Walsh , wangdi , ++ * San Mehat, Tom Rini , ++ * Jason Wessel ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include /* For BASE_BAUD and SERIAL_PORT_DFNS */ ++ ++#include "8250.h" ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++MODULE_DESCRIPTION("KGDB driver for the 8250"); ++MODULE_LICENSE("GPL"); ++/* These will conflict with early_param otherwise. */ ++#ifdef CONFIG_KGDB_8250_MODULE ++static char config[256]; ++module_param_string(kgdb8250, config, 256, 0); ++MODULE_PARM_DESC(kgdb8250, ++ " kgdb8250=,
,,\n"); ++static struct kgdb_io local_kgdb_io_ops; ++#endif /* CONFIG_KGDB_8250_MODULE */ ++ ++/* Speed of the UART. */ ++static int kgdb8250_baud; ++ ++/* Flag for if we need to call request_mem_region */ ++static int kgdb8250_needs_request_mem_region; ++ ++static char kgdb8250_buf[GDB_BUF_SIZE]; ++static atomic_t kgdb8250_buf_in_cnt; ++static int kgdb8250_buf_out_inx; ++ ++/* Old-style serial definitions, if existant, and a counter. */ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++static int __initdata should_copy_rs_table = 1; ++static struct serial_state old_rs_table[] __initdata = { ++#ifdef SERIAL_PORT_DFNS ++ SERIAL_PORT_DFNS ++#endif ++}; ++#endif ++ ++/* Our internal table of UARTS. */ ++#define UART_NR CONFIG_SERIAL_8250_NR_UARTS ++static struct uart_port kgdb8250_ports[UART_NR]; ++ ++static struct uart_port *current_port; ++ ++/* Base of the UART. */ ++static void *kgdb8250_addr; ++ ++/* Forward declarations. */ ++static int kgdb8250_uart_init(void); ++static int __init kgdb_init_io(void); ++static int __init kgdb8250_opt(char *str); ++ ++/* These are much shorter calls to ioread8/iowrite8 that take into ++ * account our shifts, etc. */ ++static inline unsigned int kgdb_ioread(u8 mask) ++{ ++ return ioread8(kgdb8250_addr + (mask << current_port->regshift)); ++} ++ ++static inline void kgdb_iowrite(u8 val, u8 mask) ++{ ++ iowrite8(val, kgdb8250_addr + (mask << current_port->regshift)); ++} ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void kgdb_put_debug_char(u8 chr) ++{ ++ while (!(kgdb_ioread(UART_LSR) & UART_LSR_THRE)) ; ++ ++ kgdb_iowrite(chr, UART_TX); ++} ++ ++/* ++ * Get a byte from the hardware data buffer and return it ++ */ ++static int read_data_bfr(void) ++{ ++ char it = kgdb_ioread(UART_LSR); ++ ++ if (it & UART_LSR_DR) ++ return kgdb_ioread(UART_RX); ++ ++ /* ++ * If we have a framing error assume somebody messed with ++ * our uart. Reprogram it and send '-' both ways... ++ */ ++ if (it & 0xc) { ++ kgdb8250_uart_init(); ++ kgdb_put_debug_char('-'); ++ return '-'; ++ } ++ ++ return -1; ++} ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ */ ++static int kgdb_get_debug_char(void) ++{ ++ int retchr; ++ ++ /* intr routine has q'd chars */ ++ if (atomic_read(&kgdb8250_buf_in_cnt) != 0) { ++ retchr = kgdb8250_buf[kgdb8250_buf_out_inx++]; ++ kgdb8250_buf_out_inx &= (GDB_BUF_SIZE - 1); ++ atomic_dec(&kgdb8250_buf_in_cnt); ++ return retchr; ++ } ++ ++ do { ++ retchr = read_data_bfr(); ++ } while (retchr < 0); ++ ++ return retchr; ++} ++ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * All that we need to do is verify that the interrupt happened on the ++ * line we're in charge of. If this is true, schedule a breakpoint and ++ * return. ++ */ ++static irqreturn_t ++kgdb8250_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ if (kgdb_ioread(UART_IIR) & UART_IIR_RDI) { ++ /* Throw away the data if another I/O routine is active. */ ++ if (kgdb_io_ops.read_char != kgdb_get_debug_char && ++ (kgdb_ioread(UART_LSR) & UART_LSR_DR)) ++ kgdb_ioread(UART_RX); ++ else ++ breakpoint(); ++ } ++ ++ return IRQ_HANDLED; ++} ++ ++/* ++ * Initializes the UART. ++ * Returns: ++ * 0 on success, 1 on failure. ++ */ ++static int ++kgdb8250_uart_init (void) ++{ ++ unsigned int ier, base_baud = current_port->uartclk ? ++ current_port->uartclk / 16 : BASE_BAUD; ++ ++ /* test uart existance */ ++ if(kgdb_ioread(UART_LSR) == 0xff) ++ return -1; ++ ++ /* disable interrupts */ ++ kgdb_iowrite(0, UART_IER); ++ ++#if defined(CONFIG_ARCH_OMAP1510) ++ /* Workaround to enable 115200 baud on OMAP1510 internal ports */ ++ if (cpu_is_omap1510() && is_omap_port((void *)kgdb8250_addr)) { ++ if (kgdb8250_baud == 115200) { ++ base_baud = 1; ++ kgdb8250_baud = 1; ++ kgdb_iowrite(1, UART_OMAP_OSC_12M_SEL); ++ } else ++ kgdb_iowrite(0, UART_OMAP_OSC_12M_SEL); ++ } ++#endif ++ /* set DLAB */ ++ kgdb_iowrite(UART_LCR_DLAB, UART_LCR); ++ ++ /* set baud */ ++ kgdb_iowrite((base_baud / kgdb8250_baud) & 0xff, UART_DLL); ++ kgdb_iowrite((base_baud / kgdb8250_baud) >> 8, UART_DLM); ++ ++ /* reset DLAB, set LCR */ ++ kgdb_iowrite(UART_LCR_WLEN8, UART_LCR); ++ ++ /* set DTR and RTS */ ++ kgdb_iowrite(UART_MCR_OUT2 | UART_MCR_DTR | UART_MCR_RTS, UART_MCR); ++ ++ /* setup fifo */ ++ kgdb_iowrite(UART_FCR_ENABLE_FIFO | UART_FCR_CLEAR_RCVR ++ | UART_FCR_CLEAR_XMIT | UART_FCR_TRIGGER_8, ++ UART_FCR); ++ ++ /* clear pending interrupts */ ++ kgdb_ioread(UART_IIR); ++ kgdb_ioread(UART_RX); ++ kgdb_ioread(UART_LSR); ++ kgdb_ioread(UART_MSR); ++ ++ /* turn on RX interrupt only */ ++ kgdb_iowrite(UART_IER_RDI, UART_IER); ++ ++ /* ++ * Borrowed from the main 8250 driver. ++ * Try writing and reading the UART_IER_UUE bit (b6). ++ * If it works, this is probably one of the Xscale platform's ++ * internal UARTs. ++ * We're going to explicitly set the UUE bit to 0 before ++ * trying to write and read a 1 just to make sure it's not ++ * already a 1 and maybe locked there before we even start start. ++ */ ++ ier = kgdb_ioread(UART_IER); ++ kgdb_iowrite(ier & ~UART_IER_UUE, UART_IER); ++ if (!(kgdb_ioread(UART_IER) & UART_IER_UUE)) { ++ /* ++ * OK it's in a known zero state, try writing and reading ++ * without disturbing the current state of the other bits. ++ */ ++ kgdb_iowrite(ier | UART_IER_UUE, UART_IER); ++ if (kgdb_ioread(UART_IER) & UART_IER_UUE) ++ /* ++ * It's an Xscale. ++ */ ++ ier |= UART_IER_UUE | UART_IER_RTOIE; ++ } ++ kgdb_iowrite(ier, UART_IER); ++ return 0; ++} ++ ++/* ++ * Copy the old serial_state table to our uart_port table if we haven't ++ * had values specifically configured in. We need to make sure this only ++ * happens once. ++ */ ++static void __init kgdb8250_copy_rs_table(void) ++{ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++ int i; ++ ++ if (!should_copy_rs_table) ++ return; ++ ++ for (i = 0; i < ARRAY_SIZE(old_rs_table); i++) { ++ kgdb8250_ports[i].iobase = old_rs_table[i].port; ++ kgdb8250_ports[i].irq = irq_canonicalize(old_rs_table[i].irq); ++ kgdb8250_ports[i].uartclk = old_rs_table[i].baud_base * 16; ++ kgdb8250_ports[i].membase = old_rs_table[i].iomem_base; ++ kgdb8250_ports[i].iotype = old_rs_table[i].io_type; ++ kgdb8250_ports[i].regshift = old_rs_table[i].iomem_reg_shift; ++ kgdb8250_ports[i].line = i; ++ } ++ ++ should_copy_rs_table = 0; ++#endif ++} ++ ++/* ++ * Hookup our IRQ line now that it is safe to do so, after we grab any ++ * memory regions we might need to. If we haven't been initialized yet, ++ * go ahead and copy the old_rs_table in. ++ */ ++static void __init kgdb8250_late_init(void) ++{ ++ /* Try and copy the old_rs_table. */ ++ kgdb8250_copy_rs_table(); ++ ++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE) ++ /* Take the port away from the main driver. */ ++ serial8250_unregister_by_port(current_port); ++ ++ /* Now reinit the port as the above has disabled things. */ ++ kgdb8250_uart_init(); ++#endif ++ /* We may need to call request_mem_region() first. */ ++ if (kgdb8250_needs_request_mem_region) ++ request_mem_region(current_port->mapbase, ++ 8 << current_port->regshift, "kgdb"); ++ if (request_irq(current_port->irq, kgdb8250_interrupt, SA_SHIRQ, ++ "GDB-stub", current_port) < 0) ++ printk(KERN_ERR "KGDB failed to request the serial IRQ (%d)\n", ++ current_port->irq); ++} ++ ++static __init int kgdb_init_io(void) ++{ ++ /* Give us the basic table of uarts. */ ++ kgdb8250_copy_rs_table(); ++ ++ /* We're either a module and parse a config string, or we have a ++ * semi-static config. */ ++#ifdef CONFIG_KGDB_8250_MODULE ++ if (strlen(config)) { ++ if (kgdb8250_opt(config)) ++ return -EINVAL; ++ } else { ++ printk(KERN_ERR "kgdb8250: argument error, usage: " ++ "kgdb8250=,
,,\n"); ++ return -EINVAL; ++ } ++#elif defined(CONFIG_KGDB_SIMPLE_SERIAL) ++ kgdb8250_baud = CONFIG_KGDB_BAUDRATE; ++ ++ /* Setup our pointer to the serial port now. */ ++ current_port = &kgdb8250_ports[CONFIG_KGDB_PORT_NUM]; ++#else ++ if (kgdb8250_opt(CONFIG_KGDB_8250_CONF_STRING)) ++ return -EINVAL; ++#endif ++ ++ ++ /* Internal driver setup. */ ++ switch (current_port->iotype) { ++ case UPIO_MEM: ++ if (current_port->mapbase) ++ kgdb8250_needs_request_mem_region = 1; ++ if (current_port->flags & UPF_IOREMAP) { ++ current_port->membase = ioremap(current_port->mapbase, ++ 8 << current_port->regshift); ++ if (!current_port->membase) ++ return -EIO; /* Failed. */ ++ } ++ kgdb8250_addr = current_port->membase; ++ break; ++ case UPIO_PORT: ++ default: ++ kgdb8250_addr = ioport_map(current_port->iobase, ++ 8 << current_port->regshift); ++ if (!kgdb8250_addr) ++ return -EIO; /* Failed. */ ++ } ++ ++ if (kgdb8250_uart_init() == -1) { ++ printk(KERN_ERR "kgdb8250: init failed\n"); ++ return -EIO; ++ } ++#ifdef CONFIG_KGDB_8250_MODULE ++ /* Attach the kgdb irq. When this is built into the kernel, it ++ * is called as a part of late_init sequence. ++ */ ++ kgdb8250_late_init(); ++ if (kgdb_register_io_module(&local_kgdb_io_ops)) ++ return -EINVAL; ++ ++ printk(KERN_INFO "kgdb8250: debugging enabled\n"); ++#endif /* CONFIG_KGD_8250_MODULE */ ++ ++ return 0; ++} ++ ++#ifdef CONFIG_KGDB_8250_MODULE ++/* If it is a module the kgdb_io_ops should be a static which ++ * is passed to the KGDB I/O initialization ++ */ ++static struct kgdb_io local_kgdb_io_ops = { ++#else /* ! CONFIG_KGDB_8250_MODULE */ ++struct kgdb_io kgdb_io_ops = { ++#endif /* ! CONFIG_KGD_8250_MODULE */ ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_put_debug_char, ++ .init = kgdb_init_io, ++ .late_init = kgdb8250_late_init, ++}; ++ ++/** ++ * kgdb8250_add_port - Define a serial port for use with KGDB ++ * @i: The index of the port being added ++ * @serial_req: The &struct uart_port describing the port ++ * ++ * On platforms where we must register the serial device ++ * dynamically, this is the best option if a platform also normally ++ * calls early_serial_setup(). ++ */ ++void __init kgdb8250_add_port(int i, struct uart_port *serial_req) ++{ ++ /* Make sure we've got the built-in data before we override. */ ++ kgdb8250_copy_rs_table(); ++ ++ /* Copy the whole thing over. */ ++ if (current_port != &kgdb8250_ports[i]) ++ memcpy(&kgdb8250_ports[i], serial_req, sizeof(struct uart_port)); ++} ++ ++/** ++ * kgdb8250_add_platform_port - Define a serial port for use with KGDB ++ * @i: The index of the port being added ++ * @p: The &struct plat_serial8250_port describing the port ++ * ++ * On platforms where we must register the serial device ++ * dynamically, this is the best option if a platform normally ++ * handles uart setup with an array of &struct plat_serial8250_port. ++ */ ++void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *p) ++{ ++ /* Make sure we've got the built-in data before we override. */ ++ kgdb8250_copy_rs_table(); ++ ++ kgdb8250_ports[i].iobase = p->iobase; ++ kgdb8250_ports[i].membase = p->membase; ++ kgdb8250_ports[i].irq = p->irq; ++ kgdb8250_ports[i].uartclk = p->uartclk; ++ kgdb8250_ports[i].regshift = p->regshift; ++ kgdb8250_ports[i].iotype = p->iotype; ++ kgdb8250_ports[i].flags = p->flags; ++ kgdb8250_ports[i].mapbase = p->mapbase; ++} ++ ++/* ++ * Syntax for this cmdline option is: ++ * kgdb8250=,
,," ++ */ ++static int __init kgdb8250_opt(char *str) ++{ ++ /* We'll fill out and use the first slot. */ ++ current_port = &kgdb8250_ports[0]; ++ ++ if (!strncmp(str, "io", 2)) { ++ current_port->iotype = UPIO_PORT; ++ str += 2; ++ } else if (!strncmp(str, "mmap", 4)) { ++ current_port->iotype = UPIO_MEM; ++ current_port->flags |= UPF_IOREMAP; ++ str += 4; ++ } else if (!strncmp(str, "mmio", 4)) { ++ current_port->iotype = UPIO_MEM; ++ current_port->flags &= ~UPF_IOREMAP; ++ str += 4; ++ } else ++ goto errout; ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ if (current_port->iotype == UPIO_PORT) ++ current_port->iobase = simple_strtoul(str, &str, 16); ++ else { ++ if (current_port->flags & UPF_IOREMAP) ++ current_port->mapbase = ++ (unsigned long) simple_strtoul(str, &str, 16); ++ else ++ current_port->membase = ++ (void *) simple_strtoul(str, &str, 16); ++ } ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ kgdb8250_baud = simple_strtoul(str, &str, 10); ++ if (!kgdb8250_baud) ++ goto errout; ++ ++ if (*str != ',') ++ goto errout; ++ str++; ++ ++ current_port->irq = simple_strtoul(str, &str, 10); ++ ++#ifdef CONFIG_KGDB_SIMPLE_SERIAL ++ should_copy_rs_table = 0; ++#endif ++ ++ return 0; ++ ++ errout: ++ printk(KERN_ERR "Invalid syntax for option kgdb8250=\n"); ++ return 1; ++} ++ ++#ifdef CONFIG_KGDB_8250_MODULE ++static void cleanup_kgdb8250(void) ++{ ++ kgdb_unregister_io_module(&local_kgdb_io_ops); ++ ++ /* Clean up the irq and memory */ ++ free_irq(current_port->irq, current_port); ++ ++ if (kgdb8250_needs_request_mem_region) ++ release_mem_region(current_port->mapbase, ++ 8 << current_port->regshift); ++ /* Hook up the serial port back to what it was previously ++ * hooked up to. ++ */ ++#if defined(CONFIG_SERIAL_8250) || defined(CONFIG_SERIAL_8250_MODULE) ++ /* Give the port back to the 8250 driver. */ ++ serial8250_register_port(current_port); ++#endif ++} ++ ++module_init(kgdb_init_io); ++module_exit(cleanup_kgdb8250); ++#else /* ! CONFIG_KGDB_8250_MODULE */ ++early_param("kgdb8250", kgdb8250_opt); ++#endif /* ! CONFIG_KGDB_8250_MODULE */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/Kconfig linux-2.6.18.kgdb/drivers/serial/Kconfig +--- linux-2.6.18/drivers/serial/Kconfig 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/Kconfig 2008-06-10 16:19:03.000000000 +0400 +@@ -106,7 +106,7 @@ config SERIAL_8250_CS + + config SERIAL_8250_NR_UARTS + int "Maximum number of 8250/16550 serial ports" +- depends on SERIAL_8250 ++ depends on SERIAL_8250 || KGDB_8250 + default "4" + help + Set this to the number of serial ports you want the driver +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/Makefile linux-2.6.18.kgdb/drivers/serial/Makefile +--- linux-2.6.18/drivers/serial/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/Makefile 2008-06-10 16:19:22.000000000 +0400 +@@ -47,6 +47,7 @@ obj-$(CONFIG_SERIAL_IMX) += imx.o + obj-$(CONFIG_SERIAL_MPC52xx) += mpc52xx_uart.o + obj-$(CONFIG_SERIAL_ICOM) += icom.o + obj-$(CONFIG_SERIAL_M32R_SIO) += m32r_sio.o ++obj-$(CONFIG_KGDB_MPSC) += mpsc_kgdb.o + obj-$(CONFIG_SERIAL_MPSC) += mpsc.o + obj-$(CONFIG_ETRAX_SERIAL) += crisv10.o + obj-$(CONFIG_SERIAL_JSM) += jsm/ +@@ -56,3 +57,4 @@ obj-$(CONFIG_SERIAL_SGI_IOC4) += ioc4_se + obj-$(CONFIG_SERIAL_SGI_IOC3) += ioc3_serial.o + obj-$(CONFIG_SERIAL_AT91) += at91_serial.o + obj-$(CONFIG_SERIAL_NETX) += netx-serial.o ++obj-$(CONFIG_KGDB_8250) += 8250_kgdb.o +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/amba-pl011.c linux-2.6.18.kgdb/drivers/serial/amba-pl011.c +--- linux-2.6.18/drivers/serial/amba-pl011.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/amba-pl011.c 2008-06-10 16:19:51.000000000 +0400 +@@ -340,7 +340,7 @@ static int pl011_startup(struct uart_por + /* + * Allocate the IRQ + */ +- retval = request_irq(uap->port.irq, pl011_int, 0, "uart-pl011", uap); ++ retval = request_irq(uap->port.irq, pl011_int, SA_SHIRQ, "uart-pl011", uap); + if (retval) + goto clk_dis; + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/Makefile linux-2.6.18.kgdb/drivers/serial/cpm_uart/Makefile +--- linux-2.6.18/drivers/serial/cpm_uart/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/Makefile 2008-06-10 16:19:22.000000000 +0400 +@@ -7,5 +7,6 @@ obj-$(CONFIG_SERIAL_CPM) += cpm_uart.o + # Select the correct platform objects. + cpm_uart-objs-$(CONFIG_CPM2) += cpm_uart_cpm2.o + cpm_uart-objs-$(CONFIG_8xx) += cpm_uart_cpm1.o ++cpm_uart-objs-$(CONFIG_KGDB_CPM_UART) += cpm_uart_kgdb.o + + cpm_uart-objs := cpm_uart_core.o $(cpm_uart-objs-y) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart.h linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart.h +--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart.h 2008-06-10 16:19:22.000000000 +0400 +@@ -51,6 +51,39 @@ + + #define SCC_WAIT_CLOSING 100 + ++#ifdef CONFIG_KGDB_CPM_UART ++ ++/* Speed of the debug UART. */ ++#if defined(CONFIG_KGDB_9600BAUD) ++#define KGDB_BAUD B9600 ++#elif defined(CONFIG_KGDB_19200BAUD) ++#define KGDB_BAUD B19200 ++#elif defined(CONFIG_KGDB_38400BAUD) ++#define KGDB_BAUD B38400 ++#elif defined(CONFIG_KGDB_57600BAUD) ++#define KGDB_BAUD B57600 ++#else ++#define KGDB_BAUD B115200 /* Start with this if not given */ ++#endif ++ ++#ifdef CONFIG_KGDB_CPM_UART_SCC1 ++#define KGDB_PINFO_INDEX UART_SCC1 ++#elif CONFIG_KGDB_CPM_UART_SCC2 ++#define KGDB_PINFO_INDEX UART_SCC2 ++#elif CONFIG_KGDB_CPM_UART_SCC3 ++#define KGDB_PINFO_INDEX UART_SCC3 ++#elif CONFIG_KGDB_CPM_UART_SCC4 ++#define KGDB_PINFO_INDEX UART_SCC4 ++#elif CONFIG_KGDB_CPM_UART_SMC1 ++#define KGDB_PINFO_INDEX UART_SMC1 ++#elif CONFIG_KGDB_CPM_UART_SMC2 ++#define KGDB_PINFO_INDEX UART_SMC2 ++#else ++#error The S(M)CC for kgdb console is undefined ++#endif ++ ++#endif /* CONFIG_KGDB_CPM_UART */ ++ + struct uart_cpm_port { + struct uart_port port; + u16 rx_nrfifos; +@@ -87,6 +120,9 @@ extern int cpm_uart_port_map[UART_NR]; + extern int cpm_uart_nr; + extern struct uart_cpm_port cpm_uart_ports[UART_NR]; + ++void cpm_uart_early_write(int index, const char *s, u_int count); ++int cpm_uart_early_setup(int index,int early); ++ + /* these are located in their respective files */ + void cpm_line_cr_cmd(int line, int cmd); + int cpm_uart_init_portdesc(void); +@@ -133,5 +169,4 @@ static inline void *cpm2cpu_addr(unsigne + return 0; + } + +- + #endif /* CPM_UART_H */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_core.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c +--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_core.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_core.c 2008-06-10 16:19:22.000000000 +0400 +@@ -1070,22 +1070,17 @@ int cpm_uart_drv_get_platform_data(struc + return 0; + } + +-#ifdef CONFIG_SERIAL_CPM_CONSOLE +-/* +- * Print a string to the serial port trying not to disturb +- * any possible real use of the port... +- * +- * Note that this is called with interrupts already disabled +- */ +-static void cpm_uart_console_write(struct console *co, const char *s, ++void cpm_uart_early_write(int index, const char *s, + u_int count) + { +- struct uart_cpm_port *pinfo = +- &cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ struct uart_cpm_port *pinfo; + unsigned int i; + volatile cbd_t *bdp, *bdbase; + volatile unsigned char *cp; + ++ BUG_ON(index>UART_NR); ++ pinfo = &cpm_uart_ports[index]; ++ + /* Get the address of the host memory buffer. + */ + bdp = pinfo->tx_cur; +@@ -1149,16 +1144,11 @@ static void cpm_uart_console_write(struc + pinfo->tx_cur = (volatile cbd_t *) bdp; + } + +- +-static int __init cpm_uart_console_setup(struct console *co, char *options) ++int cpm_uart_early_setup(int index, int early) + { ++ int ret; + struct uart_port *port; + struct uart_cpm_port *pinfo; +- int baud = 38400; +- int bits = 8; +- int parity = 'n'; +- int flow = 'n'; +- int ret; + + struct fs_uart_platform_info *pdata; + struct platform_device* pdev = early_uart_get_pdev(co->index); +@@ -1169,8 +1159,9 @@ static int __init cpm_uart_console_setup + cpm_uart_init_portdesc(); + } + ++ BUG_ON(index>UART_NR); + port = +- (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ (struct uart_port *)&cpm_uart_ports[index]; + pinfo = (struct uart_cpm_port *)port; + if (!pdev) { + if (pinfo->set_lineif) +@@ -1184,19 +1175,6 @@ static int __init cpm_uart_console_setup + cpm_uart_drv_get_platform_data(pdev, 1); + } + +- pinfo->flags |= FLAG_CONSOLE; +- +- if (options) { +- uart_parse_options(options, &baud, &parity, &bits, &flow); +- } else { +- bd_t *bd = (bd_t *) __res; +- +- if (bd->bi_baudrate) +- baud = bd->bi_baudrate; +- else +- baud = 9600; +- } +- + if (IS_SMC(pinfo)) { + pinfo->smcp->smc_smcm &= ~(SMCM_RX | SMCM_TX); + pinfo->smcp->smc_smcmr &= ~(SMCMR_REN | SMCMR_TEN); +@@ -1204,8 +1182,7 @@ static int __init cpm_uart_console_setup + pinfo->sccp->scc_sccm &= ~(UART_SCCM_TX | UART_SCCM_RX); + pinfo->sccp->scc_gsmrl &= ~(SCC_GSMRL_ENR | SCC_GSMRL_ENT); + } +- +- ret = cpm_uart_allocbuf(pinfo, 1); ++ ret = cpm_uart_allocbuf(pinfo, early); + + if (ret) + return ret; +@@ -1217,6 +1194,56 @@ static int __init cpm_uart_console_setup + else + cpm_uart_init_scc(pinfo); + ++ return 0; ++} ++ ++#ifdef CONFIG_SERIAL_CPM_CONSOLE ++/* ++ * Print a string to the serial port trying not to disturb ++ * any possible real use of the port... ++ * ++ * Note that this is called with interrupts already disabled ++ */ ++ ++static void cpm_uart_console_write(struct console *co, const char *s, ++ u_int count) ++{ ++ cpm_uart_early_write(cpm_uart_port_map[co->index],s,count); ++} ++ ++/* ++ * Setup console. Be careful is called early ! ++ */ ++static int __init cpm_uart_console_setup(struct console *co, char *options) ++{ ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ int baud = 115200; ++ int bits = 8; ++ int parity = 'n'; ++ int flow = 'n'; ++ int ret; ++ ++ port = ++ (struct uart_port *)&cpm_uart_ports[cpm_uart_port_map[co->index]]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ pinfo->flags |= FLAG_CONSOLE; ++ ++ if (options) { ++ uart_parse_options(options, &baud, &parity, &bits, &flow); ++ } else { ++ bd_t *bd = (bd_t *) __res; ++ ++ if (bd->bi_baudrate) ++ baud = bd->bi_baudrate; ++ else ++ baud = 9600; ++ } ++ ++ ret = cpm_uart_early_setup(cpm_uart_port_map[co->index], 1); ++ if(ret) ++ return ret; + uart_set_options(port, co, baud, parity, bits, flow); + + return 0; +@@ -1364,6 +1391,12 @@ static int cpm_uart_init(void) { + + for (i = 0; i < cpm_uart_nr; i++) { + int con = cpm_uart_port_map[i]; ++ ++#ifdef CONFIG_KGDB_CPM_UART ++ /* We are not interested in ports yet utilized by kgdb */ ++ if(con == KGDB_PINFO_INDEX) ++ continue; ++#endif + cpm_uart_ports[con].port.line = i; + cpm_uart_ports[con].port.flags = UPF_BOOT_AUTOCONF; + uart_add_one_port(&cpm_reg, &cpm_uart_ports[con].port); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm1.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c +--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm1.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm1.c 2008-06-10 16:19:22.000000000 +0400 +@@ -52,6 +52,7 @@ void cpm_line_cr_cmd(int line, int cmd) + { + ushort val; + volatile cpm8xx_t *cp = cpmp; ++ unsigned *bcsr_io; + + switch (line) { + case UART_SMC1: +@@ -94,12 +95,35 @@ void scc1_lineif(struct uart_cpm_port *p + { + /* XXX SCC1: insert port configuration here */ + pinfo->brg = 1; ++ ++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS) ++ bcsr_io = ioremap(BCSR1, sizeof(unsigned long)); ++ ++ if (bcsr_io == NULL) { ++ printk(KERN_CRIT "Could not remap BCSR\n"); ++ return; ++ } ++ out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_1); ++ iounmap(bcsr_io); ++#endif + } + + void scc2_lineif(struct uart_cpm_port *pinfo) + { + /* XXX SCC2: insert port configuration here */ + pinfo->brg = 2; ++ unsigned *bcsr_io; ++ ++#if defined (CONFIG_MPC885ADS) || defined (CONFIG_MPC86XADS) ++ bcsr_io = ioremap(BCSR1, sizeof(unsigned long)); ++ ++ if (bcsr_io == NULL) { ++ printk(KERN_CRIT "Could not remap BCSR\n"); ++ return; ++ } ++ out_be32(bcsr_io, in_be32(bcsr_io) & ~BCSR1_RS232EN_2); ++ iounmap(bcsr_io); ++#endif + } + + void scc3_lineif(struct uart_cpm_port *pinfo) +@@ -188,6 +212,10 @@ int cpm_uart_init_portdesc(void) + { + pr_debug("CPM uart[-]:init portdesc\n"); + ++ /* Check if we have called this yet. This may happen if early kgdb ++ breakpoint is on */ ++ if(cpm_uart_nr) ++ return 0; + cpm_uart_nr = 0; + #ifdef CONFIG_SERIAL_CPM_SMC1 + cpm_uart_ports[UART_SMC1].smcp = &cpmp->cp_smc[0]; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm2.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c +--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_cpm2.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_cpm2.c 2008-06-10 16:19:22.000000000 +0400 +@@ -256,6 +256,10 @@ int cpm_uart_init_portdesc(void) + { + pr_debug("CPM uart[-]:init portdesc\n"); + ++ /* Check if we have called this yet. This may happen if early kgdb ++ breakpoint is on */ ++ if(cpm_uart_nr) ++ return 0; + cpm_uart_nr = 0; + #ifdef CONFIG_SERIAL_CPM_SMC1 + cpm_uart_ports[UART_SMC1].smcp = (smc_t *) & cpm2_immr->im_smc[0]; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_kgdb.c linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c +--- linux-2.6.18/drivers/serial/cpm_uart/cpm_uart_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/drivers/serial/cpm_uart/cpm_uart_kgdb.c 2008-06-10 16:19:22.000000000 +0400 +@@ -0,0 +1,195 @@ ++/* ++ * drivers/serial/cpm_uart/cpm_uart_kgdb.c ++ * ++ * CPM UART interface for kgdb. ++ * ++ * Author: Vitaly Bordug ++ * ++ * Used some bits from drivers/serial/kgdb_8250.c as a template ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include /* For BASE_BAUD and SERIAL_PORT_DFNS */ ++ ++#include "cpm_uart.h" ++ ++#define GDB_BUF_SIZE 512 /* power of 2, please */ ++ ++ ++static char kgdb_buf[GDB_BUF_SIZE], *kgdbp; ++static int kgdb_chars; ++ ++/* Forward declarations. */ ++ ++/* ++ * Receive character from the serial port. This only works well ++ * before the port is initialize for real use. ++ */ ++static int kgdb_wait_key(char *obuf) ++{ ++ struct uart_cpm_port *pinfo; ++ ++ u_char c, *cp; ++ volatile cbd_t *bdp; ++ int i; ++ ++ pinfo = &cpm_uart_ports[KGDB_PINFO_INDEX]; ++ ++ /* Get the address of the host memory buffer. ++ */ ++ bdp = pinfo->rx_cur; ++ while (bdp->cbd_sc & BD_SC_EMPTY); ++ ++ /* If the buffer address is in the CPM DPRAM, don't ++ * convert it. ++ */ ++ cp = cpm2cpu_addr(bdp->cbd_bufaddr); ++ ++ if (obuf) { ++ i = c = bdp->cbd_datlen; ++ while (i-- > 0) ++ { ++ *obuf++ = *cp++; ++ } ++ } else { ++ c = *cp; ++ } ++ bdp->cbd_sc |= BD_SC_EMPTY; ++ ++ if (bdp->cbd_sc & BD_SC_WRAP) { ++ bdp = pinfo->rx_bd_base; ++ } else { ++ bdp++; ++ } ++ pinfo->rx_cur = (cbd_t *)bdp; ++ ++ return((int)c); ++} ++ ++ ++/* ++ * Wait until the interface can accept a char, then write it. ++ */ ++static void ++kgdb_put_debug_char(int chr) ++{ ++ static char ch[2]; ++ ch[0]=(char)chr; ++ cpm_uart_early_write(KGDB_PINFO_INDEX, ch, 1); ++} ++ ++ ++/* ++ * Get a char if available, return -1 if nothing available. ++ * Empty the receive buffer first, then look at the interface hardware. ++ */ ++static int ++kgdb_get_debug_char(void) ++{ ++ if (kgdb_chars<=0) { ++ kgdb_chars = kgdb_wait_key(kgdb_buf); ++ kgdbp = kgdb_buf; ++ } ++ kgdb_chars--; ++ ++ return (*kgdbp++); ++} ++ ++static void termios_set_options(int index, ++ int baud, int parity, int bits, int flow) ++{ ++ struct termios termios; ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ ++ BUG_ON(index>UART_NR); ++ ++ port = ++ (struct uart_port *)&cpm_uart_ports[index]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ /* ++ * Ensure that the serial console lock is initialised ++ * early. ++ */ ++ spin_lock_init(&port->lock); ++ ++ memset(&termios, 0, sizeof(struct termios)); ++ ++ termios.c_cflag = CREAD | HUPCL | CLOCAL; ++ ++ termios.c_cflag |= baud; ++ ++ if (bits == 7) ++ termios.c_cflag |= CS7; ++ else ++ termios.c_cflag |= CS8; ++ ++ switch (parity) { ++ case 'o': case 'O': ++ termios.c_cflag |= PARODD; ++ /*fall through*/ ++ case 'e': case 'E': ++ termios.c_cflag |= PARENB; ++ break; ++ } ++ ++ if (flow == 'r') ++ termios.c_cflag |= CRTSCTS; ++ ++ port->ops->set_termios(port, &termios, NULL); ++} ++ ++/* ++ * Returns: ++ * 0 on success, 1 on failure. ++ */ ++static int kgdb_init(void) ++{ ++ struct uart_port *port; ++ struct uart_cpm_port *pinfo; ++ ++ int use_bootmem = 0; /* use dma by default */ ++ ++ if(!cpm_uart_nr) ++ { ++ use_bootmem = 1; ++ cpm_uart_init_portdesc(); ++ } ++ port = (struct uart_port *)&cpm_uart_ports[KGDB_PINFO_INDEX]; ++ pinfo = (struct uart_cpm_port *)port; ++ ++ if (cpm_uart_early_setup(KGDB_PINFO_INDEX, use_bootmem)) ++ return 1; ++ ++ termios_set_options(KGDB_PINFO_INDEX, KGDB_BAUD,'n',8,'n'); ++ if (IS_SMC(pinfo)) ++ pinfo->smcp->smc_smcm |= SMCM_TX; ++ else ++ pinfo->sccp->scc_sccm |= UART_SCCM_TX; ++ ++ return 0; ++} ++ ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_put_debug_char, ++ .init = kgdb_init, ++}; ++ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/mpsc.c linux-2.6.18.kgdb/drivers/serial/mpsc.c +--- linux-2.6.18/drivers/serial/mpsc.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/mpsc.c 2008-06-10 16:19:22.000000000 +0400 +@@ -242,6 +242,11 @@ struct mpsc_port_info *mpsc_device_remov + #define MPSC_RCRR 0x0004 + #define MPSC_TCRR 0x0008 + ++/* MPSC Interrupt registers (offset from MV64x60_SDMA_INTR_OFFSET) */ ++#define MPSC_INTR_CAUSE 0x0004 ++#define MPSC_INTR_MASK 0x0084 ++#define MPSC_INTR_CAUSE_RCC (1<<6) ++ + /* Serial DMA Controller Interface Registers */ + #define SDMA_SDC 0x0000 + #define SDMA_SDCM 0x0008 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/mpsc_kgdb.c linux-2.6.18.kgdb/drivers/serial/mpsc_kgdb.c +--- linux-2.6.18/drivers/serial/mpsc_kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/drivers/serial/mpsc_kgdb.c 2008-06-10 16:19:22.000000000 +0400 +@@ -0,0 +1,299 @@ ++/* ++ * drivers/serial/mpsc_kgdb.c ++ * ++ * KGDB driver for the Marvell MultiProtocol Serial Controller (MPCS) ++ * ++ * Based on the polled boot loader driver by Ajit Prem (ajit.prem@motorola.com) ++ * ++ * Author: Randy Vinson ++ * ++ * 2005 (c) MontaVista Software, Inc. ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "mpsc.h" ++ ++/* Speed of the UART. */ ++static int kgdbmpsc_baud = CONFIG_KGDB_BAUDRATE; ++ ++/* Index of the UART, matches ttyMX naming. */ ++static int kgdbmpsc_ttyMM = CONFIG_KGDB_PORT_NUM; ++ ++#define MPSC_INTR_REG_SELECT(x) ((x) + (8 * kgdbmpsc_ttyMM)) ++ ++static int kgdbmpsc_init(void); ++ ++static struct platform_device mpsc_dev, shared_dev; ++ ++static void __iomem *mpsc_base; ++static void __iomem *brg_base; ++static void __iomem *routing_base; ++static void __iomem *sdma_base; ++ ++static unsigned int mpsc_irq; ++ ++static void kgdb_write_debug_char(int c) ++{ ++ u32 data; ++ ++ data = readl(mpsc_base + MPSC_MPCR); ++ writeb(c, mpsc_base + MPSC_CHR_1); ++ mb(); ++ data = readl(mpsc_base + MPSC_CHR_2); ++ data |= MPSC_CHR_2_TTCS; ++ writel(data, mpsc_base + MPSC_CHR_2); ++ mb(); ++ ++ while (readl(mpsc_base + MPSC_CHR_2) & MPSC_CHR_2_TTCS) ; ++} ++ ++static int kgdb_get_debug_char(void) ++{ ++ unsigned char c; ++ ++ while (!(readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) & ++ MPSC_INTR_CAUSE_RCC)) ; ++ ++ c = readb(mpsc_base + MPSC_CHR_10 + (1 << 1)); ++ mb(); ++ writeb(c, mpsc_base + MPSC_CHR_10 + (1 << 1)); ++ mb(); ++ writel(~MPSC_INTR_CAUSE_RCC, sdma_base + ++ MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)); ++ return (c); ++} ++ ++/* ++ * This is the receiver interrupt routine for the GDB stub. ++ * All that we need to do is verify that the interrupt happened on the ++ * line we're in charge of. If this is true, schedule a breakpoint and ++ * return. ++ */ ++static irqreturn_t ++kgdbmpsc_interrupt(int irq, void *dev_id, struct pt_regs *regs) ++{ ++ if (irq != mpsc_irq) ++ return IRQ_NONE; ++ /* ++ * If there is some other CPU in KGDB then this is a ++ * spurious interrupt. so return without even checking a byte ++ */ ++ if (atomic_read(&debugger_active)) ++ return IRQ_NONE; ++ ++ if (readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)) & ++ MPSC_INTR_CAUSE_RCC) ++ breakpoint(); ++ ++ return IRQ_HANDLED; ++} ++ ++static int __init kgdbmpsc_init(void) ++{ ++ struct mpsc_pdata *pdata; ++ u32 cdv; ++ ++ if (!brg_base || !mpsc_base || !routing_base || !sdma_base) ++ return -1; ++ ++ /* Set MPSC Routing to enable both ports */ ++ writel(0x0, routing_base + MPSC_MRR); ++ ++ /* MPSC 0/1 Rx & Tx get clocks BRG0/1 */ ++ writel(0x00000100, routing_base + MPSC_RCRR); ++ writel(0x00000100, routing_base + MPSC_TCRR); ++ ++ /* Disable all MPSC interrupts and clear any pending interrupts */ ++ writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ writel(0, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_CAUSE)); ++ ++ pdata = (struct mpsc_pdata *)mpsc_dev.dev.platform_data; ++ ++ /* cdv = (clock/(2*16*baud rate)) for 16X mode. */ ++ cdv = ((pdata->brg_clk_freq / (32 * kgdbmpsc_baud)) - 1); ++ writel((pdata->brg_clk_src << 18) | (1 << 16) | cdv, ++ brg_base + BRG_BCR); ++ ++ /* Put MPSC into UART mode, no null modem, 16x clock mode */ ++ writel(0x000004c4, mpsc_base + MPSC_MMCRL); ++ writel(0x04400400, mpsc_base + MPSC_MMCRH); ++ ++ writel(0, mpsc_base + MPSC_CHR_1); ++ writel(0, mpsc_base + MPSC_CHR_9); ++ writel(0, mpsc_base + MPSC_CHR_10); ++ writel(4, mpsc_base + MPSC_CHR_3); ++ writel(0x20000000, mpsc_base + MPSC_CHR_4); ++ writel(0x9000, mpsc_base + MPSC_CHR_5); ++ writel(0, mpsc_base + MPSC_CHR_6); ++ writel(0, mpsc_base + MPSC_CHR_7); ++ writel(0, mpsc_base + MPSC_CHR_8); ++ ++ /* 8 data bits, 1 stop bit */ ++ writel((3 << 12), mpsc_base + MPSC_MPCR); ++ ++ /* Enter "hunt" mode */ ++ writel((1 << 31), mpsc_base + MPSC_CHR_2); ++ ++ udelay(100); ++ return 0; ++} ++ ++static void __iomem *__init ++kgdbmpsc_map_resource(struct platform_device *pd, int type, int num) ++{ ++ void __iomem *base = NULL; ++ struct resource *r; ++ ++ if ((r = platform_get_resource(pd, IORESOURCE_MEM, num))) ++ base = ioremap(r->start, r->end - r->start + 1); ++ return base; ++} ++ ++static void __iomem *__init ++kgdbmpsc_unmap_resource(struct platform_device *pd, int type, int num, ++ void __iomem * base) ++{ ++ if (base) ++ iounmap(base); ++ return NULL; ++} ++ ++static void __init ++kgdbmpsc_reserve_resource(struct platform_device *pd, int type, int num) ++{ ++ struct resource *r; ++ ++ if ((r = platform_get_resource(pd, IORESOURCE_MEM, num))) ++ request_mem_region(r->start, r->end - r->start + 1, "kgdb"); ++} ++ ++static int __init kgdbmpsc_local_init(void) ++{ ++ if (!mpsc_dev.num_resources || !shared_dev.num_resources) ++ return 1; /* failure */ ++ ++ mpsc_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER); ++ brg_base = kgdbmpsc_map_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER); ++ ++ /* get the platform data for the shared registers and get them mapped */ ++ routing_base = kgdbmpsc_map_resource(&shared_dev, ++ IORESOURCE_MEM, ++ MPSC_ROUTING_BASE_ORDER); ++ sdma_base = ++ kgdbmpsc_map_resource(&shared_dev, IORESOURCE_MEM, ++ MPSC_SDMA_INTR_BASE_ORDER); ++ ++ mpsc_irq = platform_get_irq(&mpsc_dev, 1); ++ ++ if (mpsc_base && brg_base && routing_base && sdma_base) ++ return 0; /* success */ ++ ++ return 1; /* failure */ ++} ++ ++static void __init kgdbmpsc_local_exit(void) ++{ ++ if (sdma_base) ++ sdma_base = kgdbmpsc_unmap_resource(&shared_dev, IORESOURCE_MEM, ++ MPSC_SDMA_INTR_BASE_ORDER, ++ sdma_base); ++ if (routing_base) ++ routing_base = kgdbmpsc_unmap_resource(&shared_dev, ++ IORESOURCE_MEM, ++ MPSC_ROUTING_BASE_ORDER, ++ routing_base); ++ if (brg_base) ++ brg_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER, ++ brg_base); ++ if (mpsc_base) ++ mpsc_base = kgdbmpsc_unmap_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER, mpsc_base); ++} ++ ++static void __init kgdbmpsc_update_pdata(struct platform_device *pdev) ++{ ++ ++ snprintf(pdev->dev.bus_id, BUS_ID_SIZE, "%s%u", pdev->name, pdev->id); ++} ++ ++static int __init kgdbmpsc_pdev_init(void) ++{ ++ struct platform_device *pdev; ++ ++ /* get the platform data for the specified port. */ ++ pdev = mv64x60_early_get_pdev_data(MPSC_CTLR_NAME, kgdbmpsc_ttyMM, 1); ++ if (pdev) { ++ memcpy(&mpsc_dev, pdev, sizeof(struct platform_device)); ++ if (platform_notify) { ++ kgdbmpsc_update_pdata(&mpsc_dev); ++ platform_notify(&mpsc_dev.dev); ++ } ++ ++ /* get the platform data for the shared registers. */ ++ pdev = mv64x60_early_get_pdev_data(MPSC_SHARED_NAME, 0, 0); ++ if (pdev) { ++ memcpy(&shared_dev, pdev, ++ sizeof(struct platform_device)); ++ if (platform_notify) { ++ kgdbmpsc_update_pdata(&shared_dev); ++ platform_notify(&shared_dev.dev); ++ } ++ } ++ } ++ return 0; ++} ++ ++postcore_initcall(kgdbmpsc_pdev_init); ++ ++static int __init kgdbmpsc_init_io(void) ++{ ++ ++ kgdbmpsc_pdev_init(); ++ ++ if (kgdbmpsc_local_init()) { ++ kgdbmpsc_local_exit(); ++ return -1; ++ } ++ ++ if (kgdbmpsc_init() == -1) ++ return -1; ++ return 0; ++} ++ ++static void __init kgdbmpsc_hookup_irq(void) ++{ ++ unsigned int msk; ++ if (!request_irq(mpsc_irq, kgdbmpsc_interrupt, 0, "kgdb mpsc", NULL)) { ++ /* Enable interrupt */ ++ msk = readl(sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ msk |= MPSC_INTR_CAUSE_RCC; ++ writel(msk, sdma_base + MPSC_INTR_REG_SELECT(MPSC_INTR_MASK)); ++ ++ kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BASE_ORDER); ++ kgdbmpsc_reserve_resource(&mpsc_dev, IORESOURCE_MEM, ++ MPSC_BRG_BASE_ORDER); ++ } ++} ++ ++struct kgdb_io kgdb_io_ops = { ++ .read_char = kgdb_get_debug_char, ++ .write_char = kgdb_write_debug_char, ++ .init = kgdbmpsc_init_io, ++ .late_init = kgdbmpsc_hookup_irq, ++}; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/pxa.c linux-2.6.18.kgdb/drivers/serial/pxa.c +--- linux-2.6.18/drivers/serial/pxa.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/pxa.c 2008-06-10 16:19:51.000000000 +0400 +@@ -42,6 +42,9 @@ + #include + #include + #include ++#ifdef CONFIG_KGDB_CONSOLE ++#include ++#endif + + #include + #include +@@ -692,6 +695,8 @@ serial_pxa_console_init(void) + console_initcall(serial_pxa_console_init); + + #define PXA_CONSOLE &serial_pxa_console ++#elif defined(CONFIG_KGDB_CONSOLE) ++#define PXA_CONSOLE &kgdbcons + #else + #define PXA_CONSOLE NULL + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/serial_core.c linux-2.6.18.kgdb/drivers/serial/serial_core.c +--- linux-2.6.18/drivers/serial/serial_core.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/serial_core.c 2008-06-10 16:19:03.000000000 +0400 +@@ -33,6 +33,7 @@ + #include /* for serial_state and serial_icounter_struct */ + #include + #include ++#include + + #include + #include +@@ -65,6 +66,12 @@ static struct lock_class_key port_lock_k + #define uart_console(port) (0) + #endif + ++#ifdef CONFIG_KGDB_CONSOLE ++#define uart_kgdb(port) (port->cons && !strcmp(port->cons->name, "kgdb")) ++#else ++#define uart_kgdb(port) (0) ++#endif ++ + static void uart_change_speed(struct uart_state *state, struct termios *old_termios); + static void uart_wait_until_sent(struct tty_struct *tty, int timeout); + static void uart_change_pm(struct uart_state *state, int pm_state); +@@ -1673,6 +1680,9 @@ static int uart_line_info(char *buf, str + port->iotype == UPIO_MEM ? port->mapbase : + (unsigned long) port->iobase, + port->irq); ++ if (port->iotype == UPIO_MEM) ++ ret += sprintf(buf+ret, " membase 0x%08lX", ++ (unsigned long) port->membase); + + if (port->type == PORT_UNKNOWN) { + strcat(buf, "\n"); +@@ -2038,7 +2048,8 @@ uart_report_port(struct uart_driver *drv + case UPIO_AU: + case UPIO_TSI: + snprintf(address, sizeof(address), +- "MMIO 0x%lx", port->mapbase); ++ "MMIO map 0x%lx mem 0x%lx", port->mapbase, ++ (unsigned long) port->membase); + break; + default: + strlcpy(address, "*unknown*", sizeof(address)); +@@ -2090,9 +2101,9 @@ uart_configure_port(struct uart_driver * + + /* + * Power down all ports by default, except the +- * console if we have one. ++ * console (real or kgdb) if we have one. + */ +- if (!uart_console(port)) ++ if (!uart_console(port) && !uart_kgdb(port)) + uart_change_pm(state, 3); + } + } +@@ -2284,6 +2295,12 @@ int uart_add_one_port(struct uart_driver + */ + port->flags &= ~UPF_DEAD; + ++#if defined(CONFIG_KGDB_8250) ++ /* Add any 8250-like ports we find later. */ ++ if (port->type <= PORT_MAX_8250) ++ kgdb8250_add_port(port->line, port); ++#endif ++ + out: + mutex_unlock(&state->mutex); + mutex_unlock(&port_mutex); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/serial_txx9.c linux-2.6.18.kgdb/drivers/serial/serial_txx9.c +--- linux-2.6.18/drivers/serial/serial_txx9.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/serial_txx9.c 2008-06-10 16:19:28.000000000 +0400 +@@ -1164,6 +1164,96 @@ static struct pci_driver serial_txx9_pci + MODULE_DEVICE_TABLE(pci, serial_txx9_pci_tbl); + #endif /* ENABLE_SERIAL_TXX9_PCI */ + ++/******************************************************************************/ ++/* BEG: KDBG Routines */ ++/******************************************************************************/ ++ ++#ifdef CONFIG_KGDB ++int kgdb_init_count = 0; ++ ++void txx9_sio_kgdb_hook(unsigned int port, unsigned int baud_rate) ++{ ++ static struct resource kgdb_resource; ++ int ret; ++ struct uart_txx9_port *up = &serial_txx9_ports[port]; ++ ++ /* prevent initialization by driver */ ++ kgdb_resource.name = "serial_txx9(debug)"; ++ kgdb_resource.start = (unsigned long)up->port.membase; ++ kgdb_resource.end = (unsigned long)(up->port.membase + 36 - 1); ++ kgdb_resource.flags = IORESOURCE_MEM | IORESOURCE_BUSY; ++ ++ ret = request_resource(&iomem_resource, &kgdb_resource); ++ if(ret == -EBUSY) ++ printk(" serial_txx9(debug): request_resource failed\n"); ++ ++ return; ++} ++void ++txx9_sio_kdbg_init( unsigned int port_number ) ++{ ++ if (port_number == 1) { ++ txx9_sio_kgdb_hook(port_number, 38400); ++ } else { ++ printk("Bad Port Number [%u] != [1]\n",port_number); ++ } ++ return; ++} ++ ++u8 ++txx9_sio_kdbg_rd( void ) ++{ ++ unsigned int status,ch; ++ struct uart_txx9_port *up = &serial_txx9_ports[1]; ++ ++ if (kgdb_init_count == 0) { ++ txx9_sio_kdbg_init(1); ++ kgdb_init_count = 1; ++ } ++ ++ while (1) { ++ status = sio_in(up, TXX9_SIDISR); ++ if ( status & 0x1f ) { ++ ch = sio_in(up, TXX9_SIRFIFO ); ++ break; ++ } ++ } ++ ++ return (ch); ++} ++ ++int ++txx9_sio_kdbg_wr( u8 ch ) ++{ ++ unsigned int status; ++ struct uart_txx9_port *up = &serial_txx9_ports[1]; ++ ++ if (kgdb_init_count == 0) { ++ txx9_sio_kdbg_init(1); ++ kgdb_init_count = 1; ++ } ++ ++ while (1) { ++ status = sio_in(up, TXX9_SICISR); ++ if (status & TXX9_SICISR_TRDY) { ++ if ( ch == '\n' ) { ++ txx9_sio_kdbg_wr( '\r' ); ++ } ++ sio_out(up, TXX9_SITFIFO, (u32)ch ); ++ ++ break; ++ } ++ } ++ ++ return (1); ++} ++#endif /* CONFIG_KGDB */ ++ ++ ++/******************************************************************************/ ++/* END: KDBG Routines */ ++/******************************************************************************/ ++ + static int __init serial_txx9_init(void) + { + int ret; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/drivers/serial/sh-sci.c linux-2.6.18.kgdb/drivers/serial/sh-sci.c +--- linux-2.6.18/drivers/serial/sh-sci.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/drivers/serial/sh-sci.c 2008-06-10 16:19:47.000000000 +0400 +@@ -42,6 +42,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_CPU_FREQ + #include +@@ -67,14 +68,16 @@ + + #include "sh-sci.h" + +-#ifdef CONFIG_SH_KGDB +-#include +- +-static int kgdb_get_char(struct sci_port *port); +-static void kgdb_put_char(struct sci_port *port, char c); +-static void kgdb_handle_error(struct sci_port *port); +-static struct sci_port *kgdb_sci_port; +-#endif /* CONFIG_SH_KGDB */ ++#ifdef CONFIG_KGDB_SH_SCI ++/* Speed of the UART. */ ++static int kgdbsci_baud = CONFIG_KGDB_BAUDRATE ++ ++/* Index of the UART, matches ttySCX naming. */ ++static int kgdbsci_ttySC = CONFIG_KGDB_PORT_NUM; ++ ++/* Make life easier on us. */ ++#define KGDBPORT sci_ports[kgdbsci_ttySC] ++#endif /* CONFIG_KGDB_SH_SCI */ + + #ifdef CONFIG_SERIAL_SH_SCI_CONSOLE + static struct sci_port *serial_console_port = 0; +@@ -87,20 +90,17 @@ static void sci_start_rx(struct uart_por + static void sci_stop_rx(struct uart_port *port); + static int sci_request_irq(struct sci_port *port); + static void sci_free_irq(struct sci_port *port); ++static void sci_set_termios(struct uart_port *port, struct termios *termios, ++ struct termios *old); ++static int kgdbsci_init(void); + + static struct sci_port sci_ports[]; + static struct uart_driver sci_uart_driver; + + #define SCI_NPORTS sci_uart_driver.nr + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) +- +-static void handle_error(struct uart_port *port) +-{ /* Clear error flags */ +- sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port)); +-} +- +-static int get_char(struct uart_port *port) ++#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_KGDB_SH_SCI) ++static int get_char_for_gdb(struct uart_port *port) + { + unsigned long flags; + unsigned short status; +@@ -110,7 +110,8 @@ static int get_char(struct uart_port *po + do { + status = sci_in(port, SCxSR); + if (status & SCxSR_ERRORS(port)) { +- handle_error(port); ++ /* Clear error flags. */ ++ sci_out(port, SCxSR, SCxSR_ERROR_CLEAR(port)); + continue; + } + } while (!(status & SCxSR_RDxF(port))); +@@ -121,21 +122,7 @@ static int get_char(struct uart_port *po + + return c; + } +- +-/* Taken from sh-stub.c of GDB 4.18 */ +-static const char hexchars[] = "0123456789abcdef"; +- +-static __inline__ char highhex(int x) +-{ +- return hexchars[(x >> 4) & 0xf]; +-} +- +-static __inline__ char lowhex(int x) +-{ +- return hexchars[x & 0xf]; +-} +- +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_KGDB_SH_SCI */ + + /* + * Send the packet in buffer. The host gets one chance to read it. +@@ -167,21 +154,14 @@ static void put_string(struct sci_port * + const unsigned char *p = buffer; + int i; + +-#if defined(CONFIG_SH_STANDARD_BIOS) || defined(CONFIG_SH_KGDB) +- int checksum; +- int usegdb=0; +- + #ifdef CONFIG_SH_STANDARD_BIOS ++ int checksum; ++ const char hexchars[] = "0123456789abcdef"; ++ + /* This call only does a trap the first time it is + * called, and so is safe to do here unconditionally + */ +- usegdb |= sh_bios_in_gdb_mode(); +-#endif +-#ifdef CONFIG_SH_KGDB +- usegdb |= (kgdb_in_gdb_mode && (port == kgdb_sci_port)); +-#endif +- +- if (usegdb) { ++ if (sh_bios_in_gdb_mode()) { + /* $#. */ + do { + unsigned char c; +@@ -193,18 +173,18 @@ static void put_string(struct sci_port * + int h, l; + + c = *p++; +- h = highhex(c); +- l = lowhex(c); ++ h = hexchars[c >> 4]; ++ l = hexchars[c % 16]; + put_char(port, h); + put_char(port, l); + checksum += h + l; + } + put_char(port, '#'); +- put_char(port, highhex(checksum)); +- put_char(port, lowhex(checksum)); ++ put_char(port, hexchars[checksum >> 4]); ++ put_char(port, hexchars[checksum % 16]); + } while (get_char(port) != '+'); + } else +-#endif /* CONFIG_SH_STANDARD_BIOS || CONFIG_SH_KGDB */ ++#endif /* CONFIG_SH_STANDARD_BIOS */ + for (i=0; i SCI_NPORTS + '0') ++ goto errout; ++ kgdbsci_ttySC = *str - '0'; ++ str++; ++ if (*str != ',') ++ goto errout; ++ str++; ++ kgdbsci_baud = simple_strtoul(str, &str, 10); ++ if (kgdbsci_baud != 9600 && kgdbsci_baud != 19200 && ++ kgdbsci_baud != 38400 && kgdbsci_baud != 57600 && ++ kgdbsci_baud != 115200) ++ goto errout; ++ ++ return 0; ++ ++errout: ++ printk(KERN_ERR "Invalid syntax for option kgdbsci=\n"); ++ return 1; ++} ++__setup("kgdbsci", kgdbsci_opt); ++#endif /* CONFIG_KGDB_SH_SCI */ + + #if defined(__H8300S__) + enum { sci_disable, sci_enable }; +@@ -555,6 +608,16 @@ static inline void sci_receive_chars(str + continue; + } + ++#ifdef CONFIG_KGDB_SH_SCI ++ /* We assume that a ^C on the port KGDB ++ * is using means that KGDB wants to ++ * interrupt the running system. ++ */ ++ if (port->line == KGDBPORT.port.line && ++ c == 3) ++ breakpoint(); ++#endif ++ + /* Store data and status */ + if (status&SCxSR_FER(port)) { + flag = TTY_FRAME; +@@ -1618,6 +1681,7 @@ static int __init sci_console_init(void) + console_initcall(sci_console_init); + #endif /* CONFIG_SERIAL_SH_SCI_CONSOLE */ + ++#if 0 + #ifdef CONFIG_SH_KGDB + /* + * FIXME: Most of this can go away.. at the moment, we rely on +@@ -1663,30 +1727,9 @@ int __init kgdb_console_setup(struct con + return uart_set_options(port, co, baud, parity, bits, flow); + } + #endif /* CONFIG_SH_KGDB */ ++#endif /* 0 */ + +-#ifdef CONFIG_SH_KGDB_CONSOLE +-static struct console kgdb_console = { +- .name = "ttySC", +- .write = kgdb_console_write, +- .setup = kgdb_console_setup, +- .flags = CON_PRINTBUFFER | CON_ENABLED, +- .index = -1, +- .data = &sci_uart_driver, +-}; +- +-/* Register the KGDB console so we get messages (d'oh!) */ +-static int __init kgdb_console_init(void) +-{ +- register_console(&kgdb_console); +- return 0; +-} +- +-console_initcall(kgdb_console_init); +-#endif /* CONFIG_SH_KGDB_CONSOLE */ +- +-#if defined(CONFIG_SH_KGDB_CONSOLE) +-#define SCI_CONSOLE &kgdb_console +-#elif defined(CONFIG_SERIAL_SH_SCI_CONSOLE) ++#ifdef CONFIG_SERIAL_SH_SCI_CONSOLE + #define SCI_CONSOLE &serial_console + #else + #define SCI_CONSOLE 0 +@@ -1757,4 +1800,3 @@ static void __exit sci_exit(void) + + module_init(sci_init); + module_exit(sci_exit); +- +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-arm/kgdb.h linux-2.6.18.kgdb/include/asm-arm/kgdb.h +--- linux-2.6.18/include/asm-arm/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-arm/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,92 @@ ++/* ++ * include/asm-arm/kgdb.h ++ * ++ * ARM KGDB support ++ * ++ * Author: Deepak Saxena ++ * ++ * Copyright (C) 2002 MontaVista Software Inc. ++ * ++ */ ++ ++#ifndef __ASM_KGDB_H__ ++#define __ASM_KGDB_H__ ++ ++#include ++#include ++#include ++ ++ ++/* ++ * GDB assumes that we're a user process being debugged, so ++ * it will send us an SWI command to write into memory as the ++ * debug trap. When an SWI occurs, the next instruction addr is ++ * placed into R14_svc before jumping to the vector trap. ++ * This doesn't work for kernel debugging as we are already in SVC ++ * we would loose the kernel's LR, which is a bad thing. This ++ * is bad thing. ++ * ++ * By doing this as an undefined instruction trap, we force a mode ++ * switch from SVC to UND mode, allowing us to save full kernel state. ++ * ++ * We also define a KGDB_COMPILED_BREAK which can be used to compile ++ * in breakpoints. This is important for things like sysrq-G and for ++ * the initial breakpoint from trap_init(). ++ * ++ * Note to ARM HW designers: Add real trap support like SH && PPC to ++ * make our lives much much simpler. :) ++ */ ++#define BREAK_INSTR_SIZE 4 ++#define GDB_BREAKINST 0xef9f0001 ++#define KGDB_BREAKINST 0xe7ffdefe ++#define KGDB_COMPILED_BREAK 0xe7ffdeff ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++#ifndef __ASSEMBLY__ ++ ++#define BREAKPOINT() asm(".word 0xe7ffdeff") ++ ++ ++extern void kgdb_handle_bus_error(void); ++extern int kgdb_fault_expected; ++#endif /* !__ASSEMBLY__ */ ++ ++/* ++ * From Amit S. Kale: ++ * ++ * In the register packet, words 0-15 are R0 to R10, FP, IP, SP, LR, PC. But ++ * Register 16 isn't cpsr. GDB passes CPSR in word 25. There are 9 words in ++ * between which are unused. Passing only 26 words to gdb is sufficient. ++ * GDB can figure out that floating point registers are not passed. ++ * GDB_MAX_REGS should be 26. ++ */ ++#define GDB_MAX_REGS (26) ++ ++#define KGDB_MAX_NO_CPUS 1 ++#define BUFMAX 400 ++#define NUMREGBYTES (GDB_MAX_REGS << 2) ++#define NUMCRITREGBYTES (32 << 2) ++ ++#define _R0 0 ++#define _R1 1 ++#define _R2 2 ++#define _R3 3 ++#define _R4 4 ++#define _R5 5 ++#define _R6 6 ++#define _R7 7 ++#define _R8 8 ++#define _R9 9 ++#define _R10 10 ++#define _FP 11 ++#define _IP 12 ++#define _SP 13 ++#define _LR 14 ++#define _PC 15 ++#define _CPSR (GDB_MAX_REGS - 1) ++ ++/* So that we can denote the end of a frame for tracing, in the simple ++ * case. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_PC,_SP,func) ++ ++#endif /* __ASM_KGDB_H__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-arm/system.h linux-2.6.18.kgdb/include/asm-arm/system.h +--- linux-2.6.18/include/asm-arm/system.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-arm/system.h 2008-06-10 16:19:51.000000000 +0400 +@@ -444,6 +444,47 @@ static inline unsigned long __xchg(unsig + extern void disable_hlt(void); + extern void enable_hlt(void); + ++#define __HAVE_ARCH_CMPXCHG 1 ++ ++#include ++ ++static inline unsigned long __cmpxchg_u32(volatile int *m, unsigned long old, ++ unsigned long new) ++{ ++ u32 retval; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ retval = *m; ++ if (retval == old) ++ *m = new; ++ local_irq_restore(flags); /* implies memory barrier */ ++ ++ return retval; ++} ++ ++/* This function doesn't exist, so you'll get a linker error ++ if something tries to do an invalid cmpxchg(). */ ++extern void __cmpxchg_called_with_bad_pointer(void); ++ ++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, ++ unsigned long new, int size) ++{ ++ switch (size) { ++ case 4: ++ return __cmpxchg_u32(ptr, old, new); ++ } ++ __cmpxchg_called_with_bad_pointer(); ++ return old; ++} ++ ++#define cmpxchg(ptr,o,n) \ ++ ({ \ ++ __typeof__(*(ptr)) _o_ = (o); \ ++ __typeof__(*(ptr)) _n_ = (n); \ ++ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ ++ (unsigned long)_n_, sizeof(*(ptr))); \ ++ }) + #endif /* __ASSEMBLY__ */ + + #define arch_align_stack(x) (x) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-generic/kgdb.h linux-2.6.18.kgdb/include/asm-generic/kgdb.h +--- linux-2.6.18/include/asm-generic/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-generic/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,34 @@ ++/* ++ * include/asm-generic/kgdb.h ++ * ++ * This provides the assembly level information so that KGDB can provide ++ * a GDB that has been patched with enough information to know to stop ++ * trying to unwind the function. ++ * ++ * Author: Tom Rini ++ * ++ * 2005 (c) MontaVista Software, Inc. This file is licensed under the terms ++ * of the GNU General Public License version 2. This program is licensed ++ * "as is" without any warranty of any kind, whether express or implied. ++ */ ++ ++#ifndef __ASM_GENERIC_KGDB_H__ ++#define __ASM_GENERIC_KGDB_H__ ++ ++#include ++#ifdef __ASSEMBLY__ ++#ifdef CONFIG_KGDB ++/* This MUST be put at the end of a given assembly function */ ++#define __CFI_END_FRAME(pc,sp,func) \ ++CAT3(.Lend_,func,:) \ ++ CFI_preamble(func,pc,0x1,-DATA_ALIGN_FACTOR) \ ++ CFA_define_reference(sp, 0) \ ++ CFA_undefine_reg(pc) \ ++ CFI_postamble() \ ++ FDE_preamble(func,func,CAT3(.Lend,_,func)) \ ++ FDE_postamble() ++#else ++#define __CFI_END_FRAME(pc,sp,fn) ++#endif /* CONFIG_KGDB */ ++#endif /* __ASSEMBLY__ */ ++#endif /* __ASM_GENERIC_KGDB_H__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-i386/kdebug.h linux-2.6.18.kgdb/include/asm-i386/kdebug.h +--- linux-2.6.18/include/asm-i386/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-i386/kdebug.h 2008-06-10 16:19:17.000000000 +0400 +@@ -39,6 +39,7 @@ enum die_val { + DIE_CALL, + DIE_NMI_IPI, + DIE_PAGE_FAULT, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, const char *str, +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-i386/kgdb.h linux-2.6.18.kgdb/include/asm-i386/kgdb.h +--- linux-2.6.18/include/asm-i386/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-i386/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,58 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++#define _EAX 0 ++#define _ECX 1 ++#define _EDX 2 ++#define _EBX 3 ++#define _ESP 4 ++#define _EBP 5 ++#define _ESI 6 ++#define _EDI 7 ++#define _PC 8 ++#define _EIP 8 ++#define _PS 9 ++#define _EFLAGS 9 ++#define _CS 10 ++#define _SS 11 ++#define _DS 12 ++#define _ES 13 ++#define _FS 14 ++#define _GS 15 ++ ++/* So that we can denote the end of a frame for tracing, in the simple ++ * case. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_EIP,_ESP,func) ++ ++#ifndef __ASSEMBLY__ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 1024 ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES 64 ++/* Number of bytes of registers we need to save for a setjmp/longjmp. */ ++#define NUMCRITREGBYTES 24 ++ ++#define BREAKPOINT() asm(" int $3"); ++#define BREAK_INSTR_SIZE 1 ++#define CACHE_FLUSH_IS_SAFE 1 ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ia64/kdebug.h linux-2.6.18.kgdb/include/asm-ia64/kdebug.h +--- linux-2.6.18/include/asm-ia64/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-ia64/kdebug.h 2008-06-10 16:19:32.000000000 +0400 +@@ -72,6 +72,7 @@ enum die_val { + DIE_KDEBUG_LEAVE, + DIE_KDUMP_ENTER, + DIE_KDUMP_LEAVE, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, char *str, struct pt_regs *regs, +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ia64/kgdb.h linux-2.6.18.kgdb/include/asm-ia64/kgdb.h +--- linux-2.6.18/include/asm-ia64/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-ia64/kgdb.h 2008-06-10 16:19:32.000000000 +0400 +@@ -0,0 +1,36 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/************************************************************************/ ++/* BUFMAX defines the maximum number of characters in inbound/outbound buffers*/ ++/* at least NUMREGBYTES*2 are needed for register packets */ ++/* Longer buffer is needed to list all threads */ ++#define BUFMAX 1024 ++ ++/* Number of bytes of registers. We set this to 0 so that certain GDB ++ * packets will fail, forcing the use of others, which are more friendly ++ * on ia64. */ ++#define NUMREGBYTES 0 ++ ++#define NUMCRITREGBYTES (70*8) ++#define JMP_REGS_ALIGNMENT __attribute__ ((aligned (16))) ++ ++#define BREAKNUM 0x00003333300LL ++#define KGDBBREAKNUM 0x6665UL ++#define BREAKPOINT() asm volatile ("break.m 0x6665") ++#define BREAK_INSTR_SIZE 16 ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++struct pt_regs; ++extern volatile int kgdb_hwbreak_sstep[NR_CPUS]; ++extern void smp_send_nmi_allbutself(void); ++extern void kgdb_wait_ipi(struct pt_regs *); ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-mips/kdebug.h linux-2.6.18.kgdb/include/asm-mips/kdebug.h +--- linux-2.6.18/include/asm-mips/kdebug.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-mips/kdebug.h 2008-06-10 16:19:28.000000000 +0400 +@@ -0,0 +1,47 @@ ++/* ++ * ++ * Copyright (C) 2004 MontaVista Software Inc. ++ * Author: Manish Lachwani, mlachwani@mvista.com or manish@koffee-break.com ++ * ++ * This program is free software; you can redistribute it and/or modify it ++ * under the terms of the GNU General Public License as published by the ++ * Free Software Foundation; either version 2 of the License, or (at your ++ * option) any later version. ++ * ++ */ ++#ifndef _MIPS_KDEBUG_H ++#define _MIPS_KDEBUG_H ++ ++#include ++ ++struct pt_regs; ++ ++struct die_args { ++ struct pt_regs *regs; ++ const char *str; ++ long err; ++}; ++ ++int register_die_notifier(struct notifier_block *nb); ++extern struct notifier_block *mips_die_chain; ++ ++enum die_val { ++ DIE_OOPS = 1, ++ DIE_PANIC, ++ DIE_DIE, ++ DIE_KERNELDEBUG, ++ DIE_TRAP, ++ DIE_PAGE_FAULT, ++}; ++ ++/* ++ * trap number can be computed from regs and signr can be computed using ++ * compute_signal() ++ */ ++static inline int notify_die(enum die_val val,char *str,struct pt_regs *regs,long err) ++{ ++ struct die_args args = { .regs=regs, .str=str, .err=err }; ++ return notifier_call_chain(&mips_die_chain, val, &args); ++} ++ ++#endif /* _MIPS_KDEBUG_H */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-mips/kgdb.h linux-2.6.18.kgdb/include/asm-mips/kgdb.h +--- linux-2.6.18/include/asm-mips/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-mips/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,34 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++#ifndef __ASSEMBLY__ ++#if (_MIPS_ISA == _MIPS_ISA_MIPS1) || (_MIPS_ISA == _MIPS_ISA_MIPS2) ++typedef u32 gdb_reg_t; ++#elif (_MIPS_ISA == _MIPS_ISA_MIPS3) || (_MIPS_ISA == _MIPS_ISA_MIPS4) ++typedef u64 gdb_reg_t; ++#else ++#error need to do ++#endif /* _MIPS_ISA */ ++ ++#include ++ ++#ifndef __ASSEMBLY__ ++#define BUFMAX 2048 ++#define NUMREGBYTES (90*sizeof(gdb_reg_t)) ++#define NUMCRITREGBYTES (12*sizeof(gdb_reg_t)) ++#define BREAK_INSTR_SIZE 4 ++#define BREAKPOINT() __asm__ __volatile__( \ ++ ".globl breakinst\n\t" \ ++ ".set\tnoreorder\n\t" \ ++ "nop\n" \ ++ "breakinst:\tbreak\n\t" \ ++ "nop\n\t" \ ++ ".set\treorder") ++#define CACHE_FLUSH_IS_SAFE 0 ++ ++extern int kgdb_early_setup; ++ ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-powerpc/kgdb.h linux-2.6.18.kgdb/include/asm-powerpc/kgdb.h +--- linux-2.6.18/include/asm-powerpc/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-powerpc/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,74 @@ ++/* ++ * include/asm-powerpc/kgdb.h ++ * ++ * The PowerPC (32/64) specific defines / externs for KGDB. Based on ++ * the previous 32bit and 64bit specific files, which had the following ++ * copyrights: ++ * ++ * PPC64 Mods (C) 2005 Frank Rowand (frowand@mvista.com) ++ * PPC Mods (C) 2004 Tom Rini (trini@mvista.com) ++ * PPC Mods (C) 2003 John Whitney (john.whitney@timesys.com) ++ * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu) ++ * ++ * ++ * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) ++ * Author: Tom Rini ++ * ++ * 2006 (c) MontaVista Software, Inc. This file is licensed under ++ * the terms of the GNU General Public License version 2. This program ++ * is licensed "as is" without any warranty of any kind, whether express ++ * or implied. ++ */ ++#ifdef __KERNEL__ ++#ifndef __POWERPC_KGDB_H__ ++#define __POWERPC_KGDB_H__ ++ ++#include ++#ifndef __ASSEMBLY__ ++ ++#define BREAK_INSTR_SIZE 4 ++#define BUFMAX ((NUMREGBYTES * 2) + 512) ++#define OUTBUFMAX ((NUMREGBYTES * 2) + 512) ++#define BREAKPOINT() asm(".long 0x7d821008"); /* twge r2, r2 */ ++#define CACHE_FLUSH_IS_SAFE 1 ++ ++/* The number bytes of registers we have to save depends on a few ++ * things. For 64bit we default to not including vector registers and ++ * vector state registers. */ ++#ifdef CONFIG_PPC64 ++/* ++ * 64 bit (8 byte) registers: ++ * 32 gpr, 32 fpr, nip, msr, link, ctr ++ * 32 bit (4 byte) registers: ++ * ccr, xer, fpscr ++ */ ++#define NUMREGBYTES ((68 * 8) + (3 * 4)) ++#if 0 ++/* The following adds in vector registers and vector state registers. */ ++/* 128 bit (16 byte) registers: ++ * 32 vr ++ * 64 bit (8 byte) registers: ++ * 32 gpr, 32 fpr, nip, msr, link, ctr ++ * 32 bit (4 byte) registers: ++ * ccr, xer, fpscr, vscr, vrsave ++ */ ++#define NUMREGBYTES ((128 * 16) + (68 * 8) + (5 * 4)) ++#endif ++#define NUMCRITREGBYTES 184 ++#else /* CONFIG_PPC32 */ ++/* On non-E500 family PPC32 we determine the size by picking the last ++ * register we need, but on E500 we skip sections so we list what we ++ * need to store, and add it up. */ ++#ifndef CONFIG_E500 ++#define MAXREG (PT_FPSCR+1) ++#else ++/* 32 GPRs (8 bytes), nip, msr, ccr, link, ctr, xer, acc (8 bytes), spefscr*/ ++#define MAXREG ((32*2)+6+2+1) ++#endif ++#define NUMREGBYTES (MAXREG * sizeof(int)) ++/* CR/LR, R1, R2, R13-R31 inclusive. */ ++#define NUMCRITREGBYTES (23 * sizeof(int)) ++#endif /* 32/64 */ ++#endif /* !(__ASSEMBLY__) */ ++#endif /* !__POWERPC_KGDB_H__ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/kgdb.h linux-2.6.18.kgdb/include/asm-ppc/kgdb.h +--- linux-2.6.18/include/asm-ppc/kgdb.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-ppc/kgdb.h 2008-06-10 16:19:22.000000000 +0400 +@@ -1,57 +1,18 @@ +-/* +- * kgdb.h: Defines and declarations for serial line source level +- * remote debugging of the Linux kernel using gdb. +- * +- * PPC Mods (C) 1998 Michael Tesch (tesch@cs.wisc.edu) +- * +- * Copyright (C) 1995 David S. Miller (davem@caip.rutgers.edu) +- */ + #ifdef __KERNEL__ +-#ifndef _PPC_KGDB_H +-#define _PPC_KGDB_H +- ++#ifndef __PPC_KGDB_H__ ++#define __PPC_KGDB_H__ ++#include + #ifndef __ASSEMBLY__ +- +-/* Things specific to the gen550 backend. */ +-struct uart_port; +- +-extern void gen550_progress(char *, unsigned short); +-extern void gen550_kgdb_map_scc(void); +-extern void gen550_init(int, struct uart_port *); +- +-/* Things specific to the pmac backend. */ +-extern void zs_kgdb_hook(int tty_num); +- +-/* To init the kgdb engine. (called by serial hook)*/ +-extern void set_debug_traps(void); +- +-/* To enter the debugger explicitly. */ +-extern void breakpoint(void); +- +-/* For taking exceptions +- * these are defined in traps.c +- */ +-extern int (*debugger)(struct pt_regs *regs); ++ /* For taking exceptions ++ * these are defined in traps.c ++ */ ++struct pt_regs; ++extern void (*debugger)(struct pt_regs *regs); + extern int (*debugger_bpt)(struct pt_regs *regs); + extern int (*debugger_sstep)(struct pt_regs *regs); + extern int (*debugger_iabr_match)(struct pt_regs *regs); + extern int (*debugger_dabr_match)(struct pt_regs *regs); + extern void (*debugger_fault_handler)(struct pt_regs *regs); +- +-/* What we bring to the party */ +-int kgdb_bpt(struct pt_regs *regs); +-int kgdb_sstep(struct pt_regs *regs); +-void kgdb(struct pt_regs *regs); +-int kgdb_iabr_match(struct pt_regs *regs); +-int kgdb_dabr_match(struct pt_regs *regs); +- +-/* +- * external low-level support routines (ie macserial.c) +- */ +-extern void kgdb_interruptible(int); /* control interrupts from serial */ +-extern void putDebugChar(char); /* write a single character */ +-extern char getDebugChar(void); /* read and return a single char */ +- +-#endif /* !(__ASSEMBLY__) */ +-#endif /* !(_PPC_KGDB_H) */ ++#endif /* !__ASSEMBLY__ */ ++#endif /* __PPC_KGDB_H__ */ + #endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/machdep.h linux-2.6.18.kgdb/include/asm-ppc/machdep.h +--- linux-2.6.18/include/asm-ppc/machdep.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-ppc/machdep.h 2008-06-10 16:19:22.000000000 +0400 +@@ -72,9 +72,7 @@ struct machdep_calls { + unsigned long (*find_end_of_memory)(void); + void (*setup_io_mappings)(void); + +- void (*early_serial_map)(void); + void (*progress)(char *, unsigned short); +- void (*kgdb_map_scc)(void); + + unsigned char (*nvram_read_val)(int addr); + void (*nvram_write_val)(int addr, unsigned char val); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/mv64x60.h linux-2.6.18.kgdb/include/asm-ppc/mv64x60.h +--- linux-2.6.18/include/asm-ppc/mv64x60.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-ppc/mv64x60.h 2008-06-10 16:19:22.000000000 +0400 +@@ -348,6 +348,8 @@ u32 mv64x60_calc_mem_size(struct mv64x60 + + void mv64x60_progress_init(u32 base); + void mv64x60_mpsc_progress(char *s, unsigned short hex); ++struct platform_device * mv64x60_early_get_pdev_data(const char *name, ++ int id, int remove); + + extern struct mv64x60_32bit_window + gt64260_32bit_windows[MV64x60_32BIT_WIN_COUNT]; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-ppc/mv64x60_defs.h linux-2.6.18.kgdb/include/asm-ppc/mv64x60_defs.h +--- linux-2.6.18/include/asm-ppc/mv64x60_defs.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-ppc/mv64x60_defs.h 2008-06-10 16:19:22.000000000 +0400 +@@ -57,7 +57,8 @@ + #define MV64x60_IRQ_I2C 37 + #define MV64x60_IRQ_BRG 39 + #define MV64x60_IRQ_MPSC_0 40 +-#define MV64x60_IRQ_MPSC_1 42 ++#define MV64360_IRQ_MPSC_1 41 ++#define GT64260_IRQ_MPSC_1 42 + #define MV64x60_IRQ_COMM 43 + #define MV64x60_IRQ_P0_GPP_0_7 56 + #define MV64x60_IRQ_P0_GPP_8_15 57 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-sh/kgdb.h linux-2.6.18.kgdb/include/asm-sh/kgdb.h +--- linux-2.6.18/include/asm-sh/kgdb.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-sh/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -2,94 +2,40 @@ + * May be copied or modified under the terms of the GNU General Public + * License. See linux/COPYING for more information. + * +- * Based on original code by Glenn Engel, Jim Kingdon, +- * David Grothe , Tigran Aivazian, and +- * Amit S. Kale +- * +- * Super-H port based on sh-stub.c (Ben Lee and Steve Chamberlain) by +- * Henry Bell +- * +- * Header file for low-level support for remote debug using GDB. ++ * Based on a file that was modified or based on files by: Glenn Engel, ++ * Jim Kingdon, David Grothe , Tigran Aivazian , ++ * Amit S. Kale , sh-stub.c from Ben Lee and ++ * Steve Chamberlain, Henry Bell ++ * ++ * Maintainer: Tom Rini + * + */ + + #ifndef __KGDB_H + #define __KGDB_H + +-#include +- +-struct console; ++#include ++/* Based on sh-gdb.c from gdb-6.1, Glenn ++ Engel at HP Ben Lee and Steve Chamberlain */ ++#define NUMREGBYTES 112 /* 92 */ ++#define NUMCRITREGBYTES (9 << 2) ++#define BUFMAX 400 + +-/* Same as pt_regs but has vbr in place of syscall_nr */ ++#ifndef __ASSEMBLY__ + struct kgdb_regs { + unsigned long regs[16]; + unsigned long pc; + unsigned long pr; +- unsigned long sr; + unsigned long gbr; ++ unsigned long vbr; + unsigned long mach; + unsigned long macl; +- unsigned long vbr; +-}; +- +-/* State info */ +-extern char kgdb_in_gdb_mode; +-extern int kgdb_done_init; +-extern int kgdb_enabled; +-extern int kgdb_nofault; /* Ignore bus errors (in gdb mem access) */ +-extern int kgdb_halt; /* Execute initial breakpoint at startup */ +-extern char in_nmi; /* Debounce flag to prevent NMI reentry*/ +- +-/* SCI */ +-extern int kgdb_portnum; +-extern int kgdb_baud; +-extern char kgdb_parity; +-extern char kgdb_bits; +-extern int kgdb_console_setup(struct console *, char *); +- +-/* Init and interface stuff */ +-extern int kgdb_init(void); +-extern int (*kgdb_serial_setup)(void); +-extern int (*kgdb_getchar)(void); +-extern void (*kgdb_putchar)(int); +- +-struct kgdb_sermap { +- char *name; +- int namelen; +- int (*setup_fn)(struct console *, char *); +- struct kgdb_sermap *next; ++ unsigned long sr; + }; +-extern void kgdb_register_sermap(struct kgdb_sermap *map); +-extern struct kgdb_sermap *kgdb_porttype; + +-/* Trap functions */ +-typedef void (kgdb_debug_hook_t)(struct pt_regs *regs); +-typedef void (kgdb_bus_error_hook_t)(void); +-extern kgdb_debug_hook_t *kgdb_debug_hook; +-extern kgdb_bus_error_hook_t *kgdb_bus_err_hook; +- +-extern void breakpoint(void); +- +-/* Console */ +-struct console; +-void kgdb_console_write(struct console *co, const char *s, unsigned count); +-void kgdb_console_init(void); +- +-/* Prototypes for jmp fns */ +-#define _JBLEN 9 +-typedef int jmp_buf[_JBLEN]; +-extern void longjmp(jmp_buf __jmpb, int __retval); +-extern int setjmp(jmp_buf __jmpb); +- +-/* Variadic macro to print our own message to the console */ +-#define KGDB_PRINTK(...) printk("KGDB: " __VA_ARGS__) +- +-/* Forced breakpoint */ +-#define BREAKPOINT() do { \ +- if (kgdb_enabled) { \ +- asm volatile("trapa #0xff"); \ +- } \ +-} while (0) ++#define BREAKPOINT() asm("trapa #0xff"); ++#define BREAK_INSTR_SIZE 2 ++#define CACHE_FLUSH_IS_SAFE 1 + + /* KGDB should be able to flush all kernel text space */ + #if defined(CONFIG_CPU_SH4) +@@ -102,30 +48,5 @@ extern int setjmp(jmp_buf __jmpb); + #else + #define kgdb_flush_icache_range(start, end) do { } while (0) + #endif +- +-/* Kernel assert macros */ +-#ifdef CONFIG_KGDB_KERNEL_ASSERTS +- +-/* Predefined conditions */ +-#define KA_VALID_ERRNO(errno) ((errno) > 0 && (errno) <= EMEDIUMTYPE) +-#define KA_VALID_PTR_ERR(ptr) KA_VALID_ERRNO(-PTR_ERR(ptr)) +-#define KA_VALID_KPTR(ptr) (!(ptr) || \ +- ((void *)(ptr) >= (void *)PAGE_OFFSET && \ +- (void *)(ptr) < ERR_PTR(-EMEDIUMTYPE))) +-#define KA_VALID_PTRORERR(errptr) \ +- (KA_VALID_KPTR(errptr) || KA_VALID_PTR_ERR(errptr)) +-#define KA_HELD_GKL() (current->lock_depth >= 0) +- +-/* The actual assert */ +-#define KGDB_ASSERT(condition, message) do { \ +- if (!(condition) && (kgdb_enabled)) { \ +- KGDB_PRINTK("Assertion failed at %s:%d: %s\n", \ +- __FILE__, __LINE__, message);\ +- BREAKPOINT(); \ +- } \ +-} while (0) +-#else +-#define KGDB_ASSERT(condition, message) +-#endif +- ++#endif /* !__ASSEMBLY__ */ + #endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-sh/system.h linux-2.6.18.kgdb/include/asm-sh/system.h +--- linux-2.6.18/include/asm-sh/system.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-sh/system.h 2008-06-10 16:19:47.000000000 +0400 +@@ -6,6 +6,7 @@ + * Copyright (C) 2002 Paul Mundt + */ + ++#include + + /* + * switch_to() should switch tasks to task nr n, first +@@ -260,6 +261,45 @@ static __inline__ unsigned long __xchg(u + return x; + } + ++static inline unsigned long __cmpxchg_u32(volatile int * m, unsigned long old, ++ unsigned long new) ++{ ++ __u32 retval; ++ unsigned long flags; ++ ++ local_irq_save(flags); ++ retval = *m; ++ if (retval == old) ++ *m = new; ++ local_irq_restore(flags); /* implies memory barrier */ ++ return retval; ++} ++ ++/* This function doesn't exist, so you'll get a linker error ++ * if something tries to do an invalid cmpxchg(). */ ++extern void __cmpxchg_called_with_bad_pointer(void); ++ ++#define __HAVE_ARCH_CMPXCHG 1 ++ ++static inline unsigned long __cmpxchg(volatile void * ptr, unsigned long old, ++ unsigned long new, int size) ++{ ++ switch (size) { ++ case 4: ++ return __cmpxchg_u32(ptr, old, new); ++ } ++ __cmpxchg_called_with_bad_pointer(); ++ return old; ++} ++ ++#define cmpxchg(ptr,o,n) \ ++ ({ \ ++ __typeof__(*(ptr)) _o_ = (o); \ ++ __typeof__(*(ptr)) _n_ = (n); \ ++ (__typeof__(*(ptr))) __cmpxchg((ptr), (unsigned long)_o_, \ ++ (unsigned long)_n_, sizeof(*(ptr))); \ ++ }) ++ + /* XXX + * disable hlt during certain critical i/o operations + */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/kdebug.h linux-2.6.18.kgdb/include/asm-x86_64/kdebug.h +--- linux-2.6.18/include/asm-x86_64/kdebug.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-x86_64/kdebug.h 2008-06-10 16:19:36.000000000 +0400 +@@ -34,6 +34,7 @@ enum die_val { + DIE_CALL, + DIE_NMI_IPI, + DIE_PAGE_FAULT, ++ DIE_PAGE_FAULT_NO_CONTEXT, + }; + + static inline int notify_die(enum die_val val, const char *str, +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/kgdb.h linux-2.6.18.kgdb/include/asm-x86_64/kgdb.h +--- linux-2.6.18/include/asm-x86_64/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/asm-x86_64/kgdb.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,54 @@ ++#ifdef __KERNEL__ ++#ifndef _ASM_KGDB_H_ ++#define _ASM_KGDB_H_ ++ ++/* ++ * Copyright (C) 2001-2004 Amit S. Kale ++ */ ++ ++#include ++ ++/* ++ * Note that this register image is in a different order than ++ * the register image that Linux produces at interrupt time. ++ * ++ * Linux's register image is defined by struct pt_regs in ptrace.h. ++ * Just why GDB uses a different order is a historical mystery. ++ */ ++#define _RAX 0 ++#define _RDX 1 ++#define _RCX 2 ++#define _RBX 3 ++#define _RSI 4 ++#define _RDI 5 ++#define _RBP 6 ++#define _RSP 7 ++#define _R8 8 ++#define _R9 9 ++#define _R10 10 ++#define _R11 11 ++#define _R12 12 ++#define _R13 13 ++#define _R14 14 ++#define _R15 15 ++#define _PC 16 ++#define _PS 17 ++ ++/* Number of bytes of registers. */ ++#define NUMREGBYTES ((_PS+1)*8) ++#define NUMCRITREGBYTES (8 * 8) /* 8 registers. */ ++ ++/* Help GDB to know when to stop backtracing. */ ++#define CFI_END_FRAME(func) __CFI_END_FRAME(_PC,_RSP,func) ++#ifndef __ASSEMBLY__ ++/* BUFMAX defines the maximum number of characters in inbound/outbound ++ * buffers at least NUMREGBYTES*2 are needed for register packets, and ++ * a longer buffer is needed to list all threads. */ ++#define BUFMAX 1024 ++#define BREAKPOINT() asm(" int $3"); ++#define CHECK_EXCEPTION_STACK() ((&__get_cpu_var(init_tss))[0].ist[0]) ++#define BREAK_INSTR_SIZE 1 ++#define CACHE_FLUSH_IS_SAFE 1 ++#endif /* !__ASSEMBLY__ */ ++#endif /* _ASM_KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/asm-x86_64/system.h linux-2.6.18.kgdb/include/asm-x86_64/system.h +--- linux-2.6.18/include/asm-x86_64/system.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/asm-x86_64/system.h 2008-06-10 16:19:42.000000000 +0400 +@@ -21,7 +21,9 @@ + ,"rcx","rbx","rdx","r8","r9","r10","r11","r12","r13","r14","r15" + + #define switch_to(prev,next,last) \ +- asm volatile(SAVE_CONTEXT \ ++ asm volatile(".globl __switch_to_begin\n\t" \ ++ "__switch_to_begin:\n\t" \ ++ SAVE_CONTEXT \ + "movq %%rsp,%P[threadrsp](%[prev])\n\t" /* save RSP */ \ + "movq %P[threadrsp](%[next]),%%rsp\n\t" /* restore RSP */ \ + "call __switch_to\n\t" \ +@@ -33,6 +35,8 @@ + "movq %%rax,%%rdi\n\t" \ + "jc ret_from_fork\n\t" \ + RESTORE_CONTEXT \ ++ ".globl __switch_to_end\n\t" \ ++ "__switch_to_end:\n\t" \ + : "=a" (last) \ + : [next] "S" (next), [prev] "D" (prev), \ + [threadrsp] "i" (offsetof(struct task_struct, thread.rsp)), \ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2-defs.h linux-2.6.18.kgdb/include/linux/dwarf2-defs.h +--- linux-2.6.18/include/linux/dwarf2-defs.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/linux/dwarf2-defs.h 2008-06-10 16:22:59.000000000 +0400 +@@ -0,0 +1,515 @@ ++#ifndef _ELF_DWARF_H ++/* Machine generated from dwarf2.h by scripts/dwarfh.awk */ ++#define _ELF_DWARF2_H ++#define DW_TAG_padding 0x00 ++#define DW_TAG_array_type 0x01 ++#define DW_TAG_class_type 0x02 ++#define DW_TAG_entry_point 0x03 ++#define DW_TAG_enumeration_type 0x04 ++#define DW_TAG_formal_parameter 0x05 ++#define DW_TAG_imported_declaration 0x08 ++#define DW_TAG_label 0x0a ++#define DW_TAG_lexical_block 0x0b ++#define DW_TAG_member 0x0d ++#define DW_TAG_pointer_type 0x0f ++#define DW_TAG_reference_type 0x10 ++#define DW_TAG_compile_unit 0x11 ++#define DW_TAG_string_type 0x12 ++#define DW_TAG_structure_type 0x13 ++#define DW_TAG_subroutine_type 0x15 ++#define DW_TAG_typedef 0x16 ++#define DW_TAG_union_type 0x17 ++#define DW_TAG_unspecified_parameters 0x18 ++#define DW_TAG_variant 0x19 ++#define DW_TAG_common_block 0x1a ++#define DW_TAG_common_inclusion 0x1b ++#define DW_TAG_inheritance 0x1c ++#define DW_TAG_inlined_subroutine 0x1d ++#define DW_TAG_module 0x1e ++#define DW_TAG_ptr_to_member_type 0x1f ++#define DW_TAG_set_type 0x20 ++#define DW_TAG_subrange_type 0x21 ++#define DW_TAG_with_stmt 0x22 ++#define DW_TAG_access_declaration 0x23 ++#define DW_TAG_base_type 0x24 ++#define DW_TAG_catch_block 0x25 ++#define DW_TAG_const_type 0x26 ++#define DW_TAG_constant 0x27 ++#define DW_TAG_enumerator 0x28 ++#define DW_TAG_file_type 0x29 ++#define DW_TAG_friend 0x2a ++#define DW_TAG_namelist 0x2b ++#define DW_TAG_namelist_item 0x2c ++#define DW_TAG_packed_type 0x2d ++#define DW_TAG_subprogram 0x2e ++#define DW_TAG_template_type_param 0x2f ++#define DW_TAG_template_value_param 0x30 ++#define DW_TAG_thrown_type 0x31 ++#define DW_TAG_try_block 0x32 ++#define DW_TAG_variant_part 0x33 ++#define DW_TAG_variable 0x34 ++#define DW_TAG_volatile_type 0x35 ++#define DW_TAG_dwarf_procedure 0x36 ++#define DW_TAG_restrict_type 0x37 ++#define DW_TAG_interface_type 0x38 ++#define DW_TAG_namespace 0x39 ++#define DW_TAG_imported_module 0x3a ++#define DW_TAG_unspecified_type 0x3b ++#define DW_TAG_partial_unit 0x3c ++#define DW_TAG_imported_unit 0x3d ++#define DW_TAG_MIPS_loop 0x4081 ++#define DW_TAG_HP_array_descriptor 0x4090 ++#define DW_TAG_format_label 0x4101 ++#define DW_TAG_function_template 0x4102 ++#define DW_TAG_class_template 0x4103 ++#define DW_TAG_GNU_BINCL 0x4104 ++#define DW_TAG_GNU_EINCL 0x4105 ++#define DW_TAG_upc_shared_type 0x8765 ++#define DW_TAG_upc_strict_type 0x8766 ++#define DW_TAG_upc_relaxed_type 0x8767 ++#define DW_TAG_PGI_kanji_type 0xA000 ++#define DW_TAG_PGI_interface_block 0xA020 ++#define DW_TAG_lo_user 0x4080 ++#define DW_TAG_hi_user 0xffff ++#define DW_children_no 0 ++#define DW_children_yes 1 ++#define DW_FORM_addr 0x01 ++#define DW_FORM_block2 0x03 ++#define DW_FORM_block4 0x04 ++#define DW_FORM_data2 0x05 ++#define DW_FORM_data4 0x06 ++#define DW_FORM_data8 0x07 ++#define DW_FORM_string 0x08 ++#define DW_FORM_block 0x09 ++#define DW_FORM_block1 0x0a ++#define DW_FORM_data1 0x0b ++#define DW_FORM_flag 0x0c ++#define DW_FORM_sdata 0x0d ++#define DW_FORM_strp 0x0e ++#define DW_FORM_udata 0x0f ++#define DW_FORM_ref_addr 0x10 ++#define DW_FORM_ref1 0x11 ++#define DW_FORM_ref2 0x12 ++#define DW_FORM_ref4 0x13 ++#define DW_FORM_ref8 0x14 ++#define DW_FORM_ref_udata 0x15 ++#define DW_FORM_indirect 0x16 ++#define DW_AT_sibling 0x01 ++#define DW_AT_location 0x02 ++#define DW_AT_name 0x03 ++#define DW_AT_ordering 0x09 ++#define DW_AT_subscr_data 0x0a ++#define DW_AT_byte_size 0x0b ++#define DW_AT_bit_offset 0x0c ++#define DW_AT_bit_size 0x0d ++#define DW_AT_element_list 0x0f ++#define DW_AT_stmt_list 0x10 ++#define DW_AT_low_pc 0x11 ++#define DW_AT_high_pc 0x12 ++#define DW_AT_language 0x13 ++#define DW_AT_member 0x14 ++#define DW_AT_discr 0x15 ++#define DW_AT_discr_value 0x16 ++#define DW_AT_visibility 0x17 ++#define DW_AT_import 0x18 ++#define DW_AT_string_length 0x19 ++#define DW_AT_common_reference 0x1a ++#define DW_AT_comp_dir 0x1b ++#define DW_AT_const_value 0x1c ++#define DW_AT_containing_type 0x1d ++#define DW_AT_default_value 0x1e ++#define DW_AT_inline 0x20 ++#define DW_AT_is_optional 0x21 ++#define DW_AT_lower_bound 0x22 ++#define DW_AT_producer 0x25 ++#define DW_AT_prototyped 0x27 ++#define DW_AT_return_addr 0x2a ++#define DW_AT_start_scope 0x2c ++#define DW_AT_stride_size 0x2e ++#define DW_AT_upper_bound 0x2f ++#define DW_AT_abstract_origin 0x31 ++#define DW_AT_accessibility 0x32 ++#define DW_AT_address_class 0x33 ++#define DW_AT_artificial 0x34 ++#define DW_AT_base_types 0x35 ++#define DW_AT_calling_convention 0x36 ++#define DW_AT_count 0x37 ++#define DW_AT_data_member_location 0x38 ++#define DW_AT_decl_column 0x39 ++#define DW_AT_decl_file 0x3a ++#define DW_AT_decl_line 0x3b ++#define DW_AT_declaration 0x3c ++#define DW_AT_discr_list 0x3d ++#define DW_AT_encoding 0x3e ++#define DW_AT_external 0x3f ++#define DW_AT_frame_base 0x40 ++#define DW_AT_friend 0x41 ++#define DW_AT_identifier_case 0x42 ++#define DW_AT_macro_info 0x43 ++#define DW_AT_namelist_items 0x44 ++#define DW_AT_priority 0x45 ++#define DW_AT_segment 0x46 ++#define DW_AT_specification 0x47 ++#define DW_AT_static_link 0x48 ++#define DW_AT_type 0x49 ++#define DW_AT_use_location 0x4a ++#define DW_AT_variable_parameter 0x4b ++#define DW_AT_virtuality 0x4c ++#define DW_AT_vtable_elem_location 0x4d ++#define DW_AT_allocated 0x4e ++#define DW_AT_associated 0x4f ++#define DW_AT_data_location 0x50 ++#define DW_AT_stride 0x51 ++#define DW_AT_entry_pc 0x52 ++#define DW_AT_use_UTF8 0x53 ++#define DW_AT_extension 0x54 ++#define DW_AT_ranges 0x55 ++#define DW_AT_trampoline 0x56 ++#define DW_AT_call_column 0x57 ++#define DW_AT_call_file 0x58 ++#define DW_AT_call_line 0x59 ++#define DW_AT_MIPS_fde 0x2001 ++#define DW_AT_MIPS_loop_begin 0x2002 ++#define DW_AT_MIPS_tail_loop_begin 0x2003 ++#define DW_AT_MIPS_epilog_begin 0x2004 ++#define DW_AT_MIPS_loop_unroll_factor 0x2005 ++#define DW_AT_MIPS_software_pipeline_depth 0x2006 ++#define DW_AT_MIPS_linkage_name 0x2007 ++#define DW_AT_MIPS_stride 0x2008 ++#define DW_AT_MIPS_abstract_name 0x2009 ++#define DW_AT_MIPS_clone_origin 0x200a ++#define DW_AT_MIPS_has_inlines 0x200b ++#define DW_AT_HP_block_index 0x2000 ++#define DW_AT_HP_unmodifiable 0x2001 ++#define DW_AT_HP_actuals_stmt_list 0x2010 ++#define DW_AT_HP_proc_per_section 0x2011 ++#define DW_AT_HP_raw_data_ptr 0x2012 ++#define DW_AT_HP_pass_by_reference 0x2013 ++#define DW_AT_HP_opt_level 0x2014 ++#define DW_AT_HP_prof_version_id 0x2015 ++#define DW_AT_HP_opt_flags 0x2016 ++#define DW_AT_HP_cold_region_low_pc 0x2017 ++#define DW_AT_HP_cold_region_high_pc 0x2018 ++#define DW_AT_HP_all_variables_modifiable 0x2019 ++#define DW_AT_HP_linkage_name 0x201a ++#define DW_AT_HP_prof_flags 0x201b ++#define DW_AT_sf_names 0x2101 ++#define DW_AT_src_info 0x2102 ++#define DW_AT_mac_info 0x2103 ++#define DW_AT_src_coords 0x2104 ++#define DW_AT_body_begin 0x2105 ++#define DW_AT_body_end 0x2106 ++#define DW_AT_GNU_vector 0x2107 ++#define DW_AT_VMS_rtnbeg_pd_address 0x2201 ++#define DW_AT_upc_threads_scaled 0x3210 ++#define DW_AT_PGI_lbase 0x3a00 ++#define DW_AT_PGI_soffset 0x3a01 ++#define DW_AT_PGI_lstride 0x3a02 ++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ ++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ ++#define DW_OP_addr 0x03 ++#define DW_OP_deref 0x06 ++#define DW_OP_const1u 0x08 ++#define DW_OP_const1s 0x09 ++#define DW_OP_const2u 0x0a ++#define DW_OP_const2s 0x0b ++#define DW_OP_const4u 0x0c ++#define DW_OP_const4s 0x0d ++#define DW_OP_const8u 0x0e ++#define DW_OP_const8s 0x0f ++#define DW_OP_constu 0x10 ++#define DW_OP_consts 0x11 ++#define DW_OP_dup 0x12 ++#define DW_OP_drop 0x13 ++#define DW_OP_over 0x14 ++#define DW_OP_pick 0x15 ++#define DW_OP_swap 0x16 ++#define DW_OP_rot 0x17 ++#define DW_OP_xderef 0x18 ++#define DW_OP_abs 0x19 ++#define DW_OP_and 0x1a ++#define DW_OP_div 0x1b ++#define DW_OP_minus 0x1c ++#define DW_OP_mod 0x1d ++#define DW_OP_mul 0x1e ++#define DW_OP_neg 0x1f ++#define DW_OP_not 0x20 ++#define DW_OP_or 0x21 ++#define DW_OP_plus 0x22 ++#define DW_OP_plus_uconst 0x23 ++#define DW_OP_shl 0x24 ++#define DW_OP_shr 0x25 ++#define DW_OP_shra 0x26 ++#define DW_OP_xor 0x27 ++#define DW_OP_bra 0x28 ++#define DW_OP_eq 0x29 ++#define DW_OP_ge 0x2a ++#define DW_OP_gt 0x2b ++#define DW_OP_le 0x2c ++#define DW_OP_lt 0x2d ++#define DW_OP_ne 0x2e ++#define DW_OP_skip 0x2f ++#define DW_OP_lit0 0x30 ++#define DW_OP_lit1 0x31 ++#define DW_OP_lit2 0x32 ++#define DW_OP_lit3 0x33 ++#define DW_OP_lit4 0x34 ++#define DW_OP_lit5 0x35 ++#define DW_OP_lit6 0x36 ++#define DW_OP_lit7 0x37 ++#define DW_OP_lit8 0x38 ++#define DW_OP_lit9 0x39 ++#define DW_OP_lit10 0x3a ++#define DW_OP_lit11 0x3b ++#define DW_OP_lit12 0x3c ++#define DW_OP_lit13 0x3d ++#define DW_OP_lit14 0x3e ++#define DW_OP_lit15 0x3f ++#define DW_OP_lit16 0x40 ++#define DW_OP_lit17 0x41 ++#define DW_OP_lit18 0x42 ++#define DW_OP_lit19 0x43 ++#define DW_OP_lit20 0x44 ++#define DW_OP_lit21 0x45 ++#define DW_OP_lit22 0x46 ++#define DW_OP_lit23 0x47 ++#define DW_OP_lit24 0x48 ++#define DW_OP_lit25 0x49 ++#define DW_OP_lit26 0x4a ++#define DW_OP_lit27 0x4b ++#define DW_OP_lit28 0x4c ++#define DW_OP_lit29 0x4d ++#define DW_OP_lit30 0x4e ++#define DW_OP_lit31 0x4f ++#define DW_OP_reg0 0x50 ++#define DW_OP_reg1 0x51 ++#define DW_OP_reg2 0x52 ++#define DW_OP_reg3 0x53 ++#define DW_OP_reg4 0x54 ++#define DW_OP_reg5 0x55 ++#define DW_OP_reg6 0x56 ++#define DW_OP_reg7 0x57 ++#define DW_OP_reg8 0x58 ++#define DW_OP_reg9 0x59 ++#define DW_OP_reg10 0x5a ++#define DW_OP_reg11 0x5b ++#define DW_OP_reg12 0x5c ++#define DW_OP_reg13 0x5d ++#define DW_OP_reg14 0x5e ++#define DW_OP_reg15 0x5f ++#define DW_OP_reg16 0x60 ++#define DW_OP_reg17 0x61 ++#define DW_OP_reg18 0x62 ++#define DW_OP_reg19 0x63 ++#define DW_OP_reg20 0x64 ++#define DW_OP_reg21 0x65 ++#define DW_OP_reg22 0x66 ++#define DW_OP_reg23 0x67 ++#define DW_OP_reg24 0x68 ++#define DW_OP_reg25 0x69 ++#define DW_OP_reg26 0x6a ++#define DW_OP_reg27 0x6b ++#define DW_OP_reg28 0x6c ++#define DW_OP_reg29 0x6d ++#define DW_OP_reg30 0x6e ++#define DW_OP_reg31 0x6f ++#define DW_OP_breg0 0x70 ++#define DW_OP_breg1 0x71 ++#define DW_OP_breg2 0x72 ++#define DW_OP_breg3 0x73 ++#define DW_OP_breg4 0x74 ++#define DW_OP_breg5 0x75 ++#define DW_OP_breg6 0x76 ++#define DW_OP_breg7 0x77 ++#define DW_OP_breg8 0x78 ++#define DW_OP_breg9 0x79 ++#define DW_OP_breg10 0x7a ++#define DW_OP_breg11 0x7b ++#define DW_OP_breg12 0x7c ++#define DW_OP_breg13 0x7d ++#define DW_OP_breg14 0x7e ++#define DW_OP_breg15 0x7f ++#define DW_OP_breg16 0x80 ++#define DW_OP_breg17 0x81 ++#define DW_OP_breg18 0x82 ++#define DW_OP_breg19 0x83 ++#define DW_OP_breg20 0x84 ++#define DW_OP_breg21 0x85 ++#define DW_OP_breg22 0x86 ++#define DW_OP_breg23 0x87 ++#define DW_OP_breg24 0x88 ++#define DW_OP_breg25 0x89 ++#define DW_OP_breg26 0x8a ++#define DW_OP_breg27 0x8b ++#define DW_OP_breg28 0x8c ++#define DW_OP_breg29 0x8d ++#define DW_OP_breg30 0x8e ++#define DW_OP_breg31 0x8f ++#define DW_OP_regx 0x90 ++#define DW_OP_fbreg 0x91 ++#define DW_OP_bregx 0x92 ++#define DW_OP_piece 0x93 ++#define DW_OP_deref_size 0x94 ++#define DW_OP_xderef_size 0x95 ++#define DW_OP_nop 0x96 ++#define DW_OP_push_object_address 0x97 ++#define DW_OP_call2 0x98 ++#define DW_OP_call4 0x99 ++#define DW_OP_call_ref 0x9a ++#define DW_OP_GNU_push_tls_address 0xe0 ++#define DW_OP_HP_unknown 0xe0 ++#define DW_OP_HP_is_value 0xe1 ++#define DW_OP_HP_fltconst4 0xe2 ++#define DW_OP_HP_fltconst8 0xe3 ++#define DW_OP_HP_mod_range 0xe4 ++#define DW_OP_HP_unmod_range 0xe5 ++#define DW_OP_HP_tls 0xe6 ++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ ++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ ++#define DW_ATE_void 0x0 ++#define DW_ATE_address 0x1 ++#define DW_ATE_boolean 0x2 ++#define DW_ATE_complex_float 0x3 ++#define DW_ATE_float 0x4 ++#define DW_ATE_signed 0x5 ++#define DW_ATE_signed_char 0x6 ++#define DW_ATE_unsigned 0x7 ++#define DW_ATE_unsigned_char 0x8 ++#define DW_ATE_imaginary_float 0x9 ++#define DW_ATE_HP_float80 0x80 ++#define DW_ATE_HP_complex_float80 0x81 ++#define DW_ATE_HP_float128 0x82 ++#define DW_ATE_HP_complex_float128 0x83 ++#define DW_ATE_HP_floathpintel 0x84 ++#define DW_ATE_HP_imaginary_float80 0x85 ++#define DW_ATE_HP_imaginary_float128 0x86 ++#define DW_ATE_lo_user 0x80 ++#define DW_ATE_hi_user 0xff ++#define DW_ORD_row_major 0 ++#define DW_ORD_col_major 1 ++#define DW_ACCESS_public 1 ++#define DW_ACCESS_protected 2 ++#define DW_ACCESS_private 3 ++#define DW_VIS_local 1 ++#define DW_VIS_exported 2 ++#define DW_VIS_qualified 3 ++#define DW_VIRTUALITY_none 0 ++#define DW_VIRTUALITY_virtual 1 ++#define DW_VIRTUALITY_pure_virtual 2 ++#define DW_ID_case_sensitive 0 ++#define DW_ID_up_case 1 ++#define DW_ID_down_case 2 ++#define DW_ID_case_insensitive 3 ++#define DW_CC_normal 0x1 ++#define DW_CC_program 0x2 ++#define DW_CC_nocall 0x3 ++#define DW_CC_lo_user 0x40 ++#define DW_CC_hi_user 0xff ++#define DW_INL_not_inlined 0 ++#define DW_INL_inlined 1 ++#define DW_INL_declared_not_inlined 2 ++#define DW_INL_declared_inlined 3 ++#define DW_DSC_label 0 ++#define DW_DSC_range 1 ++#define DW_LNS_extended_op 0 ++#define DW_LNS_copy 1 ++#define DW_LNS_advance_pc 2 ++#define DW_LNS_advance_line 3 ++#define DW_LNS_set_file 4 ++#define DW_LNS_set_column 5 ++#define DW_LNS_negate_stmt 6 ++#define DW_LNS_set_basic_block 7 ++#define DW_LNS_const_add_pc 8 ++#define DW_LNS_fixed_advance_pc 9 ++#define DW_LNS_set_prologue_end 10 ++#define DW_LNS_set_epilogue_begin 11 ++#define DW_LNS_set_isa 12 ++#define DW_LNE_end_sequence 1 ++#define DW_LNE_set_address 2 ++#define DW_LNE_define_file 3 ++#define DW_LNE_HP_negate_is_UV_update 0x11 ++#define DW_LNE_HP_push_context 0x12 ++#define DW_LNE_HP_pop_context 0x13 ++#define DW_LNE_HP_set_file_line_column 0x14 ++#define DW_LNE_HP_set_routine_name 0x15 ++#define DW_LNE_HP_set_sequence 0x16 ++#define DW_LNE_HP_negate_post_semantics 0x17 ++#define DW_LNE_HP_negate_function_exit 0x18 ++#define DW_LNE_HP_negate_front_end_logical 0x19 ++#define DW_LNE_HP_define_proc 0x20 ++#define DW_CFA_advance_loc 0x40 ++#define DW_CFA_offset 0x80 ++#define DW_CFA_restore 0xc0 ++#define DW_CFA_nop 0x00 ++#define DW_CFA_set_loc 0x01 ++#define DW_CFA_advance_loc1 0x02 ++#define DW_CFA_advance_loc2 0x03 ++#define DW_CFA_advance_loc4 0x04 ++#define DW_CFA_offset_extended 0x05 ++#define DW_CFA_restore_extended 0x06 ++#define DW_CFA_undefined 0x07 ++#define DW_CFA_same_value 0x08 ++#define DW_CFA_register 0x09 ++#define DW_CFA_remember_state 0x0a ++#define DW_CFA_restore_state 0x0b ++#define DW_CFA_def_cfa 0x0c ++#define DW_CFA_def_cfa_register 0x0d ++#define DW_CFA_def_cfa_offset 0x0e ++#define DW_CFA_def_cfa_expression 0x0f ++#define DW_CFA_expression 0x10 ++#define DW_CFA_offset_extended_sf 0x11 ++#define DW_CFA_def_cfa_sf 0x12 ++#define DW_CFA_def_cfa_offset_sf 0x13 ++#define DW_CFA_MIPS_advance_loc8 0x1d ++#define DW_CFA_GNU_window_save 0x2d ++#define DW_CFA_GNU_args_size 0x2e ++#define DW_CFA_GNU_negative_offset_extended 0x2f ++#define DW_CIE_ID 0xffffffff ++#define DW_CIE_VERSION 1 ++#define DW_CFA_extended 0 ++#define DW_CFA_lo_user 0x1c ++#define DW_CFA_hi_user 0x3f ++#define DW_CHILDREN_no 0x00 ++#define DW_CHILDREN_yes 0x01 ++#define DW_ADDR_none 0 ++#define DW_LANG_C89 0x0001 ++#define DW_LANG_C 0x0002 ++#define DW_LANG_Ada83 0x0003 ++#define DW_LANG_C_plus_plus 0x0004 ++#define DW_LANG_Cobol74 0x0005 ++#define DW_LANG_Cobol85 0x0006 ++#define DW_LANG_Fortran77 0x0007 ++#define DW_LANG_Fortran90 0x0008 ++#define DW_LANG_Pascal83 0x0009 ++#define DW_LANG_Modula2 0x000a ++#define DW_LANG_Java 0x000b ++#define DW_LANG_C99 0x000c ++#define DW_LANG_Ada95 0x000d ++#define DW_LANG_Fortran95 0x000e ++#define DW_LANG_Mips_Assembler 0x8001 ++#define DW_LANG_Upc 0x8765 ++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ ++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ ++#define DW_MACINFO_define 1 ++#define DW_MACINFO_undef 2 ++#define DW_MACINFO_start_file 3 ++#define DW_MACINFO_end_file 4 ++#define DW_MACINFO_vendor_ext 255 ++#define DW_EH_PE_absptr 0x00 ++#define DW_EH_PE_omit 0xff ++#define DW_EH_PE_uleb128 0x01 ++#define DW_EH_PE_udata2 0x02 ++#define DW_EH_PE_udata4 0x03 ++#define DW_EH_PE_udata8 0x04 ++#define DW_EH_PE_sleb128 0x09 ++#define DW_EH_PE_sdata2 0x0A ++#define DW_EH_PE_sdata4 0x0B ++#define DW_EH_PE_sdata8 0x0C ++#define DW_EH_PE_signed 0x08 ++#define DW_EH_PE_pcrel 0x10 ++#define DW_EH_PE_textrel 0x20 ++#define DW_EH_PE_datarel 0x30 ++#define DW_EH_PE_funcrel 0x40 ++#define DW_EH_PE_aligned 0x50 ++#define DW_EH_PE_indirect 0x80 ++#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2-lang.h linux-2.6.18.kgdb/include/linux/dwarf2-lang.h +--- linux-2.6.18/include/linux/dwarf2-lang.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/linux/dwarf2-lang.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,300 @@ ++#ifndef DWARF2_LANG ++#define DWARF2_LANG ++ ++/* ++ * This is free software; you can redistribute it and/or modify it under ++ * the terms of the GNU General Public License as published by the Free ++ * Software Foundation; either version 2, or (at your option) any later ++ * version. ++ */ ++/* ++ * This file defines macros that allow generation of DWARF debug records ++ * for asm files. This file is platform independent. Register numbers ++ * (which are about the only thing that is platform dependent) are to be ++ * supplied by a platform defined file. ++ */ ++/* ++ * We need this to work for both asm and C. In asm we are using the ++ * old comment trick to concatenate while C uses the new ANSI thing. ++ * Here we have concat macro... The multi level thing is to allow and ++ * macros used in the names to be resolved prior to the cat (at which ++ * time they are no longer the same string). ++ */ ++#define CAT3(a,b,c) _CAT3(a,b,c) ++#define _CAT3(a,b,c) __CAT3(a,b,c) ++#ifndef __STDC__ ++#define __CAT3(a,b,c) a/**/b/**/c ++#else ++#define __CAT3(a,b,c) a##b##c ++#endif ++#ifdef __ASSEMBLY__ ++#define IFC(a) ++#define IFN_C(a) a ++#define NL ; ++#define QUOTE_THIS(a) a ++#define DWARF_preamble .section .debug_frame,"",%progbits; ++#else ++#define IFC(a) a ++#define IFN_C(a) ++#define NL \n\t ++#define QUOTE_THIS(a) _QUOTE_THIS(a) ++#define _QUOTE_THIS(a) #a ++/* Don't let CPP see the " and , \042=" \054=, */ ++#define DWARF_preamble .section .debug_frame \054\042\042\054%progbits ++#endif ++ ++#ifdef CONFIG_64BIT ++#define DATA_ALIGN_FACTOR 8 ++#define ADDR_LOC .quad ++#else ++#define DATA_ALIGN_FACTOR 4 ++#define ADDR_LOC .long ++#endif ++ ++#include ++/* ++ * This macro starts a debug frame section. The debug_frame describes ++ * where to find the registers that the enclosing function saved on ++ * entry. ++ * ++ * ORD is use by the label generator and should be the same as what is ++ * passed to CFI_postamble. ++ * ++ * pc, pc register gdb ordinal. ++ * ++ * code_align this is the factor used to define locations or regions ++ * where the given definitions apply. If you use labels to define these ++ * this should be 1. ++ * ++ * data_align this is the factor used to define register offsets. If ++ * you use struct offset, this should be the size of the register in ++ * bytes or the negative of that. This is how it is used: you will ++ * define a register as the reference register, say the stack pointer, ++ * then you will say where a register is located relative to this ++ * reference registers value, say 40 for register 3 (the gdb register ++ * number). The <40> will be multiplied by to define the ++ * byte offset of the given register (3, in this example). So if your ++ * <40> is the byte offset and the reference register points at the ++ * begining, you would want 1 for the data_offset. If <40> was the 40th ++ * 4-byte element in that structure you would want 4. And if your ++ * reference register points at the end of the structure you would want ++ * a negative data_align value(and you would have to do other math as ++ * well). ++ */ ++ ++#define CFI_preamble(ORD, pc, code_align, data_align) \ ++ DWARF_preamble NL \ ++ .align DATA_ALIGN_FACTOR NL \ ++ .globl CAT3(frame,_,ORD) NL \ ++CAT3(frame,_,ORD): NL \ ++ .long 7f-6f NL \ ++6: \ ++ .long DW_CIE_ID NL \ ++ .byte DW_CIE_VERSION NL \ ++ .byte 0 NL \ ++ .uleb128 code_align NL \ ++ .sleb128 data_align NL \ ++ .byte pc NL ++ ++/* ++ * After the above macro and prior to the CFI_postamble, you need to ++ * define the initial state. This starts with defining the reference ++ * register and, usually the pc. Here are some helper macros: ++ */ ++ ++#define CFA_define_reference(reg, offset) \ ++ .byte DW_CFA_def_cfa NL \ ++ .uleb128 reg NL \ ++ .uleb128 (offset) NL ++ ++#define CFA_define_offset(reg, offset) \ ++ .byte (DW_CFA_offset + reg) NL \ ++ .uleb128 (offset) NL ++ ++#define CFA_restore(reg) \ ++ .byte (DW_CFA_restore + reg) NL ++ ++#define CFI_postamble() \ ++ .align DATA_ALIGN_FACTOR NL \ ++7: NL \ ++.previous NL ++ ++/* ++ * So now your code pushs stuff on the stack, you need a new location ++ * and the rules for what to do. This starts a running description of ++ * the call frame. You need to describe what changes with respect to ++ * the call registers as the location of the pc moves through the code. ++ * The following builds an FDE (fram descriptor entry?). Like the ++ * above, it has a preamble and a postamble. It also is tied to the CFI ++ * above. ++ * The preamble macro is tied to the CFI thru the first parameter. The ++ * second is the code start address and then the code end address+1. ++ */ ++#define FDE_preamble(ORD, initial_address, end_address) \ ++ DWARF_preamble NL \ ++ .align DATA_ALIGN_FACTOR NL \ ++ .long 9f-8f NL \ ++8: \ ++ .long CAT3(frame,_,ORD) NL \ ++ ADDR_LOC initial_address NL \ ++ ADDR_LOC (end_address - initial_address) NL ++ ++#define FDE_postamble() \ ++ .align DATA_ALIGN_FACTOR NL \ ++9: NL \ ++.previous NL ++ ++/* ++ * That done, you can now add registers, subtract registers, move the ++ * reference and even change the reference. You can also define a new ++ * area of code the info applies to. For discontinuous bits you should ++ * start a new FDE. You may have as many as you like. ++ */ ++ ++/* ++ * To advance the stack address by (0x3f max) ++ */ ++ ++#define CFA_advance_loc(bytes) \ ++ .byte DW_CFA_advance_loc+bytes NL ++ ++/* ++ * This one is good for 0xff or 255 ++ */ ++#define CFA_advance_loc1(bytes) \ ++ .byte DW_CFA_advance_loc1 NL \ ++ .byte bytes NL ++ ++#define CFA_undefine_reg(reg) \ ++ .byte DW_CFA_undefined NL \ ++ .uleb128 reg NL ++/* ++ * With the above you can define all the register locations. But ++ * suppose the reference register moves... Takes the new offset NOT an ++ * increment. This is how esp is tracked if it is not saved. ++ */ ++ ++#define CFA_define_cfa_offset(offset) \ ++ .byte DW_CFA_def_cfa_offset NL \ ++ .uleb128 (offset) NL ++/* ++ * Or suppose you want to use a different reference register... ++ */ ++#define CFA_define_cfa_register(reg) \ ++ .byte DW_CFA_def_cfa_register NL \ ++ .uleb128 reg NL ++ ++/* ++ * If you want to mess with the stack pointer, here is the expression. ++ * The stack starts empty. ++ */ ++#define CFA_def_cfa_expression \ ++ .byte DW_CFA_def_cfa_expression NL \ ++ .uleb128 20f-10f NL \ ++10: NL ++/* ++ * This expression is to be used for other regs. The stack starts with the ++ * stack address. ++ */ ++ ++#define CFA_expression(reg) \ ++ .byte DW_CFA_expression NL \ ++ .uleb128 reg NL \ ++ .uleb128 20f-10f NL \ ++10: NL ++/* ++ * Here we do the expression stuff. You should code the above followed ++ * by expression OPs followed by CFA_expression_end. ++ */ ++ ++ ++#define CFA_expression_end \ ++20: NL ++ ++#define CFA_exp_OP_const4s(a) \ ++ .byte DW_OP_const4s NL \ ++ .long a NL ++ ++#define CFA_exp_OP_swap .byte DW_OP_swap NL ++#define CFA_exp_OP_dup .byte DW_OP_dup NL ++#define CFA_exp_OP_drop .byte DW_OP_drop NL ++/* ++ * All these work on the top two elements on the stack, replacing them ++ * with the result. Top comes first where it matters. True is 1, false 0. ++ */ ++#define CFA_exp_OP_deref .byte DW_OP_deref NL ++#define CFA_exp_OP_and .byte DW_OP_and NL ++#define CFA_exp_OP_div .byte DW_OP_div NL ++#define CFA_exp_OP_minus .byte DW_OP_minus NL ++#define CFA_exp_OP_mod .byte DW_OP_mod NL ++#define CFA_exp_OP_neg .byte DW_OP_neg NL ++#define CFA_exp_OP_plus .byte DW_OP_plus NL ++#define CFA_exp_OP_not .byte DW_OP_not NL ++#define CFA_exp_OP_or .byte DW_OP_or NL ++#define CFA_exp_OP_xor .byte DW_OP_xor NL ++#define CFA_exp_OP_le .byte DW_OP_le NL ++#define CFA_exp_OP_ge .byte DW_OP_ge NL ++#define CFA_exp_OP_eq .byte DW_OP_eq NL ++#define CFA_exp_OP_lt .byte DW_OP_lt NL ++#define CFA_exp_OP_gt .byte DW_OP_gt NL ++#define CFA_exp_OP_ne .byte DW_OP_ne NL ++/* ++ * These take a parameter as noted ++ */ ++/* ++ * Unconditional skip to loc. loc is a label (loc:) ++ */ ++#define CFA_exp_OP_skip(loc) \ ++ .byte DW_OP_skip NL \ ++ .hword loc-.-2 NL ++/* ++ * Conditional skip to loc (TOS != 0, TOS--) (loc is a label) ++ */ ++#define CFA_exp_OP_bra(loc) \ ++ .byte DW_OP_bra NL \ ++ .hword loc-.-2 NL ++ ++/* ++ * TOS += no (an unsigned number) ++ */ ++#define CFA_exp_OP_plus_uconst(no) \ ++ .byte DW_OP_plus_uconst NL \ ++ .uleb128 no NL ++ ++/* ++ * ++TOS = no (a unsigned number) ++ */ ++#define CFA_exp_OP_constu(no) \ ++ .byte DW_OP_constu NL \ ++ .uleb128 no NL ++/* ++ * ++TOS = no (a signed number) ++ */ ++#define CFA_exp_OP_consts(no) \ ++ .byte DW_OP_consts NL \ ++ .sleb128 no NL ++/* ++ * ++TOS = no (an unsigned byte) ++ */ ++#define CFA_exp_OP_const1u(no) \ ++ .byte DW_OP_const1u NL \ ++ .byte no NL ++ ++ ++/* ++ * ++TOS = no (a address) ++ */ ++#define CFA_exp_OP_addr(no) \ ++ .byte DW_OP_addr NL \ ++ .long no NL ++ ++/* ++ * Push current frames value for "reg" + offset ++ * We take advantage of the opcode assignments to make this a litteral reg ++ * rather than use the DW_OP_bregx opcode. ++ */ ++ ++#define CFA_exp_OP_breg(reg,offset) \ ++ .byte DW_OP_breg0+reg NL \ ++ .sleb128 offset NL ++#endif +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/dwarf2.h linux-2.6.18.kgdb/include/linux/dwarf2.h +--- linux-2.6.18/include/linux/dwarf2.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/linux/dwarf2.h 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,775 @@ ++/* Declarations and definitions of codes relating to the DWARF2 symbolic ++ debugging information format. ++ Copyright (C) 1992, 1993, 1995, 1996, 1997, 1999, 2000, 2001, 2002, ++ 2003 Free Software Foundation, Inc. ++ ++ Written by Gary Funck (gary@intrepid.com) The Ada Joint Program ++ Office (AJPO), Florida State Unviversity and Silicon Graphics Inc. ++ provided support for this effort -- June 21, 1995. ++ ++ Derived from the DWARF 1 implementation written by Ron Guilmette ++ (rfg@netcom.com), November 1990. ++ ++ This file is part of GCC. ++ ++ GCC is free software; you can redistribute it and/or modify it under ++ the terms of the GNU General Public License as published by the Free ++ Software Foundation; either version 2, or (at your option) any later ++ version. ++ ++ GCC is distributed in the hope that it will be useful, but WITHOUT ++ ANY WARRANTY; without even the implied warranty of MERCHANTABILITY ++ or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public ++ License for more details. ++ ++ You should have received a copy of the GNU General Public License ++ along with GCC; see the file COPYING. If not, write to the Free ++ Software Foundation, 59 Temple Place - Suite 330, Boston, MA ++ 02111-1307, USA. */ ++ ++/* This file is derived from the DWARF specification (a public document) ++ Revision 2.0.0 (July 27, 1993) developed by the UNIX International ++ Programming Languages Special Interest Group (UI/PLSIG) and distributed ++ by UNIX International. Copies of this specification are available from ++ UNIX International, 20 Waterview Boulevard, Parsippany, NJ, 07054. ++ ++ This file also now contains definitions from the DWARF 3 specification. */ ++ ++/* This file is shared between GCC and GDB, and should not contain ++ prototypes. */ ++ ++#ifndef _ELF_DWARF2_H ++#define _ELF_DWARF2_H ++ ++/* Structure found in the .debug_line section. */ ++typedef struct ++{ ++ unsigned char li_length [4]; ++ unsigned char li_version [2]; ++ unsigned char li_prologue_length [4]; ++ unsigned char li_min_insn_length [1]; ++ unsigned char li_default_is_stmt [1]; ++ unsigned char li_line_base [1]; ++ unsigned char li_line_range [1]; ++ unsigned char li_opcode_base [1]; ++} ++DWARF2_External_LineInfo; ++ ++typedef struct ++{ ++ unsigned long li_length; ++ unsigned short li_version; ++ unsigned int li_prologue_length; ++ unsigned char li_min_insn_length; ++ unsigned char li_default_is_stmt; ++ int li_line_base; ++ unsigned char li_line_range; ++ unsigned char li_opcode_base; ++} ++DWARF2_Internal_LineInfo; ++ ++/* Structure found in .debug_pubnames section. */ ++typedef struct ++{ ++ unsigned char pn_length [4]; ++ unsigned char pn_version [2]; ++ unsigned char pn_offset [4]; ++ unsigned char pn_size [4]; ++} ++DWARF2_External_PubNames; ++ ++typedef struct ++{ ++ unsigned long pn_length; ++ unsigned short pn_version; ++ unsigned long pn_offset; ++ unsigned long pn_size; ++} ++DWARF2_Internal_PubNames; ++ ++/* Structure found in .debug_info section. */ ++typedef struct ++{ ++ unsigned char cu_length [4]; ++ unsigned char cu_version [2]; ++ unsigned char cu_abbrev_offset [4]; ++ unsigned char cu_pointer_size [1]; ++} ++DWARF2_External_CompUnit; ++ ++typedef struct ++{ ++ unsigned long cu_length; ++ unsigned short cu_version; ++ unsigned long cu_abbrev_offset; ++ unsigned char cu_pointer_size; ++} ++DWARF2_Internal_CompUnit; ++ ++typedef struct ++{ ++ unsigned char ar_length [4]; ++ unsigned char ar_version [2]; ++ unsigned char ar_info_offset [4]; ++ unsigned char ar_pointer_size [1]; ++ unsigned char ar_segment_size [1]; ++} ++DWARF2_External_ARange; ++ ++typedef struct ++{ ++ unsigned long ar_length; ++ unsigned short ar_version; ++ unsigned long ar_info_offset; ++ unsigned char ar_pointer_size; ++ unsigned char ar_segment_size; ++} ++DWARF2_Internal_ARange; ++ ++ ++/* Tag names and codes. */ ++enum dwarf_tag ++ { ++ DW_TAG_padding = 0x00, ++ DW_TAG_array_type = 0x01, ++ DW_TAG_class_type = 0x02, ++ DW_TAG_entry_point = 0x03, ++ DW_TAG_enumeration_type = 0x04, ++ DW_TAG_formal_parameter = 0x05, ++ DW_TAG_imported_declaration = 0x08, ++ DW_TAG_label = 0x0a, ++ DW_TAG_lexical_block = 0x0b, ++ DW_TAG_member = 0x0d, ++ DW_TAG_pointer_type = 0x0f, ++ DW_TAG_reference_type = 0x10, ++ DW_TAG_compile_unit = 0x11, ++ DW_TAG_string_type = 0x12, ++ DW_TAG_structure_type = 0x13, ++ DW_TAG_subroutine_type = 0x15, ++ DW_TAG_typedef = 0x16, ++ DW_TAG_union_type = 0x17, ++ DW_TAG_unspecified_parameters = 0x18, ++ DW_TAG_variant = 0x19, ++ DW_TAG_common_block = 0x1a, ++ DW_TAG_common_inclusion = 0x1b, ++ DW_TAG_inheritance = 0x1c, ++ DW_TAG_inlined_subroutine = 0x1d, ++ DW_TAG_module = 0x1e, ++ DW_TAG_ptr_to_member_type = 0x1f, ++ DW_TAG_set_type = 0x20, ++ DW_TAG_subrange_type = 0x21, ++ DW_TAG_with_stmt = 0x22, ++ DW_TAG_access_declaration = 0x23, ++ DW_TAG_base_type = 0x24, ++ DW_TAG_catch_block = 0x25, ++ DW_TAG_const_type = 0x26, ++ DW_TAG_constant = 0x27, ++ DW_TAG_enumerator = 0x28, ++ DW_TAG_file_type = 0x29, ++ DW_TAG_friend = 0x2a, ++ DW_TAG_namelist = 0x2b, ++ DW_TAG_namelist_item = 0x2c, ++ DW_TAG_packed_type = 0x2d, ++ DW_TAG_subprogram = 0x2e, ++ DW_TAG_template_type_param = 0x2f, ++ DW_TAG_template_value_param = 0x30, ++ DW_TAG_thrown_type = 0x31, ++ DW_TAG_try_block = 0x32, ++ DW_TAG_variant_part = 0x33, ++ DW_TAG_variable = 0x34, ++ DW_TAG_volatile_type = 0x35, ++ /* DWARF 3. */ ++ DW_TAG_dwarf_procedure = 0x36, ++ DW_TAG_restrict_type = 0x37, ++ DW_TAG_interface_type = 0x38, ++ DW_TAG_namespace = 0x39, ++ DW_TAG_imported_module = 0x3a, ++ DW_TAG_unspecified_type = 0x3b, ++ DW_TAG_partial_unit = 0x3c, ++ DW_TAG_imported_unit = 0x3d, ++ /* SGI/MIPS Extensions. */ ++ DW_TAG_MIPS_loop = 0x4081, ++ /* HP extensions. See: ftp://ftp.hp.com/pub/lang/tools/WDB/wdb-4.0.tar.gz . */ ++ DW_TAG_HP_array_descriptor = 0x4090, ++ /* GNU extensions. */ ++ DW_TAG_format_label = 0x4101, /* For FORTRAN 77 and Fortran 90. */ ++ DW_TAG_function_template = 0x4102, /* For C++. */ ++ DW_TAG_class_template = 0x4103, /* For C++. */ ++ DW_TAG_GNU_BINCL = 0x4104, ++ DW_TAG_GNU_EINCL = 0x4105, ++ /* Extensions for UPC. See: http://upc.gwu.edu/~upc. */ ++ DW_TAG_upc_shared_type = 0x8765, ++ DW_TAG_upc_strict_type = 0x8766, ++ DW_TAG_upc_relaxed_type = 0x8767, ++ /* PGI (STMicroelectronics) extensions. No documentation available. */ ++ DW_TAG_PGI_kanji_type = 0xA000, ++ DW_TAG_PGI_interface_block = 0xA020 ++ }; ++ ++#define DW_TAG_lo_user 0x4080 ++#define DW_TAG_hi_user 0xffff ++ ++/* Flag that tells whether entry has a child or not. */ ++#define DW_children_no 0 ++#define DW_children_yes 1 ++ ++/* Form names and codes. */ ++enum dwarf_form ++ { ++ DW_FORM_addr = 0x01, ++ DW_FORM_block2 = 0x03, ++ DW_FORM_block4 = 0x04, ++ DW_FORM_data2 = 0x05, ++ DW_FORM_data4 = 0x06, ++ DW_FORM_data8 = 0x07, ++ DW_FORM_string = 0x08, ++ DW_FORM_block = 0x09, ++ DW_FORM_block1 = 0x0a, ++ DW_FORM_data1 = 0x0b, ++ DW_FORM_flag = 0x0c, ++ DW_FORM_sdata = 0x0d, ++ DW_FORM_strp = 0x0e, ++ DW_FORM_udata = 0x0f, ++ DW_FORM_ref_addr = 0x10, ++ DW_FORM_ref1 = 0x11, ++ DW_FORM_ref2 = 0x12, ++ DW_FORM_ref4 = 0x13, ++ DW_FORM_ref8 = 0x14, ++ DW_FORM_ref_udata = 0x15, ++ DW_FORM_indirect = 0x16 ++ }; ++ ++/* Attribute names and codes. */ ++enum dwarf_attribute ++ { ++ DW_AT_sibling = 0x01, ++ DW_AT_location = 0x02, ++ DW_AT_name = 0x03, ++ DW_AT_ordering = 0x09, ++ DW_AT_subscr_data = 0x0a, ++ DW_AT_byte_size = 0x0b, ++ DW_AT_bit_offset = 0x0c, ++ DW_AT_bit_size = 0x0d, ++ DW_AT_element_list = 0x0f, ++ DW_AT_stmt_list = 0x10, ++ DW_AT_low_pc = 0x11, ++ DW_AT_high_pc = 0x12, ++ DW_AT_language = 0x13, ++ DW_AT_member = 0x14, ++ DW_AT_discr = 0x15, ++ DW_AT_discr_value = 0x16, ++ DW_AT_visibility = 0x17, ++ DW_AT_import = 0x18, ++ DW_AT_string_length = 0x19, ++ DW_AT_common_reference = 0x1a, ++ DW_AT_comp_dir = 0x1b, ++ DW_AT_const_value = 0x1c, ++ DW_AT_containing_type = 0x1d, ++ DW_AT_default_value = 0x1e, ++ DW_AT_inline = 0x20, ++ DW_AT_is_optional = 0x21, ++ DW_AT_lower_bound = 0x22, ++ DW_AT_producer = 0x25, ++ DW_AT_prototyped = 0x27, ++ DW_AT_return_addr = 0x2a, ++ DW_AT_start_scope = 0x2c, ++ DW_AT_stride_size = 0x2e, ++ DW_AT_upper_bound = 0x2f, ++ DW_AT_abstract_origin = 0x31, ++ DW_AT_accessibility = 0x32, ++ DW_AT_address_class = 0x33, ++ DW_AT_artificial = 0x34, ++ DW_AT_base_types = 0x35, ++ DW_AT_calling_convention = 0x36, ++ DW_AT_count = 0x37, ++ DW_AT_data_member_location = 0x38, ++ DW_AT_decl_column = 0x39, ++ DW_AT_decl_file = 0x3a, ++ DW_AT_decl_line = 0x3b, ++ DW_AT_declaration = 0x3c, ++ DW_AT_discr_list = 0x3d, ++ DW_AT_encoding = 0x3e, ++ DW_AT_external = 0x3f, ++ DW_AT_frame_base = 0x40, ++ DW_AT_friend = 0x41, ++ DW_AT_identifier_case = 0x42, ++ DW_AT_macro_info = 0x43, ++ DW_AT_namelist_items = 0x44, ++ DW_AT_priority = 0x45, ++ DW_AT_segment = 0x46, ++ DW_AT_specification = 0x47, ++ DW_AT_static_link = 0x48, ++ DW_AT_type = 0x49, ++ DW_AT_use_location = 0x4a, ++ DW_AT_variable_parameter = 0x4b, ++ DW_AT_virtuality = 0x4c, ++ DW_AT_vtable_elem_location = 0x4d, ++ /* DWARF 3 values. */ ++ DW_AT_allocated = 0x4e, ++ DW_AT_associated = 0x4f, ++ DW_AT_data_location = 0x50, ++ DW_AT_stride = 0x51, ++ DW_AT_entry_pc = 0x52, ++ DW_AT_use_UTF8 = 0x53, ++ DW_AT_extension = 0x54, ++ DW_AT_ranges = 0x55, ++ DW_AT_trampoline = 0x56, ++ DW_AT_call_column = 0x57, ++ DW_AT_call_file = 0x58, ++ DW_AT_call_line = 0x59, ++ /* SGI/MIPS extensions. */ ++ DW_AT_MIPS_fde = 0x2001, ++ DW_AT_MIPS_loop_begin = 0x2002, ++ DW_AT_MIPS_tail_loop_begin = 0x2003, ++ DW_AT_MIPS_epilog_begin = 0x2004, ++ DW_AT_MIPS_loop_unroll_factor = 0x2005, ++ DW_AT_MIPS_software_pipeline_depth = 0x2006, ++ DW_AT_MIPS_linkage_name = 0x2007, ++ DW_AT_MIPS_stride = 0x2008, ++ DW_AT_MIPS_abstract_name = 0x2009, ++ DW_AT_MIPS_clone_origin = 0x200a, ++ DW_AT_MIPS_has_inlines = 0x200b, ++ /* HP extensions. */ ++ DW_AT_HP_block_index = 0x2000, ++ DW_AT_HP_unmodifiable = 0x2001, /* Same as DW_AT_MIPS_fde. */ ++ DW_AT_HP_actuals_stmt_list = 0x2010, ++ DW_AT_HP_proc_per_section = 0x2011, ++ DW_AT_HP_raw_data_ptr = 0x2012, ++ DW_AT_HP_pass_by_reference = 0x2013, ++ DW_AT_HP_opt_level = 0x2014, ++ DW_AT_HP_prof_version_id = 0x2015, ++ DW_AT_HP_opt_flags = 0x2016, ++ DW_AT_HP_cold_region_low_pc = 0x2017, ++ DW_AT_HP_cold_region_high_pc = 0x2018, ++ DW_AT_HP_all_variables_modifiable = 0x2019, ++ DW_AT_HP_linkage_name = 0x201a, ++ DW_AT_HP_prof_flags = 0x201b, /* In comp unit of procs_info for -g. */ ++ /* GNU extensions. */ ++ DW_AT_sf_names = 0x2101, ++ DW_AT_src_info = 0x2102, ++ DW_AT_mac_info = 0x2103, ++ DW_AT_src_coords = 0x2104, ++ DW_AT_body_begin = 0x2105, ++ DW_AT_body_end = 0x2106, ++ DW_AT_GNU_vector = 0x2107, ++ /* VMS extensions. */ ++ DW_AT_VMS_rtnbeg_pd_address = 0x2201, ++ /* UPC extension. */ ++ DW_AT_upc_threads_scaled = 0x3210, ++ /* PGI (STMicroelectronics) extensions. */ ++ DW_AT_PGI_lbase = 0x3a00, ++ DW_AT_PGI_soffset = 0x3a01, ++ DW_AT_PGI_lstride = 0x3a02 ++ }; ++ ++#define DW_AT_lo_user 0x2000 /* Implementation-defined range start. */ ++#define DW_AT_hi_user 0x3ff0 /* Implementation-defined range end. */ ++ ++/* Location atom names and codes. */ ++enum dwarf_location_atom ++ { ++ DW_OP_addr = 0x03, ++ DW_OP_deref = 0x06, ++ DW_OP_const1u = 0x08, ++ DW_OP_const1s = 0x09, ++ DW_OP_const2u = 0x0a, ++ DW_OP_const2s = 0x0b, ++ DW_OP_const4u = 0x0c, ++ DW_OP_const4s = 0x0d, ++ DW_OP_const8u = 0x0e, ++ DW_OP_const8s = 0x0f, ++ DW_OP_constu = 0x10, ++ DW_OP_consts = 0x11, ++ DW_OP_dup = 0x12, ++ DW_OP_drop = 0x13, ++ DW_OP_over = 0x14, ++ DW_OP_pick = 0x15, ++ DW_OP_swap = 0x16, ++ DW_OP_rot = 0x17, ++ DW_OP_xderef = 0x18, ++ DW_OP_abs = 0x19, ++ DW_OP_and = 0x1a, ++ DW_OP_div = 0x1b, ++ DW_OP_minus = 0x1c, ++ DW_OP_mod = 0x1d, ++ DW_OP_mul = 0x1e, ++ DW_OP_neg = 0x1f, ++ DW_OP_not = 0x20, ++ DW_OP_or = 0x21, ++ DW_OP_plus = 0x22, ++ DW_OP_plus_uconst = 0x23, ++ DW_OP_shl = 0x24, ++ DW_OP_shr = 0x25, ++ DW_OP_shra = 0x26, ++ DW_OP_xor = 0x27, ++ DW_OP_bra = 0x28, ++ DW_OP_eq = 0x29, ++ DW_OP_ge = 0x2a, ++ DW_OP_gt = 0x2b, ++ DW_OP_le = 0x2c, ++ DW_OP_lt = 0x2d, ++ DW_OP_ne = 0x2e, ++ DW_OP_skip = 0x2f, ++ DW_OP_lit0 = 0x30, ++ DW_OP_lit1 = 0x31, ++ DW_OP_lit2 = 0x32, ++ DW_OP_lit3 = 0x33, ++ DW_OP_lit4 = 0x34, ++ DW_OP_lit5 = 0x35, ++ DW_OP_lit6 = 0x36, ++ DW_OP_lit7 = 0x37, ++ DW_OP_lit8 = 0x38, ++ DW_OP_lit9 = 0x39, ++ DW_OP_lit10 = 0x3a, ++ DW_OP_lit11 = 0x3b, ++ DW_OP_lit12 = 0x3c, ++ DW_OP_lit13 = 0x3d, ++ DW_OP_lit14 = 0x3e, ++ DW_OP_lit15 = 0x3f, ++ DW_OP_lit16 = 0x40, ++ DW_OP_lit17 = 0x41, ++ DW_OP_lit18 = 0x42, ++ DW_OP_lit19 = 0x43, ++ DW_OP_lit20 = 0x44, ++ DW_OP_lit21 = 0x45, ++ DW_OP_lit22 = 0x46, ++ DW_OP_lit23 = 0x47, ++ DW_OP_lit24 = 0x48, ++ DW_OP_lit25 = 0x49, ++ DW_OP_lit26 = 0x4a, ++ DW_OP_lit27 = 0x4b, ++ DW_OP_lit28 = 0x4c, ++ DW_OP_lit29 = 0x4d, ++ DW_OP_lit30 = 0x4e, ++ DW_OP_lit31 = 0x4f, ++ DW_OP_reg0 = 0x50, ++ DW_OP_reg1 = 0x51, ++ DW_OP_reg2 = 0x52, ++ DW_OP_reg3 = 0x53, ++ DW_OP_reg4 = 0x54, ++ DW_OP_reg5 = 0x55, ++ DW_OP_reg6 = 0x56, ++ DW_OP_reg7 = 0x57, ++ DW_OP_reg8 = 0x58, ++ DW_OP_reg9 = 0x59, ++ DW_OP_reg10 = 0x5a, ++ DW_OP_reg11 = 0x5b, ++ DW_OP_reg12 = 0x5c, ++ DW_OP_reg13 = 0x5d, ++ DW_OP_reg14 = 0x5e, ++ DW_OP_reg15 = 0x5f, ++ DW_OP_reg16 = 0x60, ++ DW_OP_reg17 = 0x61, ++ DW_OP_reg18 = 0x62, ++ DW_OP_reg19 = 0x63, ++ DW_OP_reg20 = 0x64, ++ DW_OP_reg21 = 0x65, ++ DW_OP_reg22 = 0x66, ++ DW_OP_reg23 = 0x67, ++ DW_OP_reg24 = 0x68, ++ DW_OP_reg25 = 0x69, ++ DW_OP_reg26 = 0x6a, ++ DW_OP_reg27 = 0x6b, ++ DW_OP_reg28 = 0x6c, ++ DW_OP_reg29 = 0x6d, ++ DW_OP_reg30 = 0x6e, ++ DW_OP_reg31 = 0x6f, ++ DW_OP_breg0 = 0x70, ++ DW_OP_breg1 = 0x71, ++ DW_OP_breg2 = 0x72, ++ DW_OP_breg3 = 0x73, ++ DW_OP_breg4 = 0x74, ++ DW_OP_breg5 = 0x75, ++ DW_OP_breg6 = 0x76, ++ DW_OP_breg7 = 0x77, ++ DW_OP_breg8 = 0x78, ++ DW_OP_breg9 = 0x79, ++ DW_OP_breg10 = 0x7a, ++ DW_OP_breg11 = 0x7b, ++ DW_OP_breg12 = 0x7c, ++ DW_OP_breg13 = 0x7d, ++ DW_OP_breg14 = 0x7e, ++ DW_OP_breg15 = 0x7f, ++ DW_OP_breg16 = 0x80, ++ DW_OP_breg17 = 0x81, ++ DW_OP_breg18 = 0x82, ++ DW_OP_breg19 = 0x83, ++ DW_OP_breg20 = 0x84, ++ DW_OP_breg21 = 0x85, ++ DW_OP_breg22 = 0x86, ++ DW_OP_breg23 = 0x87, ++ DW_OP_breg24 = 0x88, ++ DW_OP_breg25 = 0x89, ++ DW_OP_breg26 = 0x8a, ++ DW_OP_breg27 = 0x8b, ++ DW_OP_breg28 = 0x8c, ++ DW_OP_breg29 = 0x8d, ++ DW_OP_breg30 = 0x8e, ++ DW_OP_breg31 = 0x8f, ++ DW_OP_regx = 0x90, ++ DW_OP_fbreg = 0x91, ++ DW_OP_bregx = 0x92, ++ DW_OP_piece = 0x93, ++ DW_OP_deref_size = 0x94, ++ DW_OP_xderef_size = 0x95, ++ DW_OP_nop = 0x96, ++ /* DWARF 3 extensions. */ ++ DW_OP_push_object_address = 0x97, ++ DW_OP_call2 = 0x98, ++ DW_OP_call4 = 0x99, ++ DW_OP_call_ref = 0x9a, ++ /* GNU extensions. */ ++ DW_OP_GNU_push_tls_address = 0xe0, ++ /* HP extensions. */ ++ DW_OP_HP_unknown = 0xe0, /* Ouch, the same as GNU_push_tls_address. */ ++ DW_OP_HP_is_value = 0xe1, ++ DW_OP_HP_fltconst4 = 0xe2, ++ DW_OP_HP_fltconst8 = 0xe3, ++ DW_OP_HP_mod_range = 0xe4, ++ DW_OP_HP_unmod_range = 0xe5, ++ DW_OP_HP_tls = 0xe6 ++ }; ++ ++#define DW_OP_lo_user 0xe0 /* Implementation-defined range start. */ ++#define DW_OP_hi_user 0xff /* Implementation-defined range end. */ ++ ++/* Type encodings. */ ++enum dwarf_type ++ { ++ DW_ATE_void = 0x0, ++ DW_ATE_address = 0x1, ++ DW_ATE_boolean = 0x2, ++ DW_ATE_complex_float = 0x3, ++ DW_ATE_float = 0x4, ++ DW_ATE_signed = 0x5, ++ DW_ATE_signed_char = 0x6, ++ DW_ATE_unsigned = 0x7, ++ DW_ATE_unsigned_char = 0x8, ++ /* DWARF 3. */ ++ DW_ATE_imaginary_float = 0x9, ++ /* HP extensions. */ ++ DW_ATE_HP_float80 = 0x80, /* Floating-point (80 bit). */ ++ DW_ATE_HP_complex_float80 = 0x81, /* Complex floating-point (80 bit). */ ++ DW_ATE_HP_float128 = 0x82, /* Floating-point (128 bit). */ ++ DW_ATE_HP_complex_float128 = 0x83, /* Complex floating-point (128 bit). */ ++ DW_ATE_HP_floathpintel = 0x84, /* Floating-point (82 bit IA64). */ ++ DW_ATE_HP_imaginary_float80 = 0x85, ++ DW_ATE_HP_imaginary_float128 = 0x86 ++ }; ++ ++#define DW_ATE_lo_user 0x80 ++#define DW_ATE_hi_user 0xff ++ ++/* Array ordering names and codes. */ ++enum dwarf_array_dim_ordering ++ { ++ DW_ORD_row_major = 0, ++ DW_ORD_col_major = 1 ++ }; ++ ++/* Access attribute. */ ++enum dwarf_access_attribute ++ { ++ DW_ACCESS_public = 1, ++ DW_ACCESS_protected = 2, ++ DW_ACCESS_private = 3 ++ }; ++ ++/* Visibility. */ ++enum dwarf_visibility_attribute ++ { ++ DW_VIS_local = 1, ++ DW_VIS_exported = 2, ++ DW_VIS_qualified = 3 ++ }; ++ ++/* Virtuality. */ ++enum dwarf_virtuality_attribute ++ { ++ DW_VIRTUALITY_none = 0, ++ DW_VIRTUALITY_virtual = 1, ++ DW_VIRTUALITY_pure_virtual = 2 ++ }; ++ ++/* Case sensitivity. */ ++enum dwarf_id_case ++ { ++ DW_ID_case_sensitive = 0, ++ DW_ID_up_case = 1, ++ DW_ID_down_case = 2, ++ DW_ID_case_insensitive = 3 ++ }; ++ ++/* Calling convention. */ ++enum dwarf_calling_convention ++ { ++ DW_CC_normal = 0x1, ++ DW_CC_program = 0x2, ++ DW_CC_nocall = 0x3 ++ }; ++ ++#define DW_CC_lo_user 0x40 ++#define DW_CC_hi_user 0xff ++ ++/* Inline attribute. */ ++enum dwarf_inline_attribute ++ { ++ DW_INL_not_inlined = 0, ++ DW_INL_inlined = 1, ++ DW_INL_declared_not_inlined = 2, ++ DW_INL_declared_inlined = 3 ++ }; ++ ++/* Discriminant lists. */ ++enum dwarf_discrim_list ++ { ++ DW_DSC_label = 0, ++ DW_DSC_range = 1 ++ }; ++ ++/* Line number opcodes. */ ++enum dwarf_line_number_ops ++ { ++ DW_LNS_extended_op = 0, ++ DW_LNS_copy = 1, ++ DW_LNS_advance_pc = 2, ++ DW_LNS_advance_line = 3, ++ DW_LNS_set_file = 4, ++ DW_LNS_set_column = 5, ++ DW_LNS_negate_stmt = 6, ++ DW_LNS_set_basic_block = 7, ++ DW_LNS_const_add_pc = 8, ++ DW_LNS_fixed_advance_pc = 9, ++ /* DWARF 3. */ ++ DW_LNS_set_prologue_end = 10, ++ DW_LNS_set_epilogue_begin = 11, ++ DW_LNS_set_isa = 12 ++ }; ++ ++/* Line number extended opcodes. */ ++enum dwarf_line_number_x_ops ++ { ++ DW_LNE_end_sequence = 1, ++ DW_LNE_set_address = 2, ++ DW_LNE_define_file = 3, ++ /* HP extensions. */ ++ DW_LNE_HP_negate_is_UV_update = 0x11, ++ DW_LNE_HP_push_context = 0x12, ++ DW_LNE_HP_pop_context = 0x13, ++ DW_LNE_HP_set_file_line_column = 0x14, ++ DW_LNE_HP_set_routine_name = 0x15, ++ DW_LNE_HP_set_sequence = 0x16, ++ DW_LNE_HP_negate_post_semantics = 0x17, ++ DW_LNE_HP_negate_function_exit = 0x18, ++ DW_LNE_HP_negate_front_end_logical = 0x19, ++ DW_LNE_HP_define_proc = 0x20 ++ }; ++ ++/* Call frame information. */ ++enum dwarf_call_frame_info ++ { ++ DW_CFA_advance_loc = 0x40, ++ DW_CFA_offset = 0x80, ++ DW_CFA_restore = 0xc0, ++ DW_CFA_nop = 0x00, ++ DW_CFA_set_loc = 0x01, ++ DW_CFA_advance_loc1 = 0x02, ++ DW_CFA_advance_loc2 = 0x03, ++ DW_CFA_advance_loc4 = 0x04, ++ DW_CFA_offset_extended = 0x05, ++ DW_CFA_restore_extended = 0x06, ++ DW_CFA_undefined = 0x07, ++ DW_CFA_same_value = 0x08, ++ DW_CFA_register = 0x09, ++ DW_CFA_remember_state = 0x0a, ++ DW_CFA_restore_state = 0x0b, ++ DW_CFA_def_cfa = 0x0c, ++ DW_CFA_def_cfa_register = 0x0d, ++ DW_CFA_def_cfa_offset = 0x0e, ++ /* DWARF 3. */ ++ DW_CFA_def_cfa_expression = 0x0f, ++ DW_CFA_expression = 0x10, ++ DW_CFA_offset_extended_sf = 0x11, ++ DW_CFA_def_cfa_sf = 0x12, ++ DW_CFA_def_cfa_offset_sf = 0x13, ++ /* SGI/MIPS specific. */ ++ DW_CFA_MIPS_advance_loc8 = 0x1d, ++ /* GNU extensions. */ ++ DW_CFA_GNU_window_save = 0x2d, ++ DW_CFA_GNU_args_size = 0x2e, ++ DW_CFA_GNU_negative_offset_extended = 0x2f ++ }; ++ ++#define DW_CIE_ID 0xffffffff ++#define DW_CIE_VERSION 1 ++ ++#define DW_CFA_extended 0 ++#define DW_CFA_lo_user 0x1c ++#define DW_CFA_hi_user 0x3f ++ ++#define DW_CHILDREN_no 0x00 ++#define DW_CHILDREN_yes 0x01 ++ ++#define DW_ADDR_none 0 ++ ++/* Source language names and codes. */ ++enum dwarf_source_language ++ { ++ DW_LANG_C89 = 0x0001, ++ DW_LANG_C = 0x0002, ++ DW_LANG_Ada83 = 0x0003, ++ DW_LANG_C_plus_plus = 0x0004, ++ DW_LANG_Cobol74 = 0x0005, ++ DW_LANG_Cobol85 = 0x0006, ++ DW_LANG_Fortran77 = 0x0007, ++ DW_LANG_Fortran90 = 0x0008, ++ DW_LANG_Pascal83 = 0x0009, ++ DW_LANG_Modula2 = 0x000a, ++ DW_LANG_Java = 0x000b, ++ /* DWARF 3. */ ++ DW_LANG_C99 = 0x000c, ++ DW_LANG_Ada95 = 0x000d, ++ DW_LANG_Fortran95 = 0x000e, ++ /* MIPS. */ ++ DW_LANG_Mips_Assembler = 0x8001, ++ /* UPC. */ ++ DW_LANG_Upc = 0x8765 ++ }; ++ ++#define DW_LANG_lo_user 0x8000 /* Implementation-defined range start. */ ++#define DW_LANG_hi_user 0xffff /* Implementation-defined range start. */ ++ ++/* Names and codes for macro information. */ ++enum dwarf_macinfo_record_type ++ { ++ DW_MACINFO_define = 1, ++ DW_MACINFO_undef = 2, ++ DW_MACINFO_start_file = 3, ++ DW_MACINFO_end_file = 4, ++ DW_MACINFO_vendor_ext = 255 ++ }; ++ ++/* @@@ For use with GNU frame unwind information. */ ++ ++#define DW_EH_PE_absptr 0x00 ++#define DW_EH_PE_omit 0xff ++ ++#define DW_EH_PE_uleb128 0x01 ++#define DW_EH_PE_udata2 0x02 ++#define DW_EH_PE_udata4 0x03 ++#define DW_EH_PE_udata8 0x04 ++#define DW_EH_PE_sleb128 0x09 ++#define DW_EH_PE_sdata2 0x0A ++#define DW_EH_PE_sdata4 0x0B ++#define DW_EH_PE_sdata8 0x0C ++#define DW_EH_PE_signed 0x08 ++ ++#define DW_EH_PE_pcrel 0x10 ++#define DW_EH_PE_textrel 0x20 ++#define DW_EH_PE_datarel 0x30 ++#define DW_EH_PE_funcrel 0x40 ++#define DW_EH_PE_aligned 0x50 ++ ++#define DW_EH_PE_indirect 0x80 ++ ++#endif /* _ELF_DWARF2_H */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/kgdb.h linux-2.6.18.kgdb/include/linux/kgdb.h +--- linux-2.6.18/include/linux/kgdb.h 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/include/linux/kgdb.h 2008-06-10 16:20:11.000000000 +0400 +@@ -0,0 +1,279 @@ ++/* ++ * include/linux/kgdb.h ++ * ++ * This provides the hooks and functions that KGDB needs to share between ++ * the core, I/O and arch-specific portions. ++ * ++ * Author: Amit Kale and ++ * Tom Rini ++ * ++ * 2001-2004 (c) Amit S. Kale and 2003-2005 (c) MontaVista Software, Inc. ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++#ifdef __KERNEL__ ++#ifndef _KGDB_H_ ++#define _KGDB_H_ ++ ++#include ++ ++#ifdef CONFIG_KGDB ++#include ++#include ++#include ++#include ++ ++#ifndef CHECK_EXCEPTION_STACK ++#define CHECK_EXCEPTION_STACK() 1 ++#endif ++ ++struct tasklet_struct; ++struct pt_regs; ++struct task_struct; ++struct uart_port; ++ ++#ifdef CONFIG_KGDB_CONSOLE ++extern struct console kgdbcons; ++#endif ++ ++/* To enter the debugger explicitly. */ ++extern void breakpoint(void); ++extern int kgdb_connected; ++extern int kgdb_may_fault; ++extern struct tasklet_struct kgdb_tasklet_breakpoint; ++ ++extern atomic_t kgdb_setting_breakpoint; ++extern atomic_t cpu_doing_single_step; ++extern atomic_t kgdb_sync_softlockup[NR_CPUS]; ++ ++extern struct task_struct *kgdb_usethread, *kgdb_contthread; ++ ++enum kgdb_bptype { ++ bp_breakpoint = '0', ++ bp_hardware_breakpoint, ++ bp_write_watchpoint, ++ bp_read_watchpoint, ++ bp_access_watchpoint ++}; ++ ++enum kgdb_bpstate { ++ bp_none = 0, ++ bp_removed, ++ bp_set, ++ bp_active ++}; ++ ++struct kgdb_bkpt { ++ unsigned long bpt_addr; ++ unsigned char saved_instr[BREAK_INSTR_SIZE]; ++ enum kgdb_bptype type; ++ enum kgdb_bpstate state; ++}; ++ ++/* The maximum number of KGDB I/O modules that can be loaded */ ++#define MAX_KGDB_IO_HANDLERS 3 ++ ++#ifndef MAX_BREAKPOINTS ++#define MAX_BREAKPOINTS 1000 ++#endif ++ ++#define KGDB_HW_BREAKPOINT 1 ++ ++/* Required functions. */ ++/** ++ * regs_to_gdb_regs - Convert ptrace regs to GDB regs ++ * @gdb_regs: A pointer to hold the registers in the order GDB wants. ++ * @regs: The &struct pt_regs of the current process. ++ * ++ * Convert the pt_regs in @regs into the format for registers that ++ * GDB expects, stored in @gdb_regs. ++ */ ++extern void regs_to_gdb_regs(unsigned long *gdb_regs, struct pt_regs *regs); ++ ++/** ++ * sleeping_regs_to_gdb_regs - Convert ptrace regs to GDB regs ++ * @gdb_regs: A pointer to hold the registers in the order GDB wants. ++ * @p: The &struct task_struct of the desired process. ++ * ++ * Convert the register values of the sleeping process in @p to ++ * the format that GDB expects. ++ * This function is called when kgdb does not have access to the ++ * &struct pt_regs and therefore it should fill the gdb registers ++ * @gdb_regs with what has been saved in &struct thread_struct ++ * thread field during switch_to. ++ */ ++extern void sleeping_thread_to_gdb_regs(unsigned long *gdb_regs, ++ struct task_struct *p); ++ ++/** ++ * gdb_regs_to_regs - Convert GDB regs to ptrace regs. ++ * @gdb_regs: A pointer to hold the registers we've recieved from GDB. ++ * @regs: A pointer to a &struct pt_regs to hold these values in. ++ * ++ * Convert the GDB regs in @gdb_regs into the pt_regs, and store them ++ * in @regs. ++ */ ++extern void gdb_regs_to_regs(unsigned long *gdb_regs, struct pt_regs *regs); ++ ++/** ++ * kgdb_arch_handle_exception - Handle architecture specific GDB packets. ++ * @vector: The error vector of the exception that happened. ++ * @signo: The signal number of the exception that happened. ++ * @err_code: The error code of the exception that happened. ++ * @remcom_in_buffer: The buffer of the packet we have read. ++ * @remcom_out_buffer: The buffer, of %BUFMAX to write a packet into. ++ * @regs: The &struct pt_regs of the current process. ++ * ++ * This function MUST handle the 'c' and 's' command packets, ++ * as well packets to set / remove a hardware breakpoint, if used. ++ * If there are additional packets which the hardware needs to handle, ++ * they are handled here. The code should return -1 if it wants to ++ * process more packets, and a %0 or %1 if it wants to exit from the ++ * kgdb hook. ++ */ ++extern int kgdb_arch_handle_exception(int vector, int signo, int err_code, ++ char *remcom_in_buffer, ++ char *remcom_out_buffer, ++ struct pt_regs *regs); ++ ++#ifndef JMP_REGS_ALIGNMENT ++#define JMP_REGS_ALIGNMENT ++#endif ++ ++extern unsigned long kgdb_fault_jmp_regs[]; ++ ++/** ++ * kgdb_fault_setjmp - Store state in case we fault. ++ * @curr_context: An array to store state into. ++ * ++ * Certain functions may try and access memory, and in doing so may ++ * cause a fault. When this happens, we trap it, restore state to ++ * this call, and let ourself know that something bad has happened. ++ */ ++extern asmlinkage int kgdb_fault_setjmp(unsigned long *curr_context); ++ ++/** ++ * kgdb_fault_longjmp - Restore state when we have faulted. ++ * @curr_context: The previously stored state. ++ * ++ * When something bad does happen, this function is called to ++ * restore the known good state, and set the return value to 1, so ++ * we know something bad happened. ++ */ ++extern asmlinkage void kgdb_fault_longjmp(unsigned long *curr_context); ++ ++/* Optional functions. */ ++extern int kgdb_arch_init(void); ++extern void kgdb_disable_hw_debug(struct pt_regs *regs); ++extern void kgdb_post_master_code(struct pt_regs *regs, int e_vector, ++ int err_code); ++extern void kgdb_roundup_cpus(unsigned long flags); ++extern int kgdb_set_hw_break(unsigned long addr); ++extern int kgdb_remove_hw_break(unsigned long addr); ++extern void kgdb_remove_all_hw_break(void); ++extern void kgdb_correct_hw_break(void); ++extern void kgdb_shadowinfo(struct pt_regs *regs, char *buffer, ++ unsigned threadid); ++extern struct task_struct *kgdb_get_shadow_thread(struct pt_regs *regs, ++ int threadid); ++extern struct pt_regs *kgdb_shadow_regs(struct pt_regs *regs, int threadid); ++extern int kgdb_validate_break_address(unsigned long addr); ++extern int kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr); ++extern int kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle); ++ ++/** ++ * struct kgdb_arch - Desribe architecture specific values. ++ * @gdb_bpt_instr: The instruction to trigger a breakpoint. ++ * @flags: Flags for the breakpoint, currently just %KGDB_HW_BREAKPOINT. ++ * @shadowth: A value of %1 indicates we shadow information on processes. ++ * @set_breakpoint: Allow an architecture to specify how to set a software ++ * breakpoint. ++ * @remove_breakpoint: Allow an architecture to specify how to remove a ++ * software breakpoint. ++ * @set_hw_breakpoint: Allow an architecture to specify how to set a hardware ++ * breakpoint. ++ * @remove_hw_breakpoint: Allow an architecture to specify how to remove a ++ * hardware breakpoint. ++ * ++ * The @shadowth flag is an option to shadow information not retrievable by ++ * gdb otherwise. This is deprecated in favor of a binutils which supports ++ * CFI macros. ++ */ ++struct kgdb_arch { ++ unsigned char gdb_bpt_instr[BREAK_INSTR_SIZE]; ++ unsigned long flags; ++ unsigned shadowth; ++ int (*set_breakpoint) (unsigned long, char *); ++ int (*remove_breakpoint)(unsigned long, char *); ++ int (*set_hw_breakpoint)(unsigned long, int, enum kgdb_bptype); ++ int (*remove_hw_breakpoint)(unsigned long, int, enum kgdb_bptype); ++}; ++ ++/* Thread reference */ ++typedef unsigned char threadref[8]; ++ ++/** ++ * struct kgdb_io - Desribe the interface for an I/O driver to talk with KGDB. ++ * @read_char: Pointer to a function that will return one char. ++ * @write_char: Pointer to a function that will write one char. ++ * @flush: Pointer to a function that will flush any pending writes. ++ * @init: Pointer to a function that will initialize the device. ++ * @late_init: Pointer to a function that will do any setup that has ++ * other dependencies. ++ * @pre_exception: Pointer to a function that will do any prep work for ++ * the I/O driver. ++ * @post_exception: Pointer to a function that will do any cleanup work ++ * for the I/O driver. ++ * ++ * The @init and @late_init function pointers allow for an I/O driver ++ * such as a serial driver to fully initialize the port with @init and ++ * be called very early, yet safely call request_irq() later in the boot ++ * sequence. ++ * ++ * @init is allowed to return a non-0 return value to indicate failure. ++ * If this is called early on, then KGDB will try again when it would call ++ * @late_init. If it has failed later in boot as well, the user will be ++ * notified. ++ */ ++struct kgdb_io { ++ int (*read_char) (void); ++ void (*write_char) (u8); ++ void (*flush) (void); ++ int (*init) (void); ++ void (*late_init) (void); ++ void (*pre_exception) (void); ++ void (*post_exception) (void); ++}; ++ ++extern struct kgdb_io kgdb_io_ops; ++extern struct kgdb_arch arch_kgdb_ops; ++extern int kgdb_initialized; ++ ++extern int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops); ++extern void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops); ++ ++extern void __init kgdb8250_add_port(int i, struct uart_port *serial_req); ++extern void __init kgdb8250_add_platform_port(int i, struct plat_serial8250_port *serial_req); ++ ++extern int kgdb_hex2long(char **ptr, long *long_val); ++extern char *kgdb_mem2hex(char *mem, char *buf, int count); ++extern char *kgdb_hex2mem(char *buf, char *mem, int count); ++extern int kgdb_get_mem(char *addr, unsigned char *buf, int count); ++extern int kgdb_set_mem(char *addr, unsigned char *buf, int count); ++ ++int kgdb_isremovedbreak(unsigned long addr); ++int kgdb_skipexception(int exception, struct pt_regs *regs); ++ ++extern int kgdb_handle_exception(int ex_vector, int signo, int err_code, ++ struct pt_regs *regs); ++extern void kgdb_nmihook(int cpu, void *regs); ++extern int debugger_step; ++extern atomic_t debugger_active; ++extern struct kgdb_arch *kgdb_ops; ++#else ++/* Stubs for when KGDB is not set. */ ++static const atomic_t debugger_active = ATOMIC_INIT(0); ++#endif /* CONFIG_KGDB */ ++#endif /* _KGDB_H_ */ ++#endif /* __KERNEL__ */ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/module.h linux-2.6.18.kgdb/include/linux/module.h +--- linux-2.6.18/include/linux/module.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/linux/module.h 2008-06-10 16:20:07.000000000 +0400 +@@ -224,8 +224,17 @@ enum module_state + MODULE_STATE_LIVE, + MODULE_STATE_COMING, + MODULE_STATE_GOING, ++ MODULE_STATE_GONE, + }; + ++#ifdef CONFIG_KGDB ++#define MAX_SECTNAME 31 ++struct mod_section { ++ void *address; ++ char name[MAX_SECTNAME + 1]; ++}; ++#endif ++ + /* Similar stuff for section attributes. */ + #define MODULE_SECT_NAME_LEN 32 + struct module_sect_attr +@@ -253,6 +262,13 @@ struct module + /* Unique handle for this module */ + char name[MODULE_NAME_LEN]; + ++#ifdef CONFIG_KGDB ++ /* keep kgdb info at the begining so that gdb doesn't have a chance to ++ * miss out any fields */ ++ unsigned long num_sections; ++ struct mod_section *mod_sections; ++#endif ++ + /* Sysfs stuff. */ + struct module_kobject mkobj; + struct module_param_attrs *param_attrs; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/netpoll.h linux-2.6.18.kgdb/include/linux/netpoll.h +--- linux-2.6.18/include/linux/netpoll.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/linux/netpoll.h 2008-06-10 16:19:07.000000000 +0400 +@@ -17,7 +17,7 @@ struct netpoll; + struct netpoll { + struct net_device *dev; + char dev_name[16], *name; +- void (*rx_hook)(struct netpoll *, int, char *, int); ++ void (*rx_hook)(struct netpoll *, int, char *, int, struct sk_buff *); + void (*drop)(struct sk_buff *skb); + u32 local_ip, remote_ip; + u16 local_port, remote_port; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/include/linux/serial_8250.h linux-2.6.18.kgdb/include/linux/serial_8250.h +--- linux-2.6.18/include/linux/serial_8250.h 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/include/linux/serial_8250.h 2008-06-10 16:19:03.000000000 +0400 +@@ -56,6 +56,7 @@ struct uart_port; + + int serial8250_register_port(struct uart_port *); + void serial8250_unregister_port(int line); ++void serial8250_unregister_by_port(struct uart_port *port); + void serial8250_suspend_port(int line); + void serial8250_resume_port(int line); + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/Makefile linux-2.6.18.kgdb/kernel/Makefile +--- linux-2.6.18/kernel/Makefile 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/kernel/Makefile 2008-06-10 16:18:58.000000000 +0400 +@@ -42,6 +42,7 @@ obj-$(CONFIG_STOP_MACHINE) += stop_machi + obj-$(CONFIG_AUDIT) += audit.o auditfilter.o + obj-$(CONFIG_AUDITSYSCALL) += auditsc.o + obj-$(CONFIG_KPROBES) += kprobes.o ++obj-$(CONFIG_KGDB) += kgdb.o kgdbarchlib.o + obj-$(CONFIG_SYSFS) += ksysfs.o + obj-$(CONFIG_DETECT_SOFTLOCKUP) += softlockup.o + obj-$(CONFIG_GENERIC_HARDIRQS) += irq/ +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/kgdb.c linux-2.6.18.kgdb/kernel/kgdb.c +--- linux-2.6.18/kernel/kgdb.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/kernel/kgdb.c 2008-06-10 16:20:11.000000000 +0400 +@@ -0,0 +1,1778 @@ ++/* ++ * kernel/kgdb.c ++ * ++ * Maintainer: Tom Rini ++ * ++ * Copyright (C) 2000-2001 VERITAS Software Corporation. ++ * Copyright (C) 2002-2004 Timesys Corporation ++ * Copyright (C) 2003-2004 Amit S. Kale ++ * Copyright (C) 2004 Pavel Machek ++ * Copyright (C) 2004-2005 Tom Rini ++ * Copyright (C) 2004-2006 LinSysSoft Technologies Pvt. Ltd. ++ * Copyright (C) 2005 Wind River Systems, Inc. ++ * ++ * Contributors at various stages not listed above: ++ * Jason Wessel ( jason.wessel@windriver.com ) ++ * George Anzinger ++ * Anurekh Saxena (anurekh.saxena@timesys.com) ++ * Lake Stevens Instrument Division (Glenn Engel) ++ * Jim Kingdon, Cygnus Support. ++ * ++ * Original KGDB stub: David Grothe , ++ * Tigran Aivazian ++ * ++ * This file is licensed under the terms of the GNU General Public License ++ * version 2. This program is licensed "as is" without any warranty of any ++ * kind, whether express or implied. ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++extern int pid_max; ++/* How many times to count all of the waiting CPUs */ ++#define ROUNDUP_WAIT 640000 /* Arbitrary, increase if needed. */ ++#define BUF_THREAD_ID_SIZE 16 ++ ++/* ++ * kgdb_initialized with a value of 1 indicates that kgdb is setup and is ++ * all ready to serve breakpoints and other kernel exceptions. A value of ++ * -1 indicates that we have tried to initialize early, and need to try ++ * again later. ++ */ ++int kgdb_initialized; ++/* Is a host GDB connected to us? */ ++int kgdb_connected; ++/* Could we be about to try and access a bad memory location? If so we ++ * also need to flag this has happend. */ ++int kgdb_may_fault; ++/* All the KGDB handlers are installed */ ++int kgdb_from_module_registered = 0; ++ ++/* We provide a kgdb_io_ops structure that may be overriden. */ ++struct kgdb_io __attribute__ ((weak)) kgdb_io_ops; ++ ++static struct kgdb_io kgdb_io_ops_prev[MAX_KGDB_IO_HANDLERS]; ++static int kgdb_io_handler_cnt = 0; ++ ++/* Export the following symbols for use with kernel modules */ ++EXPORT_SYMBOL(kgdb_io_ops); ++EXPORT_SYMBOL(kgdb_tasklet_breakpoint); ++EXPORT_SYMBOL(kgdb_connected); ++EXPORT_SYMBOL(kgdb_register_io_module); ++EXPORT_SYMBOL(kgdb_unregister_io_module); ++EXPORT_SYMBOL(debugger_active); ++ ++/* ++ * Holds information about breakpoints in a kernel. These breakpoints are ++ * added and removed by gdb. ++ */ ++struct kgdb_bkpt kgdb_break[MAX_BREAKPOINTS]; ++ ++static const char hexchars[] = "0123456789abcdef"; ++ ++static spinlock_t slavecpulocks[NR_CPUS]; ++static atomic_t procindebug[NR_CPUS]; ++atomic_t kgdb_setting_breakpoint; ++EXPORT_SYMBOL(kgdb_setting_breakpoint); ++struct task_struct *kgdb_usethread, *kgdb_contthread; ++ ++int debugger_step; ++atomic_t debugger_active; ++ ++/* Our I/O buffers. */ ++static char remcom_in_buffer[BUFMAX]; ++static char remcom_out_buffer[BUFMAX]; ++/* Storage for the registers, in GDB format. */ ++static unsigned long gdb_regs[(NUMREGBYTES + sizeof(unsigned long) - 1) / ++ sizeof(unsigned long)]; ++/* Storage of registers for handling a fault. */ ++unsigned long kgdb_fault_jmp_regs[NUMCRITREGBYTES / sizeof(unsigned long)] ++ JMP_REGS_ALIGNMENT; ++static int kgdb_notify_reboot(struct notifier_block *this, ++ unsigned long code ,void *x); ++struct debuggerinfo_struct { ++ void *debuggerinfo; ++ struct task_struct *task; ++} kgdb_info[NR_CPUS]; ++ ++/* to keep track of the CPU which is doing the single stepping*/ ++atomic_t cpu_doing_single_step = ATOMIC_INIT(-1); ++ ++atomic_t kgdb_sync_softlockup[NR_CPUS] = {ATOMIC_INIT(0)}; ++ ++/* reboot notifier block */ ++static struct notifier_block kgdb_reboot_notifier = { ++ .notifier_call = kgdb_notify_reboot, ++ .next = NULL, ++ .priority = INT_MAX, ++}; ++ ++static int hex(char ch) ++{ ++ if ((ch >= 'a') && (ch <= 'f')) ++ return (ch - 'a' + 10); ++ if ((ch >= '0') && (ch <= '9')) ++ return (ch - '0'); ++ if ((ch >= 'A') && (ch <= 'F')) ++ return (ch - 'A' + 10); ++ return (-1); ++} ++ ++/* scan for the sequence $# */ ++static void get_packet(char *buffer) ++{ ++ unsigned char checksum; ++ unsigned char xmitcsum; ++ int count; ++ char ch; ++ if (!kgdb_io_ops.read_char) ++ return; ++ do { ++ /* Spin and wait around for the start character, ignore all ++ * other characters */ ++ while ((ch = (kgdb_io_ops.read_char())) != '$') ; ++ kgdb_connected = 1; ++ checksum = 0; ++ xmitcsum = -1; ++ ++ count = 0; ++ ++ /* now, read until a # or end of buffer is found */ ++ while (count < (BUFMAX - 1)) { ++ ch = kgdb_io_ops.read_char(); ++ if (ch == '#') ++ break; ++ checksum = checksum + ch; ++ buffer[count] = ch; ++ count = count + 1; ++ } ++ buffer[count] = 0; ++ ++ if (ch == '#') { ++ xmitcsum = hex(kgdb_io_ops.read_char()) << 4; ++ xmitcsum += hex(kgdb_io_ops.read_char()); ++ ++ if (checksum != xmitcsum) ++ /* failed checksum */ ++ kgdb_io_ops.write_char('-'); ++ else ++ /* successful transfer */ ++ kgdb_io_ops.write_char('+'); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ } ++ } while (checksum != xmitcsum); ++} ++ ++/* ++ * Send the packet in buffer. ++ * Check for gdb connection if asked for. ++ */ ++static void put_packet(char *buffer) ++{ ++ unsigned char checksum; ++ int count; ++ char ch; ++ ++ if (!kgdb_io_ops.write_char) ++ return; ++ /* $#. */ ++ while (1) { ++ kgdb_io_ops.write_char('$'); ++ checksum = 0; ++ count = 0; ++ ++ while ((ch = buffer[count])) { ++ kgdb_io_ops.write_char(ch); ++ checksum += ch; ++ count++; ++ } ++ ++ kgdb_io_ops.write_char('#'); ++ kgdb_io_ops.write_char(hexchars[checksum >> 4]); ++ kgdb_io_ops.write_char(hexchars[checksum % 16]); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ ++ /* Now see what we get in reply. */ ++ ch = kgdb_io_ops.read_char(); ++ ++ if (ch == 3) ++ ch = kgdb_io_ops.read_char(); ++ ++ /* If we get an ACK, we are done. */ ++ if (ch == '+') ++ return; ++ ++ /* If we get the start of another packet, this means ++ * that GDB is attempting to reconnect. We will NAK ++ * the packet being sent, and stop trying to send this ++ * packet. */ ++ if (ch == '$') { ++ kgdb_io_ops.write_char('-'); ++ if (kgdb_io_ops.flush) ++ kgdb_io_ops.flush(); ++ return; ++ } ++ } ++} ++ ++/* ++ * convert the memory pointed to by mem into hex, placing result in buf ++ * return a pointer to the last char put in buf (null). May return an error. ++ */ ++char *kgdb_mem2hex(char *mem, char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ /* Accessing some registers in a single load instruction is ++ * required to avoid bad side effects for some I/O registers. ++ */ ++ if ((count == 2) && (((long)mem & 1) == 0)) { ++ unsigned short tmp_s = *(unsigned short *)mem; ++ mem += 2; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_s >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_s & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_s >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_s & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_s >> 8) & 0xf]; ++#endif ++ } else if ((count == 4) && (((long)mem & 3) == 0)) { ++ unsigned long tmp_l = *(unsigned int *)mem; ++ mem += 4; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_l >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_l & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_l >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_l & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_l >> 24) & 0xf]; ++#endif ++#ifdef CONFIG_64BIT ++ } else if ((count == 8) && (((long)mem & 7) == 0)) { ++ unsigned long long tmp_ll = *(unsigned long long *)mem; ++ mem += 8; ++#ifdef __BIG_ENDIAN ++ *buf++ = hexchars[(tmp_ll >> 60) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 56) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 52) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 48) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 44) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 40) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 36) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 32) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_ll & 0xf]; ++#else ++ *buf++ = hexchars[(tmp_ll >> 4) & 0xf]; ++ *buf++ = hexchars[tmp_ll & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 12) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 8) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 20) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 16) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 28) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 24) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 36) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 32) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 44) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 40) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 52) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 48) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 60) & 0xf]; ++ *buf++ = hexchars[(tmp_ll >> 56) & 0xf]; ++#endif ++#endif ++ } else { ++ while (count-- > 0) { ++ unsigned char ch = *mem++; ++ *buf++ = hexchars[ch >> 4]; ++ *buf++ = hexchars[ch & 0xf]; ++ } ++ } ++ kgdb_may_fault = 0; ++ *buf = 0; ++ return (buf); ++} ++ ++/* ++ * Copy the binary array pointed to by buf into mem. Fix $, #, and ++ * 0x7d escaped with 0x7d. Return a pointer to the character after ++ * the last byte written. ++ */ ++static char *kgdb_ebin2mem(char *buf, char *mem, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ for (; count > 0; count--, buf++) { ++ if (*buf == 0x7d) ++ *mem++ = *(++buf) ^ 0x20; ++ else ++ *mem++ = *buf; ++ } ++ kgdb_may_fault = 0; ++ return mem; ++} ++ ++/* ++ * convert the hex array pointed to by buf into binary to be placed in mem ++ * return a pointer to the character AFTER the last byte written ++ * May return an error. ++ */ ++char *kgdb_hex2mem(char *buf, char *mem, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return ERR_PTR(-EINVAL); ++ } ++ if ((count == 2) && (((long)mem & 1) == 0)) { ++ unsigned short tmp_s = 0; ++#ifdef __BIG_ENDIAN ++ tmp_s |= hex(*buf++) << 12; ++ tmp_s |= hex(*buf++) << 8; ++ tmp_s |= hex(*buf++) << 4; ++ tmp_s |= hex(*buf++); ++#else ++ tmp_s |= hex(*buf++) << 4; ++ tmp_s |= hex(*buf++); ++ tmp_s |= hex(*buf++) << 12; ++ tmp_s |= hex(*buf++) << 8; ++#endif ++ *(unsigned short *)mem = tmp_s; ++ mem += 2; ++ } else if ((count == 4) && (((long)mem & 3) == 0)) { ++ unsigned long tmp_l = 0; ++#ifdef __BIG_ENDIAN ++ tmp_l |= hex(*buf++) << 28; ++ tmp_l |= hex(*buf++) << 24; ++ tmp_l |= hex(*buf++) << 20; ++ tmp_l |= hex(*buf++) << 16; ++ tmp_l |= hex(*buf++) << 12; ++ tmp_l |= hex(*buf++) << 8; ++ tmp_l |= hex(*buf++) << 4; ++ tmp_l |= hex(*buf++); ++#else ++ tmp_l |= hex(*buf++) << 4; ++ tmp_l |= hex(*buf++); ++ tmp_l |= hex(*buf++) << 12; ++ tmp_l |= hex(*buf++) << 8; ++ tmp_l |= hex(*buf++) << 20; ++ tmp_l |= hex(*buf++) << 16; ++ tmp_l |= hex(*buf++) << 28; ++ tmp_l |= hex(*buf++) << 24; ++#endif ++ *(unsigned long *)mem = tmp_l; ++ mem += 4; ++ } else { ++ int i; ++ for (i = 0; i < count; i++) { ++ unsigned char ch = hex(*buf++) << 4; ++ ch |= hex(*buf++); ++ *mem++ = ch; ++ } ++ } ++ kgdb_may_fault = 0; ++ return (mem); ++} ++ ++/* ++ * While we find nice hex chars, build a long_val. ++ * Return number of chars processed. ++ */ ++int kgdb_hex2long(char **ptr, long *long_val) ++{ ++ int hex_val, num = 0; ++ ++ *long_val = 0; ++ ++ while (**ptr) { ++ hex_val = hex(**ptr); ++ if (hex_val >= 0) { ++ *long_val = (*long_val << 4) | hex_val; ++ num++; ++ } else ++ break; ++ ++ (*ptr)++; ++ } ++ ++ return (num); ++} ++ ++/* Write memory due to an 'M' or 'X' packet. */ ++static char *write_mem_msg(int binary) ++{ ++ char *ptr = &remcom_in_buffer[1]; ++ unsigned long addr, length; ++ ++ if (kgdb_hex2long(&ptr, &addr) > 0 && *(ptr++) == ',' && ++ kgdb_hex2long(&ptr, &length) > 0 && *(ptr++) == ':') { ++ if (binary) ++ ptr = kgdb_ebin2mem(ptr, (char *)addr, length); ++ else ++ ptr = kgdb_hex2mem(ptr, (char *)addr, length); ++ if (CACHE_FLUSH_IS_SAFE) ++ flush_icache_range(addr, addr + length + 1); ++ if (IS_ERR(ptr)) ++ return ptr; ++ return NULL; ++ } ++ ++ return ERR_PTR(-EINVAL); ++} ++ ++static inline char *pack_hex_byte(char *pkt, int byte) ++{ ++ *pkt++ = hexchars[(byte >> 4) & 0xf]; ++ *pkt++ = hexchars[(byte & 0xf)]; ++ return pkt; ++} ++ ++static inline void error_packet(char *pkt, int error) ++{ ++ error = -error; ++ pkt[0] = 'E'; ++ pkt[1] = hexchars[(error / 10)]; ++ pkt[2] = hexchars[(error % 10)]; ++ pkt[3] = '\0'; ++} ++ ++static char *pack_threadid(char *pkt, threadref * id) ++{ ++ char *limit; ++ unsigned char *altid; ++ ++ altid = (unsigned char *)id; ++ limit = pkt + BUF_THREAD_ID_SIZE; ++ while (pkt < limit) ++ pkt = pack_hex_byte(pkt, *altid++); ++ ++ return pkt; ++} ++ ++void int_to_threadref(threadref * id, int value) ++{ ++ unsigned char *scan; ++ int i = 4; ++ ++ scan = (unsigned char *)id; ++ while (i--) ++ *scan++ = 0; ++ *scan++ = (value >> 24) & 0xff; ++ *scan++ = (value >> 16) & 0xff; ++ *scan++ = (value >> 8) & 0xff; ++ *scan++ = (value & 0xff); ++} ++ ++static struct task_struct *getthread(struct pt_regs *regs, int tid) ++{ ++ if (last_pid == 0) ++ return current; ++ ++ if (num_online_cpus() && ++ (tid >= pid_max + num_online_cpus() + kgdb_ops->shadowth)) ++ return NULL; ++ ++ if (kgdb_ops->shadowth && (tid >= pid_max + num_online_cpus())) ++ return kgdb_get_shadow_thread(regs, tid - pid_max - ++ num_online_cpus()); ++ ++ if (tid >= pid_max) ++ return idle_task(tid - pid_max); ++ ++ if (!tid) ++ return NULL; ++ ++ return find_task_by_pid(tid); ++} ++ ++#ifdef CONFIG_SMP ++static void kgdb_wait(struct pt_regs *regs) ++{ ++ unsigned long flags; ++ int processor; ++ ++ local_irq_save(flags); ++ processor = smp_processor_id(); ++ kgdb_info[processor].debuggerinfo = regs; ++ kgdb_info[processor].task = current; ++ atomic_set(&procindebug[processor], 1); ++ atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1); ++ ++ /* Wait till master processor goes completely into the debugger. ++ * FIXME: this looks racy */ ++ while (!atomic_read(&procindebug[atomic_read(&debugger_active) - 1])) { ++ int i = 10; /* an arbitrary number */ ++ ++ while (--i) ++ cpu_relax(); ++ } ++ ++ /* Wait till master processor is done with debugging */ ++ spin_lock_nested(&slavecpulocks[processor], processor); ++ ++ /* This has been taken from x86 kgdb implementation and ++ * will be needed by architectures that have SMP support ++ */ ++ kgdb_correct_hw_break(); ++ ++ kgdb_info[processor].debuggerinfo = NULL; ++ kgdb_info[processor].task = NULL; ++ ++ /* Signal the master processor that we are done */ ++ atomic_set(&procindebug[processor], 0); ++ spin_unlock(&slavecpulocks[processor]); ++ local_irq_restore(flags); ++} ++#endif ++ ++int kgdb_get_mem(char *addr, unsigned char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return -EINVAL; ++ } ++ while (count) { ++ if ((unsigned long)addr < TASK_SIZE) ++ return -EINVAL; ++ *buf++ = *addr++; ++ count--; ++ } ++ kgdb_may_fault = 0; ++ return 0; ++} ++ ++int kgdb_set_mem(char *addr, unsigned char *buf, int count) ++{ ++ kgdb_may_fault = 1; ++ if ((kgdb_fault_setjmp(kgdb_fault_jmp_regs)) != 0) { ++ kgdb_may_fault = 0; ++ return -EINVAL; ++ } ++ while (count) { ++ if ((unsigned long)addr < TASK_SIZE) ++ return -EINVAL; ++ *addr++ = *buf++; ++ count--; ++ } ++ kgdb_may_fault = 0; ++ return 0; ++} ++int kgdb_activate_sw_breakpoints(void) ++{ ++ int i; ++ int error = 0; ++ unsigned long addr; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_set) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_set_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ ++ if (CACHE_FLUSH_IS_SAFE) { ++ if (current->mm && addr < TASK_SIZE) ++ flush_cache_range(current->mm->mmap_cache, ++ addr, addr + BREAK_INSTR_SIZE); ++ else ++ flush_icache_range(addr, addr + ++ BREAK_INSTR_SIZE); ++ } ++ ++ kgdb_break[i].state = bp_active; ++ } ++ return 0; ++} ++ ++static int kgdb_set_sw_break(unsigned long addr) ++{ ++ int i, breakno = -1; ++ int error = 0; ++ if ((error = kgdb_validate_break_address(addr)) < 0) ++ return error; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_set) && ++ (kgdb_break[i].bpt_addr == addr)) ++ return -EEXIST; ++ } ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state == bp_removed && ++ kgdb_break[i].bpt_addr == addr) { ++ breakno = i; ++ break; ++ } ++ } ++ ++ if (breakno == -1) { ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state == bp_none) { ++ breakno = i; ++ break; ++ } ++ } ++ } ++ if (breakno == -1) ++ return -E2BIG; ++ ++ kgdb_break[breakno].state = bp_set; ++ kgdb_break[breakno].type = bp_breakpoint; ++ kgdb_break[breakno].bpt_addr = addr; ++ ++ return 0; ++} ++ ++int kgdb_deactivate_sw_breakpoints(void) ++{ ++ int i; ++ int error = 0; ++ unsigned long addr; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_active) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_remove_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ ++ if (CACHE_FLUSH_IS_SAFE && current->mm && ++ addr < TASK_SIZE) ++ flush_cache_range(current->mm->mmap_cache, ++ addr, addr + BREAK_INSTR_SIZE); ++ else if (CACHE_FLUSH_IS_SAFE) ++ flush_icache_range(addr, ++ addr + BREAK_INSTR_SIZE); ++ kgdb_break[i].state = bp_set; ++ } ++ return 0; ++} ++ ++static int kgdb_remove_sw_break(unsigned long addr) ++{ ++ int i; ++ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_set) && ++ (kgdb_break[i].bpt_addr == addr)) { ++ kgdb_break[i].state = bp_removed; ++ return 0; ++ } ++ } ++ return -ENOENT; ++} ++ ++int kgdb_isremovedbreak(unsigned long addr) ++{ ++ int i; ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if ((kgdb_break[i].state == bp_removed) && ++ (kgdb_break[i].bpt_addr == addr)) { ++ return 1; ++ } ++ } ++ return 0; ++} ++ ++int remove_all_break(void) ++{ ++ int i; ++ int error; ++ unsigned long addr; ++ ++ /* Clear memory breakpoints. */ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) { ++ if (kgdb_break[i].state != bp_set) ++ continue; ++ addr = kgdb_break[i].bpt_addr; ++ if ((error = kgdb_arch_remove_breakpoint(addr, ++ kgdb_break[i].saved_instr))) ++ return error; ++ kgdb_break[i].state = bp_removed; ++ } ++ ++ /* Clear hardware breakpoints. */ ++ kgdb_remove_all_hw_break(); ++ ++ return 0; ++} ++ ++static inline int shadow_pid(int realpid) ++{ ++ if (realpid) { ++ return realpid; ++ } ++ return pid_max + smp_processor_id(); ++} ++ ++static char gdbmsgbuf[BUFMAX + 1]; ++static void kgdb_msg_write(const char *s, int len) ++{ ++ int i; ++ int wcount; ++ char *bufptr; ++ ++ /* 'O'utput */ ++ gdbmsgbuf[0] = 'O'; ++ ++ /* Fill and send buffers... */ ++ while (len > 0) { ++ bufptr = gdbmsgbuf + 1; ++ ++ /* Calculate how many this time */ ++ if ((len << 1) > (BUFMAX - 2)) ++ wcount = (BUFMAX - 2) >> 1; ++ else ++ wcount = len; ++ ++ /* Pack in hex chars */ ++ for (i = 0; i < wcount; i++) ++ bufptr = pack_hex_byte(bufptr, s[i]); ++ *bufptr = '\0'; ++ ++ /* Move up */ ++ s += wcount; ++ len -= wcount; ++ ++ /* Write packet */ ++ put_packet(gdbmsgbuf); ++ } ++} ++ ++/* ++ * This function does all command procesing for interfacing to gdb. ++ * ++ * Locking hierarchy: ++ * interface locks, if any (begin_session) ++ * kgdb lock (debugger_active) ++ * ++ * Note that since we can be in here prior to our cpumask being filled ++ * out, we err on the side of caution and loop over NR_CPUS instead ++ * of a for_each_online_cpu. ++ * ++ */ ++int kgdb_handle_exception(int ex_vector, int signo, int err_code, ++ struct pt_regs *linux_regs) ++{ ++ unsigned long length, addr; ++ char *ptr; ++ unsigned long flags; ++ unsigned i; ++ long threadid; ++ threadref thref; ++ struct task_struct *thread = NULL; ++ unsigned procid; ++ int numshadowth = num_online_cpus() + kgdb_ops->shadowth; ++ long kgdb_usethreadid = 0; ++ int error = 0, all_cpus_synced = 0; ++ struct pt_regs *shadowregs; ++ int processor = smp_processor_id(); ++ void *local_debuggerinfo; ++ ++ /* Panic on recursive debugger calls. */ ++ if (atomic_read(&debugger_active) == smp_processor_id() + 1) ++ return 0; ++ ++ acquirelock: ++ ++ /* Call the I/O drivers pre_exception routine if the I/O ++ * driver defined one ++ */ ++ if (kgdb_io_ops.pre_exception) ++ kgdb_io_ops.pre_exception(); ++ ++ /* ++ * Interrupts will be restored by the 'trap return' code, except when ++ * single stepping. ++ */ ++ local_irq_save(flags); ++ ++ /* Hold debugger_active */ ++ procid = smp_processor_id(); ++ ++ while (cmpxchg(&atomic_read(&debugger_active), 0, (procid + 1)) != 0) { ++ int i = 25; /* an arbitrary number */ ++ ++ while (--i) ++ cpu_relax(); ++ ++ if (atomic_read(&cpu_doing_single_step) != -1 && ++ atomic_read(&cpu_doing_single_step) != procid) ++ udelay(1); ++ } ++ ++ atomic_set(&kgdb_sync_softlockup[smp_processor_id()], 1); ++ ++ /* ++ * Don't enter if the last instance of the exception handler wanted to ++ * come into the debugger again. ++ */ ++ if (atomic_read(&cpu_doing_single_step) != -1 && ++ atomic_read(&cpu_doing_single_step) != procid) { ++ atomic_set(&debugger_active, 0); ++ local_irq_restore(flags); ++ goto acquirelock; ++ } ++ ++ /* ++ * Don't enter if we have hit a removed breakpoint. ++ */ ++ if (kgdb_skipexception(ex_vector, linux_regs)) ++ goto kgdb_restore; ++ ++ kgdb_info[processor].debuggerinfo = linux_regs; ++ kgdb_info[processor].task = current; ++ ++ kgdb_disable_hw_debug(linux_regs); ++ ++ if (!debugger_step || !kgdb_contthread) ++ for (i = 0; i < NR_CPUS; i++) ++ spin_lock_nested(&slavecpulocks[i], i); ++ ++ /* Make sure we get the other CPUs */ ++ if (!debugger_step || !kgdb_contthread) ++ kgdb_roundup_cpus(flags); ++ ++ /* spin_lock code is good enough as a barrier so we don't ++ * need one here */ ++ atomic_set(&procindebug[processor], 1); ++ ++ /* Wait a reasonable time for the other CPUs to be notified and ++ * be waiting for us. Very early on this could be imperfect ++ * as num_online_cpus() could be 0.*/ ++ for (i = 0; i < ROUNDUP_WAIT; i++) { ++ int cpu, num = 0; ++ for (cpu = 0; cpu < NR_CPUS; cpu++) { ++ if (atomic_read(&procindebug[cpu])) ++ num++; ++ } ++ if (num >= num_online_cpus()) { ++ all_cpus_synced = 1; ++ break; ++ } ++ } ++ ++ /* Clear the out buffer. */ ++ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); ++ ++ /* Master processor is completely in the debugger */ ++ kgdb_post_master_code(linux_regs, ex_vector, err_code); ++ kgdb_deactivate_sw_breakpoints(); ++ debugger_step = 0; ++ kgdb_contthread = NULL; ++ ++ if (kgdb_connected) { ++ /* If we're still unable to roundup all of the CPUs, ++ * send an 'O' packet informing the user again. */ ++ if (!all_cpus_synced) ++ kgdb_msg_write("Not all CPUs have been synced for " ++ "KGDB\n", 39); ++ /* Reply to host that an exception has occurred */ ++ ptr = remcom_out_buffer; ++ *ptr++ = 'T'; ++ *ptr++ = hexchars[(signo >> 4) % 16]; ++ *ptr++ = hexchars[signo % 16]; ++ ptr += strlen(strcpy(ptr, "thread:")); ++ int_to_threadref(&thref, shadow_pid(current->pid)); ++ ptr = pack_threadid(ptr, &thref); ++ *ptr++ = ';'; ++ ++ put_packet(remcom_out_buffer); ++ } ++ ++ kgdb_usethread = kgdb_info[processor].task; ++ kgdb_usethreadid = shadow_pid(kgdb_info[processor].task->pid); ++ ++ while (kgdb_io_ops.read_char) { ++ char *bpt_type; ++ error = 0; ++ ++ /* Clear the out buffer. */ ++ memset(remcom_out_buffer, 0, sizeof(remcom_out_buffer)); ++ ++ get_packet(remcom_in_buffer); ++ ++ switch (remcom_in_buffer[0]) { ++ case '?': ++ /* We know that this packet is only sent ++ * during initial connect. So to be safe, ++ * we clear out our breakpoints now incase ++ * GDB is reconnecting. */ ++ remove_all_break(); ++ /* Also, if we haven't been able to roundup all ++ * CPUs, send an 'O' packet informing the user ++ * as much. Only need to do this once. */ ++ if (!all_cpus_synced) ++ kgdb_msg_write("Not all CPUs have been " ++ "synced for KGDB\n", 39); ++ remcom_out_buffer[0] = 'S'; ++ remcom_out_buffer[1] = hexchars[signo >> 4]; ++ remcom_out_buffer[2] = hexchars[signo % 16]; ++ break; ++ ++ case 'g': /* return the value of the CPU registers */ ++ thread = kgdb_usethread; ++ ++ if (!thread) { ++ thread = kgdb_info[processor].task; ++ local_debuggerinfo = ++ kgdb_info[processor].debuggerinfo; ++ } else { ++ local_debuggerinfo = NULL; ++ for (i = 0; i < NR_CPUS; i++) { ++ /* Try to find the task on some other ++ * or possibly this node if we do not ++ * find the matching task then we try ++ * to approximate the results. ++ */ ++ if (thread == kgdb_info[i].task) ++ local_debuggerinfo = ++ kgdb_info[i].debuggerinfo; ++ } ++ } ++ ++ /* All threads that don't have debuggerinfo should be ++ * in __schedule() sleeping, since all other CPUs ++ * are in kgdb_wait, and thus have debuggerinfo. */ ++ if (kgdb_ops->shadowth && ++ kgdb_usethreadid >= pid_max + num_online_cpus()) { ++ shadowregs = kgdb_shadow_regs(linux_regs, ++ kgdb_usethreadid - ++ pid_max - ++ num_online_cpus ++ ()); ++ if (!shadowregs) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ regs_to_gdb_regs(gdb_regs, shadowregs); ++ } else if (local_debuggerinfo) ++ regs_to_gdb_regs(gdb_regs, local_debuggerinfo); ++ else { ++ /* Pull stuff saved during ++ * switch_to; nothing else is ++ * accessible (or even particularly relevant). ++ * This should be enough for a stack trace. */ ++ sleeping_thread_to_gdb_regs(gdb_regs, thread); ++ } ++ kgdb_mem2hex((char *)gdb_regs, remcom_out_buffer, ++ NUMREGBYTES); ++ break; ++ ++ /* set the value of the CPU registers - return OK */ ++ case 'G': ++ kgdb_hex2mem(&remcom_in_buffer[1], (char *)gdb_regs, ++ NUMREGBYTES); ++ ++ if (kgdb_usethread && kgdb_usethread != current) ++ error_packet(remcom_out_buffer, -EINVAL); ++ else { ++ gdb_regs_to_regs(gdb_regs, linux_regs); ++ strcpy(remcom_out_buffer, "OK"); ++ } ++ break; ++ ++ /* mAA..AA,LLLL Read LLLL bytes at address AA..AA */ ++ case 'm': ++ ptr = &remcom_in_buffer[1]; ++ if (kgdb_hex2long(&ptr, &addr) > 0 && *ptr++ == ',' && ++ kgdb_hex2long(&ptr, &length) > 0) { ++ if (IS_ERR(ptr = kgdb_mem2hex((char *)addr, ++ remcom_out_buffer, ++ length))) ++ error_packet(remcom_out_buffer, ++ PTR_ERR(ptr)); ++ } else ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ ++ /* MAA..AA,LLLL: Write LLLL bytes at address AA..AA */ ++ case 'M': ++ if (IS_ERR(ptr = write_mem_msg(0))) ++ error_packet(remcom_out_buffer, PTR_ERR(ptr)); ++ else ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ /* XAA..AA,LLLL: Write LLLL bytes at address AA..AA */ ++ case 'X': ++ if (IS_ERR(ptr = write_mem_msg(1))) ++ error_packet(remcom_out_buffer, PTR_ERR(ptr)); ++ else ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ ++ /* kill or detach. KGDB should treat this like a ++ * continue. ++ */ ++ case 'D': ++ if ((error = remove_all_break()) < 0) { ++ error_packet(remcom_out_buffer, error); ++ } else { ++ strcpy(remcom_out_buffer, "OK"); ++ kgdb_connected = 0; ++ } ++ put_packet(remcom_out_buffer); ++ goto default_handle; ++ ++ case 'k': ++ /* Don't care about error from remove_all_break */ ++ remove_all_break(); ++ kgdb_connected = 0; ++ goto default_handle; ++ ++ /* Reboot */ ++ case 'R': ++ /* For now, only honor R0 */ ++ if (strcmp(remcom_in_buffer, "R0") == 0) { ++ printk(KERN_CRIT "Executing reboot\n"); ++ strcpy(remcom_out_buffer, "OK"); ++ put_packet(remcom_out_buffer); ++ emergency_sync(); ++ /* Execution should not return from ++ * machine_restart() ++ */ ++ machine_restart(NULL); ++ kgdb_connected = 0; ++ goto default_handle; ++ } ++ ++ /* query */ ++ case 'q': ++ switch (remcom_in_buffer[1]) { ++ case 's': ++ case 'f': ++ if (memcmp(remcom_in_buffer + 2, "ThreadInfo", ++ 10)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ ++ /* ++ * If we have not yet completed in ++ * pidhash_init() there isn't much we ++ * can give back. ++ */ ++ if (last_pid == 0) { ++ if (remcom_in_buffer[1] == 'f') ++ strcpy(remcom_out_buffer, ++ "m0000000000000001"); ++ break; ++ } ++ ++ if (remcom_in_buffer[1] == 'f') { ++ threadid = 1; ++ } ++ remcom_out_buffer[0] = 'm'; ++ ptr = remcom_out_buffer + 1; ++ for (i = 0; i < 17 && threadid < pid_max + ++ numshadowth; threadid++) { ++ thread = getthread(linux_regs, ++ threadid); ++ if (thread) { ++ int_to_threadref(&thref, ++ threadid); ++ pack_threadid(ptr, &thref); ++ ptr += 16; ++ *(ptr++) = ','; ++ i++; ++ } ++ } ++ *(--ptr) = '\0'; ++ break; ++ ++ case 'C': ++ /* Current thread id */ ++ strcpy(remcom_out_buffer, "QC"); ++ ++ threadid = shadow_pid(current->pid); ++ ++ int_to_threadref(&thref, threadid); ++ pack_threadid(remcom_out_buffer + 2, &thref); ++ break; ++ case 'T': ++ if (memcmp(remcom_in_buffer + 1, ++ "ThreadExtraInfo,", 16)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ threadid = 0; ++ ptr = remcom_in_buffer + 17; ++ kgdb_hex2long(&ptr, &threadid); ++ if (!getthread(linux_regs, threadid)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ if (threadid < pid_max) { ++ kgdb_mem2hex(getthread(linux_regs, ++ threadid)->comm, ++ remcom_out_buffer, 16); ++ } else if (threadid >= pid_max + ++ num_online_cpus()) { ++ kgdb_shadowinfo(linux_regs, ++ remcom_out_buffer, ++ threadid - pid_max - ++ num_online_cpus()); ++ } else { ++ static char tmpstr[23 + ++ BUF_THREAD_ID_SIZE]; ++ sprintf(tmpstr, "Shadow task %d" ++ " for pid 0", ++ (int)(threadid - pid_max)); ++ kgdb_mem2hex(tmpstr, remcom_out_buffer, ++ strlen(tmpstr)); ++ } ++ break; ++ } ++ break; ++ ++ /* task related */ ++ case 'H': ++ switch (remcom_in_buffer[1]) { ++ case 'g': ++ ptr = &remcom_in_buffer[2]; ++ kgdb_hex2long(&ptr, &threadid); ++ thread = getthread(linux_regs, threadid); ++ if (!thread && threadid > 0) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ kgdb_usethread = thread; ++ kgdb_usethreadid = threadid; ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ ++ case 'c': ++ ptr = &remcom_in_buffer[2]; ++ kgdb_hex2long(&ptr, &threadid); ++ if (!threadid) { ++ kgdb_contthread = NULL; ++ } else { ++ thread = getthread(linux_regs, ++ threadid); ++ if (!thread && threadid > 0) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ kgdb_contthread = thread; ++ } ++ strcpy(remcom_out_buffer, "OK"); ++ break; ++ } ++ break; ++ ++ /* Query thread status */ ++ case 'T': ++ ptr = &remcom_in_buffer[1]; ++ kgdb_hex2long(&ptr, &threadid); ++ thread = getthread(linux_regs, threadid); ++ if (thread) ++ strcpy(remcom_out_buffer, "OK"); ++ else ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ /* Since GDB-5.3, it's been drafted that '0' is a software ++ * breakpoint, '1' is a hardware breakpoint, so let's do ++ * that. ++ */ ++ case 'z': ++ case 'Z': ++ bpt_type = &remcom_in_buffer[1]; ++ ptr = &remcom_in_buffer[2]; ++ ++ if (kgdb_ops->set_hw_breakpoint && *bpt_type >= '1') { ++ /* Unsupported */ ++ if (*bpt_type > '4') ++ break; ++ } else if (*bpt_type != '0' && *bpt_type != '1') ++ /* Unsupported. */ ++ break; ++ /* Test if this is a hardware breakpoint, and ++ * if we support it. */ ++ if (*bpt_type == '1' && ++ !kgdb_ops->flags & KGDB_HW_BREAKPOINT) ++ /* Unsupported. */ ++ break; ++ ++ if (*(ptr++) != ',') { ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } else if (kgdb_hex2long(&ptr, &addr)) { ++ if (*(ptr++) != ',' || ++ !kgdb_hex2long(&ptr, &length)) { ++ error_packet(remcom_out_buffer, ++ -EINVAL); ++ break; ++ } ++ } else { ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } ++ ++ if (remcom_in_buffer[0] == 'Z' && *bpt_type == '0') ++ error = kgdb_set_sw_break(addr); ++ else if (remcom_in_buffer[0] == 'Z' && *bpt_type == '1') ++ error = kgdb_set_hw_break(addr); ++ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '0') ++ error = kgdb_remove_sw_break(addr); ++ else if (remcom_in_buffer[0] == 'z' && *bpt_type == '1') ++ error = kgdb_remove_hw_break(addr); ++ else if (remcom_in_buffer[0] == 'Z') ++ error = kgdb_ops->set_hw_breakpoint(addr, ++ (int)length, ++ *bpt_type); ++ else if (remcom_in_buffer[0] == 'z') ++ error = kgdb_ops->remove_hw_breakpoint(addr, ++ (int) ++ length, ++ *bpt_type); ++ ++ if (error == 0) ++ strcpy(remcom_out_buffer, "OK"); ++ else ++ error_packet(remcom_out_buffer, error); ++ ++ break; ++ case 'c': ++ case 's': ++ if (kgdb_contthread && kgdb_contthread != current) { ++ /* Can't switch threads in kgdb */ ++ error_packet(remcom_out_buffer, -EINVAL); ++ break; ++ } ++ kgdb_activate_sw_breakpoints(); ++ /* Followthrough to default processing */ ++ default: ++ default_handle: ++ error = kgdb_arch_handle_exception(ex_vector, signo, ++ err_code, ++ remcom_in_buffer, ++ remcom_out_buffer, ++ linux_regs); ++ ++ if (error >= 0 || remcom_in_buffer[0] == 'D' || ++ remcom_in_buffer[0] == 'k') ++ goto kgdb_exit; ++ ++ } /* switch */ ++ ++ /* reply to the request */ ++ put_packet(remcom_out_buffer); ++ } ++ ++ kgdb_exit: ++ /* Call the I/O driver's post_exception routine if the I/O ++ * driver defined one. ++ */ ++ if (kgdb_io_ops.post_exception) ++ kgdb_io_ops.post_exception(); ++ ++ kgdb_info[processor].debuggerinfo = NULL; ++ kgdb_info[processor].task = NULL; ++ atomic_set(&procindebug[processor], 0); ++ ++ if (!debugger_step || !kgdb_contthread) { ++ for (i = 0; i < NR_CPUS; i++) ++ spin_unlock(&slavecpulocks[i]); ++ /* Wait till all the processors have quit ++ * from the debugger. */ ++ for (i = 0; i < NR_CPUS; i++) { ++ while (atomic_read(&procindebug[i])) { ++ int j = 10; /* an arbitrary number */ ++ ++ while (--j) ++ cpu_relax(); ++ } ++ } ++ } ++ ++#ifdef CONFIG_SMP ++ /* This delay has a real purpose. The problem is that if you ++ * are single-stepping, you are sending an NMI to all the ++ * other processors to stop them. Interrupts come in, but ++ * don't get handled. Then you let them go just long enough ++ * to get into their interrupt routines and use up some stack. ++ * You stop them again, and then do the same thing. After a ++ * while you blow the stack on the other processors. This ++ * delay gives some time for interrupts to be cleared out on ++ * the other processors. ++ */ ++ if (debugger_step) ++ mdelay(2); ++#endif ++kgdb_restore: ++ /* Free debugger_active */ ++ atomic_set(&debugger_active, 0); ++ local_irq_restore(flags); ++ ++ return error; ++} ++ ++/* ++ * GDB places a breakpoint at this function to know dynamically ++ * loaded objects. It's not defined static so that only one instance with this ++ * name exists in the kernel. ++ */ ++ ++int module_event(struct notifier_block *self, unsigned long val, void *data) ++{ ++ return 0; ++} ++ ++static struct notifier_block kgdb_module_load_nb = { ++ .notifier_call = module_event, ++}; ++ ++void kgdb_nmihook(int cpu, void *regs) ++{ ++#ifdef CONFIG_SMP ++ if (!atomic_read(&procindebug[cpu]) && atomic_read(&debugger_active) != (cpu + 1)) ++ kgdb_wait((struct pt_regs *)regs); ++#endif ++} ++ ++/* ++ * This is called when a panic happens. All we need to do is ++ * breakpoint(). ++ */ ++static int kgdb_panic_notify(struct notifier_block *self, unsigned long cmd, ++ void *ptr) ++{ ++ breakpoint(); ++ ++ return 0; ++} ++ ++static struct notifier_block kgdb_panic_notifier = { ++ .notifier_call = kgdb_panic_notify, ++}; ++ ++/* ++ * Initialization that needs to be done in either of our entry points. ++ */ ++static void __init kgdb_internal_init(void) ++{ ++ int i; ++ ++ /* Initialize our spinlocks. */ ++ for (i = 0; i < NR_CPUS; i++) ++ spin_lock_init(&slavecpulocks[i]); ++ ++ for (i = 0; i < MAX_BREAKPOINTS; i++) ++ kgdb_break[i].state = bp_none; ++ ++ /* Initialize the I/O handles */ ++ memset(&kgdb_io_ops_prev, 0, sizeof(kgdb_io_ops_prev)); ++ ++ /* We can't do much if this fails */ ++ register_module_notifier(&kgdb_module_load_nb); ++ ++ kgdb_initialized = 1; ++} ++ ++static void kgdb_register_for_panic(void) ++{ ++ /* Register for panics(). */ ++ /* The registration is done in the kgdb_register_for_panic ++ * routine because KGDB should not try to handle a panic when ++ * there are no kgdb_io_ops setup. It is assumed that the ++ * kgdb_io_ops are setup at the time this method is called. ++ */ ++ if (!kgdb_from_module_registered) { ++ atomic_notifier_chain_register(&panic_notifier_list, ++ &kgdb_panic_notifier); ++ kgdb_from_module_registered = 1; ++ } ++} ++ ++static void kgdb_unregister_for_panic(void) ++{ ++ /* When this routine is called KGDB should unregister from the ++ * panic handler and clean up, making sure it is not handling any ++ * break exceptions at the time. ++ */ ++ if (kgdb_from_module_registered) { ++ kgdb_from_module_registered = 0; ++ atomic_notifier_chain_unregister(&panic_notifier_list, ++ &kgdb_panic_notifier); ++ } ++} ++ ++int kgdb_register_io_module(struct kgdb_io *local_kgdb_io_ops) ++{ ++ ++ if (kgdb_connected) { ++ printk(KERN_ERR "kgdb: Cannot load I/O module while KGDB " ++ "connected.\n"); ++ return -EINVAL; ++ } ++ ++ /* Save the old values so they can be restored */ ++ if (kgdb_io_handler_cnt >= MAX_KGDB_IO_HANDLERS) { ++ printk(KERN_ERR "kgdb: No more I/O handles available.\n"); ++ return -EINVAL; ++ } ++ ++ /* Check to see if there is an existing driver and if so save its ++ * values. Also check to make sure the same driver was not trying ++ * to re-register. ++ */ ++ if (kgdb_io_ops.read_char != NULL && ++ kgdb_io_ops.read_char != local_kgdb_io_ops->read_char) { ++ memcpy(&kgdb_io_ops_prev[kgdb_io_handler_cnt], ++ &kgdb_io_ops, sizeof(struct kgdb_io)); ++ kgdb_io_handler_cnt++; ++ } ++ ++ /* Initialize the io values for this module */ ++ memcpy(&kgdb_io_ops, local_kgdb_io_ops, sizeof(struct kgdb_io)); ++ ++ /* Make the call to register kgdb if is not initialized */ ++ kgdb_register_for_panic(); ++ ++ return 0; ++} ++ ++void kgdb_unregister_io_module(struct kgdb_io *local_kgdb_io_ops) ++{ ++ int i; ++ ++ /* Unregister KGDB if there were no other prior io hooks, else ++ * restore the io hooks. ++ */ ++ if (kgdb_io_handler_cnt > 0 && kgdb_io_ops_prev[0].read_char != NULL) { ++ /* First check if the hook that is in use is the one being ++ * removed */ ++ if (kgdb_io_ops.read_char == local_kgdb_io_ops->read_char) { ++ /* Set 'i' to the value of where the list should be ++ * shifed */ ++ i = kgdb_io_handler_cnt - 1; ++ memcpy(&kgdb_io_ops, &kgdb_io_ops_prev[i], ++ sizeof(struct kgdb_io)); ++ } else { ++ /* Simple case to remove an entry for an I/O handler ++ * that is not in use */ ++ for (i = 0; i < kgdb_io_handler_cnt; i++) { ++ if (kgdb_io_ops_prev[i].read_char == ++ local_kgdb_io_ops->read_char) ++ break; ++ } ++ } ++ ++ /* Shift all the entries in the handler array so it is ++ * ordered from oldest to newest. ++ */ ++ kgdb_io_handler_cnt--; ++ for (; i < kgdb_io_handler_cnt; i++) { ++ memcpy(&kgdb_io_ops_prev[i], &kgdb_io_ops_prev[i + 1], ++ sizeof(struct kgdb_io)); ++ } ++ /* Handle the case if we are on the last element and set it ++ * to NULL; */ ++ memset(&kgdb_io_ops_prev[kgdb_io_handler_cnt], 0, ++ sizeof(struct kgdb_io)); ++ ++ if (kgdb_connected) ++ printk(KERN_ERR "kgdb: WARNING: I/O method changed " ++ "while kgdb was connected state.\n"); ++ } else { ++ /* KGDB is no longer able to communicate out, so ++ * unregister our hooks and reset state. */ ++ kgdb_unregister_for_panic(); ++ if (kgdb_connected) { ++ printk(KERN_CRIT "kgdb: I/O module was unloaded while " ++ "a debugging session was running. " ++ "KGDB will be reset.\n"); ++ if (remove_all_break() < 0) ++ printk(KERN_CRIT "kgdb: Reset failed.\n"); ++ kgdb_connected = 0; ++ } ++ memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io)); ++ } ++} ++ ++/* ++ * There are times we need to call a tasklet to cause a breakpoint ++ * as calling breakpoint() at that point might be fatal. We have to ++ * check that the exception stack is setup, as tasklets may be scheduled ++ * prior to this. When that happens, it is up to the architecture to ++ * schedule this when it is safe to run. ++ */ ++static void kgdb_tasklet_bpt(unsigned long ing) ++{ ++ if(CHECK_EXCEPTION_STACK()) ++ breakpoint(); ++} ++ ++DECLARE_TASKLET(kgdb_tasklet_breakpoint, kgdb_tasklet_bpt, 0); ++ ++/* ++ * This function can be called very early, either via early_param() or ++ * an explicit breakpoint() early on. ++ */ ++static void __init kgdb_early_entry(void) ++{ ++ /* ++ * Don't try and do anything until the architecture is able to ++ * setup the exception stack. In this case, it is up to the ++ * architecture to hook in and look at us when they are ready. ++ */ ++ if(!CHECK_EXCEPTION_STACK()) { ++ kgdb_initialized = -1; ++ tasklet_schedule(&kgdb_tasklet_breakpoint); ++ return; ++ } ++ ++ /* Let the architecture do any setup that it needs to. */ ++ kgdb_arch_init(); ++ ++ /* Now try the I/O. */ ++ /* For early entry kgdb_io_ops.init must be defined */ ++ if (!kgdb_io_ops.init || kgdb_io_ops.init()) { ++ /* Try again later. */ ++ kgdb_initialized = -1; ++ return; ++ } ++ ++ /* Finish up. */ ++ kgdb_internal_init(); ++ ++ /* KGDB can assume that if kgdb_io_ops.init was defined that the ++ * panic registion should be performed at this time. This means ++ * kgdb_io_ops.init did not come from a kernel module and was ++ * initialized statically by a built in. ++ */ ++ if (kgdb_io_ops.init) ++ kgdb_register_for_panic(); ++} ++ ++/* ++ * This function will always be invoked to make sure that KGDB will grab ++ * what it needs to so that if something happens while the system is ++ * running, KGDB will get involved. If kgdb_early_entry() has already ++ * been invoked, there is little we need to do. ++ */ ++static int __init kgdb_late_entry(void) ++{ ++ int need_break = 0; ++ ++ /* If kgdb_initialized is -1 then we were passed kgdbwait. */ ++ if (kgdb_initialized == -1) ++ need_break = 1; ++ ++ /* ++ * If we haven't tried to initialize KGDB yet, we need to call ++ * kgdb_arch_init before moving onto the I/O. ++ */ ++ if (!kgdb_initialized) ++ kgdb_arch_init(); ++ ++ if (kgdb_initialized != 1) { ++ if (kgdb_io_ops.init && kgdb_io_ops.init()) { ++ /* When KGDB allows I/O via modules and the core ++ * I/O init fails KGDB must default to defering the ++ * I/O setup, and appropriately print an error about ++ * it. ++ */ ++ printk(KERN_ERR "kgdb: Could not setup core I/O " ++ "for KGDB.\n"); ++ printk(KERN_INFO "kgdb: Defering I/O setup to kernel " ++ "module.\n"); ++ memset(&kgdb_io_ops, 0, sizeof(struct kgdb_io)); ++ } ++ ++ kgdb_internal_init(); ++ ++ /* KGDB can assume that if kgdb_io_ops.init was defined that ++ * panic registion should be performed at this time. This means ++ * kgdb_io_ops.init did not come from a kernel module and was ++ * initialized statically by a built in. ++ */ ++ if (kgdb_io_ops.init) ++ kgdb_register_for_panic(); ++ } ++ ++ /* Registering to reboot notifier list*/ ++ register_reboot_notifier(&kgdb_reboot_notifier); ++ ++ /* Now do any late init of the I/O. */ ++ if (kgdb_io_ops.late_init) ++ kgdb_io_ops.late_init(); ++ ++ if (need_break) { ++ printk(KERN_CRIT "kgdb: Waiting for connection from remote" ++ " gdb...\n"); ++ breakpoint(); ++ } ++ ++ return 0; ++} ++ ++late_initcall(kgdb_late_entry); ++ ++/* ++ * This function will generate a breakpoint exception. It is used at the ++ * beginning of a program to sync up with a debugger and can be used ++ * otherwise as a quick means to stop program execution and "break" into ++ * the debugger. ++ */ ++void breakpoint(void) ++{ ++ if (kgdb_initialized != 1) { ++ kgdb_early_entry(); ++ if (kgdb_initialized == 1) ++ printk(KERN_CRIT "Waiting for connection from remote " ++ "gdb...\n"); ++ else { ++ printk(KERN_CRIT "KGDB cannot initialize I/O yet.\n"); ++ return; ++ } ++ } ++ ++ atomic_set(&kgdb_setting_breakpoint, 1); ++ wmb(); ++ BREAKPOINT(); ++ wmb(); ++ atomic_set(&kgdb_setting_breakpoint, 0); ++} ++ ++EXPORT_SYMBOL(breakpoint); ++ ++#ifdef CONFIG_MAGIC_SYSRQ ++static void sysrq_handle_gdb(int key, struct pt_regs *pt_regs, ++ struct tty_struct *tty) ++{ ++ printk("Entering GDB stub\n"); ++ breakpoint(); ++} ++static struct sysrq_key_op sysrq_gdb_op = { ++ .handler = sysrq_handle_gdb, ++ .help_msg = "Gdb", ++ .action_msg = "GDB", ++}; ++ ++static int gdb_register_sysrq(void) ++{ ++ printk("Registering GDB sysrq handler\n"); ++ register_sysrq_key('g', &sysrq_gdb_op); ++ return 0; ++} ++ ++module_init(gdb_register_sysrq); ++#endif ++ ++static int kgdb_notify_reboot(struct notifier_block *this, ++ unsigned long code, void *x) ++{ ++ ++ unsigned long flags; ++ ++ /* If we're debugging, or KGDB has not connected, don't try ++ * and print. */ ++ if (!kgdb_connected || atomic_read(&debugger_active) != 0) ++ return 0; ++ if ((code == SYS_RESTART) || (code == SYS_HALT) || (code == SYS_POWER_OFF)){ ++ local_irq_save(flags); ++ put_packet("X00"); ++ local_irq_restore(flags); ++ } ++ return NOTIFY_DONE; ++} ++ ++#ifdef CONFIG_KGDB_CONSOLE ++void kgdb_console_write(struct console *co, const char *s, unsigned count) ++{ ++ unsigned long flags; ++ ++ /* If we're debugging, or KGDB has not connected, don't try ++ * and print. */ ++ if (!kgdb_connected || atomic_read(&debugger_active) != 0) ++ return; ++ ++ local_irq_save(flags); ++ kgdb_msg_write(s, count); ++ local_irq_restore(flags); ++} ++ ++struct console kgdbcons = { ++ .name = "kgdb", ++ .write = kgdb_console_write, ++ .flags = CON_PRINTBUFFER | CON_ENABLED, ++}; ++static int __init kgdb_console_init(void) ++{ ++ register_console(&kgdbcons); ++ return 0; ++} ++ ++console_initcall(kgdb_console_init); ++#endif ++ ++static int __init opt_kgdb_enter(char *str) ++{ ++ /* We've already done this by an explicit breakpoint() call. */ ++ if (kgdb_initialized) ++ return 0; ++ ++ /* Call breakpoint() which will take care of init. */ ++ breakpoint(); ++ ++ return 0; ++} ++ ++early_param("kgdbwait", opt_kgdb_enter); +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/kgdbarchlib.c linux-2.6.18.kgdb/kernel/kgdbarchlib.c +--- linux-2.6.18/kernel/kgdbarchlib.c 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/kernel/kgdbarchlib.c 2008-06-10 16:18:58.000000000 +0400 +@@ -0,0 +1,198 @@ ++#include ++ ++struct kgdb_arch *kgdb_ops = &arch_kgdb_ops; ++ ++/** ++ * kgdb_arch_init - Perform any architecture specific initalization. ++ * ++ * RETURN: ++ * The return value is ignored. ++ * ++ * This function will handle the initalization of any architecture ++ * specific hooks. ++ */ ++int __attribute__ ((weak)) ++ kgdb_arch_init(void) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_disable_hw_debug - Disable hardware debugging while we in kgdb. ++ * @regs: Current &struct pt_regs. ++ * ++ * This function will be called if the particular architecture must ++ * disable hardware debugging while it is processing gdb packets or ++ * handling exception. ++ */ ++void __attribute__ ((weak)) ++ kgdb_disable_hw_debug(struct pt_regs *regs) ++{ ++} ++ ++/* ++ * Skip an int3 exception when it occurs after a breakpoint has been ++ * removed. Backtrack eip by 1 since the int3 would have caused it to ++ * increment by 1. ++ */ ++int __attribute__ ((weak)) ++ kgdb_skipexception(int exception, struct pt_regs *regs) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_set_hw_break - Set a hardware breakpoint at @addr. ++ * @addr: The address to set a hardware breakpoint at. ++ */ ++int __attribute__ ((weak)) ++ kgdb_set_hw_break(unsigned long addr) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_remove_hw_break - Remove a hardware breakpoint at @addr. ++ * @addr: The address to remove a hardware breakpoint from. ++ */ ++int __attribute__ ((weak)) ++ kgdb_remove_hw_break(unsigned long addr) ++{ ++ return 0; ++} ++ ++/** ++ * kgdb_remove_all_hw_break - Clear all hardware breakpoints. ++ */ ++void __attribute__ ((weak)) ++ kgdb_remove_all_hw_break(void) ++{ ++} ++ ++/** ++ * kgdb_correct_hw_break - Correct hardware breakpoints. ++ * ++ * A hook to allow for changes to the hardware breakpoint, called ++ * after a single step (s) or continue (c) packet, and once we're about ++ * to let the kernel continue running. ++ * ++ * This is used to set the hardware breakpoint registers for all the ++ * slave cpus on an SMP configuration. This must be called after any ++ * changes are made to the hardware breakpoints (such as by a single ++ * step (s) or continue (c) packet. This is only required on ++ * architectures that support SMP and every processor has its own set ++ * of breakpoint registers. ++ */ ++void __attribute__ ((weak)) ++ kgdb_correct_hw_break(void) ++{ ++} ++ ++/** ++ * kgdb_post_master_code - Save error vector/code numbers. ++ * @regs: Original pt_regs. ++ * @e_vector: Original error vector. ++ * @err_code: Original error code. ++ * ++ * This is needed on architectures which support SMP and KGDB. ++ * This function is called after all the slave cpus have been put ++ * to a know spin state and the master CPU has control over KGDB. ++ */ ++ ++void __attribute__ ((weak)) ++ kgdb_post_master_code(struct pt_regs *regs, int e_vector, int err_code) ++{ ++} ++ ++/** ++ * kgdb_roundup_cpus - Get other CPUs into a holding pattern ++ * @flags: Current IRQ state ++ * ++ * On SMP systems, we need to get the attention of the other CPUs ++ * and get them be in a known state. This should do what is needed ++ * to get the other CPUs to call kgdb_wait(). Note that on some arches, ++ * the NMI approach is not used for rounding up all the CPUs. For example, ++ * in case of MIPS, smp_call_function() is used to roundup CPUs. In ++ * this case, we have to make sure that interrupts are enabled before ++ * calling smp_call_function(). The argument to this function is ++ * the flags that will be used when restoring the interrupts. There is ++ * local_irq_save() call before kgdb_roundup_cpus(). ++ */ ++void __attribute__ ((weak)) ++ kgdb_roundup_cpus(unsigned long flags) ++{ ++} ++ ++/** ++ * kgdb_shadowinfo - Get shadowed information on @threadid. ++ * @regs: The &struct pt_regs of the current process. ++ * @buffer: A buffer of %BUFMAX size. ++ * @threadid: The thread id of the shadowed process to get information on. ++ */ ++void __attribute__ ((weak)) ++ kgdb_shadowinfo(struct pt_regs *regs, char *buffer, unsigned threadid) ++{ ++} ++ ++/** ++ * kgdb_get_shadow_thread - Get the shadowed &task_struct of @threadid. ++ * @regs: The &struct pt_regs of the current thread. ++ * @threadid: The thread id of the shadowed process to get information on. ++ * ++ * RETURN: ++ * This returns a pointer to the &struct task_struct of the shadowed ++ * thread, @threadid. ++ */ ++struct task_struct __attribute__ ((weak)) ++ * kgdb_get_shadow_thread(struct pt_regs *regs, int threadid) ++{ ++ return NULL; ++} ++ ++/** ++ * kgdb_shadow_regs - Return the shadowed registers of @threadid. ++ * @regs: The &struct pt_regs of the current thread. ++ * @threadid: The thread id we want the &struct pt_regs for. ++ * ++ * RETURN: ++ * The a pointer to the &struct pt_regs of the shadowed thread @threadid. ++ */ ++struct pt_regs __attribute__ ((weak)) ++ * kgdb_shadow_regs(struct pt_regs *regs, int threadid) ++{ ++ return NULL; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_validate_break_address(unsigned long addr) ++{ ++ int error = 0; ++ char tmp_variable[BREAK_INSTR_SIZE]; ++ error = kgdb_get_mem((char *)addr, tmp_variable, BREAK_INSTR_SIZE); ++ return error; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_arch_set_breakpoint(unsigned long addr, char *saved_instr) ++{ ++ int error = 0; ++ if ((error = kgdb_get_mem((char *)addr, ++ saved_instr, BREAK_INSTR_SIZE)) < 0) ++ return error; ++ ++ if ((error = kgdb_set_mem((char *)addr, kgdb_ops->gdb_bpt_instr, ++ BREAK_INSTR_SIZE)) < 0) ++ return error; ++ return 0; ++} ++ ++int __attribute__ ((weak)) ++ kgdb_arch_remove_breakpoint(unsigned long addr, char *bundle) ++{ ++ ++ int error = 0; ++ if ((error =kgdb_set_mem((char *)addr, (char *)bundle, ++ BREAK_INSTR_SIZE)) < 0) ++ return error; ++ return 0; ++} +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/module.c linux-2.6.18.kgdb/kernel/module.c +--- linux-2.6.18/kernel/module.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/kernel/module.c 2008-06-10 16:20:07.000000000 +0400 +@@ -64,6 +64,7 @@ static DEFINE_SPINLOCK(modlist_lock); + /* List of modules, protected by module_mutex AND modlist_lock */ + static DEFINE_MUTEX(module_mutex); + static LIST_HEAD(modules); ++static DECLARE_MUTEX(notify_mutex); + + static BLOCKING_NOTIFIER_HEAD(module_notify_list); + +@@ -700,6 +701,12 @@ sys_delete_module(const char __user *nam + if (ret != 0) + goto out; + ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, ++ mod); ++ up(¬ify_mutex); ++ ++ + /* Never wait if forced. */ + if (!forced && module_refcount(mod) != 0) + wait_for_zero_refcount(mod); +@@ -712,6 +719,11 @@ sys_delete_module(const char __user *nam + } + free_module(mod); + ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GONE, ++ NULL); ++ up(¬ify_mutex); ++ + out: + mutex_unlock(&module_mutex); + return ret; +@@ -1112,6 +1124,11 @@ static void free_module(struct module *m + /* Arch-specific cleanup. */ + module_arch_cleanup(mod); + ++#ifdef CONFIG_KGDB ++ /* kgdb info */ ++ vfree(mod->mod_sections); ++#endif ++ + /* Module unload stuff */ + module_unload_free(mod); + +@@ -1371,6 +1388,31 @@ static void setup_modinfo(struct module + } + } + ++#ifdef CONFIG_KGDB ++int add_modsects (struct module *mod, Elf_Ehdr *hdr, Elf_Shdr *sechdrs, const ++ char *secstrings) ++{ ++ int i; ++ ++ mod->num_sections = hdr->e_shnum - 1; ++ mod->mod_sections = vmalloc((hdr->e_shnum - 1)* ++ sizeof (struct mod_section)); ++ ++ if (mod->mod_sections == NULL) { ++ return -ENOMEM; ++ } ++ ++ for (i = 1; i < hdr->e_shnum; i++) { ++ mod->mod_sections[i - 1].address = (void *)sechdrs[i].sh_addr; ++ strncpy(mod->mod_sections[i - 1].name, secstrings + ++ sechdrs[i].sh_name, MAX_SECTNAME); ++ mod->mod_sections[i - 1].name[MAX_SECTNAME] = '\0'; ++ } ++ ++ return 0; ++} ++#endif ++ + #ifdef CONFIG_KALLSYMS + int is_exported(const char *name, const struct module *mod) + { +@@ -1782,6 +1824,12 @@ static struct module *load_module(void _ + + add_kallsyms(mod, sechdrs, symindex, strindex, secstrings); + ++#ifdef CONFIG_KGDB ++ if ((err = add_modsects(mod, hdr, sechdrs, secstrings)) < 0) { ++ goto nomodsectinfo; ++ } ++#endif ++ + err = module_finalize(hdr, sechdrs, mod); + if (err < 0) + goto cleanup; +@@ -1842,6 +1890,11 @@ static struct module *load_module(void _ + arch_cleanup: + module_arch_cleanup(mod); + cleanup: ++ ++#ifdef CONFIG_KGDB ++nomodsectinfo: ++ vfree(mod->mod_sections); ++#endif + module_unload_free(mod); + module_free(mod, mod->module_init); + free_core: +@@ -1913,6 +1966,10 @@ sys_init_module(void __user *umod, + /* Init routine failed: abort. Try to protect us from + buggy refcounters. */ + mod->state = MODULE_STATE_GOING; ++ down(¬ify_mutex); ++ blocking_notifier_call_chain(&module_notify_list, MODULE_STATE_GOING, ++ mod); ++ up(¬ify_mutex); + synchronize_sched(); + if (mod->unsafe) + printk(KERN_ERR "%s: module is now stuck!\n", +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/sched.c linux-2.6.18.kgdb/kernel/sched.c +--- linux-2.6.18/kernel/sched.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/kernel/sched.c 2008-06-10 16:18:58.000000000 +0400 +@@ -52,6 +52,7 @@ + #include + #include + #include ++#include + #include + + #include +@@ -6790,6 +6791,9 @@ void __might_sleep(char *file, int line) + #ifdef in_atomic + static unsigned long prev_jiffy; /* ratelimiting */ + ++ if (atomic_read(&debugger_active)) ++ return; ++ + if ((in_atomic() || irqs_disabled()) && + system_state == SYSTEM_RUNNING && !oops_in_progress) { + if (time_before(jiffies, prev_jiffy + HZ) && prev_jiffy) +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/softlockup.c linux-2.6.18.kgdb/kernel/softlockup.c +--- linux-2.6.18/kernel/softlockup.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/kernel/softlockup.c 2008-06-10 16:20:11.000000000 +0400 +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + + static DEFINE_SPINLOCK(print_lock); + +@@ -37,6 +38,9 @@ static struct notifier_block panic_block + void touch_softlockup_watchdog(void) + { + __raw_get_cpu_var(touch_timestamp) = jiffies; ++#ifdef CONFIG_KGDB ++ atomic_set(&kgdb_sync_softlockup[raw_smp_processor_id()], 0); ++#endif + } + EXPORT_SYMBOL(touch_softlockup_watchdog); + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/kernel/timer.c linux-2.6.18.kgdb/kernel/timer.c +--- linux-2.6.18/kernel/timer.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/kernel/timer.c 2008-06-10 16:20:11.000000000 +0400 +@@ -34,6 +34,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -1257,7 +1258,11 @@ static void run_timer_softirq(struct sof + */ + void run_local_timers(void) + { ++ int this_cpu = smp_processor_id(); + raise_softirq(TIMER_SOFTIRQ); ++#ifdef CONFIG_KGDB ++ if(!atomic_read(&kgdb_sync_softlockup[this_cpu])) ++#endif + softlockup_tick(); + } + +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/lib/Kconfig.debug linux-2.6.18.kgdb/lib/Kconfig.debug +--- linux-2.6.18/lib/Kconfig.debug 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/lib/Kconfig.debug 2008-06-10 16:19:51.000000000 +0400 +@@ -315,7 +315,7 @@ config DEBUG_VM + + config FRAME_POINTER + bool "Compile the kernel with frame pointers" +- depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390) ++ depends on DEBUG_KERNEL && (X86 || CRIS || M68K || M68KNOMMU || FRV || UML || S390 || SUPERH) + default y if DEBUG_INFO && UML + help + If you say Y here the resulting kernel image will be slightly larger +@@ -368,3 +368,158 @@ config RCU_TORTURE_TEST + at boot time (you probably don't). + Say M if you want the RCU torture tests to build as a module. + Say N if you are unsure. ++ ++config WANT_EXTRA_DEBUG_INFORMATION ++ bool ++ select DEBUG_INFO ++ select FRAME_POINTER if X86 || SUPERH ++ default n ++ ++config KGDB ++ bool "KGDB: kernel debugging with remote gdb" ++ select WANT_EXTRA_DEBUG_INFORMATION ++ depends on DEBUG_KERNEL && (ARM || X86 || MIPS || (SUPERH && !SUPERH64) || IA64 || X86_64 || PPC) ++ help ++ If you say Y here, it will be possible to remotely debug the ++ kernel using gdb. It is strongly suggested that you enable ++ DEBUG_INFO, and if available on your platform, FRAME_POINTER. ++ Documentation of kernel debugger available at ++ http://kgdb.sourceforge.net as well as in DocBook form ++ in Documentation/DocBook/. If unsure, say N. ++ ++config KGDB_CONSOLE ++ bool "KGDB: Console messages through gdb" ++ depends on KGDB ++ help ++ If you say Y here, console messages will appear through gdb. ++ Other consoles such as tty or ttyS will continue to work as usual. ++ Note, that if you use this in conjunction with KGDB_ETH, if the ++ ethernet driver runs into an error condition during use with KGDB ++ it is possible to hit an infinite recusrion, causing the kernel ++ to crash, and typically reboot. For this reason, it is preferable ++ to use NETCONSOLE in conjunction with KGDB_ETH instead of ++ KGDB_CONSOLE. ++ ++choice ++ prompt "Method for KGDB communication" ++ depends on KGDB ++ default KGDB_8250_NOMODULE ++ default KGDB_MPSC if SERIAL_MPSC ++ default KGDB_CPM_UART if (8xx || 8260) ++ default KGDB_SIBYTE if SIBYTE_SB1xxx_SOC ++ help ++ There are a number of different ways in which you can communicate ++ with KGDB. The most common is via serial, with the 8250 driver ++ (should your hardware have an 8250, or ns1655x style uart). ++ Another option is to use the NETPOLL framework and UDP, should ++ your ethernet card support this. Other options may exist. ++ You can elect to have one core I/O driver that is built into the ++ kernel for debugging as the kernel is booting, or using only ++ kernel modules. ++ ++config KGDB_ONLY_MODULES ++ bool "KGDB: Use only kernel modules for I/O" ++ depends on MODULES ++ help ++ Use only kernel modules to configure KGDB I/O after the ++ kernel is booted. ++ ++config KGDB_8250_NOMODULE ++ bool "KGDB: On generic serial port (8250)" ++ select KGDB_8250 ++ help ++ Uses generic serial port (8250) to communicate with the host ++ GDB. This is independent of the normal (SERIAL_8250) driver ++ for this chipset. ++ ++config KGDBOE_NOMODULE ++ bool "KGDB: On ethernet - in kernel" ++ select KGDBOE ++ select NETPOLL ++ select NETPOLL_TRAP ++ select NETPOLL_RX ++ help ++ Uses the NETPOLL API to communicate with the host GDB via UDP. ++ In order for this to work, the ethernet interface specified must ++ support the NETPOLL API, and this must be initialized at boot. ++ See the documentation for syntax. ++ ++config KGDB_MPSC ++ bool "KGDB on MV64x60 MPSC" ++ depends on SERIAL_MPSC ++ help ++ Uses a Marvell GT64260B or MV64x60 Multi-Purpose Serial ++ Controller (MPSC) channel. Note that the GT64260A is not ++ supported. ++ ++config KGDB_CPM_UART ++ bool "KGDB: On CPM UART" ++ depends on PPC && (CPM2 || 8xx) ++ help ++ Uses CPM UART to communicate with the host GDB. ++ ++config KGDB_SIBYTE ++ bool "KGDB: On the Broadcom SWARM serial port" ++ depends on MIPS && SIBYTE_SB1xxx_SOC ++endchoice ++ ++config KGDBOE ++ tristate "KGDB: On ethernet" if !KGDBOE_NOMODULE ++ depends on m && KGDB ++ select NETPOLL ++ select NETPOLL_TRAP ++ select NETPOLL_RX ++ help ++ Uses the NETPOLL API to communicate with the host GDB via UDP. ++ In order for this to work, the ethernet interface specified must ++ support the NETPOLL API, and this must be initialized at boot. ++ See the documentation for syntax. ++ ++config KGDB_8250 ++ tristate "KGDB: On generic serial port (8250)" if !KGDB_8250_NOMODULE ++ depends on m && KGDB_ONLY_MODULES ++ help ++ Uses generic serial port (8250) to communicate with the host ++ GDB. This is independent of the normal (SERIAL_8250) driver ++ for this chipset. ++ ++config KGDB_SIMPLE_SERIAL ++ bool "Simple selection of KGDB serial port" ++ depends on KGDB_8250_NOMODULE ++ default y ++ help ++ If you say Y here, you will only have to pick the baud rate ++ and port number that you wish to use for KGDB. Note that this ++ only works on architectures that register known serial ports ++ early on. If you say N, you will have to provide, either here ++ or on the command line, the type (I/O or MMIO), IRQ and ++ address to use. If in doubt, say Y. ++ ++config KGDB_BAUDRATE ++ int "Debug serial port baud rate" ++ depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) ++ default "115200" ++ help ++ gdb and the kernel stub need to agree on the baud rate to be ++ used. Standard rates from 9600 to 115200 are allowed, and this ++ may be overridden via the commandline. ++ ++config KGDB_PORT_NUM ++ int "Serial port number for KGDB" ++ range 0 1 if KGDB_MPSC ++ range 0 3 ++ depends on (KGDB_8250 && KGDB_SIMPLE_SERIAL) || KGDB_MPSC ++ default "1" ++ help ++ Pick the port number (0 based) for KGDB to use. ++ ++config KGDB_8250_CONF_STRING ++ string "Configuration string for KGDB" ++ depends on KGDB_8250_NOMODULE && !KGDB_SIMPLE_SERIAL ++ default "io,2f8,115200,3" if X86 ++ help ++ The format of this string should be ,
,,. For example, to use the ++ serial port on an i386 box located at 0x2f8 and 115200 baud ++ on IRQ 3 at use: ++ io,2f8,115200,3 +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/net/core/netpoll.c linux-2.6.18.kgdb/net/core/netpoll.c +--- linux-2.6.18/net/core/netpoll.c 2006-09-20 07:42:06.000000000 +0400 ++++ linux-2.6.18.kgdb/net/core/netpoll.c 2008-06-10 16:19:07.000000000 +0400 +@@ -519,7 +519,8 @@ int __netpoll_rx(struct sk_buff *skb) + + np->rx_hook(np, ntohs(uh->source), + (char *)(uh+1), +- ulen - sizeof(struct udphdr)); ++ ulen - sizeof(struct udphdr), ++ skb); + + kfree_skb(skb); + return 1; +diff -rupN -X ../client-cleanup/dontdiff linux-2.6.18/scripts/dwarfh.awk linux-2.6.18.kgdb/scripts/dwarfh.awk +--- linux-2.6.18/scripts/dwarfh.awk 1970-01-01 03:00:00.000000000 +0300 ++++ linux-2.6.18.kgdb/scripts/dwarfh.awk 2008-06-10 16:19:58.000000000 +0400 +@@ -0,0 +1,19 @@ ++BEGIN { ++ print "#ifndef _ELF_DWARF_H" ++ print "/* Machine generated from dwarf2.h by scripts/dwarfh.awk */" ++} ++$2 == "=" { ++ gsub(/,/, "", $3) ++ print "#define " $1 "\t " $3 ++} ++$1 == "#define" { ++ print $0 ++ while( index($0,"\\") == length($0)){ ++ getline ++ print $0 ++ } ++} ++/.*/ {} ++END { ++ print "#endif" ++} diff --git a/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch b/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch new file mode 100644 index 0000000..f0f3894 --- /dev/null +++ b/lustre/kernel_patches/patches/lockdep_chains-2.6.18-vanilla.patch @@ -0,0 +1,269 @@ +commit 443cd507ce7f78c6f8742b72736585c031d5a921 +Author: Huang, Ying +Date: Fri Jun 20 16:39:21 2008 +0800 + + lockdep: add lock_class information to lock_chain and output it + + This patch records array of lock_class into lock_chain, and export + lock_chain information via /proc/lockdep_chains. + + It is based on x86/master branch of git-x86 tree, and has been tested + on x86_64 platform. + + Signed-off-by: Huang Ying + Cc: Peter Zijlstra + Signed-off-by: Ingo Molnar + +diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h +index 4c4d236..b26fbc7 100644 +--- a/include/linux/lockdep.h ++++ b/include/linux/lockdep.h +@@ -182,6 +182,9 @@ struct lock_list { + * We record lock dependency chains, so that we can cache them: + */ + struct lock_chain { ++ u8 irq_context; ++ u8 depth; ++ u16 base; + struct list_head entry; + u64 chain_key; + }; +diff --git a/kernel/lockdep.c b/kernel/lockdep.c +index 81a4e4a..a796f1f 100644 +--- a/kernel/lockdep.c ++++ b/kernel/lockdep.c +@@ -1458,7 +1458,14 @@ out_bug: + } + + unsigned long nr_lock_chains; +-static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; ++struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; ++atomic_t nr_chain_hlocks; ++static u16 chain_hlocks[MAX_LOCKDEP_CHAIN_HLOCKS]; ++ ++struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i) ++{ ++ return lock_classes + chain_hlocks[chain->base + i]; ++} + + /* + * Look up a dependency chain. If the key is not present yet then +@@ -1466,9 +1473,14 @@ static struct lock_chain lock_chains[MAX_LOCKDEP_CHAINS]; + * validated. If the key is already hashed, return 0. + */ +-static inline int lookup_chain_cache(u64 chain_key) ++static inline int lookup_chain_cache(struct task_struct *curr, ++ struct held_lock *hlock, ++ u64 chain_key) + { ++ struct lock_class *class = hlock->class; + struct list_head *hash_head = chainhashentry(chain_key); + struct lock_chain *chain; ++ struct held_lock *hlock_curr, *hlock_next; ++ int i, j, n; + + if (DEBUG_LOCKS_WARN_ON(!irqs_disabled())) + return 0; +@@ -1517,6 +1529,26 @@ cache_hit: + } + chain = lock_chains + nr_lock_chains++; + chain->chain_key = chain_key; ++ chain->irq_context = hlock->irq_context; ++ /* Find the first held_lock of current chain */ ++ hlock_next = hlock; ++ for (i = curr->lockdep_depth - 1; i >= 0; i--) { ++ hlock_curr = curr->held_locks + i; ++ if (hlock_curr->irq_context != hlock_next->irq_context) ++ break; ++ hlock_next = hlock; ++ } ++ i++; ++ chain->depth = curr->lockdep_depth + 1 - i; ++ n = atomic_add_return(chain->depth, &nr_chain_hlocks); ++ if (unlikely(n < MAX_LOCKDEP_CHAIN_HLOCKS)) { ++ chain->base = n - chain->depth; ++ for (j = 0; j < chain->depth - 1; j++, i++) { ++ int lock_id = curr->held_locks[i].class - lock_classes; ++ chain_hlocks[chain->base + j] = lock_id; ++ } ++ chain_hlocks[chain->base + j] = class - lock_classes; ++ } + list_add_tail_rcu(&chain->entry, hash_head); + debug_atomic_inc(&chain_lookup_misses); + inc_chains(); +@@ -1538,7 +1570,7 @@ static int validate_chain(struct task_struct *curr, struct lockdep_map *lock, + * (If lookup_chain_cache() returns with 1 it acquires + * hash_lock for us) + */ +- if (!trylock && (check == 2) && lookup_chain_cache(chain_key)) { ++ if (!trylock && (check == 2) && lookup_chain_cache(curr, hlock, chain_key)) { + /* + * Check whether last held lock: + * +diff --git a/kernel/lockdep_internals.h b/kernel/lockdep_internals.h +index 8ce09bc..db09b17 100644 +--- a/kernel/lockdep_internals.h ++++ b/kernel/lockdep_internals.h +@@ -23,6 +23,8 @@ + #define MAX_LOCKDEP_CHAINS_BITS 14 + #define MAX_LOCKDEP_CHAINS (1UL << MAX_LOCKDEP_CHAINS_BITS) + ++#define MAX_LOCKDEP_CHAIN_HLOCKS (MAX_LOCKDEP_CHAINS*5) ++ + /* + * Stack-trace: tightly packed array of stack backtrace + * addresses. Protected by the hash_lock. +@@ -30,15 +32,19 @@ + #define MAX_STACK_TRACE_ENTRIES 262144UL + + extern struct list_head all_lock_classes; ++extern struct lock_chain lock_chains[]; + + extern void + get_usage_chars(struct lock_class *class, char *c1, char *c2, char *c3, char *c4); + + extern const char * __get_key_name(struct lockdep_subclass_key *key, char *str); + ++struct lock_class *lock_chain_get_class(struct lock_chain *chain, int i); ++ + extern unsigned long nr_lock_classes; + extern unsigned long nr_list_entries; + extern unsigned long nr_lock_chains; ++extern atomic_t nr_chain_hlocks; + extern unsigned long nr_stack_trace_entries; + + extern unsigned int nr_hardirq_chains; +diff --git a/kernel/lockdep_proc.c b/kernel/lockdep_proc.c +index 688c5f1..14d052c 100644 +--- a/kernel/lockdep_proc.c ++++ b/kernel/lockdep_proc.c +@@ -178,6 +178,110 @@ static const struct file_operations proc_lockdep_operations = { + .release = seq_release, + }; + ++static void print_name(struct seq_file *m, struct lock_class *class) ++{ ++ char str[128]; ++ const char *name = class->name; ++ ++ if (!name) { ++ name = __get_key_name(class->key, str); ++ seq_printf(m, "%s", name); ++ } else{ ++ seq_printf(m, "%s", name); ++ if (class->name_version > 1) ++ seq_printf(m, "#%d", class->name_version); ++ if (class->subclass) ++ seq_printf(m, "/%d", class->subclass); ++ } ++} ++ ++static void *lc_next(struct seq_file *m, void *v, loff_t *pos) ++{ ++ struct lock_chain *chain; ++ ++ (*pos)++; ++ ++ if (v == SEQ_START_TOKEN) ++ chain = m->private; ++ else { ++ chain = v; ++ ++ if (*pos < nr_lock_chains) ++ chain = lock_chains + *pos; ++ else ++ chain = NULL; ++ } ++ ++ return chain; ++} ++ ++static void *lc_start(struct seq_file *m, loff_t *pos) ++{ ++ if (*pos == 0) ++ return SEQ_START_TOKEN; ++ ++ if (*pos < nr_lock_chains) ++ return lock_chains + *pos; ++ ++ return NULL; ++} ++ ++static void lc_stop(struct seq_file *m, void *v) ++{ ++} ++ ++static int lc_show(struct seq_file *m, void *v) ++{ ++ struct lock_chain *chain = v; ++ struct lock_class *class; ++ int i; ++ ++ if (v == SEQ_START_TOKEN) { ++ seq_printf(m, "all lock chains:\n"); ++ return 0; ++ } ++ ++ seq_printf(m, "irq_context: %d\n", chain->irq_context); ++ ++ for (i = 0; i < chain->depth; i++) { ++ class = lock_chain_get_class(chain, i); ++ seq_printf(m, "[%p] ", class->key); ++ print_name(m, class); ++ seq_puts(m, "\n"); ++ } ++ seq_puts(m, "\n"); ++ ++ return 0; ++} ++ ++static const struct seq_operations lockdep_chains_ops = { ++ .start = lc_start, ++ .next = lc_next, ++ .stop = lc_stop, ++ .show = lc_show, ++}; ++ ++static int lockdep_chains_open(struct inode *inode, struct file *file) ++{ ++ int res = seq_open(file, &lockdep_chains_ops); ++ if (!res) { ++ struct seq_file *m = file->private_data; ++ ++ if (nr_lock_chains) ++ m->private = lock_chains; ++ else ++ m->private = NULL; ++ } ++ return res; ++} ++ ++static const struct file_operations proc_lockdep_chains_operations = { ++ .open = lockdep_chains_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = seq_release, ++}; ++ + static void lockdep_stats_debug_show(struct seq_file *m) + { + #ifdef CONFIG_DEBUG_LOCKDEP +@@ -294,5 +381,7 @@ static int lockdep_stats_show(struct seq_file *m, void *v) + + seq_printf(m, " dependency chains: %11lu [max: %lu]\n", + nr_lock_chains, MAX_LOCKDEP_CHAINS); ++ seq_printf(m, " dependency chain hlocks: %11d [max: %lu]\n", ++ atomic_read(&nr_chain_hlocks), MAX_LOCKDEP_CHAIN_HLOCKS); + + #ifdef CONFIG_TRACE_IRQFLAGS +@@ -661,6 +750,9 @@ static const struct file_operations proc_lock_stat_operations = { + entry = create_proc_entry("lockdep", S_IRUSR, NULL); + if (entry) + entry->proc_fops = &proc_lockdep_operations; ++ entry = create_proc_entry("lockdep_chains", S_IRUSR, NULL); ++ if (entry) ++ entry->proc_fops = &proc_lockdep_chains_operations; + + entry = create_proc_entry("lockdep_stats", S_IRUSR, NULL); + if (entry) + diff --git a/lustre/kernel_patches/patches/md-rebuild-policy.patch b/lustre/kernel_patches/patches/md-rebuild-policy.patch index e6c9f9c..62bb484 100644 --- a/lustre/kernel_patches/patches/md-rebuild-policy.patch +++ b/lustre/kernel_patches/patches/md-rebuild-policy.patch @@ -33,15 +33,16 @@ diff -pur linux-2.6.18-53.orig/drivers/md/md.c linux-2.6.18-53/drivers/md/md.c { .ctl_name = 0 } }; -@@ -4980,14 +4998,15 @@ static int is_mddev_idle(mddev_t *mddev) +@@ -4980,15 +4998,16 @@ static int is_mddev_idle(mddev_t *mddev) + { mdk_rdev_t * rdev; - struct list_head *tmp; int idle; - unsigned long curr_events; + unsigned long rw, sync; idle = 1; - ITERATE_RDEV(mddev,rdev,tmp) { + rcu_read_lock(); + rdev_for_each_rcu(rdev, mddev) { struct gendisk *disk = rdev->bdev->bd_contains->bd_disk; - curr_events = disk_stat_read(disk, sectors[0]) + - disk_stat_read(disk, sectors[1]) - diff --git a/lustre/kernel_patches/patches/md-soft-lockups.patch b/lustre/kernel_patches/patches/md-soft-lockups.patch new file mode 100644 index 0000000..cde9a34 --- /dev/null +++ b/lustre/kernel_patches/patches/md-soft-lockups.patch @@ -0,0 +1,13 @@ +Index: linux-2.6.18-92.1.10/drivers/md/raid5.c +=================================================================== +--- linux-2.6.18-92.1.10.orig/drivers/md/raid5.c 2008-11-10 11:00:51.000000000 +0900 ++++ linux-2.6.18-92.1.10/drivers/md/raid5.c 2008-11-10 11:02:38.000000000 +0900 +@@ -3251,6 +3251,8 @@ + handle_stripe(sh, conf->spare_page, NULL); + release_stripe(sh); + ++ cond_resched(); ++ + spin_lock_irq(&conf->device_lock); + } + PRINTK("%d stripes handled\n", handled); diff --git a/lustre/kernel_patches/patches/quota-fix-oops-in-invalidate_dquots.patch b/lustre/kernel_patches/patches/quota-fix-oops-in-invalidate_dquots.patch new file mode 100644 index 0000000..b8c6b0d --- /dev/null +++ b/lustre/kernel_patches/patches/quota-fix-oops-in-invalidate_dquots.patch @@ -0,0 +1,127 @@ +From: Jan Kara +Date: Thu, 23 Mar 2006 11:00:17 +0000 (-0800) +Subject: [PATCH] Fix oops in invalidate_dquots() +X-Git-Tag: v2.6.17-rc1~1059 +X-Git-Url: http://git.kernel.org/?p=linux%2Fkernel%2Fgit%2Ftorvalds%2Flinux-2.6.git;a=commitdiff_plain;h=6362e4d4eda61efb04ac1cdae32e48ac6d90b701 + +[PATCH] Fix oops in invalidate_dquots() + +When quota is being turned off we assumed that all the references to dquots +were already dropped. That need not be true as inodes being deleted are +not on superblock's inodes list and hence we need not reach it when +removing quota references from inodes. So invalidate_dquots() has to wait +for all the users of dquots (as quota is already marked as turned off, no +new references can be acquired and so this is bound to happen rather +early). When we do this, we can also remove the iprune_sem locking as it +was protecting us against exactly the same problem when freeing inodes +icache memory. + +Signed-off-by: Jan Kara +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +--- + +diff --git a/fs/dquot.c b/fs/dquot.c +index 1966c89..9376a43 100644 +--- a/fs/dquot.c ++++ b/fs/dquot.c +@@ -118,8 +118,7 @@ + * spinlock to internal buffers before writing. + * + * Lock ordering (including related VFS locks) is the following: +- * i_mutex > dqonoff_sem > iprune_sem > journal_lock > dqptr_sem > +- * > dquot->dq_lock > dqio_sem ++ * i_mutex > dqonoff_sem > journal_lock > dqptr_sem > dquot->dq_lock > dqio_sem + * i_mutex on quota files is special (it's below dqio_sem) + */ + +@@ -407,23 +406,49 @@ out_dqlock: + + /* Invalidate all dquots on the list. Note that this function is called after + * quota is disabled and pointers from inodes removed so there cannot be new +- * quota users. Also because we hold dqonoff_sem there can be no quota users +- * for this sb+type at all. */ ++ * quota users. There can still be some users of quotas due to inodes being ++ * just deleted or pruned by prune_icache() (those are not attached to any ++ * list). We have to wait for such users. ++ */ + static void invalidate_dquots(struct super_block *sb, int type) + { + struct dquot *dquot, *tmp; + ++restart: + spin_lock(&dq_list_lock); + list_for_each_entry_safe(dquot, tmp, &inuse_list, dq_inuse) { + if (dquot->dq_sb != sb) + continue; + if (dquot->dq_type != type) + continue; +-#ifdef __DQUOT_PARANOIA +- if (atomic_read(&dquot->dq_count)) +- BUG(); +-#endif +- /* Quota now has no users and it has been written on last dqput() */ ++ /* Wait for dquot users */ ++ if (atomic_read(&dquot->dq_count)) { ++ DEFINE_WAIT(wait); ++ ++ atomic_inc(&dquot->dq_count); ++ prepare_to_wait(&dquot->dq_wait_unused, &wait, ++ TASK_UNINTERRUPTIBLE); ++ spin_unlock(&dq_list_lock); ++ /* Once dqput() wakes us up, we know it's time to free ++ * the dquot. ++ * IMPORTANT: we rely on the fact that there is always ++ * at most one process waiting for dquot to free. ++ * Otherwise dq_count would be > 1 and we would never ++ * wake up. ++ */ ++ if (atomic_read(&dquot->dq_count) > 1) ++ schedule(); ++ finish_wait(&dquot->dq_wait_unused, &wait); ++ dqput(dquot); ++ /* At this moment dquot() need not exist (it could be ++ * reclaimed by prune_dqcache(). Hence we must ++ * restart. */ ++ goto restart; ++ } ++ /* ++ * Quota now has no users and it has been written on last ++ * dqput() ++ */ + remove_dquot_hash(dquot); + remove_free_dquot(dquot); + remove_inuse(dquot); +@@ -540,6 +565,10 @@ we_slept: + if (atomic_read(&dquot->dq_count) > 1) { + /* We have more than one user... nothing to do */ + atomic_dec(&dquot->dq_count); ++ /* Releasing dquot during quotaoff phase? */ ++ if (!sb_has_quota_enabled(dquot->dq_sb, dquot->dq_type) && ++ atomic_read(&dquot->dq_count) == 1) ++ wake_up(&dquot->dq_wait_unused); + spin_unlock(&dq_list_lock); + return; + } +@@ -581,6 +610,7 @@ static struct dquot *get_empty_dquot(struct super_block *sb, int type) + INIT_LIST_HEAD(&dquot->dq_inuse); + INIT_HLIST_NODE(&dquot->dq_hash); + INIT_LIST_HEAD(&dquot->dq_dirty); ++ init_waitqueue_head(&dquot->dq_wait_unused); + dquot->dq_sb = sb; + dquot->dq_type = type; + atomic_set(&dquot->dq_count, 1); +@@ -732,13 +762,9 @@ static void drop_dquot_ref(struct super_block *sb, int type) + { + LIST_HEAD(tofree_head); + +- /* We need to be guarded against prune_icache to reach all the +- * inodes - otherwise some can be on the local list of prune_icache */ +- down(&iprune_sem); + down_write(&sb_dqopt(sb)->dqptr_sem); + remove_dquot_ref(sb, type, &tofree_head); + up_write(&sb_dqopt(sb)->dqptr_sem); +- up(&iprune_sem); + put_dquot_list(&tofree_head); + } + diff --git a/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch b/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch new file mode 100644 index 0000000..4f3a3bc --- /dev/null +++ b/lustre/kernel_patches/patches/quota-large-limits-rhel5.patch @@ -0,0 +1,616 @@ +diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot.c +--- linux-2.6.16.54-0.2.5/fs/dquot.c 2008-03-18 15:48:26.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/dquot.c 2008-03-17 22:43:11.000000000 +0300 +@@ -1588,10 +1588,19 @@ int vfs_get_dqblk(struct super_block *sb + } + + /* Generic routine for setting common part of quota structure */ +-static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) ++static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) + { + struct mem_dqblk *dm = &dquot->dq_dqb; + int check_blim = 0, check_ilim = 0; ++ struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; ++ ++ if ((di->dqb_valid & QIF_BLIMITS && ++ (di->dqb_bhardlimit > dqi->dqi_maxblimit || ++ di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || ++ (di->dqb_valid & QIF_ILIMITS && ++ (di->dqb_ihardlimit > dqi->dqi_maxilimit || ++ di->dqb_isoftlimit > dqi->dqi_maxilimit))) ++ return -ERANGE; + + spin_lock(&dq_data_lock); + if (di->dqb_valid & QIF_SPACE) { +@@ -1623,7 +1632,7 @@ static void do_set_dqblk(struct dquot *d + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ +- dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; ++ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; + } + if (check_ilim) { + if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { +@@ -1631,7 +1640,7 @@ static void do_set_dqblk(struct dquot *d + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ +- dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; ++ dm->dqb_itime = get_seconds() + dqi->dqi_igrace; + } + if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) + clear_bit(DQ_FAKE_B, &dquot->dq_flags); +@@ -1639,21 +1648,24 @@ static void do_set_dqblk(struct dquot *d + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + mark_dquot_dirty(dquot); ++ ++ return 0; + } + + int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) + { + struct dquot *dquot; ++ int rc; + + mutex_lock(&sb_dqopt(sb)->dqonoff_mutex); + if (!(dquot = dqget(sb, id, type))) { + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); + return -ESRCH; + } +- do_set_dqblk(dquot, di); ++ rc = do_set_dqblk(dquot, di); + dqput(dquot); + mutex_unlock(&sb_dqopt(sb)->dqonoff_mutex); +- return 0; ++ return rc; + } + + /* Generic routine for getting common part of quota file information */ +diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c +--- linux-2.6.16.54-0.2.5/fs/quota_v1.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c 2008-03-17 22:42:47.000000000 +0300 +@@ -139,6 +139,9 @@ static int v1_read_file_info(struct supe + goto out; + } + ret = 0; ++ /* limits are stored as unsigned 32-bit data */ ++ dqopt->info[type].dqi_maxblimit = 0xffffffff; ++ dqopt->info[type].dqi_maxilimit = 0xffffffff; + dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; + out: +diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c +--- linux-2.6.16.54-0.2.5/fs/quota_v2.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c 2008-03-18 11:58:02.000000000 +0300 +@@ -23,26 +23,64 @@ MODULE_LICENSE("GPL"); + typedef char *dqbuf_t; + + #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) +-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) ++#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \ ++ sizeof(struct v2_disk_dqdbheader))) ++#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1) ++ ++static const union v2_disk_dqblk emptydquot; ++static const union v2_disk_dqblk fakedquot[2] = { ++ {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }, ++ {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} } ++}; + +-/* Check whether given file is really vfsv0 quotafile */ +-static int v2_check_quota_file(struct super_block *sb, int type) ++static inline uint v2_dqblksz(uint rev) ++{ ++ uint sz; ++ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) ++ sz = sizeof(struct v2_disk_dqblk_r0); ++ else ++ sz = sizeof(struct v2_disk_dqblk_r1); ++ ++ return sz; ++} ++ ++/* Number of quota entries in a block */ ++static inline int v2_dqstrinblk(uint rev) ++{ ++ return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev); ++} ++ ++/* Get revision of a quota file, -1 if it does not look a quota file */ ++static int v2_quota_file_revision(struct super_block *sb, int type) + { + struct v2_disk_dqheader dqhead; + ssize_t size; + static const uint quota_magics[] = V2_INITQMAGICS; +- static const uint quota_versions[] = V2_INITQVERSIONS; ++ static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0; ++ static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1; + + size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + printk("quota_v2: failed read expected=%zd got=%zd\n", + sizeof(struct v2_disk_dqheader), size); +- return 0; ++ return -1; + } +- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || +- le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) +- return 0; +- return 1; ++ if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) { ++ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type]) ++ return 0; ++ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type]) ++ return 1; ++ } ++ return -1; ++} ++ ++/* Check whether given file is really vfsv0 quotafile */ ++static inline int v2_check_quota_file(struct super_block *sb, int type) ++{ ++ return v2_quota_file_revision(sb, type) != -1; + } + + /* Read information header from quota file */ +@@ -51,6 +89,13 @@ static int v2_read_file_info(struct supe + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + ssize_t size; ++ int rev; ++ ++ rev = v2_quota_file_revision(sb, type); ++ if (rev < 0) { ++ printk(KERN_WARNING "Second quota file check failed.\n"); ++ return -1; ++ } + + size = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); +@@ -65,6 +110,16 @@ static int v2_read_file_info(struct supe + info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); ++ ++ info->u.v2_i.dqi_revision = rev; ++ if (rev == 0) { ++ info->dqi_maxblimit = 0xffffffffULL; ++ info->dqi_maxilimit = 0xffffffffULL; ++ } else { ++ info->dqi_maxblimit = 0xffffffffffffffffULL; ++ info->dqi_maxilimit = 0xffffffffffffffffULL; ++ } ++ + return 0; + } + +@@ -94,29 +149,61 @@ static int v2_write_file_info(struct sup + return 0; + } + +-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) ++static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev) + { +- m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); +- m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); +- m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); +- m->dqb_itime = le64_to_cpu(d->dqb_itime); +- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); +- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); +- m->dqb_curspace = le64_to_cpu(d->dqb_curspace); +- m->dqb_btime = le64_to_cpu(d->dqb_btime); +-} +- +-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) +-{ +- d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); +- d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); +- d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); +- d->dqb_itime = cpu_to_le64(m->dqb_itime); +- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); +- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); +- d->dqb_curspace = cpu_to_le64(m->dqb_curspace); +- d->dqb_btime = cpu_to_le64(m->dqb_btime); +- d->dqb_id = cpu_to_le32(id); ++ REV_ASSERT(rev); ++ ++ if (rev == 0) { ++ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; ++ m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit); ++ m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit); ++ m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes); ++ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); ++ m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit); ++ m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit); ++ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); ++ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); ++ } else { ++ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; ++ m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit); ++ m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit); ++ m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes); ++ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); ++ m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit); ++ m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit); ++ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); ++ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); ++ } ++} ++ ++static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m, ++ qid_t id, uint rev) ++{ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) { ++ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; ++ ddqblk->dqb_id = cpu_to_le32(id); ++ ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit); ++ ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit); ++ ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes); ++ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); ++ ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit); ++ ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit); ++ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); ++ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); ++ } else { ++ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; ++ ddqblk->dqb_id = cpu_to_le32(id); ++ ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); ++ ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); ++ ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); ++ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); ++ ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); ++ ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); ++ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); ++ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); ++ } + } + + static dqbuf_t getdqbuf(void) +@@ -268,10 +355,10 @@ static uint find_free_dqentry(struct dqu + { + struct super_block *sb = dquot->dq_sb; + struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; +- uint blk, i; ++ uint blk, i, rev = info->u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); + struct v2_disk_dqdbheader *dh; +- struct v2_disk_dqblk *ddquot; +- struct v2_disk_dqblk fakedquot; ++ union v2_disk_dqblk *ddquot; + dqbuf_t buf; + + *err = 0; +@@ -298,17 +385,18 @@ static uint find_free_dqentry(struct dqu + info->u.v2_i.dqi_free_entry = blk; + mark_info_dirty(sb, dquot->dq_type); + } +- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ ++ /* Block will be full? */ ++ if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk) + if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); + goto out_buf; + } + dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1); +- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); + /* Find free structure in block */ +- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); ++ for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz); ++ i++, ddquot = (char *)ddquot + dqblksz); + #ifdef __QUOTA_V2_PARANOIA +- if (i == V2_DQSTRINBLK) { ++ if (i == dqstrinblk) { + printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); + *err = -EIO; + goto out_buf; +@@ -318,7 +406,8 @@ static uint find_free_dqentry(struct dqu + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); + goto out_buf; + } +- dquot->dq_off = (blk<dq_off = (blk<dq_type; + ssize_t ret; +- struct v2_disk_dqblk ddquot, empty; ++ union v2_disk_dqblk ddquot; ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev); + + /* dq_off is guarded by dqio_sem */ + if (!dquot->dq_off) +@@ -401,18 +492,22 @@ static int v2_write_dquot(struct dquot * + return ret; + } + spin_lock(&dq_data_lock); +- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); ++ mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev); + /* Argh... We may need to write structure full of zeroes but that would be + * treated as an empty place by the rest of the code. Format change would + * be definitely cleaner but the problems probably are not worth it */ +- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); +- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) +- ddquot.dqb_itime = cpu_to_le64(1); ++ if (!memcmp(&emptydquot, &ddquot, dqblksz)) { ++ if (rev == 0) ++ ddquot.r0.dqb_itime = cpu_to_le64(1); ++ else ++ ddquot.r1.dqb_itime = cpu_to_le64(1); ++ } + spin_unlock(&dq_data_lock); + ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, +- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); +- if (ret != sizeof(struct v2_disk_dqblk)) { +- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); ++ (char *)&ddquot, dqblksz, dquot->dq_off); ++ if (ret != dqblksz) { ++ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", ++ dquot->dq_sb->s_id); + if (ret >= 0) + ret = -ENOSPC; + } +@@ -431,6 +526,7 @@ static int free_dqentry(struct dquot *dq + struct v2_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(); + int ret = 0; ++ uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision; + + if (!buf) + return -ENOMEM; +@@ -456,8 +552,8 @@ static int free_dqentry(struct dquot *dq + } + else { + memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, +- sizeof(struct v2_disk_dqblk)); +- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { ++ v2_dqblksz(rev)); ++ if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) { + /* Insert will write block itself */ + if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); +@@ -529,41 +625,56 @@ static int v2_delete_dquot(struct dquot + return remove_tree(dquot, &tmp, 0); + } + ++static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev) ++{ ++ __u32 dq_id; ++ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) ++ dq_id = le32_to_cpu(ddquot->r0.dqb_id); ++ else ++ dq_id = le32_to_cpu(ddquot->r1.dqb_id); ++ ++ return dq_id; ++} ++ + /* Find entry in block */ + static loff_t find_block_dqentry(struct dquot *dquot, uint blk) + { + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + int i; +- struct v2_disk_dqblk *ddquot = GETENTRIES(buf); ++ union v2_disk_dqblk *ddquot = GETENTRIES(buf); ++ int type = dquot->dq_type; ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); + + if (!buf) + return -ENOMEM; +- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { ++ ++ ret = read_blk(dquot->dq_sb, type, blk, buf); ++ if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + if (dquot->dq_id) +- for (i = 0; i < V2_DQSTRINBLK && +- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); ++ for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id; ++ i++, ddquot = (char *)ddquot + dqblksz); + else { /* ID 0 as a bit more complicated searching... */ +- struct v2_disk_dqblk fakedquot; +- +- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); +- for (i = 0; i < V2_DQSTRINBLK; i++) +- if (!le32_to_cpu(ddquot[i].dqb_id) && +- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) ++ for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz) ++ if (!dqid(ddquot, rev) && ++ memcmp(&emptydquot, ddquot, dqblksz)) + break; + } +- if (i == V2_DQSTRINBLK) { ++ if (i == dqstrinblk) { + printk(KERN_ERR "VFS: Quota for id %u referenced " + "but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } + else +- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct +- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); ++ ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf); + out_buf: + freedqbuf(buf); + return ret; +@@ -605,7 +716,7 @@ static int v2_read_dquot(struct dquot *d + { + int type = dquot->dq_type; + loff_t offset; +- struct v2_disk_dqblk ddquot, empty; ++ union v2_disk_dqblk ddquot; + int ret = 0; + + #ifdef __QUOTA_V2_PARANOIA +@@ -626,25 +737,30 @@ static int v2_read_dquot(struct dquot *d + ret = offset; + } + else { ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i. ++ dqi_revision; ++ uint dqblksz = v2_dqblksz(rev); + dquot->dq_off = offset; +- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, +- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) +- != sizeof(struct v2_disk_dqblk)) { ++ ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, ++ (char *)&ddquot, dqblksz, offset); ++ if (ret != dqblksz) { + if (ret >= 0) + ret = -EIO; + printk(KERN_ERR "VFS: Error while reading quota " + "structure for id %u.\n", dquot->dq_id); +- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); ++ memset(&ddquot, 0, dqblksz); + } + else { + ret = 0; + /* We need to escape back all-zero structure */ +- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); +- empty.dqb_itime = cpu_to_le64(1); +- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) +- ddquot.dqb_itime = 0; ++ if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) { ++ if (rev == 0) ++ ddquot.r0.dqb_itime = cpu_to_le64(0); ++ else ++ ddquot.r1.dqb_itime = cpu_to_le64(0); ++ } + } +- disk2memdqb(&dquot->dq_dqb, &ddquot); ++ disk2memdqb(&dquot->dq_dqb, &ddquot, rev); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h +--- linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h 2008-03-17 23:39:54.000000000 +0300 +@@ -21,6 +21,7 @@ struct v2_mem_dqinfo { + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; ++ unsigned int dqi_revision; + }; + + #endif /* _LINUX_DQBLK_V2_H */ +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quota/include/linux/quota.h +--- linux-2.6.16.54-0.2.5/include/linux/quota.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/quota.h 2008-03-17 23:39:54.000000000 +0300 +@@ -148,12 +148,12 @@ struct if_dqinfo { + * Data for one user/group kept in memory + */ + struct mem_dqblk { +- __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ +- __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ ++ qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ ++ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ + qsize_t dqb_curspace; /* current used space */ +- __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ +- __u32 dqb_isoftlimit; /* preferred inode limit */ +- __u32 dqb_curinodes; /* current # allocated inodes */ ++ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ ++ qsize_t dqb_isoftlimit; /* preferred inode limit */ ++ qsize_t dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ + }; +@@ -169,6 +169,8 @@ struct mem_dqinfo { + unsigned long dqi_flags; + unsigned int dqi_bgrace; + unsigned int dqi_igrace; ++ qsize_t dqi_maxblimit; ++ qsize_t dqi_maxilimit; + union { + struct v1_mem_dqinfo v1_i; + struct v2_mem_dqinfo v2_i; +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h +--- linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h 2008-03-17 23:39:54.000000000 +0300 +@@ -16,28 +16,51 @@ + 0xd9c01927 /* GRPQUOTA */\ + } + +-#define V2_INITQVERSIONS {\ ++#define V2_INITQVERSIONS_R0 {\ + 0, /* USRQUOTA */\ + 0 /* GRPQUOTA */\ + } + ++#define V2_INITQVERSIONS_R1 {\ ++ 1, /* USRQUOTA */\ ++ 1 /* GRPQUOTA */\ ++} ++ + /* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +-struct v2_disk_dqblk { ++struct v2_disk_dqblk_r0 { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ +- __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ +- __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ ++ __le32 dqb_bhardlimit; /* absolute limit on disk space */ ++ __le32 dqb_bsoftlimit; /* preferred limit on disk space */ ++ __le64 dqb_curspace; /* current space occupied (in bytes) */ ++ __le64 dqb_btime; /* time limit for excessive disk use */ ++ __le64 dqb_itime; /* time limit for excessive inode use */ ++}; ++ ++struct v2_disk_dqblk_r1 { ++ __le32 dqb_id; /* id this quota applies to */ ++ __le32 dqb_padding; /* padding field */ ++ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ ++ __le64 dqb_isoftlimit; /* preferred inode limit */ ++ __le64 dqb_curinodes; /* current # allocated inodes */ ++ __le64 dqb_bhardlimit; /* absolute limit on disk space */ ++ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ + }; + ++union v2_disk_dqblk { ++ struct v2_disk_dqblk_r0 r0; ++ struct v2_disk_dqblk_r1 r1; ++}; ++ + /* + * Here are header structures as written on disk and their in-memory copies + */ +@@ -59,7 +82,7 @@ struct v2_disk_dqinfo { + + /* + * Structure of header of block with quota structures. It is padded to 16 bytes so +- * there will be space for exactly 21 quota-entries in a block ++ * there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block + */ + struct v2_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ +@@ -74,6 +97,5 @@ struct v2_disk_dqdbheader { + #define V2_DQBLKSIZE (1 << V2_DQBLKSIZE_BITS) /* Size of block with quota structures */ + #define V2_DQTREEOFF 1 /* Offset of tree in file in blocks */ + #define V2_DQTREEDEPTH 4 /* Depth of quota tree */ +-#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk)) /* Number of entries in one blocks */ + + #endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/lustre/kernel_patches/patches/quota-large-limits-sles10.patch b/lustre/kernel_patches/patches/quota-large-limits-sles10.patch new file mode 100644 index 0000000..fcef1c2 --- /dev/null +++ b/lustre/kernel_patches/patches/quota-large-limits-sles10.patch @@ -0,0 +1,616 @@ +diff -rNpu linux-2.6.16.54-0.2.5/fs/dquot.c linux-2.6.16.54-0.2.5-quota/fs/dquot.c +--- linux-2.6.16.54-0.2.5/fs/dquot.c 2008-03-18 15:48:26.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/dquot.c 2008-03-17 22:43:11.000000000 +0300 +@@ -1588,10 +1588,19 @@ int vfs_get_dqblk(struct super_block *sb + } + + /* Generic routine for setting common part of quota structure */ +-static void do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) ++static int do_set_dqblk(struct dquot *dquot, struct if_dqblk *di) + { + struct mem_dqblk *dm = &dquot->dq_dqb; + int check_blim = 0, check_ilim = 0; ++ struct mem_dqinfo *dqi = &sb_dqopt(dquot->dq_sb)->info[dquot->dq_type]; ++ ++ if ((di->dqb_valid & QIF_BLIMITS && ++ (di->dqb_bhardlimit > dqi->dqi_maxblimit || ++ di->dqb_bsoftlimit > dqi->dqi_maxblimit)) || ++ (di->dqb_valid & QIF_ILIMITS && ++ (di->dqb_ihardlimit > dqi->dqi_maxilimit || ++ di->dqb_isoftlimit > dqi->dqi_maxilimit))) ++ return -ERANGE; + + spin_lock(&dq_data_lock); + if (di->dqb_valid & QIF_SPACE) { +@@ -1623,7 +1632,7 @@ static void do_set_dqblk(struct dquot *d + clear_bit(DQ_BLKS_B, &dquot->dq_flags); + } + else if (!(di->dqb_valid & QIF_BTIME)) /* Set grace only if user hasn't provided his own... */ +- dm->dqb_btime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_bgrace; ++ dm->dqb_btime = get_seconds() + dqi->dqi_bgrace; + } + if (check_ilim) { + if (!dm->dqb_isoftlimit || dm->dqb_curinodes < dm->dqb_isoftlimit) { +@@ -1631,7 +1640,7 @@ static void do_set_dqblk(struct dquot *d + clear_bit(DQ_INODES_B, &dquot->dq_flags); + } + else if (!(di->dqb_valid & QIF_ITIME)) /* Set grace only if user hasn't provided his own... */ +- dm->dqb_itime = get_seconds() + sb_dqopt(dquot->dq_sb)->info[dquot->dq_type].dqi_igrace; ++ dm->dqb_itime = get_seconds() + dqi->dqi_igrace; + } + if (dm->dqb_bhardlimit || dm->dqb_bsoftlimit || dm->dqb_ihardlimit || dm->dqb_isoftlimit) + clear_bit(DQ_FAKE_B, &dquot->dq_flags); +@@ -1639,21 +1648,24 @@ static void do_set_dqblk(struct dquot *d + set_bit(DQ_FAKE_B, &dquot->dq_flags); + spin_unlock(&dq_data_lock); + mark_dquot_dirty(dquot); ++ ++ return 0; + } + + int vfs_set_dqblk(struct super_block *sb, int type, qid_t id, struct if_dqblk *di) + { + struct dquot *dquot; ++ int rc; + + down(&sb_dqopt(sb)->dqonoff_sem); + if (!(dquot = dqget(sb, id, type))) { + up(&sb_dqopt(sb)->dqonoff_sem); + return -ESRCH; + } +- do_set_dqblk(dquot, di); ++ rc = do_set_dqblk(dquot, di); + dqput(dquot); + up(&sb_dqopt(sb)->dqonoff_sem); +- return 0; ++ return rc; + } + + /* Generic routine for getting common part of quota file information */ +diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v1.c linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c +--- linux-2.6.16.54-0.2.5/fs/quota_v1.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v1.c 2008-03-17 22:42:47.000000000 +0300 +@@ -139,6 +139,9 @@ static int v1_read_file_info(struct supe + goto out; + } + ret = 0; ++ /* limits are stored as unsigned 32-bit data */ ++ dqopt->info[type].dqi_maxblimit = 0xffffffff; ++ dqopt->info[type].dqi_maxilimit = 0xffffffff; + dqopt->info[type].dqi_igrace = dqblk.dqb_itime ? dqblk.dqb_itime : MAX_IQ_TIME; + dqopt->info[type].dqi_bgrace = dqblk.dqb_btime ? dqblk.dqb_btime : MAX_DQ_TIME; + out: +diff -rNpu linux-2.6.16.54-0.2.5/fs/quota_v2.c linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c +--- linux-2.6.16.54-0.2.5/fs/quota_v2.c 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/fs/quota_v2.c 2008-03-18 11:58:02.000000000 +0300 +@@ -23,26 +23,64 @@ MODULE_LICENSE("GPL"); + typedef char *dqbuf_t; + + #define GETIDINDEX(id, depth) (((id) >> ((V2_DQTREEDEPTH-(depth)-1)*8)) & 0xff) +-#define GETENTRIES(buf) ((struct v2_disk_dqblk *)(((char *)buf)+sizeof(struct v2_disk_dqdbheader))) ++#define GETENTRIES(buf) ((union v2_disk_dqblk *)(((char *)buf) + \ ++ sizeof(struct v2_disk_dqdbheader))) ++#define REV_ASSERT(r) BUG_ON((rev) != 0 && (rev) != 1) ++ ++static const union v2_disk_dqblk emptydquot; ++static const union v2_disk_dqblk fakedquot[2] = { ++ {.r0 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} }, ++ {.r1 = {.dqb_itime = __constant_cpu_to_le64(1LLU)} } ++}; + +-/* Check whether given file is really vfsv0 quotafile */ +-static int v2_check_quota_file(struct super_block *sb, int type) ++static inline uint v2_dqblksz(uint rev) ++{ ++ uint sz; ++ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) ++ sz = sizeof(struct v2_disk_dqblk_r0); ++ else ++ sz = sizeof(struct v2_disk_dqblk_r1); ++ ++ return sz; ++} ++ ++/* Number of quota entries in a block */ ++static inline int v2_dqstrinblk(uint rev) ++{ ++ return (V2_DQBLKSIZE-sizeof(struct v2_disk_dqdbheader))/v2_dqblksz(rev); ++} ++ ++/* Get revision of a quota file, -1 if it does not look a quota file */ ++static int v2_quota_file_revision(struct super_block *sb, int type) + { + struct v2_disk_dqheader dqhead; + ssize_t size; + static const uint quota_magics[] = V2_INITQMAGICS; +- static const uint quota_versions[] = V2_INITQVERSIONS; ++ static const uint quota_versions_r0[] = V2_INITQVERSIONS_R0; ++ static const uint quota_versions_r1[] = V2_INITQVERSIONS_R1; + + size = sb->s_op->quota_read(sb, type, (char *)&dqhead, sizeof(struct v2_disk_dqheader), 0); + if (size != sizeof(struct v2_disk_dqheader)) { + printk("quota_v2: failed read expected=%zd got=%zd\n", + sizeof(struct v2_disk_dqheader), size); +- return 0; ++ return -1; + } +- if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || +- le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) +- return 0; +- return 1; ++ if (le32_to_cpu(dqhead.dqh_magic) == quota_magics[type]) { ++ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r0[type]) ++ return 0; ++ if (le32_to_cpu(dqhead.dqh_version) == quota_versions_r1[type]) ++ return 1; ++ } ++ return -1; ++} ++ ++/* Check whether given file is really vfsv0 quotafile */ ++static inline int v2_check_quota_file(struct super_block *sb, int type) ++{ ++ return v2_quota_file_revision(sb, type) != -1; + } + + /* Read information header from quota file */ +@@ -51,6 +89,13 @@ static int v2_read_file_info(struct supe + struct v2_disk_dqinfo dinfo; + struct mem_dqinfo *info = sb_dqopt(sb)->info+type; + ssize_t size; ++ int rev; ++ ++ rev = v2_quota_file_revision(sb, type); ++ if (rev < 0) { ++ printk(KERN_WARNING "Second quota file check failed.\n"); ++ return -1; ++ } + + size = sb->s_op->quota_read(sb, type, (char *)&dinfo, + sizeof(struct v2_disk_dqinfo), V2_DQINFOOFF); +@@ -65,6 +110,16 @@ static int v2_read_file_info(struct supe + info->u.v2_i.dqi_blocks = le32_to_cpu(dinfo.dqi_blocks); + info->u.v2_i.dqi_free_blk = le32_to_cpu(dinfo.dqi_free_blk); + info->u.v2_i.dqi_free_entry = le32_to_cpu(dinfo.dqi_free_entry); ++ ++ info->u.v2_i.dqi_revision = rev; ++ if (rev == 0) { ++ info->dqi_maxblimit = 0xffffffffULL; ++ info->dqi_maxilimit = 0xffffffffULL; ++ } else { ++ info->dqi_maxblimit = 0xffffffffffffffffULL; ++ info->dqi_maxilimit = 0xffffffffffffffffULL; ++ } ++ + return 0; + } + +@@ -94,29 +149,61 @@ static int v2_write_file_info(struct sup + return 0; + } + +-static void disk2memdqb(struct mem_dqblk *m, struct v2_disk_dqblk *d) ++static void disk2memdqb(struct mem_dqblk *m, union v2_disk_dqblk *d, uint rev) + { +- m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); +- m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); +- m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); +- m->dqb_itime = le64_to_cpu(d->dqb_itime); +- m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); +- m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); +- m->dqb_curspace = le64_to_cpu(d->dqb_curspace); +- m->dqb_btime = le64_to_cpu(d->dqb_btime); +-} +- +-static void mem2diskdqb(struct v2_disk_dqblk *d, struct mem_dqblk *m, qid_t id) +-{ +- d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); +- d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); +- d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); +- d->dqb_itime = cpu_to_le64(m->dqb_itime); +- d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); +- d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); +- d->dqb_curspace = cpu_to_le64(m->dqb_curspace); +- d->dqb_btime = cpu_to_le64(m->dqb_btime); +- d->dqb_id = cpu_to_le32(id); ++ REV_ASSERT(rev); ++ ++ if (rev == 0) { ++ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; ++ m->dqb_ihardlimit = le32_to_cpu(ddqblk->dqb_ihardlimit); ++ m->dqb_isoftlimit = le32_to_cpu(ddqblk->dqb_isoftlimit); ++ m->dqb_curinodes = le32_to_cpu(ddqblk->dqb_curinodes); ++ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); ++ m->dqb_bhardlimit = le32_to_cpu(ddqblk->dqb_bhardlimit); ++ m->dqb_bsoftlimit = le32_to_cpu(ddqblk->dqb_bsoftlimit); ++ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); ++ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); ++ } else { ++ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; ++ m->dqb_ihardlimit = le64_to_cpu(ddqblk->dqb_ihardlimit); ++ m->dqb_isoftlimit = le64_to_cpu(ddqblk->dqb_isoftlimit); ++ m->dqb_curinodes = le64_to_cpu(ddqblk->dqb_curinodes); ++ m->dqb_itime = le64_to_cpu(ddqblk->dqb_itime); ++ m->dqb_bhardlimit = le64_to_cpu(ddqblk->dqb_bhardlimit); ++ m->dqb_bsoftlimit = le64_to_cpu(ddqblk->dqb_bsoftlimit); ++ m->dqb_curspace = le64_to_cpu(ddqblk->dqb_curspace); ++ m->dqb_btime = le64_to_cpu(ddqblk->dqb_btime); ++ } ++} ++ ++static void mem2diskdqb(union v2_disk_dqblk *d, struct mem_dqblk *m, ++ qid_t id, uint rev) ++{ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) { ++ struct v2_disk_dqblk_r0 *ddqblk = &d->r0; ++ ddqblk->dqb_id = cpu_to_le32(id); ++ ddqblk->dqb_ihardlimit = cpu_to_le32((__u32)m->dqb_ihardlimit); ++ ddqblk->dqb_isoftlimit = cpu_to_le32((__u32)m->dqb_isoftlimit); ++ ddqblk->dqb_curinodes = cpu_to_le32((__u32)m->dqb_curinodes); ++ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); ++ ddqblk->dqb_bhardlimit = cpu_to_le32((__u32)m->dqb_bhardlimit); ++ ddqblk->dqb_bsoftlimit = cpu_to_le32((__u32)m->dqb_bsoftlimit); ++ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); ++ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); ++ } else { ++ struct v2_disk_dqblk_r1 *ddqblk = &d->r1; ++ ddqblk->dqb_id = cpu_to_le32(id); ++ ddqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); ++ ddqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); ++ ddqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); ++ ddqblk->dqb_itime = cpu_to_le64(m->dqb_itime); ++ ddqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); ++ ddqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); ++ ddqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); ++ ddqblk->dqb_btime = cpu_to_le64(ddqblk->dqb_btime); ++ } + } + + static dqbuf_t getdqbuf(void) +@@ -268,10 +355,10 @@ static uint find_free_dqentry(struct dqu + { + struct super_block *sb = dquot->dq_sb; + struct mem_dqinfo *info = sb_dqopt(sb)->info+dquot->dq_type; +- uint blk, i; ++ uint blk, i, rev = info->u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); + struct v2_disk_dqdbheader *dh; +- struct v2_disk_dqblk *ddquot; +- struct v2_disk_dqblk fakedquot; ++ union v2_disk_dqblk *ddquot; + dqbuf_t buf; + + *err = 0; +@@ -298,17 +385,18 @@ static uint find_free_dqentry(struct dqu + info->u.v2_i.dqi_free_entry = blk; + mark_info_dirty(sb, dquot->dq_type); + } +- if (le16_to_cpu(dh->dqdh_entries)+1 >= V2_DQSTRINBLK) /* Block will be full? */ ++ /* Block will be full? */ ++ if (le16_to_cpu(dh->dqdh_entries)+1 >= dqstrinblk) + if ((*err = remove_free_dqentry(sb, dquot->dq_type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", blk); + goto out_buf; + } + dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries)+1); +- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); + /* Find free structure in block */ +- for (i = 0; i < V2_DQSTRINBLK && memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk)); i++); ++ for (i = 0; i < dqstrinblk && memcmp(&emptydquot, ddquot, dqblksz); ++ i++, ddquot = (char *)ddquot + dqblksz); + #ifdef __QUOTA_V2_PARANOIA +- if (i == V2_DQSTRINBLK) { ++ if (i == dqstrinblk) { + printk(KERN_ERR "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); + *err = -EIO; + goto out_buf; +@@ -318,7 +406,8 @@ static uint find_free_dqentry(struct dqu + printk(KERN_ERR "VFS: find_free_dqentry(): Can't write quota data block %u.\n", blk); + goto out_buf; + } +- dquot->dq_off = (blk<dq_off = (blk<dq_type; + ssize_t ret; +- struct v2_disk_dqblk ddquot, empty; ++ union v2_disk_dqblk ddquot; ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev); + + /* dq_off is guarded by dqio_sem */ + if (!dquot->dq_off) +@@ -401,18 +492,22 @@ static int v2_write_dquot(struct dquot * + return ret; + } + spin_lock(&dq_data_lock); +- mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); ++ mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id, rev); + /* Argh... We may need to write structure full of zeroes but that would be + * treated as an empty place by the rest of the code. Format change would + * be definitely cleaner but the problems probably are not worth it */ +- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); +- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) +- ddquot.dqb_itime = cpu_to_le64(1); ++ if (!memcmp(&emptydquot, &ddquot, dqblksz)) { ++ if (rev == 0) ++ ddquot.r0.dqb_itime = cpu_to_le64(1); ++ else ++ ddquot.r1.dqb_itime = cpu_to_le64(1); ++ } + spin_unlock(&dq_data_lock); + ret = dquot->dq_sb->s_op->quota_write(dquot->dq_sb, type, +- (char *)&ddquot, sizeof(struct v2_disk_dqblk), dquot->dq_off); +- if (ret != sizeof(struct v2_disk_dqblk)) { +- printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", dquot->dq_sb->s_id); ++ (char *)&ddquot, dqblksz, dquot->dq_off); ++ if (ret != dqblksz) { ++ printk(KERN_WARNING "VFS: dquota write failed on dev %s\n", ++ dquot->dq_sb->s_id); + if (ret >= 0) + ret = -ENOSPC; + } +@@ -431,6 +526,7 @@ static int free_dqentry(struct dquot *dq + struct v2_disk_dqdbheader *dh; + dqbuf_t buf = getdqbuf(); + int ret = 0; ++ uint rev = sb_dqopt(sb)->info[type].u.v2_i.dqi_revision; + + if (!buf) + return -ENOMEM; +@@ -456,8 +552,8 @@ static int free_dqentry(struct dquot *dq + } + else { + memset(buf+(dquot->dq_off & ((1 << V2_DQBLKSIZE_BITS)-1)), 0, +- sizeof(struct v2_disk_dqblk)); +- if (le16_to_cpu(dh->dqdh_entries) == V2_DQSTRINBLK-1) { ++ v2_dqblksz(rev)); ++ if (le16_to_cpu(dh->dqdh_entries) == v2_dqstrinblk(rev)-1) { + /* Insert will write block itself */ + if ((ret = insert_free_dqentry(sb, type, buf, blk)) < 0) { + printk(KERN_ERR "VFS: Can't insert quota data block (%u) to free entry list.\n", blk); +@@ -529,41 +625,56 @@ static int v2_delete_dquot(struct dquot + return remove_tree(dquot, &tmp, 0); + } + ++static inline __u32 dqid(union v2_disk_dqblk *ddquot, uint rev) ++{ ++ __u32 dq_id; ++ ++ REV_ASSERT(rev); ++ ++ if (rev == 0) ++ dq_id = le32_to_cpu(ddquot->r0.dqb_id); ++ else ++ dq_id = le32_to_cpu(ddquot->r1.dqb_id); ++ ++ return dq_id; ++} ++ + /* Find entry in block */ + static loff_t find_block_dqentry(struct dquot *dquot, uint blk) + { + dqbuf_t buf = getdqbuf(); + loff_t ret = 0; + int i; +- struct v2_disk_dqblk *ddquot = GETENTRIES(buf); ++ union v2_disk_dqblk *ddquot = GETENTRIES(buf); ++ int type = dquot->dq_type; ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i.dqi_revision; ++ uint dqblksz = v2_dqblksz(rev), dqstrinblk = v2_dqstrinblk(rev); + + if (!buf) + return -ENOMEM; +- if ((ret = read_blk(dquot->dq_sb, dquot->dq_type, blk, buf)) < 0) { ++ ++ ret = read_blk(dquot->dq_sb, type, blk, buf); ++ if (ret < 0) { + printk(KERN_ERR "VFS: Can't read quota tree block %u.\n", blk); + goto out_buf; + } + if (dquot->dq_id) +- for (i = 0; i < V2_DQSTRINBLK && +- le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++); ++ for (i = 0; i < dqstrinblk && dqid(ddquot, rev) != dquot->dq_id; ++ i++, ddquot = (char *)ddquot + dqblksz); + else { /* ID 0 as a bit more complicated searching... */ +- struct v2_disk_dqblk fakedquot; +- +- memset(&fakedquot, 0, sizeof(struct v2_disk_dqblk)); +- for (i = 0; i < V2_DQSTRINBLK; i++) +- if (!le32_to_cpu(ddquot[i].dqb_id) && +- memcmp(&fakedquot, ddquot+i, sizeof(struct v2_disk_dqblk))) ++ for (i = 0; i < dqstrinblk; i++, ddquot = (char *)ddquot+dqblksz) ++ if (!dqid(ddquot, rev) && ++ memcmp(&emptydquot, ddquot, dqblksz)) + break; + } +- if (i == V2_DQSTRINBLK) { ++ if (i == dqstrinblk) { + printk(KERN_ERR "VFS: Quota for id %u referenced " + "but not present.\n", dquot->dq_id); + ret = -EIO; + goto out_buf; + } + else +- ret = (blk << V2_DQBLKSIZE_BITS) + sizeof(struct +- v2_disk_dqdbheader) + i * sizeof(struct v2_disk_dqblk); ++ ret = (blk << V2_DQBLKSIZE_BITS)+((char *)ddquot-(char *)buf); + out_buf: + freedqbuf(buf); + return ret; +@@ -605,7 +716,7 @@ static int v2_read_dquot(struct dquot *d + { + int type = dquot->dq_type; + loff_t offset; +- struct v2_disk_dqblk ddquot, empty; ++ union v2_disk_dqblk ddquot; + int ret = 0; + + #ifdef __QUOTA_V2_PARANOIA +@@ -626,25 +737,30 @@ static int v2_read_dquot(struct dquot *d + ret = offset; + } + else { ++ uint rev = sb_dqopt(dquot->dq_sb)->info[type].u.v2_i. ++ dqi_revision; ++ uint dqblksz = v2_dqblksz(rev); + dquot->dq_off = offset; +- if ((ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, +- (char *)&ddquot, sizeof(struct v2_disk_dqblk), offset)) +- != sizeof(struct v2_disk_dqblk)) { ++ ret = dquot->dq_sb->s_op->quota_read(dquot->dq_sb, type, ++ (char *)&ddquot, dqblksz, offset); ++ if (ret != dqblksz) { + if (ret >= 0) + ret = -EIO; + printk(KERN_ERR "VFS: Error while reading quota " + "structure for id %u.\n", dquot->dq_id); +- memset(&ddquot, 0, sizeof(struct v2_disk_dqblk)); ++ memset(&ddquot, 0, dqblksz); + } + else { + ret = 0; + /* We need to escape back all-zero structure */ +- memset(&empty, 0, sizeof(struct v2_disk_dqblk)); +- empty.dqb_itime = cpu_to_le64(1); +- if (!memcmp(&empty, &ddquot, sizeof(struct v2_disk_dqblk))) +- ddquot.dqb_itime = 0; ++ if (!memcmp(&fakedquot[rev], &ddquot, dqblksz)) { ++ if (rev == 0) ++ ddquot.r0.dqb_itime = cpu_to_le64(0); ++ else ++ ddquot.r1.dqb_itime = cpu_to_le64(0); ++ } + } +- disk2memdqb(&dquot->dq_dqb, &ddquot); ++ disk2memdqb(&dquot->dq_dqb, &ddquot, rev); + if (!dquot->dq_dqb.dqb_bhardlimit && + !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h +--- linux-2.6.16.54-0.2.5/include/linux/dqblk_v2.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/dqblk_v2.h 2008-03-17 23:39:54.000000000 +0300 +@@ -21,6 +21,7 @@ struct v2_mem_dqinfo { + unsigned int dqi_blocks; + unsigned int dqi_free_blk; + unsigned int dqi_free_entry; ++ unsigned int dqi_revision; + }; + + #endif /* _LINUX_DQBLK_V2_H */ +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quota.h linux-2.6.16.54-0.2.5-quota/include/linux/quota.h +--- linux-2.6.16.54-0.2.5/include/linux/quota.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/quota.h 2008-03-17 23:39:54.000000000 +0300 +@@ -148,12 +148,12 @@ struct if_dqinfo { + * Data for one user/group kept in memory + */ + struct mem_dqblk { +- __u32 dqb_bhardlimit; /* absolute limit on disk blks alloc */ +- __u32 dqb_bsoftlimit; /* preferred limit on disk blks */ ++ qsize_t dqb_bhardlimit; /* absolute limit on disk blks alloc */ ++ qsize_t dqb_bsoftlimit; /* preferred limit on disk blks */ + qsize_t dqb_curspace; /* current used space */ +- __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ +- __u32 dqb_isoftlimit; /* preferred inode limit */ +- __u32 dqb_curinodes; /* current # allocated inodes */ ++ qsize_t dqb_ihardlimit; /* absolute limit on allocated inodes */ ++ qsize_t dqb_isoftlimit; /* preferred inode limit */ ++ qsize_t dqb_curinodes; /* current # allocated inodes */ + time_t dqb_btime; /* time limit for excessive disk use */ + time_t dqb_itime; /* time limit for excessive inode use */ + }; +@@ -169,6 +169,8 @@ struct mem_dqinfo { + unsigned long dqi_flags; + unsigned int dqi_bgrace; + unsigned int dqi_igrace; ++ qsize_t dqi_maxblimit; ++ qsize_t dqi_maxilimit; + union { + struct v1_mem_dqinfo v1_i; + struct v2_mem_dqinfo v2_i; +diff -rNpu linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h +--- linux-2.6.16.54-0.2.5/include/linux/quotaio_v2.h 2006-03-20 08:53:29.000000000 +0300 ++++ linux-2.6.16.54-0.2.5-quota/include/linux/quotaio_v2.h 2008-03-17 23:39:54.000000000 +0300 +@@ -16,28 +16,51 @@ + 0xd9c01927 /* GRPQUOTA */\ + } + +-#define V2_INITQVERSIONS {\ ++#define V2_INITQVERSIONS_R0 {\ + 0, /* USRQUOTA */\ + 0 /* GRPQUOTA */\ + } + ++#define V2_INITQVERSIONS_R1 {\ ++ 1, /* USRQUOTA */\ ++ 1 /* GRPQUOTA */\ ++} ++ + /* + * The following structure defines the format of the disk quota file + * (as it appears on disk) - the file is a radix tree whose leaves point + * to blocks of these structures. + */ +-struct v2_disk_dqblk { ++struct v2_disk_dqblk_r0 { + __le32 dqb_id; /* id this quota applies to */ + __le32 dqb_ihardlimit; /* absolute limit on allocated inodes */ + __le32 dqb_isoftlimit; /* preferred inode limit */ + __le32 dqb_curinodes; /* current # allocated inodes */ +- __le32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ +- __le32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ ++ __le32 dqb_bhardlimit; /* absolute limit on disk space */ ++ __le32 dqb_bsoftlimit; /* preferred limit on disk space */ ++ __le64 dqb_curspace; /* current space occupied (in bytes) */ ++ __le64 dqb_btime; /* time limit for excessive disk use */ ++ __le64 dqb_itime; /* time limit for excessive inode use */ ++}; ++ ++struct v2_disk_dqblk_r1 { ++ __le32 dqb_id; /* id this quota applies to */ ++ __le32 dqb_padding; /* padding field */ ++ __le64 dqb_ihardlimit; /* absolute limit on allocated inodes */ ++ __le64 dqb_isoftlimit; /* preferred inode limit */ ++ __le64 dqb_curinodes; /* current # allocated inodes */ ++ __le64 dqb_bhardlimit; /* absolute limit on disk space */ ++ __le64 dqb_bsoftlimit; /* preferred limit on disk space */ + __le64 dqb_curspace; /* current space occupied (in bytes) */ + __le64 dqb_btime; /* time limit for excessive disk use */ + __le64 dqb_itime; /* time limit for excessive inode use */ + }; + ++union v2_disk_dqblk { ++ struct v2_disk_dqblk_r0 r0; ++ struct v2_disk_dqblk_r1 r1; ++}; ++ + /* + * Here are header structures as written on disk and their in-memory copies + */ +@@ -59,7 +82,7 @@ struct v2_disk_dqinfo { + + /* + * Structure of header of block with quota structures. It is padded to 16 bytes so +- * there will be space for exactly 21 quota-entries in a block ++ * there will be space for exactly 21 (r0) or 14 (r1) quota-entries in a block + */ + struct v2_disk_dqdbheader { + __le32 dqdh_next_free; /* Number of next block with free entry */ +@@ -74,6 +97,5 @@ struct v2_disk_dqdbheader { + #define V2_DQBLKSIZE (1 << V2_DQBLKSIZE_BITS) /* Size of block with quota structures */ + #define V2_DQTREEOFF 1 /* Offset of tree in file in blocks */ + #define V2_DQTREEDEPTH 4 /* Depth of quota tree */ +-#define V2_DQSTRINBLK ((V2_DQBLKSIZE - sizeof(struct v2_disk_dqdbheader)) / sizeof(struct v2_disk_dqblk)) /* Number of entries in one blocks */ + + #endif /* _LINUX_QUOTAIO_V2_H */ diff --git a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch index 735af2c..decf7a4 100644 --- a/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch +++ b/lustre/kernel_patches/patches/raid5-merge-ios-rhel5.patch @@ -1,6 +1,7 @@ -diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/raid5.c ---- linux-2.6.18-53.orig/drivers/md/raid5.c 2007-12-28 18:55:24.000000000 +0800 -+++ linux-2.6.18-53/drivers/md/raid5.c 2007-12-28 19:08:15.000000000 +0800 +Index: linux-2.6.18-92.1.17/drivers/md/raid5.c +=================================================================== +--- linux-2.6.18-92.1.17.orig/drivers/md/raid5.c ++++ linux-2.6.18-92.1.17/drivers/md/raid5.c @@ -1277,7 +1277,26 @@ static void compute_block_2(struct strip } } @@ -151,7 +152,7 @@ diff -pur linux-2.6.18-53.orig/drivers/md/raid5.c linux-2.6.18-53/drivers/md/rai } if (sh) { - handle_stripe(sh, NULL); -+ handle_stripe(sh, NULL, NULL); ++ handle_stripe(sh, NULL, bios); release_stripe(sh); sh = NULL; } diff --git a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch index fa92977..dd80825 100644 --- a/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch +++ b/lustre/kernel_patches/patches/raid5-zerocopy-rhel5.patch @@ -345,9 +345,9 @@ diff -pur linux-2.6.18-53.orig/include/linux/page-flags.h linux-2.6.18-53/includ #define PG_nosave_free 18 /* Free, should not be written */ #define PG_buddy 19 /* Page is free, on buddy lists */ +#define PG_constant 20 /* To mark if the page is constant */ + #define PG_xpmem 27 /* Testing for xpmem. */ /* PG_owner_priv_1 users should have descriptive aliases */ - #define PG_checked PG_owner_priv_1 /* Used by some filesystems */ @@ -252,6 +253,14 @@ struct page; /* forward declaration */ diff --git a/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch b/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch index 9e822d2..48f392a 100644 --- a/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch +++ b/lustre/kernel_patches/patches/sd_iostats-2.6.22-vanilla.patch @@ -1,3 +1,7 @@ +Export more statistics from the SCSI layer. + +A nice to have patch, but not required for Lustre to work. + Index: linux-2.6.22.19/drivers/scsi/Kconfig =================================================================== --- linux-2.6.22.19.orig/drivers/scsi/Kconfig diff --git a/lustre/kernel_patches/series/2.6-rhel4.series b/lustre/kernel_patches/series/2.6-rhel4.series index d058295..49fe38a 100644 --- a/lustre/kernel_patches/series/2.6-rhel4.series +++ b/lustre/kernel_patches/series/2.6-rhel4.series @@ -29,6 +29,7 @@ quota-deadlock-on-pagelock-core.patch quota-umount-race-fix.patch quota-deadlock-on-pagelock-ext3.patch export-nr_free_buffer_pages.patch +2.6-rhel4-kgdb-ga.patch vfs-keep-inode-hashed-for-clear-inode.patch modpost_external_module_updates_rhel4.patch mpt-fusion-downgrade-to-3_02_73-rhel4.patch diff --git a/lustre/kernel_patches/series/2.6-rhel5.series b/lustre/kernel_patches/series/2.6-rhel5.series index 9e7a8ed..0fc2b97 100644 --- a/lustre/kernel_patches/series/2.6-rhel5.series +++ b/lustre/kernel_patches/series/2.6-rhel5.series @@ -18,4 +18,6 @@ raid5-stripe-by-stripe-handling-rhel5.patch raid5-merge-ios-rhel5.patch raid5-zerocopy-rhel5.patch md-rebuild-policy.patch +md-soft-lockups.patch jbd-journal-chksum-2.6.18-vanilla.patch +quota-large-limits-rhel5.patch diff --git a/lustre/kernel_patches/series/2.6-sles10.series b/lustre/kernel_patches/series/2.6-sles10.series index 72adc21..070f943 100644 --- a/lustre/kernel_patches/series/2.6-sles10.series +++ b/lustre/kernel_patches/series/2.6-sles10.series @@ -10,7 +10,9 @@ export_symbol_numa-2.6-fc5.patch blkdev_tunables-2.6-sles10.patch jbd-stats-2.6-sles10.patch i_filter_data.patch +quota-fix-oops-in-invalidate_dquots.patch jbd-journal-chksum-2.6-sles10.patch proc-sleep-2.6.16-sles10.patch export-nr_free_buffer_pages.patch fmode-exec-2.6-sles10.patch +quota-large-limits-sles10.patch diff --git a/lustre/kernel_patches/series/2.6.18-vanilla.series b/lustre/kernel_patches/series/2.6.18-vanilla.series index 9253a3e..a9b79b9 100644 --- a/lustre/kernel_patches/series/2.6.18-vanilla.series +++ b/lustre/kernel_patches/series/2.6.18-vanilla.series @@ -15,3 +15,4 @@ jbd-16tb-overflow-fixes.patch jbd-check-for-unmapped-buffer.patch jbd-stats-2.6-rhel5.patch export-nr_free_buffer_pages.patch +kgdb-2.6.18-vanilla.patch diff --git a/lustre/kernel_patches/series/2.6.22-vanilla.series b/lustre/kernel_patches/series/2.6.22-vanilla.series index eba2991..fe32803 100644 --- a/lustre/kernel_patches/series/2.6.22-vanilla.series +++ b/lustre/kernel_patches/series/2.6.22-vanilla.series @@ -11,3 +11,4 @@ export-2.6.18-vanilla.patch 8kstack-2.6.12.patch export-show_task-2.6.18-vanilla.patch sd_iostats-2.6.22-vanilla.patch +quota-large-limits-rhel5.patch diff --git a/lustre/kernel_patches/targets/2.6-rhel5.target.in b/lustre/kernel_patches/targets/2.6-rhel5.target.in index 5d208f5..660b7a6 100644 --- a/lustre/kernel_patches/targets/2.6-rhel5.target.in +++ b/lustre/kernel_patches/targets/2.6-rhel5.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.18" -lnxrel="92.1.10.el5" +lnxrel="92.1.17.el5" KERNEL=linux-${lnxmaj}-${lnxrel}.tar.bz2 SERIES=2.6-rhel5.series diff --git a/lustre/kernel_patches/targets/2.6-sles10.target.in b/lustre/kernel_patches/targets/2.6-sles10.target.in index 42ef94c..bfa8365 100644 --- a/lustre/kernel_patches/targets/2.6-sles10.target.in +++ b/lustre/kernel_patches/targets/2.6-sles10.target.in @@ -1,5 +1,5 @@ lnxmaj="2.6.16" -lnxrel="60-0.27" +lnxrel="60-0.31" # this is the delimeter that goes between $lnxmaj and $lnxrel # defaults to "-" diff --git a/lustre/kernel_patches/which_patch b/lustre/kernel_patches/which_patch index eb8e804..a0104b7 100644 --- a/lustre/kernel_patches/which_patch +++ b/lustre/kernel_patches/which_patch @@ -2,8 +2,8 @@ SERIES VERSION COMMENT SUPPORTED KERNELS: 2.6-rhel4 RHEL4: 2.6.9-67.0.20.EL -2.6-sles10 SLES10: 2.6.16.60-0.27 -2.6-rhel5 RHEL5: 2.6.18-92.1.10.el5 +2.6-sles10 SLES10: 2.6.16.60-0.31 +2.6-rhel5 RHEL5: 2.6.18-92.1.17.el5 2.6.18-vanilla kernel.org: 2.6.18.8 2.6.22-vanilla kernel.org: 2.6.22.14 diff --git a/lustre/lclient/Makefile.am b/lustre/lclient/Makefile.am new file mode 100644 index 0000000..a6e1548 --- /dev/null +++ b/lustre/lclient/Makefile.am @@ -0,0 +1 @@ +EXTRA_DIST=glimpse.c lcommon_cl.c diff --git a/lustre/lclient/glimpse.c b/lustre/lclient/glimpse.c new file mode 100644 index 0000000..78acee6 --- /dev/null +++ b/lustre/lclient/glimpse.c @@ -0,0 +1,253 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * glimpse code shared between vvp and liblustre (and other Lustre clients in + * the future). + * + * Author: Nikita Danilov + * Author: Oleg Drokin + */ + +#include +#include +#include +#include + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +#else +#include +#include +#include +#include +#include +#include +#include +#include +# include +# ifdef HAVE_XTIO_H +# include +# endif +# include +# include +# include +# ifdef HAVE_FILE_H +# include +# endif +# include +#endif + +#include "cl_object.h" +#include "lclient.h" +#ifdef __KERNEL__ +# include "../llite/llite_internal.h" +#else +# include "../liblustre/llite_lib.h" +#endif + +static const struct cl_lock_descr whole_file = { + .cld_start = 0, + .cld_end = CL_PAGE_EOF, + .cld_mode = CLM_READ +}; + +int cl_glimpse_lock(const struct lu_env *env, struct cl_io *io, + struct inode *inode, struct cl_object *clob) +{ + struct cl_lock_descr *descr = &ccc_env_info(env)->cti_descr; + struct cl_inode_info *lli = cl_i2info(inode); + const struct lu_fid *fid = lu_object_fid(&clob->co_lu); + struct ccc_io *cio = ccc_env_io(env); + struct cl_lock *lock; + int result; + + ENTRY; + result = 0; + if (!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)) { + CDEBUG(D_DLMTRACE, "Glimpsing inode "DFID"\n", PFID(fid)); + if (lli->lli_smd) { + /* NOTE: this looks like DLM lock request, but it may + * not be one. Due to CEF_ASYNC flag (translated + * to LDLM_FL_HAS_INTENT by osc), this is + * glimpse request, that won't revoke any + * conflicting DLM locks held. Instead, + * ll_glimpse_callback() will be called on each + * client holding a DLM lock against this file, + * and resulting size will be returned for each + * stripe. DLM lock on [0, EOF] is acquired only + * if there were no conflicting locks. If there + * were conflicting locks, enqueuing or waiting + * fails with -ENAVAIL, but valid inode + * attributes are returned anyway. */ + *descr = whole_file; + descr->cld_obj = clob; + descr->cld_mode = CLM_PHANTOM; + /* The lockreq for glimpse should be mandatory, + * otherwise, osc may decide to use lockless */ + io->ci_lockreq = CILR_MANDATORY; + cio->cui_glimpse = 1; + lock = cl_lock_request(env, io, descr, CEF_ASYNC, + "glimpse", cfs_current()); + cio->cui_glimpse = 0; + if (!IS_ERR(lock)) { + result = cl_wait(env, lock); + if (result == 0) { + cl_merge_lvb(inode); + cl_unuse(env, lock); + } + cl_lock_release(env, lock, + "glimpse", cfs_current()); + } else + result = PTR_ERR(lock); + } else + CDEBUG(D_DLMTRACE, "No objects for inode\n"); + } + + RETURN(result); +} + +static int cl_io_get(struct inode *inode, struct lu_env **envout, + struct cl_io **ioout, int *refcheck) +{ + struct ccc_thread_info *info; + struct lu_env *env; + struct cl_io *io; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int result; + + if (S_ISREG(cl_inode_mode(inode))) { + env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + info = ccc_env_info(env); + io = &info->cti_io; + io->ci_obj = clob; + *envout = env; + *ioout = io; + result = +1; + } else + result = PTR_ERR(env); + } else + result = 0; + return result; +} + +int cl_glimpse_size(struct inode *inode) +{ + /* + * We don't need ast_flags argument to cl_glimpse_size(), because + * osc_lock_enqueue() takes care of the possible deadlock that said + * argument was introduced to avoid. + */ + /* + * XXX but note that ll_file_seek() passes LDLM_FL_BLOCK_NOWAIT to + * cl_glimpse_size(), which doesn't make sense: glimpse locks are not + * blocking anyway. + */ + struct lu_env *env; + struct cl_io *io; + int result; + int refcheck; + + ENTRY; + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result > 0) { + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result > 0) + /* + * nothing to do for this io. This currently happens + * when stripe sub-object's are not yet created. + */ + result = io->ci_result; + else if (result == 0) + result = cl_glimpse_lock(env, io, inode, io->ci_obj); + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + } + RETURN(result); +} + +int cl_local_size(struct inode *inode) +{ + struct lu_env *env; + struct cl_io *io; + struct ccc_thread_info *cti; + struct cl_object *clob; + struct cl_lock_descr *descr; + struct cl_lock *lock; + int result; + int refcheck; + + ENTRY; + + /* + * XXX layering violation. + */ + if (cl_i2info(inode)->lli_smd->lsm_stripe_count == 0) + RETURN(0); + + result = cl_io_get(inode, &env, &io, &refcheck); + if (result <= 0) + RETURN(result); + + clob = io->ci_obj; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result > 0) + result = io->ci_result; + else if (result == 0) { + cti = ccc_env_info(env); + descr = &cti->cti_descr; + + *descr = whole_file; + descr->cld_obj = clob; + lock = cl_lock_peek(env, io, descr, "localsize", cfs_current()); + if (lock != NULL) { + cl_merge_lvb(inode); + cl_unuse(env, lock); + cl_lock_release(env, lock, "localsize", cfs_current()); + result = 0; + } else + result = -ENODATA; + } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(result); +} + diff --git a/lustre/lclient/lcommon_cl.c b/lustre/lclient/lcommon_cl.c new file mode 100644 index 0000000..6b56b4e --- /dev/null +++ b/lustre/lclient/lcommon_cl.c @@ -0,0 +1,1188 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl code shared between vvp and liblustre (and other Lustre clients in the + * future). + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +# include +# include +# include +#else /* __KERNEL__ */ +#include +#include +#include +#include +#include +#include +#include +#include +# include +# ifdef HAVE_XTIO_H +# include +# endif +# include +# include +# include +# ifdef HAVE_FILE_H +# include +# endif +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include + +#ifdef __KERNEL__ +#include "../llite/llite_internal.h" +#else +#include "../liblustre/llite_lib.h" +#endif + +const struct cl_req_operations ccc_req_ops; + +/* + * ccc_ prefix stands for "Common Client Code". + */ + +static cfs_mem_cache_t *ccc_lock_kmem; +static cfs_mem_cache_t *ccc_object_kmem; +static cfs_mem_cache_t *ccc_thread_kmem; +static cfs_mem_cache_t *ccc_session_kmem; +static cfs_mem_cache_t *ccc_req_kmem; + +static struct lu_kmem_descr ccc_caches[] = { + { + .ckd_cache = &ccc_lock_kmem, + .ckd_name = "ccc_lock_kmem", + .ckd_size = sizeof (struct ccc_lock) + }, + { + .ckd_cache = &ccc_object_kmem, + .ckd_name = "ccc_object_kmem", + .ckd_size = sizeof (struct ccc_object) + }, + { + .ckd_cache = &ccc_thread_kmem, + .ckd_name = "ccc_thread_kmem", + .ckd_size = sizeof (struct ccc_thread_info), + }, + { + .ckd_cache = &ccc_session_kmem, + .ckd_name = "ccc_session_kmem", + .ckd_size = sizeof (struct ccc_session) + }, + { + .ckd_cache = &ccc_req_kmem, + .ckd_name = "ccc_req_kmem", + .ckd_size = sizeof (struct ccc_req) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +void *ccc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ccc_thread_info *info; + + OBD_SLAB_ALLOC_PTR(info, ccc_thread_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +void ccc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, ccc_thread_kmem); +} + +void *ccc_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct ccc_session *session; + + OBD_SLAB_ALLOC_PTR(session, ccc_session_kmem); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +void ccc_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct ccc_session *session = data; + OBD_SLAB_FREE_PTR(session, ccc_session_kmem); +} + +struct lu_context_key ccc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = ccc_key_init, + .lct_fini = ccc_key_fini +}; + +struct lu_context_key ccc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = ccc_session_key_init, + .lct_fini = ccc_session_key_fini +}; + + +/* type constructor/destructor: ccc_type_{init,fini,start,stop}(). */ +// LU_TYPE_INIT_FINI(ccc, &ccc_key, &ccc_session_key); + +int ccc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct ccc_device *vdv; + int rc; + ENTRY; + + vdv = lu2ccc_dev(d); + vdv->cdv_next = lu2cl_dev(next); + + LASSERT(d->ld_site != NULL && next->ld_type != NULL); + next->ld_site = d->ld_site; + rc = next->ld_type->ldt_ops->ldto_device_init( + env, next, next->ld_type->ldt_name, NULL); + if (rc == 0) { + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + } + RETURN(rc); +} + +struct lu_device *ccc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return cl2lu_dev(lu2ccc_dev(d)->cdv_next); +} + +struct lu_device *ccc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg, + const struct lu_device_operations *luops, + const struct cl_device_operations *clops) +{ + struct ccc_device *vdv; + struct lu_device *lud; + struct cl_site *site; + int rc; + ENTRY; + + OBD_ALLOC_PTR(vdv); + if (vdv == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + lud = &vdv->cdv_cl.cd_lu_dev; + cl_device_init(&vdv->cdv_cl, t); + ccc2lu_dev(vdv)->ld_ops = luops; + vdv->cdv_cl.cd_ops = clops; + + OBD_ALLOC_PTR(site); + if (site != NULL) { + rc = cl_site_init(site, &vdv->cdv_cl); + if (rc == 0) + rc = lu_site_init_finish(&site->cs_lu); + else { + LASSERT(lud->ld_site == NULL); + CERROR("Cannot init lu_site, rc %d.\n", rc); + OBD_FREE_PTR(site); + } + } else + rc = -ENOMEM; + if (rc != 0) { + ccc_device_free(env, lud); + lud = ERR_PTR(rc); + } + RETURN(lud); +} + +struct lu_device *ccc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct ccc_device *vdv = lu2ccc_dev(d); + struct cl_site *site = lu2cl_site(d->ld_site); + struct lu_device *next = cl2lu_dev(vdv->cdv_next); + + if (d->ld_site != NULL) { + cl_site_fini(site); + OBD_FREE_PTR(site); + } + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(vdv); + return next; +} + +int ccc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct ccc_req *vrq; + int result; + + OBD_SLAB_ALLOC_PTR(vrq, ccc_req_kmem); + if (vrq != NULL) { + cl_req_slice_add(req, &vrq->crq_cl, dev, &ccc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** + * An `emergency' environment used by ccc_inode_fini() when cl_env_get() + * fails. Access to this environment is serialized by ccc_inode_fini_guard + * mutex. + */ +static struct lu_env *ccc_inode_fini_env = NULL; + +/** + * A mutex serializing calls to slp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +static DEFINE_MUTEX(ccc_inode_fini_guard); +static int dummy_refcheck; + +int ccc_global_init(struct lu_device_type *device_type) +{ + int result; + + result = lu_kmem_init(ccc_caches); + if (result == 0) { + result = lu_device_type_init(device_type); + ccc_inode_fini_env = cl_env_alloc(&dummy_refcheck, + LCT_REMEMBER|LCT_NOREF); + if (IS_ERR(ccc_inode_fini_env)) + result = PTR_ERR(ccc_inode_fini_env); + else + ccc_inode_fini_env->le_ctx.lc_cookie = 0x4; + } + return result; +} + +void ccc_global_fini(struct lu_device_type *device_type) +{ + if (ccc_inode_fini_env != NULL) { + cl_env_put(ccc_inode_fini_env, &dummy_refcheck); + ccc_inode_fini_env = NULL; + } + lu_device_type_fini(device_type); + lu_kmem_fini(ccc_caches); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +struct lu_object *ccc_object_alloc(const struct lu_env *env, + const struct lu_object_header *_, + struct lu_device *dev, + const struct cl_object_operations *clops, + const struct lu_object_operations *luops) +{ + struct ccc_object *vob; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR(vob, ccc_object_kmem); + if (vob != NULL) { + struct cl_object_header *hdr; + + obj = ccc2lu(vob); + hdr = &vob->cob_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + vob->cob_cl.co_ops = clops; + obj->lo_ops = luops; + } else + obj = NULL; + return obj; +} + +int ccc_object_init0(const struct lu_env *env, + struct ccc_object *vob, + const struct cl_object_conf *conf) +{ + vob->cob_inode = conf->coc_inode; + vob->cob_transient_pages = 0; + return 0; +} + +int ccc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct ccc_device *dev = lu2ccc_dev(obj->lo_dev); + struct ccc_object *vob = lu2ccc(obj); + struct lu_object *below; + struct lu_device *under; + int result; + + under = &dev->cdv_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + const struct cl_object_conf *cconf; + + cconf = lu2cl_conf(conf); + CFS_INIT_LIST_HEAD(&vob->cob_pending_list); + lu_object_add(obj, below); + result = ccc_object_init0(env, vob, cconf); + } else + result = -ENOMEM; + return result; +} + +void ccc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct ccc_object *vob = lu2ccc(obj); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + OBD_SLAB_FREE_PTR(vob, ccc_object_kmem); +} + +int ccc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *_, + const struct cl_lock_operations *lkops) +{ + struct ccc_lock *clk; + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + OBD_SLAB_ALLOC_PTR(clk, ccc_lock_kmem); + if (clk != NULL) { + cl_lock_slice_add(lock, &clk->clk_cl, obj, lkops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +int ccc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + return 0; +} + +int ccc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct inode *inode = ccc_object_inode(obj); + + ENTRY; + lvb->lvb_mtime = cl_inode_mtime(inode); + lvb->lvb_atime = cl_inode_atime(inode); + lvb->lvb_ctime = cl_inode_ctime(inode); + RETURN(0); +} + + + +int ccc_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + /* TODO: destroy all pages attached to this object. */ + return 0; +} + +/***************************************************************************** + * + * Page operations. + * + */ + +cfs_page_t *ccc_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2vm_page(slice); +} + +int ccc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *io) +{ + struct ccc_io *vio = ccc_env_io(env); + struct cl_lock_descr *desc = &ccc_env_info(env)->cti_descr; + struct cl_page *page = slice->cpl_page; + + int result; + + ENTRY; + + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + io->ci_type == CIT_FAULT) { + if (vio->cui_fd->fd_flags & LL_FILE_GROUP_LOCKED) + result = -EBUSY; + else { + desc->cld_start = page->cp_index; + desc->cld_end = page->cp_index; + desc->cld_obj = page->cp_obj; + desc->cld_mode = CLM_READ; + result = cl_queue_match(&io->ci_lockset.cls_done, + desc) ? -EBUSY : 0; + } + } else + result = 0; + RETURN(result); +} + +int ccc_fail(const struct lu_env *env, const struct cl_page_slice *slice) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + +void ccc_transient_page_verify(const struct cl_page *page) +{ +} + +void ccc_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ccc_transient_page_verify(slice->cpl_page); +} + +void ccc_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + struct cl_page *page = slice->cpl_page; + + ccc_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +int ccc_transient_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ENTRY; + /* transient page should always be sent. */ + RETURN(0); +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +void ccc_lock_fini(const struct lu_env *env, struct cl_lock_slice *slice) +{ + struct ccc_lock *clk = cl2ccc_lock(slice); + + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + OBD_SLAB_FREE_PTR(clk, ccc_lock_kmem); +} + +int ccc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *_, __u32 enqflags) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_unuse(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +int ccc_lock_wait(const struct lu_env *env, const struct cl_lock_slice *slice) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + return 0; +} + +/** + * Implementation of cl_lock_operations::clo_fits_into() methods for ccc + * layer. This function is executed every time io finds an existing lock in + * the lock cache while creating new lock. This function has to decide whether + * cached lock "fits" into io. + * + * \param slice lock to be checked + * + * \param io IO that wants a lock. + * + * \see lov_lock_fits_into(). + */ +int ccc_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock *lock = slice->cls_lock; + const struct cl_lock_descr *descr = &lock->cll_descr; + const struct ccc_io *cio = ccc_env_io(env); + int result; + + ENTRY; + /* + * Work around DLM peculiarity: it assumes that glimpse + * (LDLM_FL_HAS_INTENT) lock is always LCK_PR, and returns reads lock + * when asked for LCK_PW lock with LDLM_FL_HAS_INTENT flag set. Make + * sure that glimpse doesn't get CLM_WRITE top-lock, so that it + * doesn't enqueue CLM_WRITE sub-locks. + */ + if (cio->cui_glimpse) + result = descr->cld_mode != CLM_WRITE; + /* + * Also, don't match incomplete write locks for read, otherwise read + * would enqueue missing sub-locks in the write mode. + * + * XXX this is a candidate for generic locking policy, to be moved + * into cl_lock_lookup(). + */ + else if (need->cld_mode != descr->cld_mode) + result = lock->cll_state >= CLS_ENQUEUED; + else + result = 1; + RETURN(result); +} + +/** + * Implements cl_lock_operations::clo_state() method for vvp layer, invoked + * whenever lock state changes. Transfers object attributes, that might be + * updated as a result of lock acquiring into inode. + */ +void ccc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct cl_lock *lock; + struct cl_object *obj; + struct inode *inode; + struct cl_attr *attr; + + ENTRY; + lock = slice->cls_lock; + + /* + * Refresh inode attributes when the lock is moving into CLS_HELD + * state, and only when this is a result of real enqueue, rather than + * of finding lock in the cache. + */ + if (state == CLS_HELD && lock->cll_state < CLS_HELD) { + int rc; + + obj = slice->cls_obj; + inode = ccc_object_inode(obj); + attr = &ccc_env_info(env)->cti_attr; + + /* vmtruncate()->ll_truncate() first sets the i_size and then + * the kms under both a DLM lock and the + * ll_inode_size_lock(). If we don't get the + * ll_inode_size_lock() here we can match the DLM lock and + * reset i_size from the kms before the truncating path has + * updated the kms. generic_file_write can then trust the + * stale i_size when doing appending writes and effectively + * cancel the result of the truncate. Getting the + * ll_inode_size_lock() after the enqueue maintains the DLM + * -> ll_inode_size_lock() acquiring order. */ + cl_isize_lock(inode, 0); + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, attr); + if (rc == 0) { + if (lock->cll_descr.cld_start == 0 && + lock->cll_descr.cld_end == CL_PAGE_EOF) { + cl_isize_write(inode, attr->cat_kms); + CDEBUG(D_INODE, DFID" updating i_size %llu\n", + PFID(lu_object_fid(&obj->co_lu)), + (__u64)cl_isize_read(inode)); + } + cl_inode_mtime(inode) = attr->cat_mtime; + cl_inode_atime(inode) = attr->cat_atime; + cl_inode_ctime(inode) = attr->cat_ctime; + } else + CL_LOCK_DEBUG(D_ERROR, env, lock, "attr_get: %i\n", rc); + cl_object_attr_unlock(obj); + cl_isize_unlock(inode, 0); + } + EXIT; +} + +/***************************************************************************** + * + * io operations. + * + */ + +void ccc_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); +} + +int ccc_io_one_lock_index(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + pgoff_t start, pgoff_t end) +{ + struct ccc_io *vio = ccc_env_io(env); + struct cl_lock_descr *descr = &vio->cui_link.cill_descr; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + ENTRY; + + CDEBUG(D_VFSTRACE, "lock: %i [%lu, %lu]\n", mode, start, end); + + memset(&vio->cui_link, 0, sizeof vio->cui_link); + descr->cld_mode = mode; + descr->cld_obj = obj; + descr->cld_start = start; + descr->cld_end = end; + + vio->cui_link.cill_enq_flags = enqflags; + cl_io_lock_add(env, io, &vio->cui_link); + RETURN(0); +} + +int ccc_io_one_lock(const struct lu_env *env, struct cl_io *io, + __u32 enqflags, enum cl_lock_mode mode, + loff_t start, loff_t end) +{ + struct cl_object *obj = io->ci_obj; + + return ccc_io_one_lock_index(env, io, enqflags, mode, + cl_index(obj, start), cl_index(obj, end)); +} + +void ccc_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + CLOBINVRNT(env, ios->cis_io->ci_obj, + ccc_object_invariant(ios->cis_io->ci_obj)); +} + +static void ccc_object_size_lock(struct cl_object *obj, int vfslock) +{ + struct inode *inode = ccc_object_inode(obj); + + if (vfslock) + cl_isize_lock(inode, 0); + cl_object_attr_lock(obj); +} + +static void ccc_object_size_unlock(struct cl_object *obj, int vfslock) +{ + struct inode *inode = ccc_object_inode(obj); + + cl_object_attr_unlock(obj); + if (vfslock) + cl_isize_unlock(inode, 0); +} + +/** + * Helper function that if necessary adjusts file size (inode->i_size), when + * position at the offset \a pos is accessed. File size can be arbitrary stale + * on a Lustre client, but client at least knows KMS. If accessed area is + * inside [0, KMS], set file size to KMS, otherwise glimpse file size. + * + * Locking: cl_isize_lock is used to serialize changes to inode size and to + * protect consistency between inode size and cl_object + * attributes. cl_object_size_lock() protects consistency between cl_attr's of + * top-object and sub-objects. + * + * In page fault path cl_isize_lock cannot be taken, client has to live with + * the resulting races. + */ +int ccc_prep_size(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, loff_t pos, int vfslock) +{ + struct cl_attr *attr = &ccc_env_info(env)->cti_attr; + struct inode *inode = ccc_object_inode(obj); + loff_t kms; + int result; + + /* + * Consistency guarantees: following possibilities exist for the + * relation between region being accessed and real file size at this + * moment: + * + * (A): the region is completely inside of the file; + * + * (B-x): x bytes of region are inside of the file, the rest is + * outside; + * + * (C): the region is completely outside of the file. + * + * This classification is stable under DLM lock already acquired by + * the caller, because to change the class, other client has to take + * DLM lock conflicting with our lock. Also, any updates to ->i_size + * by other threads on this client are serialized by + * ll_inode_size_lock(). This guarantees that short reads are handled + * correctly in the face of concurrent writes and truncates. + */ + ccc_object_size_lock(obj, vfslock); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + kms = attr->cat_kms; + if (pos > kms) { + /* + * A glimpse is necessary to determine whether we + * return a short read (B) or some zeroes at the end + * of the buffer (C) + */ + ccc_object_size_unlock(obj, vfslock); + return cl_glimpse_lock(env, io, inode, obj); + } else { + /* + * region is within kms and, hence, within real file + * size (A). We need to increase i_size to cover the + * read region so that generic_file_read() will do its + * job, but that doesn't mean the kms size is + * _correct_, it is only the _minimum_ size. If + * someone does a stat they will get the correct size + * which will always be >= the kms value here. + * b=11081 + */ + /* + * XXX in a page fault path, change inode size without + * ll_inode_size_lock() held! there is a race + * condition with truncate path. (see ll_extent_lock) + */ + /* + * XXX i_size_write() is not used because it is not + * safe to take the ll_inode_size_lock() due to a + * potential lock inversion (bug 6077). And since + * it's not safe to use i_size_write() without a + * covering mutex we do the assignment directly. It + * is not critical that the size be correct. + */ + if (cl_isize_read(inode) < kms) { + if (vfslock) + cl_isize_write(inode, kms); + else + cl_isize_write_nolock(inode, kms); + } + } + } + ccc_object_size_unlock(obj, vfslock); + return result; +} + +/***************************************************************************** + * + * Transfer operations. + * + */ + +void ccc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct ccc_req *vrq; + + vrq = cl2ccc_req(slice); + OBD_SLAB_FREE_PTR(vrq, ccc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for ccc + * layer. ccc is responsible for + * + * - o_[mac]time + * + * - o_mode + * + * - o_fid (filled with inode number?!) + * + * - o_[ug]id + * + * - o_generation + * + * - and IO epoch (stored in o_easize), + * + * and capability. + */ +void ccc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct inode *inode; + struct obdo *oa; + obd_flag valid_flags; + + oa = attr->cra_oa; + inode = ccc_object_inode(obj); + valid_flags = OBD_MD_FLTYPE|OBD_MD_FLATIME; + + if (flags != (obd_valid)~0ULL) + valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME; + else { + LASSERT(attr->cra_capa == NULL); + attr->cra_capa = cl_capa_lookup(inode, + slice->crs_req->crq_type); + } + + if (slice->crs_req->crq_type == CRT_WRITE) { + if (flags & OBD_MD_FLEPOCH) { + oa->o_valid |= OBD_MD_FLEPOCH; + oa->o_easize = cl_i2info(inode)->lli_ioepoch; + valid_flags |= OBD_MD_FLMTIME|OBD_MD_FLCTIME| + OBD_MD_FLUID|OBD_MD_FLGID| + OBD_MD_FLFID|OBD_MD_FLGENER; + } + } + obdo_from_inode(oa, inode, valid_flags & flags); +} + +const struct cl_req_operations ccc_req_ops = { + .cro_attr_set = ccc_req_attr_set, + .cro_completion = ccc_req_completion +}; + +/* Setattr helpers */ +int cl_setattr_do_truncate(struct inode *inode, loff_t size, + struct obd_capa *capa) +{ + struct lu_env *env; + struct cl_io *io; + int result; + int refcheck; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + io->ci_obj = cl_i2info(inode)->lli_clob; + io->u.ci_truncate.tr_size = size; + io->u.ci_truncate.tr_capa = capa; + if (cl_io_init(env, io, CIT_TRUNC, io->ci_obj) == 0) + result = cl_io_loop(env, io); + else + result = io->ci_result; + cl_io_fini(env, io); + cl_env_put(env, &refcheck); + RETURN(result); +} + +int cl_setattr_ost(struct inode *inode, struct obd_capa *capa) +{ + struct cl_inode_info *lli = cl_i2info(inode); + struct lov_stripe_md *lsm = lli->lli_smd; + int rc; + obd_flag flags; + struct obd_info oinfo = { { { 0 } } }; + struct obdo *oa; + + OBDO_ALLOC(oa); + if (oa) { + oa->o_id = lsm->lsm_object_id; + oa->o_gr = lsm->lsm_object_gr; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; + + flags = OBD_MD_FLTYPE | OBD_MD_FLATIME | + OBD_MD_FLMTIME | OBD_MD_FLCTIME | + OBD_MD_FLFID | OBD_MD_FLGENER | + OBD_MD_FLGROUP; + + obdo_from_inode(oa, inode, flags); + + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + + /* XXX: this looks unnecessary now. */ + rc = obd_setattr_rqset(cl_i2sbi(inode)->ll_dt_exp, &oinfo, + NULL); + if (rc) + CERROR("obd_setattr_async fails: rc=%d\n", rc); + OBDO_FREE(oa); + } else { + rc = -ENOMEM; + } + return rc; +} + + +/***************************************************************************** + * + * Type conversions. + * + */ + +struct lu_device *ccc2lu_dev(struct ccc_device *vdv) +{ + return &vdv->cdv_cl.cd_lu_dev; +} + +struct ccc_device *lu2ccc_dev(const struct lu_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl.cd_lu_dev); +} + +struct ccc_device *cl2ccc_dev(const struct cl_device *d) +{ + return container_of0(d, struct ccc_device, cdv_cl); +} + +struct lu_object *ccc2lu(struct ccc_object *vob) +{ + return &vob->cob_cl.co_lu; +} + +struct ccc_object *lu2ccc(const struct lu_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl.co_lu); +} + +struct ccc_object *cl2ccc(const struct cl_object *obj) +{ + return container_of0(obj, struct ccc_object, cob_cl); +} + +struct ccc_lock *cl2ccc_lock(const struct cl_lock_slice *slice) +{ + return container_of(slice, struct ccc_lock, clk_cl); +} + +struct ccc_io *cl2ccc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct ccc_io *cio; + + cio = container_of(slice, struct ccc_io, cui_cl); + LASSERT(cio == ccc_env_io(env)); + return cio; +} + +struct ccc_req *cl2ccc_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct ccc_req, crq_cl); +} + +cfs_page_t *cl2vm_page(const struct cl_page_slice *slice) +{ + return cl2ccc_page(slice)->cpg_page; +} + +/***************************************************************************** + * + * Accessors. + * + */ +int ccc_object_invariant(const struct cl_object *obj) +{ + struct inode *inode = ccc_object_inode(obj); + struct cl_inode_info *lli = cl_i2info(inode); + + return (S_ISREG(cl_inode_mode(inode)) || + /* i_mode of unlinked inode is zeroed. */ + cl_inode_mode(inode) == 0) && lli->lli_clob == obj; +} + +struct inode *ccc_object_inode(const struct cl_object *obj) +{ + return cl2ccc(obj)->cob_inode; +} + +/** + * Returns a pointer to cl_page associated with \a vmpage, without acquiring + * additional reference to the resulting page. This is an unsafe version of + * cl_vmpage_page() that can only be used under vmpage lock. + */ +struct cl_page *ccc_vmpage_page_transient(cfs_page_t *vmpage) +{ + KLASSERT(PageLocked(vmpage)); + return (struct cl_page *)vmpage->private; +} + +/** + * Initializes or updates CLIO part when new meta-data arrives from the + * server. + * + * - allocates cl_object if necessary, + * - updated layout, if object was already here. + */ +int cl_inode_init(struct inode *inode, struct lustre_md *md) +{ + struct lu_env *env; + struct cl_inode_info *lli; + struct cl_object *clob; + struct lu_site *site; + struct lu_fid *fid; + const struct cl_object_conf conf = { + .coc_inode = inode, + .u = { + .coc_md = md + } + }; + int result = 0; + int refcheck; + + /* LASSERT(inode->i_state & I_NEW); */ + LASSERT(md->body->valid & OBD_MD_FLID); + + if (!S_ISREG(cl_inode_mode(inode))) + return 0; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + return PTR_ERR(env); + + site = cl_i2sbi(inode)->ll_site; + lli = cl_i2info(inode); + fid = &lli->lli_fid; + LASSERT(fid_is_sane(fid)); + + if (lli->lli_clob == NULL) { + clob = cl_object_find(env, lu2cl_dev(site->ls_top_dev), + fid, &conf); + if (!IS_ERR(clob)) { + /* + * No locking is necessary, as new inode is + * locked by I_NEW bit. + * + * XXX not true for call from ll_update_inode(). + */ + lli->lli_clob = clob; + lu_object_ref_add(&clob->co_lu, "inode", inode); + } else + result = PTR_ERR(clob); + } else + result = cl_conf_set(env, lli->lli_clob, &conf); + cl_env_put(env, &refcheck); + + if (result != 0) + CERROR("Failure to initialize cl object "DFID": %d\n", + PFID(fid), result); + return result; +} + +void cl_inode_fini(struct inode *inode) +{ + struct lu_env *env; + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *clob = lli->lli_clob; + int refcheck; + int emergency; + + if (clob != NULL) { + struct lu_object_header *head = clob->co_lu.lo_header; + void *cookie; + + cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + emergency = IS_ERR(env); + if (emergency) { + mutex_lock(&ccc_inode_fini_guard); + LASSERT(ccc_inode_fini_env != NULL); + cl_env_implant(ccc_inode_fini_env, &refcheck); + env = ccc_inode_fini_env; + } + /* + * cl_object cache is a slave to inode cache (which, in turn + * is a slave to dentry cache), don't keep cl_object in memory + * when its master is evicted. + */ + cl_object_kill(env, clob); + lu_object_ref_del(&clob->co_lu, "inode", inode); + /* XXX temporary: this is racy */ + LASSERT(atomic_read(&head->loh_ref) == 1); + cl_object_put(env, clob); + lli->lli_clob = NULL; + if (emergency) { + cl_env_unplant(ccc_inode_fini_env, &refcheck); + mutex_unlock(&ccc_inode_fini_guard); + } else + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + } +} diff --git a/lustre/ldlm/interval_tree.c b/lustre/ldlm/interval_tree.c index 68480bb..60dcbeb 100644 --- a/lustre/ldlm/interval_tree.c +++ b/lustre/ldlm/interval_tree.c @@ -389,6 +389,7 @@ struct interval_node *interval_insert(struct interval_node *node, struct interval_node **p, *parent = NULL; ENTRY; + LASSERT(!interval_is_intree(node)); p = root; while (*p) { parent = *p; @@ -412,6 +413,7 @@ struct interval_node *interval_insert(struct interval_node *node, *p = node; interval_insert_color(node, root); + node->in_intree = 1; RETURN(NULL); } @@ -527,6 +529,8 @@ void interval_erase(struct interval_node *node, int color; ENTRY; + LASSERT(interval_is_intree(node)); + node->in_intree = 0; if (!node->in_left) { child = node->in_right; } else if (!node->in_right) { diff --git a/lustre/ldlm/ldlm_extent.c b/lustre/ldlm/ldlm_extent.c index 1541416..0858207 100644 --- a/lustre/ldlm/ldlm_extent.c +++ b/lustre/ldlm/ldlm_extent.c @@ -49,6 +49,7 @@ #include #include #include +#include #include #include "ldlm_internal.h" @@ -64,7 +65,7 @@ static void ldlm_extent_internal_policy_fixup(struct ldlm_lock *req, __u64 req_start = req->l_req_extent.start; __u64 req_end = req->l_req_extent.end; __u64 req_align, mask; - + if (conflicting > 32 && (req_mode == LCK_PW || req_mode == LCK_CW)) { if (req_end < req_start + LDLM_MAX_GROWN_EXTENT) new_ex->end = min(req_start + LDLM_MAX_GROWN_EXTENT, @@ -707,16 +708,31 @@ int ldlm_process_extent_lock(struct ldlm_lock *lock, int *flags, int first_enq, ldlm_resource_add_lock(res, &res->lr_waiting, lock); unlock_res(res); rc = ldlm_run_ast_work(&rpc_list, LDLM_WORK_BL_AST); - lock_res(res); + if (OBD_FAIL_CHECK(OBD_FAIL_LDLM_OST_FAIL_RACE) && + !ns_is_client(res->lr_namespace)) + class_fail_export(lock->l_export); + + lock_res(res); if (rc == -ERESTART) { + + /* 15715: The lock was granted and destroyed after + * resource lock was dropped. Interval node was freed + * in ldlm_lock_destroy. Anyway, this always happens + * when a client is being evicted. So it would be + * ok to return an error. -jay */ + if (lock->l_destroyed) { + *err = -EAGAIN; + GOTO(out, rc = -EAGAIN); + } + /* lock was granted while resource was unlocked. */ if (lock->l_granted_mode == lock->l_req_mode) { /* bug 11300: if the lock has been granted, * break earlier because otherwise, we will go * to restart and ldlm_resource_unlink will be * called and it causes the interval node to be - * freed. Then we will fail at + * freed. Then we will fail at * ldlm_extent_add_lock() */ *flags &= ~(LDLM_FL_BLOCK_GRANTED | LDLM_FL_BLOCK_CONV | LDLM_FL_BLOCK_WAIT); @@ -798,6 +814,7 @@ void ldlm_interval_free(struct ldlm_interval *node) { if (node) { LASSERT(list_empty(&node->li_group)); + LASSERT(!interval_is_intree(&node->li_node)); OBD_SLAB_FREE(node, ldlm_interval_slab, sizeof(*node)); } } @@ -850,6 +867,7 @@ void ldlm_extent_add_lock(struct ldlm_resource *res, node = lock->l_tree_node; LASSERT(node != NULL); + LASSERT(!interval_is_intree(&node->li_node)); idx = lock_mode_to_index(lock->l_granted_mode); LASSERT(lock->l_granted_mode == 1 << idx); @@ -877,14 +895,13 @@ void ldlm_extent_add_lock(struct ldlm_resource *res, void ldlm_extent_unlink_lock(struct ldlm_lock *lock) { struct ldlm_resource *res = lock->l_resource; - struct ldlm_interval *node; + struct ldlm_interval *node = lock->l_tree_node; struct ldlm_interval_tree *tree; int idx; - if (lock->l_granted_mode != lock->l_req_mode) + if (!node || !interval_is_intree(&node->li_node)) /* duplicate unlink */ return; - LASSERT(lock->l_tree_node != NULL); idx = lock_mode_to_index(lock->l_granted_mode); LASSERT(lock->l_granted_mode == 1 << idx); tree = &res->lr_itree[idx]; diff --git a/lustre/ldlm/ldlm_flock.c b/lustre/ldlm/ldlm_flock.c index 2e888ff..1089022 100644 --- a/lustre/ldlm/ldlm_flock.c +++ b/lustre/ldlm/ldlm_flock.c @@ -112,7 +112,7 @@ ldlm_flock_destroy(struct ldlm_lock *lock, ldlm_mode_t mode, int flags) /* client side - set a flag to prevent sending a CANCEL */ lock->l_flags |= LDLM_FL_LOCAL_ONLY | LDLM_FL_CBPENDING; - /* when reaching here, it is under lock_res_and_lock(). Thus, + /* when reaching here, it is under lock_res_and_lock(). Thus, need call the nolock version of ldlm_lock_decref_internal*/ ldlm_lock_decref_internal_nolock(lock, mode); } @@ -405,9 +405,8 @@ reprocess: &new2->l_remote_handle, &new2->l_exp_hash); } - if (*flags == LDLM_FL_WAIT_NOREPROC) { + if (*flags == LDLM_FL_WAIT_NOREPROC) ldlm_lock_addref_internal_nolock(new2, lock->l_granted_mode); - } /* insert new2 at lock */ ldlm_resource_add_lock(res, ownlocks, new2); @@ -524,7 +523,7 @@ ldlm_flock_completion_ast(struct ldlm_lock *lock, int flags, void *data) * holding the lock even if app still believes it has it, since * server already dropped it anyway. Only for granted locks too. */ lock_res_and_lock(lock); - if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == + if ((lock->l_flags & (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) == (LDLM_FL_FAILED|LDLM_FL_LOCAL_ONLY)) { unlock_res_and_lock(lock); if (lock->l_req_mode == lock->l_granted_mode && diff --git a/lustre/ldlm/ldlm_inodebits.c b/lustre/ldlm/ldlm_inodebits.c index 07014fb..548ee14 100644 --- a/lustre/ldlm/ldlm_inodebits.c +++ b/lustre/ldlm/ldlm_inodebits.c @@ -86,7 +86,22 @@ ldlm_inodebits_compat_queue(struct list_head *queue, struct ldlm_lock *req, tmp = mode_tail; continue; } - + + if (lock->l_req_mode == LCK_COS) { + if (lock->l_client_cookie == req->l_client_cookie) { + tmp = mode_tail; + } else { + tmp = mode_tail; + if (!work_list) + RETURN(0); + compat = 0; + if (lock->l_blocking_ast) + ldlm_add_ast_work_item(lock, req, + work_list); + } + continue; + } + for (;;) { struct list_head *head; diff --git a/lustre/ldlm/ldlm_internal.h b/lustre/ldlm/ldlm_internal.h index a6eb21c..5332b2f 100644 --- a/lustre/ldlm/ldlm_internal.h +++ b/lustre/ldlm/ldlm_internal.h @@ -45,19 +45,19 @@ extern struct list_head ldlm_cli_namespace_list; static inline atomic_t *ldlm_namespace_nr(ldlm_side_t client) { - return client == LDLM_NAMESPACE_SERVER ? + return client == LDLM_NAMESPACE_SERVER ? &ldlm_srv_namespace_nr : &ldlm_cli_namespace_nr; } static inline struct list_head *ldlm_namespace_list(ldlm_side_t client) { - return client == LDLM_NAMESPACE_SERVER ? + return client == LDLM_NAMESPACE_SERVER ? &ldlm_srv_namespace_list : &ldlm_cli_namespace_list; } static inline struct semaphore *ldlm_namespace_lock(ldlm_side_t client) { - return client == LDLM_NAMESPACE_SERVER ? + return client == LDLM_NAMESPACE_SERVER ? &ldlm_srv_namespace_lock : &ldlm_cli_namespace_lock; } @@ -75,11 +75,11 @@ enum { LDLM_CANCEL_LRUR = 1 << 3 /* Cancel locks from lru resize. */ }; -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, int flags); int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, int count, int max, int cancel_flags, int flags); -int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, +int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, int flags); extern int ldlm_enqueue_min; int ldlm_get_enq_timeout(struct ldlm_lock *lock); @@ -107,7 +107,7 @@ typedef enum { LDLM_WORK_BL_AST, LDLM_WORK_CP_AST, LDLM_WORK_REVOKE_AST -} ldlm_desc_ast_t; +} ldlm_desc_ast_t; void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list); struct ldlm_lock * diff --git a/lustre/ldlm/ldlm_lib.c b/lustre/ldlm/ldlm_lib.c index e739721..7ac4c93 100644 --- a/lustre/ldlm/ldlm_lib.c +++ b/lustre/ldlm/ldlm_lib.c @@ -51,7 +51,6 @@ #include #include "ldlm_internal.h" - /* @priority: if non-zero, move the selected to the list head * @create: if zero, only search in existed connections */ @@ -222,14 +221,21 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) rq_portal = OST_REQUEST_PORTAL; rp_portal = OSC_REPLY_PORTAL; connect_op = OST_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_OST; } else if (!strcmp(name, LUSTRE_MDC_NAME)) { rq_portal = MDS_REQUEST_PORTAL; rp_portal = MDC_REPLY_PORTAL; connect_op = MDS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_CLI; + cli->cl_sp_to = LUSTRE_SP_MDT; } else if (!strcmp(name, LUSTRE_MGC_NAME)) { rq_portal = MGS_REQUEST_PORTAL; rp_portal = MGC_REPLY_PORTAL; connect_op = MGS_CONNECT; + cli->cl_sp_me = LUSTRE_SP_MGC; + cli->cl_sp_to = LUSTRE_SP_MGS; + cli->cl_flvr_mgc.sf_rpc = SPTLRPC_FLVR_INVALID; } else { CERROR("unknown client OBD type \"%s\", can't setup\n", name); @@ -258,8 +264,6 @@ int client_obd_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) init_rwsem(&cli->cl_sem); sema_init(&cli->cl_mgc_sem, 1); - sptlrpc_rule_set_init(&cli->cl_sptlrpc_rset); - cli->cl_sec_part = LUSTRE_SP_ANY; cli->cl_conn_count = 0; memcpy(server_uuid.uuid, lustre_cfg_buf(lcfg, 2), min_t(unsigned int, LUSTRE_CFG_BUFLEN(lcfg, 2), @@ -375,7 +379,6 @@ err: int client_obd_cleanup(struct obd_device *obddev) { ENTRY; - sptlrpc_rule_set_free(&obddev->u.cli.cl_sptlrpc_rset); ldlm_put_ref(); RETURN(0); } @@ -518,7 +521,10 @@ int client_disconnect_export(struct obd_export *exp) * in obd_namespace. bug 14260 */ obd->obd_namespace = NULL; - ptlrpc_free_rq_pool(imp->imp_rq_pool); + if (imp->imp_rq_pool) { + ptlrpc_free_rq_pool(imp->imp_rq_pool); + imp->imp_rq_pool = NULL; + } destroy_import(imp); cli->cl_import = NULL; @@ -907,16 +913,11 @@ dont_check_exports: export->exp_connection = ptlrpc_connection_get(req->rq_peer, req->rq_self, &remote_uuid); - - spin_lock(&target->obd_dev_lock); - - /* Export might be hashed already, e.g. if this is reconnect */ - if (hlist_unhashed(&export->exp_nid_hash)) - lustre_hash_add(export->exp_obd->obd_nid_hash, - &export->exp_connection->c_peer.nid, - &export->exp_nid_hash); - - spin_unlock(&target->obd_dev_lock); + if (hlist_unhashed(&export->exp_nid_hash)) { + lustre_hash_add_unique(export->exp_obd->obd_nid_hash, + &export->exp_connection->c_peer.nid, + &export->exp_nid_hash); + } spin_lock_bh(&target->obd_processing_task_lock); if (target->obd_recovering && !export->exp_in_recovery) { @@ -1101,7 +1102,7 @@ void ptlrpc_free_clone(struct ptlrpc_request *req) ptlrpc_req_drop_rs(req); sptlrpc_svc_ctx_decref(req); class_export_rpc_put(req->rq_export); - list_del(&req->rq_list); + list_del_init(&req->rq_list); if (req->rq_user_desc) { int ngroups = req->rq_user_desc->pud_ngroups; @@ -1161,6 +1162,21 @@ static void target_finish_recovery(struct obd_device *obd) obd->obd_name); ldlm_reprocess_all_ns(obd->obd_namespace); + spin_lock_bh(&obd->obd_processing_task_lock); + if (list_empty(&obd->obd_req_replay_queue) && + list_empty(&obd->obd_lock_replay_queue) && + list_empty(&obd->obd_final_req_queue)) { + obd->obd_processing_task = 0; + } else { + CERROR("%s: Recovery queues ( %s%s%s) are empty\n", + obd->obd_name, + list_empty(&obd->obd_req_replay_queue) ? "" : "req ", + list_empty(&obd->obd_lock_replay_queue) ? "" : "lock ", + list_empty(&obd->obd_final_req_queue) ? "" : "final "); + spin_unlock_bh(&obd->obd_processing_task_lock); + LBUG(); + } + spin_unlock_bh(&obd->obd_processing_task_lock); /* when recovery finished, cleanup orphans on mds and ost */ if (OBT(obd) && OBP(obd, postrecov)) { @@ -1176,8 +1192,13 @@ static void target_finish_recovery(struct obd_device *obd) static void abort_req_replay_queue(struct obd_device *obd) { struct ptlrpc_request *req, *n; + struct list_head abort_list; - list_for_each_entry_safe(req, n, &obd->obd_req_replay_queue, rq_list) { + CFS_INIT_LIST_HEAD(&abort_list); + spin_lock_bh(&obd->obd_processing_task_lock); + list_splice_init(&obd->obd_req_replay_queue, &abort_list); + spin_unlock_bh(&obd->obd_processing_task_lock); + list_for_each_entry_safe(req, n, &abort_list, rq_list) { DEBUG_REQ(D_WARNING, req, "aborted:"); req->rq_status = -ENOTCONN; if (ptlrpc_error(req)) { @@ -1192,7 +1213,12 @@ static void abort_req_replay_queue(struct obd_device *obd) static void abort_lock_replay_queue(struct obd_device *obd) { struct ptlrpc_request *req, *n; + struct list_head abort_list; + CFS_INIT_LIST_HEAD(&abort_list); + spin_lock_bh(&obd->obd_processing_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &abort_list); + spin_unlock_bh(&obd->obd_processing_task_lock); list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){ DEBUG_REQ(D_ERROR, req, "aborted:"); req->rq_status = -ENOTCONN; @@ -1217,10 +1243,12 @@ static void abort_lock_replay_queue(struct obd_device *obd) void target_cleanup_recovery(struct obd_device *obd) { struct ptlrpc_request *req, *n; + struct list_head clean_list; ENTRY; LASSERT(obd->obd_stopping); + CFS_INIT_LIST_HEAD(&clean_list); spin_lock_bh(&obd->obd_processing_task_lock); if (!obd->obd_recovering) { spin_unlock_bh(&obd->obd_processing_task_lock); @@ -1229,19 +1257,23 @@ void target_cleanup_recovery(struct obd_device *obd) } obd->obd_recovering = obd->obd_abort_recovery = 0; target_cancel_recovery_timer(obd); + + list_splice_init(&obd->obd_req_replay_queue, &clean_list); spin_unlock_bh(&obd->obd_processing_task_lock); - list_for_each_entry_safe(req, n, &obd->obd_req_replay_queue, rq_list) { - LASSERT (req->rq_reply_state == 0); + list_for_each_entry_safe(req, n, &clean_list, rq_list) { + LASSERT(req->rq_reply_state == 0); target_exp_dequeue_req_replay(req); ptlrpc_free_clone(req); } - list_for_each_entry_safe(req, n, &obd->obd_lock_replay_queue, rq_list){ - LASSERT (req->rq_reply_state == 0); - ptlrpc_free_clone(req); - } - list_for_each_entry_safe(req, n, &obd->obd_final_req_queue, rq_list) { - LASSERT (req->rq_reply_state == 0); + + spin_lock_bh(&obd->obd_processing_task_lock); + list_splice_init(&obd->obd_lock_replay_queue, &clean_list); + list_splice_init(&obd->obd_final_req_queue, &clean_list); + spin_unlock_bh(&obd->obd_processing_task_lock); + + list_for_each_entry_safe(req, n, &clean_list, rq_list){ + LASSERT(req->rq_reply_state == 0); ptlrpc_free_clone(req); } @@ -1329,18 +1361,27 @@ target_start_and_reset_recovery_timer(struct obd_device *obd, struct ptlrpc_request *req, int new_client) { - int req_timeout = lustre_msg_get_timeout(req->rq_reqmsg); + int service_time = lustre_msg_get_service_time(req->rq_reqmsg); - /* teach server about old server's estimates */ - if (!new_client) + if (!new_client && service_time) + /* Teach server about old server's estimates, as first guess + * at how long new requests will take. */ at_add(&req->rq_rqbd->rqbd_service->srv_at_estimate, - at_timeout2est(req_timeout)); + service_time); check_and_start_recovery_timer(obd); - req_timeout *= OBD_RECOVERY_FACTOR; - if (req_timeout > obd->obd_recovery_timeout && !new_client) - reset_recovery_timer(obd, req_timeout, 0); + /* convert the service time to rpc timeout, + * reuse service_time to limit stack usage */ + service_time = at_est2timeout(service_time); + + /* We expect other clients to timeout within service_time, then try + * to reconnect, then try the failover server. The max delay between + * connect attempts is SWITCH_MAX + SWITCH_INC + INITIAL */ + service_time += 2 * (CONNECTION_SWITCH_MAX + CONNECTION_SWITCH_INC + + INITIAL_CONNECT_TIMEOUT); + if (service_time > obd->obd_recovery_timeout && !new_client) + reset_recovery_timer(obd, service_time, 0); } #ifdef __KERNEL__ @@ -1393,6 +1434,11 @@ static int check_for_next_transno(struct obd_device *obd) next_transno, queue_len, completed, connected, req_transno); obd->obd_next_recovery_transno = req_transno; wake_up = 1; + } else if (OBD_FAIL_CHECK(OBD_FAIL_MDS_RECOVERY_ACCEPTS_GAPS)) { + CDEBUG(D_HA, "accepting transno gaps is explicitly allowed" + " by fail_lock, waking up ("LPD64")\n", next_transno); + obd->obd_next_recovery_transno = req_transno; + wake_up = 1; } else if (queue_len == atomic_read(&obd->obd_req_replay_clients)) { /* some clients haven't connected in time, but we can try * to replay requests that demand on already committed ones @@ -1558,7 +1604,7 @@ static int handle_recovery_req(struct ptlrpc_thread *thread, if (!req_replay_done(req->rq_export) || !lock_replay_done(req->rq_export)) reset_recovery_timer(class_exp2obd(req->rq_export), - OBD_RECOVERY_FACTOR * AT_OFF ? obd_timeout : + AT_OFF ? obd_timeout : at_get(&req->rq_rqbd->rqbd_service->srv_at_estimate), 1); ptlrpc_free_clone(req); RETURN(0); @@ -1580,7 +1626,6 @@ static int target_recovery_thread(void *arg) unsigned long flags; struct lu_env env; struct ptlrpc_thread fake_svc_thread, *thread = &fake_svc_thread; - __u32 recov_ctx_tags = LCT_MD_THREAD; int rc = 0; ENTRY; @@ -1591,7 +1636,7 @@ static int target_recovery_thread(void *arg) RECALC_SIGPENDING; SIGNAL_MASK_UNLOCK(current, flags); - rc = lu_context_init(&env.le_ctx, recov_ctx_tags); + rc = lu_context_init(&env.le_ctx, LCT_MD_THREAD); if (rc) RETURN(rc); @@ -2044,15 +2089,19 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) struct obd_device *obd; struct obd_export *exp; struct ptlrpc_service *svc; + ENTRY; - if (req->rq_no_reply) + if (req->rq_no_reply) { + EXIT; return; + } svc = req->rq_rqbd->rqbd_service; rs = req->rq_reply_state; if (rs == NULL || !rs->rs_difficult) { /* no notifiers */ target_send_reply_msg (req, rc, fail_id); + EXIT; return; } @@ -2082,6 +2131,8 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) spin_lock(&obd->obd_uncommitted_replies_lock); + CDEBUG(D_NET, "rs transno = "LPU64", last committed = "LPU64"\n", + rs->rs_transno, obd->obd_last_committed); if (rs->rs_transno > obd->obd_last_committed) { /* not committed already */ list_add_tail (&rs->rs_obd_list, @@ -2112,9 +2163,11 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) atomic_inc (&svc->srv_outstanding_replies); } - if (!rs->rs_on_net || /* some notifier */ - list_empty(&rs->rs_exp_list) || /* completed already */ - list_empty(&rs->rs_obd_list)) { + if (rs->rs_transno <= obd->obd_last_committed || + (!rs->rs_on_net && !rs->rs_no_ack) || + list_empty(&rs->rs_exp_list) || /* completed already */ + list_empty(&rs->rs_obd_list)) { + CDEBUG(D_HA, "Schedule reply immediately\n"); list_add_tail (&rs->rs_list, &svc->srv_reply_queue); cfs_waitq_signal (&svc->srv_waitq); } else { @@ -2123,6 +2176,7 @@ void target_send_reply(struct ptlrpc_request *req, int rc, int fail_id) } spin_unlock(&svc->srv_lock); + EXIT; } int target_handle_ping(struct ptlrpc_request *req) @@ -2155,33 +2209,37 @@ void target_committed_to_req(struct ptlrpc_request *req) EXPORT_SYMBOL(target_committed_to_req); -#ifdef HAVE_QUOTA_SUPPORT int target_handle_qc_callback(struct ptlrpc_request *req) { struct obd_quotactl *oqctl; struct client_obd *cli = &req->rq_export->exp_obd->u.cli; oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - if (oqctl == NULL) + if (oqctl == NULL) { + CERROR("Can't unpack obd_quotactl\n"); RETURN(-EPROTO); + } cli->cl_qchk_stat = oqctl->qc_stat; return 0; } +#ifdef HAVE_QUOTA_SUPPORT int target_handle_dqacq_callback(struct ptlrpc_request *req) { #ifdef __KERNEL__ struct obd_device *obd = req->rq_export->exp_obd; struct obd_device *master_obd; + struct obd_device_target *obt; struct lustre_quota_ctxt *qctxt; - struct qunit_data *qdata; - void* rep; - struct qunit_data_old *qdata_old; + struct qunit_data *qdata = NULL; int rc = 0; ENTRY; + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_DROP_QUOTA_REQ)) + RETURN(rc); + rc = req_capsule_server_pack(&req->rq_pill); if (rc) { CERROR("packing reply failed!: rc = %d\n", rc); @@ -2190,52 +2248,73 @@ int target_handle_dqacq_callback(struct ptlrpc_request *req) LASSERT(req->rq_export); - /* fixed for bug10707 */ - if ((req->rq_export->exp_connect_flags & OBD_CONNECT_QUOTA64) && - !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) { - CDEBUG(D_QUOTA, "qd_count is 64bit!\n"); - rep = req_capsule_server_get(&req->rq_pill, - &RMF_QUNIT_DATA); - LASSERT(rep); - qdata = req_capsule_client_swab_get(&req->rq_pill, - &RMF_QUNIT_DATA, - (void*)lustre_swab_qdata); - } else { - CDEBUG(D_QUOTA, "qd_count is 32bit!\n"); - rep = req_capsule_server_get(&req->rq_pill, &RMF_QUNIT_DATA); - LASSERT(rep); - qdata_old = req_capsule_client_swab_get(&req->rq_pill, - &RMF_QUNIT_DATA, - (void*)lustre_swab_qdata_old); - qdata = lustre_quota_old_to_new(qdata_old); + OBD_ALLOC(qdata, sizeof(struct qunit_data)); + if (!qdata) + RETURN(-ENOMEM); + rc = quota_get_qdata(req, qdata, QUOTA_REQUEST, QUOTA_EXPORT); + if (rc < 0) { + CDEBUG(D_ERROR, "Can't unpack qunit_data(rc: %d)\n", rc); + GOTO(out, rc); } - if (qdata == NULL) - RETURN(-EPROTO); - /* we use the observer */ - LASSERT(obd->obd_observer && obd->obd_observer->obd_observer); + if (!obd->obd_observer || !obd->obd_observer->obd_observer) { + CERROR("Can't find the observer, it is recovering\n"); + req->rq_status = -EAGAIN; + GOTO(send_reply, rc = -EAGAIN); + } + master_obd = obd->obd_observer->obd_observer; - qctxt = &master_obd->u.obt.obt_qctxt; + obt = &master_obd->u.obt; + qctxt = &obt->obt_qctxt; + + if (!qctxt->lqc_setup || !qctxt->lqc_valid) { + /* quota_type has not been processed yet, return EAGAIN + * until we know whether or not quotas are supposed to + * be enabled */ + CDEBUG(D_QUOTA, "quota_type not processed yet, return " + "-EAGAIN\n"); + req->rq_status = -EAGAIN; + rc = ptlrpc_reply(req); + GOTO(out, rc); + } + + down_read(&obt->obt_rwsem); + if (qctxt->lqc_lqs_hash == NULL) { + up_read(&obt->obt_rwsem); + /* quota_type has not been processed yet, return EAGAIN + * until we know whether or not quotas are supposed to + * be enabled */ + CDEBUG(D_QUOTA, "quota_ctxt is not ready yet, return " + "-EAGAIN\n"); + req->rq_status = -EAGAIN; + rc = ptlrpc_reply(req); + GOTO(out, rc); + } LASSERT(qctxt->lqc_handler); rc = qctxt->lqc_handler(master_obd, qdata, lustre_msg_get_opc(req->rq_reqmsg)); + up_read(&obt->obt_rwsem); if (rc && rc != -EDQUOT) CDEBUG(rc == -EBUSY ? D_QUOTA : D_ERROR, "dqacq failed! (rc:%d)\n", rc); + req->rq_status = rc; - /* the qd_count might be changed in lqc_handler */ - if ((req->rq_export->exp_connect_flags & OBD_CONNECT_QUOTA64) && - !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) { - memcpy(rep, qdata, sizeof(*qdata)); - } else { - qdata_old = lustre_quota_new_to_old(qdata); - memcpy(rep, qdata_old, sizeof(*qdata_old)); + /* there are three forms of qunit(historic causes), so we need to + * adjust the same form to different forms slaves needed */ + rc = quota_copy_qdata(req, qdata, QUOTA_REPLY, QUOTA_EXPORT); + if (rc < 0) { + CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc); + GOTO(out, rc); } - req->rq_status = rc; - rc = ptlrpc_reply(req); + /* Block the quota req. b=14840 */ + OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_BLOCK_QUOTA_REQ, obd_timeout); +send_reply: + rc = ptlrpc_reply(req); +out: + OBD_FREE(qdata, sizeof(struct qunit_data)); RETURN(rc); #else return 0; @@ -2250,7 +2329,8 @@ ldlm_mode_t lck_compat_array[] = { [LCK_CW] LCK_COMPAT_CW, [LCK_CR] LCK_COMPAT_CR, [LCK_NL] LCK_COMPAT_NL, - [LCK_GROUP] LCK_COMPAT_GROUP + [LCK_GROUP] LCK_COMPAT_GROUP, + [LCK_COS] LCK_COMPAT_COS, }; /** diff --git a/lustre/ldlm/ldlm_lock.c b/lustre/ldlm/ldlm_lock.c index 2b8c9bf..97d680c 100644 --- a/lustre/ldlm/ldlm_lock.c +++ b/lustre/ldlm/ldlm_lock.c @@ -62,7 +62,8 @@ char *ldlm_lockname[] = { [LCK_CW] "CW", [LCK_CR] "CR", [LCK_NL] "NL", - [LCK_GROUP] "GROUP" + [LCK_GROUP] "GROUP", + [LCK_COS] "COS" }; char *ldlm_typename[] = { @@ -267,7 +268,8 @@ int ldlm_lock_destroy_internal(struct ldlm_lock *lock) } lock->l_destroyed = 1; - if (lock->l_export && lock->l_export->exp_lock_hash) + if (lock->l_export && lock->l_export->exp_lock_hash && + !hlist_unhashed(&lock->l_exp_hash)) lustre_hash_del(lock->l_export->exp_lock_hash, &lock->l_remote_handle, &lock->l_exp_hash); @@ -367,6 +369,7 @@ static struct ldlm_lock *ldlm_lock_new(struct ldlm_resource *resource) CFS_INIT_LIST_HEAD(&lock->l_cache_locks_list); lu_ref_init(&lock->l_reference); lu_ref_add(&lock->l_reference, "hash", lock); + lock->l_callback_timeout = 0; RETURN(lock); } @@ -413,7 +416,7 @@ int ldlm_lock_change_resource(struct ldlm_namespace *ns, struct ldlm_lock *lock, lock_res(oldres); lock_res_nested(newres, LRT_NEW); } else { - lock_res(newres); + lock_res(newres); lock_res_nested(oldres, LRT_NEW); } LASSERT(memcmp(new_resid, &oldres->lr_name, @@ -592,7 +595,7 @@ void ldlm_lock_addref_internal_nolock(struct ldlm_lock *lock, __u32 mode) lock->l_readers++; lu_ref_add_atomic(&lock->l_reference, "reader", lock); } - if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP)) { + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { lock->l_writers++; lu_ref_add_atomic(&lock->l_reference, "writer", lock); } @@ -618,7 +621,8 @@ int ldlm_lock_addref_try(struct lustre_handle *lockh, __u32 mode) lock = ldlm_handle2lock(lockh); if (lock != NULL) { lock_res_and_lock(lock); - if (!(lock->l_flags & LDLM_FL_CBPENDING)) { + if (lock->l_readers != 0 || lock->l_writers != 0 || + !(lock->l_flags & LDLM_FL_CBPENDING)) { ldlm_lock_addref_internal_nolock(lock, mode); result = 0; } @@ -648,7 +652,7 @@ void ldlm_lock_decref_internal_nolock(struct ldlm_lock *lock, __u32 mode) lu_ref_del(&lock->l_reference, "reader", lock); lock->l_readers--; } - if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP)) { + if (mode & (LCK_EX | LCK_CW | LCK_PW | LCK_GROUP | LCK_COS)) { LASSERT(lock->l_writers > 0); lu_ref_del(&lock->l_reference, "writer", lock); lock->l_writers--; @@ -915,7 +919,8 @@ void ldlm_grant_lock(struct ldlm_lock *lock, struct list_head *work_list) static struct ldlm_lock *search_queue(struct list_head *queue, ldlm_mode_t *mode, ldlm_policy_data_t *policy, - struct ldlm_lock *old_lock, int flags) + struct ldlm_lock *old_lock, + int flags, int unref) { struct ldlm_lock *lock; struct list_head *tmp; @@ -937,7 +942,7 @@ static struct ldlm_lock *search_queue(struct list_head *queue, if (lock->l_flags & LDLM_FL_CBPENDING && !(flags & LDLM_FL_CBPENDING)) continue; - if (lock->l_flags & LDLM_FL_CBPENDING && + if (!unref && lock->l_flags & LDLM_FL_CBPENDING && lock->l_readers == 0 && lock->l_writers == 0) continue; @@ -964,7 +969,8 @@ static struct ldlm_lock *search_queue(struct list_head *queue, policy->l_inodebits.bits)) continue; - if (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED)) + if (!unref && + (lock->l_destroyed || (lock->l_flags & LDLM_FL_FAILED))) continue; if ((flags & LDLM_FL_LOCAL_ONLY) && @@ -992,88 +998,6 @@ void ldlm_lock_allow_match(struct ldlm_lock *lock) unlock_res_and_lock(lock); } -/** - * Checks if requested extent lock is compatible with another owned lock. - * - * Checks if \a lock is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param lock the already owned lock - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param start start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * - * \post result == 1, *cookie == context, appropriate lock is referenced - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * - * \see ldlm_lock_fast_release - */ -int ldlm_lock_fast_match(struct ldlm_lock *lock, int rw, - obd_off start, obd_off end, - void **cookie) -{ - LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE); - - if (!lock) - return 0; - - lock_res_and_lock(lock); - /* check if granted mode is compatible */ - if (rw == OBD_BRW_WRITE && - !(lock->l_granted_mode & (LCK_PW|LCK_GROUP))) - goto no_match; - - /* does the lock cover the region we would like to access? */ - if ((lock->l_policy_data.l_extent.start > start) || - (lock->l_policy_data.l_extent.end < end)) - goto no_match; - - /* if we received a blocking callback and the lock is no longer - * referenced, don't use it */ - if ((lock->l_flags & LDLM_FL_CBPENDING) && - !lock->l_writers && !lock->l_readers) - goto no_match; - - ldlm_lock_addref_internal_nolock(lock, rw == OBD_BRW_WRITE ? - LCK_PW : LCK_PR); - unlock_res_and_lock(lock); - *cookie = (void *)lock; - return 1; /* avoid using rc for stack relief */ - -no_match: - unlock_res_and_lock(lock); - return 0; -} - -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see ldlm_lock_fast_lock - */ -void ldlm_lock_fast_release(void *cookie, int rw) -{ - struct ldlm_lock *lock = (struct ldlm_lock *)cookie; - - LASSERT(lock != NULL); - LASSERT(rw == OBD_BRW_READ || rw == OBD_BRW_WRITE); - LASSERT(rw == OBD_BRW_READ || - (lock->l_granted_mode & (LCK_PW | LCK_GROUP))); - ldlm_lock_decref_internal(lock, rw == OBD_BRW_WRITE ? LCK_PW : LCK_PR); -} - /* Can be called in two ways: * * If 'ns' is NULL, then lockh describes an existing lock that we want to look @@ -1101,7 +1025,7 @@ void ldlm_lock_fast_release(void *cookie, int rw) ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags, const struct ldlm_res_id *res_id, ldlm_type_t type, ldlm_policy_data_t *policy, ldlm_mode_t mode, - struct lustre_handle *lockh) + struct lustre_handle *lockh, int unref) { struct ldlm_resource *res; struct ldlm_lock *lock, *old_lock = NULL; @@ -1127,15 +1051,18 @@ ldlm_mode_t ldlm_lock_match(struct ldlm_namespace *ns, int flags, LDLM_RESOURCE_ADDREF(res); lock_res(res); - lock = search_queue(&res->lr_granted, &mode, policy, old_lock, flags); + lock = search_queue(&res->lr_granted, &mode, policy, old_lock, + flags, unref); if (lock != NULL) GOTO(out, rc = 1); if (flags & LDLM_FL_BLOCK_GRANTED) GOTO(out, rc = 0); - lock = search_queue(&res->lr_converting, &mode, policy, old_lock,flags); + lock = search_queue(&res->lr_converting, &mode, policy, old_lock, + flags, unref); if (lock != NULL) GOTO(out, rc = 1); - lock = search_queue(&res->lr_waiting, &mode, policy, old_lock, flags); + lock = search_queue(&res->lr_waiting, &mode, policy, old_lock, + flags, unref); if (lock != NULL) GOTO(out, rc = 1); @@ -1447,10 +1374,10 @@ ldlm_work_bl_ast_lock(struct list_head *tmp, struct ldlm_cb_set_arg *arg) ldlm_lock2desc(lock->l_blocking_lock, &d); - LDLM_LOCK_RELEASE(lock->l_blocking_lock); - lock->l_blocking_lock = NULL; lock->l_blocking_ast(lock, &d, (void *)arg, LDLM_CB_BLOCKING); + LDLM_LOCK_RELEASE(lock->l_blocking_lock); + lock->l_blocking_lock = NULL; LDLM_LOCK_RELEASE(lock); RETURN(1); @@ -1739,6 +1666,40 @@ void ldlm_cancel_locks_for_export(struct obd_export *exp) ldlm_cancel_locks_for_export_cb, exp); } +/** + * Downgrade an exclusive lock. + * + * A fast variant of ldlm_lock_convert for convertion of exclusive + * locks. The convertion is always successful. + * + * \param lock A lock to convert + * \param new_mode new lock mode + */ +void ldlm_lock_downgrade(struct ldlm_lock *lock, int new_mode) +{ + struct ldlm_namespace *ns; + ENTRY; + + LASSERT(lock->l_granted_mode & (LCK_PW | LCK_EX)); + LASSERT(new_mode == LCK_COS); + + lock_res_and_lock(lock); + ldlm_resource_unlink_lock(lock); + /* + * Remove the lock from pool as it will be added again in + * ldlm_grant_lock() called below. + */ + ns = lock->l_resource->lr_namespace; + ldlm_pool_del(&ns->ns_pool, lock); + + lock->l_req_mode = new_mode; + ldlm_grant_lock(lock, NULL); + unlock_res_and_lock(lock); + ldlm_reprocess_all(lock->l_resource); + + EXIT; +} + struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, __u32 *flags) { @@ -1763,7 +1724,7 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, if (node == NULL) /* Actually, this causes EDEADLOCK to be returned */ RETURN(NULL); - LASSERTF(new_mode == LCK_PW && lock->l_granted_mode == LCK_PR, + LASSERTF((new_mode == LCK_PW && lock->l_granted_mode == LCK_PR), "new_mode %u, granted %u\n", new_mode, lock->l_granted_mode); lock_res_and_lock(lock); @@ -1792,6 +1753,12 @@ struct ldlm_resource *ldlm_lock_convert(struct ldlm_lock *lock, int new_mode, node = NULL; } } + + /* + * Remove old lock from the pool before adding the lock with new + * mode below in ->policy() + */ + ldlm_pool_del(&ns->ns_pool, lock); /* If this is a local resource, put it on the appropriate list. */ if (ns_is_client(res->lr_namespace)) { @@ -1922,7 +1889,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, data->msg_fn, data->msg_line, fmt, args, " ns: \?\? lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " "res: \?\? rrc=\?\? type: \?\?\? flags: %x remote: " - LPX64" expref: %d pid: %u\n", lock, + LPX64" expref: %d pid: %u timeout: %lu\n", lock, lock->l_handle.h_cookie, atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers, ldlm_lockname[lock->l_granted_mode], @@ -1930,7 +1897,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, lock->l_flags, lock->l_remote_handle.cookie, lock->l_export ? atomic_read(&lock->l_export->exp_refcount) : -99, - lock->l_pid); + lock->l_pid, lock->l_callback_timeout); va_end(args); return; } @@ -1942,7 +1909,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " "res: "LPU64"/"LPU64" rrc: %d type: %s ["LPU64"->"LPU64 "] (req "LPU64"->"LPU64") flags: %x remote: "LPX64 - " expref: %d pid: %u\n", + " expref: %d pid: %u timeout %lu\n", lock->l_resource->lr_namespace->ns_name, lock, lock->l_handle.h_cookie, atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers, @@ -1958,7 +1925,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, lock->l_flags, lock->l_remote_handle.cookie, lock->l_export ? atomic_read(&lock->l_export->exp_refcount) : -99, - lock->l_pid); + lock->l_pid, lock->l_callback_timeout); break; case LDLM_FLOCK: @@ -1967,7 +1934,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " "res: "LPU64"/"LPU64" rrc: %d type: %s pid: %d " "["LPU64"->"LPU64"] flags: %x remote: "LPX64 - " expref: %d pid: %u\n", + " expref: %d pid: %u timeout: %lu\n", lock->l_resource->lr_namespace->ns_name, lock, lock->l_handle.h_cookie, atomic_read(&lock->l_refc), lock->l_readers, lock->l_writers, @@ -1983,7 +1950,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, lock->l_flags, lock->l_remote_handle.cookie, lock->l_export ? atomic_read(&lock->l_export->exp_refcount) : -99, - lock->l_pid); + lock->l_pid, lock->l_callback_timeout); break; case LDLM_IBITS: @@ -1992,7 +1959,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " "res: "LPU64"/"LPU64" bits "LPX64" rrc: %d type: %s " "flags: %x remote: "LPX64" expref: %d " - "pid %u\n", + "pid: %u timeout: %lu\n", lock->l_resource->lr_namespace->ns_name, lock, lock->l_handle.h_cookie, atomic_read (&lock->l_refc), @@ -2007,7 +1974,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, lock->l_flags, lock->l_remote_handle.cookie, lock->l_export ? atomic_read(&lock->l_export->exp_refcount) : -99, - lock->l_pid); + lock->l_pid, lock->l_callback_timeout); break; default: @@ -2015,7 +1982,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, data->msg_fn, data->msg_line, fmt, args, " ns: %s lock: %p/"LPX64" lrc: %d/%d,%d mode: %s/%s " "res: "LPU64"/"LPU64" rrc: %d type: %s flags: %x " - "remote: "LPX64" expref: %d pid: %u\n", + "remote: "LPX64" expref: %d pid: %u timeout %lu\n", lock->l_resource->lr_namespace->ns_name, lock, lock->l_handle.h_cookie, atomic_read (&lock->l_refc), @@ -2029,7 +1996,7 @@ void _ldlm_lock_debug(struct ldlm_lock *lock, __u32 level, lock->l_flags, lock->l_remote_handle.cookie, lock->l_export ? atomic_read(&lock->l_export->exp_refcount) : -99, - lock->l_pid); + lock->l_pid, lock->l_callback_timeout); break; } va_end(args); diff --git a/lustre/ldlm/ldlm_lockd.c b/lustre/ldlm/ldlm_lockd.c index ee466e8..9d6bb38 100644 --- a/lustre/ldlm/ldlm_lockd.c +++ b/lustre/ldlm/ldlm_lockd.c @@ -238,6 +238,31 @@ static int expired_lock_main(void *arg) static int ldlm_add_waiting_lock(struct ldlm_lock *lock); +/** + * Check if there is a request in the export request list + * which prevents the lock canceling. + */ +static int ldlm_lock_busy(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + int match = 0; + ENTRY; + + if (lock->l_export == NULL) + return 0; + + spin_lock(&lock->l_export->exp_lock); + list_for_each_entry(req, &lock->l_export->exp_queued_rpc, rq_exp_list) { + if (req->rq_ops->hpreq_lock_match) { + match = req->rq_ops->hpreq_lock_match(req, lock); + if (match) + break; + } + } + spin_unlock(&lock->l_export->exp_lock); + RETURN(match); +} + /* This is called from within a timer interrupt and cannot schedule */ static void waiting_locks_callback(unsigned long unused) { @@ -248,7 +273,6 @@ repeat: while (!list_empty(&waiting_locks_list)) { lock = list_entry(waiting_locks_list.next, struct ldlm_lock, l_pending_chain); - if (cfs_time_after(lock->l_callback_timeout, cfs_time_current()) || (lock->l_req_mode == LCK_GROUP)) break; @@ -286,6 +310,29 @@ repeat: goto repeat; } + /* Check if we need to prolong timeout */ + if (!OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT) && + ldlm_lock_busy(lock)) { + int cont = 1; + + if (lock->l_pending_chain.next == &waiting_locks_list) + cont = 0; + + LDLM_LOCK_GET(lock); + spin_unlock_bh(&waiting_locks_spinlock); + LDLM_DEBUG(lock, "prolong the busy lock"); + ldlm_refresh_waiting_lock(lock); + spin_lock_bh(&waiting_locks_spinlock); + + if (!cont) { + LDLM_LOCK_PUT(lock); + break; + } + + LDLM_LOCK_PUT(lock); + continue; + } + lock->l_resource->lr_namespace->ns_timeouts++; LDLM_ERROR(lock, "lock callback timer expired after %lds: " "evicting client at %s ", cfs_time_current_sec()- lock->l_enqueued_time.tv_sec, @@ -335,15 +382,21 @@ repeat: */ static int __ldlm_add_waiting_lock(struct ldlm_lock *lock) { - int timeout; + cfs_time_t timeout; cfs_time_t timeout_rounded; if (!list_empty(&lock->l_pending_chain)) return 0; - timeout = ldlm_get_enq_timeout(lock); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT) || + OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + timeout = 2; + else + timeout = ldlm_get_enq_timeout(lock); - lock->l_callback_timeout = cfs_time_shift(timeout); + timeout = cfs_time_shift(timeout); + if (likely(cfs_time_after(timeout, lock->l_callback_timeout))) + lock->l_callback_timeout = timeout; timeout_rounded = round_timeout(lock->l_callback_timeout); @@ -475,7 +528,6 @@ int ldlm_refresh_waiting_lock(struct ldlm_lock *lock) LDLM_DEBUG(lock, "refreshed"); return 1; } - #else /* !__KERNEL__ */ static int ldlm_add_waiting_lock(struct ldlm_lock *lock) @@ -621,6 +673,30 @@ static inline int ldlm_bl_and_cp_ast_fini(struct ptlrpc_request *req, RETURN(rc); } +/** + * Check if there are requests in the export request list which prevent + * the lock canceling and make these requests high priority ones. + */ +static void ldlm_lock_reorder_req(struct ldlm_lock *lock) +{ + struct ptlrpc_request *req; + ENTRY; + + if (lock->l_export == NULL) { + LDLM_DEBUG(lock, "client lock: no-op"); + RETURN_EXIT; + } + + spin_lock(&lock->l_export->exp_lock); + list_for_each_entry(req, &lock->l_export->exp_queued_rpc, rq_exp_list) { + if (!req->rq_hp && req->rq_ops->hpreq_lock_match && + req->rq_ops->hpreq_lock_match(req, lock)) + ptlrpc_hpreq_reorder(req); + } + spin_unlock(&lock->l_export->exp_lock); + EXIT; +} + /* * ->l_blocking_ast() method for server-side locks. This is invoked when newly * enqueued server lock conflicts with given one. @@ -650,6 +726,8 @@ int ldlm_server_blocking_ast(struct ldlm_lock *lock, ldlm_lock_dump(D_ERROR, lock, 0); } + ldlm_lock_reorder_req(lock); + req = ptlrpc_request_alloc_pack(lock->l_export->exp_imp_reverse, &RQF_LDLM_BL_CALLBACK, LUSTRE_DLM_VERSION, LDLM_BL_CALLBACK); @@ -1882,7 +1960,8 @@ void ldlm_revoke_lock_cb(void *obj, void *data) LASSERT(!lock->l_blocking_lock); lock->l_flags |= LDLM_FL_AST_SENT; - if (lock->l_export && lock->l_export->exp_lock_hash) + if (lock->l_export && lock->l_export->exp_lock_hash && + !hlist_unhashed(&lock->l_exp_hash)) lustre_hash_del(lock->l_export->exp_lock_hash, &lock->l_remote_handle, &lock->l_exp_hash); list_add_tail(&lock->l_rk_ast, rpc_list); @@ -2131,7 +2210,7 @@ int ldlm_init_export(struct obd_export *exp) exp->exp_lock_hash = lustre_hash_init(obd_uuid2str(&exp->exp_client_uuid), - 128, 65536, &ldlm_export_lock_ops, LH_REHASH); + 7, 16, &ldlm_export_lock_ops, LH_REHASH); if (!exp->exp_lock_hash) RETURN(-ENOMEM); @@ -2192,7 +2271,7 @@ static int ldlm_setup(void) ldlm_svc_proc_dir, NULL, ldlm_min_threads, ldlm_max_threads, "ldlm_cb", - LCT_MD_THREAD|LCT_DT_THREAD); + LCT_MD_THREAD|LCT_DT_THREAD, NULL); if (!ldlm_state->ldlm_cb_service) { CERROR("failed to start service\n"); @@ -2207,7 +2286,8 @@ static int ldlm_setup(void) ldlm_svc_proc_dir, NULL, ldlm_min_threads, ldlm_max_threads, "ldlm_cn", - LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD); + LCT_MD_THREAD|LCT_DT_THREAD|LCT_CL_THREAD, + NULL); if (!ldlm_state->ldlm_cancel_service) { CERROR("failed to start service\n"); @@ -2394,8 +2474,6 @@ EXPORT_SYMBOL(ldlm_lock2handle); EXPORT_SYMBOL(__ldlm_handle2lock); EXPORT_SYMBOL(ldlm_lock_get); EXPORT_SYMBOL(ldlm_lock_put); -EXPORT_SYMBOL(ldlm_lock_fast_match); -EXPORT_SYMBOL(ldlm_lock_fast_release); EXPORT_SYMBOL(ldlm_lock_match); EXPORT_SYMBOL(ldlm_lock_cancel); EXPORT_SYMBOL(ldlm_lock_addref); @@ -2403,16 +2481,18 @@ EXPORT_SYMBOL(ldlm_lock_addref_try); EXPORT_SYMBOL(ldlm_lock_decref); EXPORT_SYMBOL(ldlm_lock_decref_and_cancel); EXPORT_SYMBOL(ldlm_lock_change_resource); -EXPORT_SYMBOL(ldlm_lock_set_data); EXPORT_SYMBOL(ldlm_it2str); EXPORT_SYMBOL(ldlm_lock_dump); EXPORT_SYMBOL(ldlm_lock_dump_handle); EXPORT_SYMBOL(ldlm_cancel_locks_for_export); EXPORT_SYMBOL(ldlm_reprocess_all_ns); EXPORT_SYMBOL(ldlm_lock_allow_match); +EXPORT_SYMBOL(ldlm_lock_downgrade); +EXPORT_SYMBOL(ldlm_lock_convert); /* ldlm_request.c */ EXPORT_SYMBOL(ldlm_completion_ast_async); +EXPORT_SYMBOL(ldlm_blocking_ast_nocheck); EXPORT_SYMBOL(ldlm_completion_ast); EXPORT_SYMBOL(ldlm_blocking_ast); EXPORT_SYMBOL(ldlm_glimpse_ast); diff --git a/lustre/ldlm/ldlm_pool.c b/lustre/ldlm/ldlm_pool.c index c870218..09b9590 100644 --- a/lustre/ldlm/ldlm_pool.c +++ b/lustre/ldlm/ldlm_pool.c @@ -38,7 +38,7 @@ * Author: Yury Umanets */ -/* +/* * Idea of this code is rather simple. Each second, for each server namespace * we have SLV - server lock volume which is calculated on current number of * granted locks, grant speed for past period, etc - that is, locking load. @@ -103,6 +103,8 @@ # include #endif +#include + #include #include #include "ldlm_internal.h" @@ -110,17 +112,17 @@ #ifdef HAVE_LRU_RESIZE_SUPPORT /* - * 50 ldlm locks for 1MB of RAM. + * 50 ldlm locks for 1MB of RAM. */ #define LDLM_POOL_HOST_L ((num_physpages >> (20 - CFS_PAGE_SHIFT)) * 50) /* - * Maximal possible grant step plan in %. + * Maximal possible grant step plan in %. */ #define LDLM_POOL_MAX_GSP (30) /* - * Minimal possible grant step plan in %. + * Minimal possible grant step plan in %. */ #define LDLM_POOL_MIN_GSP (1) @@ -130,13 +132,13 @@ */ #define LDLM_POOL_GSP_STEP (4) -/* - * LDLM_POOL_GSP% of all locks is default GP. +/* + * LDLM_POOL_GSP% of all locks is default GP. */ #define LDLM_POOL_GP(L) (((L) * LDLM_POOL_MAX_GSP) / 100) -/* - * Max age for locks on clients. +/* + * Max age for locks on clients. */ #define LDLM_POOL_MAX_AGE (36000) @@ -158,7 +160,7 @@ static inline __u64 ldlm_pool_slv_max(__u32 L) { /* * Allow to have all locks for 1 client for 10 hrs. - * Formula is the following: limit * 10h / 1 client. + * Formula is the following: limit * 10h / 1 client. */ __u64 lim = L * LDLM_POOL_MAX_AGE / 1; return lim; @@ -191,7 +193,7 @@ static inline struct ldlm_namespace *ldlm_pl2ns(struct ldlm_pool *pl) } /** - * Calculates suggested grant_step in % of available locks for passed + * Calculates suggested grant_step in % of available locks for passed * \a period. This is later used in grant_plan calculations. */ static inline int ldlm_pool_t2gsp(int t) @@ -199,7 +201,7 @@ static inline int ldlm_pool_t2gsp(int t) /* * This yeilds 1% grant step for anything below LDLM_POOL_GSP_STEP * and up to 30% for anything higher than LDLM_POOL_GSP_STEP. - * + * * How this will affect execution is the following: * * - for thread peroid 1s we will have grant_step 1% which good from @@ -211,25 +213,25 @@ static inline int ldlm_pool_t2gsp(int t) * * - for thread period 10s (which is default) we will have 23% which * means that clients will have enough of room to take some new locks - * without getting some back. All locks from this 23% which were not + * without getting some back. All locks from this 23% which were not * taken by clients in current period will contribute in SLV growing. * SLV growing means more locks cached on clients until limit or grant * plan is reached. */ - return LDLM_POOL_MAX_GSP - - (LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) / + return LDLM_POOL_MAX_GSP - + (LDLM_POOL_MAX_GSP - LDLM_POOL_MIN_GSP) / (1 << (t / LDLM_POOL_GSP_STEP)); } /** * Recalculates next grant limit on passed \a pl. * - * \pre ->pl_lock is locked. + * \pre ->pl_lock is locked. */ static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) { int granted, grant_step, limit; - + limit = ldlm_pool_get_limit(pl); granted = atomic_read(&pl->pl_granted); @@ -241,7 +243,7 @@ static inline void ldlm_pool_recalc_grant_plan(struct ldlm_pool *pl) /** * Recalculates next SLV on passed \a pl. * - * \pre ->pl_lock is locked. + * \pre ->pl_lock is locked. */ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) { @@ -258,13 +260,13 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) if (grant_usage <= 0) grant_usage = 1; - /* - * Find out SLV change factor which is the ratio of grant usage - * from limit. SLV changes as fast as the ratio of grant plan - * consumtion. The more locks from grant plan are not consumed - * by clients in last interval (idle time), the faster grows + /* + * Find out SLV change factor which is the ratio of grant usage + * from limit. SLV changes as fast as the ratio of grant plan + * consumtion. The more locks from grant plan are not consumed + * by clients in last interval (idle time), the faster grows * SLV. And the opposite, the more grant plan is over-consumed - * (load time) the faster drops SLV. + * (load time) the faster drops SLV. */ slv_factor = (grant_usage * 100) / limit; if (2 * abs(granted - limit) > limit) { @@ -286,7 +288,7 @@ static inline void ldlm_pool_recalc_slv(struct ldlm_pool *pl) /** * Recalculates next stats on passed \a pl. * - * \pre ->pl_lock is locked. + * \pre ->pl_lock is locked. */ static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl) { @@ -296,7 +298,7 @@ static inline void ldlm_pool_recalc_stats(struct ldlm_pool *pl) int grant_rate = atomic_read(&pl->pl_grant_rate); int cancel_rate = atomic_read(&pl->pl_cancel_rate); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SLV_STAT, slv); lprocfs_counter_add(pl->pl_stats, LDLM_POOL_GRANTED_STAT, granted); @@ -315,12 +317,12 @@ static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) { struct obd_device *obd; - /* + /* * Set new SLV in obd field for using it later without accessing the * pool. This is required to avoid race between sending reply to client * with new SLV and cleanup server stack in which we can't guarantee * that namespace is still alive. We know only that obd is alive as - * long as valid export is alive. + * long as valid export is alive. */ obd = ldlm_pl2ns(pl)->ns_obd; LASSERT(obd != NULL); @@ -332,7 +334,7 @@ static void ldlm_srv_pool_push_slv(struct ldlm_pool *pl) /** * Recalculates all pool fields on passed \a pl. * - * \pre ->pl_lock is not locked. + * \pre ->pl_lock is not locked. */ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) { @@ -344,22 +346,22 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) if (recalc_interval_sec >= pl->pl_recalc_period) { /* * Recalc SLV after last period. This should be done - * _before_ recalculating new grant plan. + * _before_ recalculating new grant plan. */ ldlm_pool_recalc_slv(pl); - + /* - * Make sure that pool informed obd of last SLV changes. + * Make sure that pool informed obd of last SLV changes. */ ldlm_srv_pool_push_slv(pl); /* - * Update grant_plan for new period. + * Update grant_plan for new period. */ ldlm_pool_recalc_grant_plan(pl); pl->pl_recalc_time = cfs_time_current_sec(); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, recalc_interval_sec); } @@ -371,9 +373,9 @@ static int ldlm_srv_pool_recalc(struct ldlm_pool *pl) * This function is used on server side as main entry point for memory * preasure handling. It decreases SLV on \a pl according to passed * \a nr and \a gfp_mask. - * + * * Our goal here is to decrease SLV such a way that clients hold \a nr - * locks smaller in next 10h. + * locks smaller in next 10h. */ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) @@ -381,22 +383,22 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, __u32 limit; ENTRY; - /* - * VM is asking how many entries may be potentially freed. + /* + * VM is asking how many entries may be potentially freed. */ if (nr == 0) RETURN(atomic_read(&pl->pl_granted)); - /* + /* * Client already canceled locks but server is already in shrinker - * and can't cancel anything. Let's catch this race. + * and can't cancel anything. Let's catch this race. */ if (atomic_read(&pl->pl_granted) == 0) RETURN(0); spin_lock(&pl->pl_lock); - /* + /* * We want shrinker to possibly cause cancelation of @nr locks from * clients or grant approximately @nr locks smaller next intervals. * @@ -406,7 +408,7 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, * interval pool will either increase SLV if locks load is not high * or will keep on same level or even decrease again, thus, shrinker * decreased SLV will affect next recalc intervals and this way will - * make locking load lower. + * make locking load lower. */ if (nr < pl->pl_server_lock_volume) { pl->pl_server_lock_volume = pl->pl_server_lock_volume - nr; @@ -415,15 +417,15 @@ static int ldlm_srv_pool_shrink(struct ldlm_pool *pl, pl->pl_server_lock_volume = ldlm_pool_slv_min(limit); } - /* - * Make sure that pool informed obd of last SLV changes. + /* + * Make sure that pool informed obd of last SLV changes. */ ldlm_srv_pool_push_slv(pl); spin_unlock(&pl->pl_lock); - /* + /* * We did not really free any memory here so far, it only will be - * freed later may be, so that we return 0 to not confuse VM. + * freed later may be, so that we return 0 to not confuse VM. */ RETURN(0); } @@ -435,7 +437,7 @@ static int ldlm_srv_pool_setup(struct ldlm_pool *pl, int limit) { struct obd_device *obd; ENTRY; - + obd = ldlm_pl2ns(pl)->ns_obd; LASSERT(obd != NULL && obd != LP_POISON); LASSERT(obd->obd_type != LP_POISON); @@ -454,9 +456,9 @@ static void ldlm_cli_pool_pop_slv(struct ldlm_pool *pl) { struct obd_device *obd; - /* - * Get new SLV and Limit from obd which is updated with comming - * RPCs. + /* + * Get new SLV and Limit from obd which is updated with comming + * RPCs. */ obd = ldlm_pl2ns(pl)->ns_obd; LASSERT(obd != NULL); @@ -484,29 +486,29 @@ static int ldlm_cli_pool_recalc(struct ldlm_pool *pl) RETURN(0); } - /* - * Make sure that pool knows last SLV and Limit from obd. + /* + * Make sure that pool knows last SLV and Limit from obd. */ ldlm_cli_pool_pop_slv(pl); pl->pl_recalc_time = cfs_time_current_sec(); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_TIMING_STAT, recalc_interval_sec); spin_unlock(&pl->pl_lock); - /* - * Do not cancel locks in case lru resize is disabled for this ns. + /* + * Do not cancel locks in case lru resize is disabled for this ns. */ if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) RETURN(0); - /* + /* * In the time of canceling locks on client we do not need to maintain * sharp timing, we only want to cancel locks asap according to new SLV. * It may be called when SLV has changed much, this is why we do not - * take into account pl->pl_recalc_time here. + * take into account pl->pl_recalc_time here. */ - RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), 0, LDLM_ASYNC, LDLM_CANCEL_LRUR)); } @@ -519,30 +521,30 @@ static int ldlm_cli_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { ENTRY; - - /* - * Do not cancel locks in case lru resize is disabled for this ns. + + /* + * Do not cancel locks in case lru resize is disabled for this ns. */ if (!ns_connect_lru_resize(ldlm_pl2ns(pl))) RETURN(0); - /* - * Make sure that pool knows last SLV and Limit from obd. + /* + * Make sure that pool knows last SLV and Limit from obd. */ ldlm_cli_pool_pop_slv(pl); - /* - * Find out how many locks may be released according to shrink - * policy. + /* + * Find out how many locks may be released according to shrink + * policy. */ if (nr == 0) - RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0, + RETURN(ldlm_cancel_lru_estimate(ldlm_pl2ns(pl), 0, 0, LDLM_CANCEL_SHRINK)); - /* - * Cancel @nr locks accoding to shrink policy. + /* + * Cancel @nr locks accoding to shrink policy. */ - RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, + RETURN(ldlm_cancel_lru(ldlm_pl2ns(pl), nr, LDLM_SYNC, LDLM_CANCEL_SHRINK)); } @@ -575,7 +577,7 @@ int ldlm_pool_recalc(struct ldlm_pool *pl) ldlm_pool_recalc_stats(pl); /* - * Zero out all rates and speed for the last period. + * Zero out all rates and speed for the last period. */ atomic_set(&pl->pl_grant_rate, 0); atomic_set(&pl->pl_cancel_rate, 0); @@ -585,7 +587,7 @@ int ldlm_pool_recalc(struct ldlm_pool *pl) if (pl->pl_ops->po_recalc != NULL) { count = pl->pl_ops->po_recalc(pl); - lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_RECALC_STAT, count); return count; } @@ -602,14 +604,14 @@ int ldlm_pool_shrink(struct ldlm_pool *pl, int nr, unsigned int gfp_mask) { int cancel = 0; - + if (pl->pl_ops->po_shrink != NULL) { cancel = pl->pl_ops->po_shrink(pl, nr, gfp_mask); if (nr > 0) { - lprocfs_counter_add(pl->pl_stats, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SHRINK_REQTD_STAT, nr); - lprocfs_counter_add(pl->pl_stats, + lprocfs_counter_add(pl->pl_stats, LDLM_POOL_SHRINK_FREED_STAT, cancel); CDEBUG(D_DLMTRACE, "%s: request to shrink %d locks, " @@ -779,10 +781,10 @@ static int ldlm_pool_proc_init(struct ldlm_pool *pl) lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANTED_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "granted", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "grant", "locks"); - lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, + lprocfs_counter_init(pl->pl_stats, LDLM_POOL_CANCEL_STAT, LPROCFS_CNTR_AVGMINMAX | LPROCFS_CNTR_STDDEV, "cancel", "locks"); lprocfs_counter_init(pl->pl_stats, LDLM_POOL_GRANT_RATE_STAT, @@ -878,8 +880,8 @@ void ldlm_pool_fini(struct ldlm_pool *pl) { ENTRY; ldlm_pool_proc_fini(pl); - - /* + + /* * Pool should not be used after this point. We can't free it here as * it lives in struct ldlm_namespace, but still interested in catching * any abnormal using cases. @@ -894,27 +896,27 @@ EXPORT_SYMBOL(ldlm_pool_fini); */ void ldlm_pool_add(struct ldlm_pool *pl, struct ldlm_lock *lock) { - /* + /* * FLOCK locks are special in a sense that they are almost never * cancelled, instead special kind of lock is used to drop them. * also there is no LRU for flock locks, so no point in tracking - * them anyway. + * them anyway. */ if (lock->l_resource->lr_type == LDLM_FLOCK) return; ENTRY; - + + LDLM_DEBUG(lock, "add lock to pool"); atomic_inc(&pl->pl_granted); atomic_inc(&pl->pl_grant_rate); atomic_inc(&pl->pl_grant_speed); lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_GRANT_STAT); - - /* + /* * Do not do pool recalc for client side as all locks which - * potentially may be canceled has already been packed into + * potentially may be canceled has already been packed into * enqueue/cancel rpc. Also we do not want to run out of stack - * with too long call paths. + * with too long call paths. */ if (ns_is_server(ldlm_pl2ns(pl))) ldlm_pool_recalc(pl); @@ -934,11 +936,12 @@ void ldlm_pool_del(struct ldlm_pool *pl, struct ldlm_lock *lock) return; ENTRY; + LDLM_DEBUG(lock, "del lock from pool"); LASSERT(atomic_read(&pl->pl_granted) > 0); atomic_dec(&pl->pl_granted); atomic_inc(&pl->pl_cancel_rate); atomic_dec(&pl->pl_grant_speed); - + lprocfs_counter_incr(pl->pl_stats, LDLM_POOL_CANCEL_STAT); if (ns_is_server(ldlm_pl2ns(pl))) @@ -950,7 +953,7 @@ EXPORT_SYMBOL(ldlm_pool_del); /** * Returns current \a pl SLV. * - * \pre ->pl_lock is not locked. + * \pre ->pl_lock is not locked. */ __u64 ldlm_pool_get_slv(struct ldlm_pool *pl) { @@ -965,7 +968,7 @@ EXPORT_SYMBOL(ldlm_pool_get_slv); /** * Sets passed \a slv to \a pl. * - * \pre ->pl_lock is not locked. + * \pre ->pl_lock is not locked. */ void ldlm_pool_set_slv(struct ldlm_pool *pl, __u64 slv) { @@ -978,7 +981,7 @@ EXPORT_SYMBOL(ldlm_pool_set_slv); /** * Returns current \a pl CLV. * - * \pre ->pl_lock is not locked. + * \pre ->pl_lock is not locked. */ __u64 ldlm_pool_get_clv(struct ldlm_pool *pl) { @@ -993,7 +996,7 @@ EXPORT_SYMBOL(ldlm_pool_get_clv); /** * Sets passed \a clv to \a pl. * - * \pre ->pl_lock is not locked. + * \pre ->pl_lock is not locked. */ void ldlm_pool_set_clv(struct ldlm_pool *pl, __u64 clv) { @@ -1041,16 +1044,17 @@ static struct shrinker *ldlm_pools_srv_shrinker; static struct shrinker *ldlm_pools_cli_shrinker; static struct completion ldlm_pools_comp; -/* +/* * Cancel \a nr locks from all namespaces (if possible). Returns number of * cached locks after shrink is finished. All namespaces are asked to * cancel approximately equal amount of locks to keep balancing. */ -static int ldlm_pools_shrink(ldlm_side_t client, int nr, +static int ldlm_pools_shrink(ldlm_side_t client, int nr, unsigned int gfp_mask) { int total = 0, cached = 0, nr_ns; struct ldlm_namespace *ns; + void *cookie; if (nr != 0 && !(gfp_mask & __GFP_FS)) return -1; @@ -1058,15 +1062,18 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, CDEBUG(D_DLMTRACE, "Request to shrink %d %s locks from all pools\n", nr, client == LDLM_NAMESPACE_CLIENT ? "client" : "server"); - /* - * Find out how many resources we may release. + cookie = cl_env_reenter(); + + /* + * Find out how many resources we may release. */ - for (nr_ns = atomic_read(ldlm_namespace_nr(client)); - nr_ns > 0; nr_ns--) + for (nr_ns = atomic_read(ldlm_namespace_nr(client)); + nr_ns > 0; nr_ns--) { mutex_down(ldlm_namespace_lock(client)); if (list_empty(ldlm_namespace_list(client))) { mutex_up(ldlm_namespace_lock(client)); + cl_env_reexit(cookie); return 0; } ns = ldlm_namespace_first_locked(client); @@ -1076,28 +1083,30 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, total += ldlm_pool_shrink(&ns->ns_pool, 0, gfp_mask); ldlm_namespace_put(ns, 1); } - - if (nr == 0 || total == 0) + + if (nr == 0 || total == 0) { + cl_env_reexit(cookie); return total; + } - /* - * Shrink at least ldlm_namespace_nr(client) namespaces. + /* + * Shrink at least ldlm_namespace_nr(client) namespaces. */ - for (nr_ns = atomic_read(ldlm_namespace_nr(client)); - nr_ns > 0; nr_ns--) + for (nr_ns = atomic_read(ldlm_namespace_nr(client)); + nr_ns > 0; nr_ns--) { int cancel, nr_locks; - /* - * Do not call shrink under ldlm_namespace_lock(client) + /* + * Do not call shrink under ldlm_namespace_lock(client) */ mutex_down(ldlm_namespace_lock(client)); if (list_empty(ldlm_namespace_list(client))) { mutex_up(ldlm_namespace_lock(client)); - /* + /* * If list is empty, we can't return any @cached > 0, * that probably would cause needless shrinker - * call. + * call. */ cached = 0; break; @@ -1106,13 +1115,14 @@ static int ldlm_pools_shrink(ldlm_side_t client, int nr, ldlm_namespace_get(ns); ldlm_namespace_move_locked(ns, client); mutex_up(ldlm_namespace_lock(client)); - + nr_locks = ldlm_pool_granted(&ns->ns_pool); cancel = 1 + nr_locks * nr / total; ldlm_pool_shrink(&ns->ns_pool, cancel, gfp_mask); cached += ldlm_pool_granted(&ns->ns_pool); ldlm_namespace_put(ns, 1); } + cl_env_reexit(cookie); return cached; } @@ -1132,16 +1142,16 @@ void ldlm_pools_recalc(ldlm_side_t client) struct ldlm_namespace *ns; int nr, equal = 0; - /* + /* * No need to setup pool limit for client pools. */ if (client == LDLM_NAMESPACE_SERVER) { - /* - * Check all modest namespaces first. + /* + * Check all modest namespaces first. */ mutex_down(ldlm_namespace_lock(client)); - list_for_each_entry(ns, ldlm_namespace_list(client), - ns_list_chain) + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) { if (ns->ns_appetite != LDLM_NAMESPACE_MODEST) continue; @@ -1150,9 +1160,9 @@ void ldlm_pools_recalc(ldlm_side_t client) if (l == 0) l = 1; - /* + /* * Set the modest pools limit equal to their avg granted - * locks + 5%. + * locks + 5%. */ l += dru(l * LDLM_POOLS_MODEST_MARGIN, 100); ldlm_pool_setup(&ns->ns_pool, l); @@ -1160,9 +1170,9 @@ void ldlm_pools_recalc(ldlm_side_t client) nr_p++; } - /* - * Make sure that modest namespaces did not eat more that 2/3 - * of limit. + /* + * Make sure that modest namespaces did not eat more that 2/3 + * of limit. */ if (nr_l >= 2 * (LDLM_POOL_HOST_L / 3)) { CWARN("\"Modest\" pools eat out 2/3 of server locks " @@ -1172,25 +1182,25 @@ void ldlm_pools_recalc(ldlm_side_t client) equal = 1; } - /* - * The rest is given to greedy namespaces. + /* + * The rest is given to greedy namespaces. */ - list_for_each_entry(ns, ldlm_namespace_list(client), - ns_list_chain) + list_for_each_entry(ns, ldlm_namespace_list(client), + ns_list_chain) { if (!equal && ns->ns_appetite != LDLM_NAMESPACE_GREEDY) continue; if (equal) { - /* + /* * In the case 2/3 locks are eaten out by * modest pools, we re-setup equal limit - * for _all_ pools. + * for _all_ pools. */ l = LDLM_POOL_HOST_L / atomic_read(ldlm_namespace_nr(client)); } else { - /* + /* * All the rest of greedy pools will have * all locks in equal parts. */ @@ -1203,16 +1213,16 @@ void ldlm_pools_recalc(ldlm_side_t client) mutex_up(ldlm_namespace_lock(client)); } - /* - * Recalc at least ldlm_namespace_nr(client) namespaces. + /* + * Recalc at least ldlm_namespace_nr(client) namespaces. */ for (nr = atomic_read(ldlm_namespace_nr(client)); nr > 0; nr--) { - /* + /* * Lock the list, get first @ns in the list, getref, move it * to the tail, unlock and call pool recalc. This way we avoid * calling recalc under @ns lock what is really good as we get * rid of potential deadlock on client nodes when canceling - * locks synchronously. + * locks synchronously. */ mutex_down(ldlm_namespace_lock(client)); if (list_empty(ldlm_namespace_list(client))) { @@ -1224,8 +1234,8 @@ void ldlm_pools_recalc(ldlm_side_t client) ldlm_namespace_move_locked(ns, client); mutex_up(ldlm_namespace_lock(client)); - /* - * After setup is done - recalc the pool. + /* + * After setup is done - recalc the pool. */ ldlm_pool_recalc(&ns->ns_pool); ldlm_namespace_put(ns, 1); @@ -1250,14 +1260,14 @@ static int ldlm_pools_thread_main(void *arg) struct l_wait_info lwi; /* - * Recal all pools on this tick. + * Recal all pools on this tick. */ ldlm_pools_recalc(LDLM_NAMESPACE_SERVER); ldlm_pools_recalc(LDLM_NAMESPACE_CLIENT); - + /* * Wait until the next check time, or until we're - * stopped. + * stopped. */ lwi = LWI_TIMEOUT(cfs_time_seconds(LDLM_POOLS_THREAD_PERIOD), NULL, NULL); @@ -1298,9 +1308,9 @@ static int ldlm_pools_thread_start(void) init_completion(&ldlm_pools_comp); cfs_waitq_init(&ldlm_pools_thread->t_ctl_waitq); - /* + /* * CLONE_VM and CLONE_FILES just avoid a needless copy, because we - * just drop the VM and FILES in ptlrpc_daemonize() right away. + * just drop the VM and FILES in ptlrpc_daemonize() right away. */ rc = cfs_kernel_thread(ldlm_pools_thread_main, ldlm_pools_thread, CLONE_VM | CLONE_FILES); @@ -1328,10 +1338,10 @@ static void ldlm_pools_thread_stop(void) ldlm_pools_thread->t_flags = SVC_STOPPING; cfs_waitq_signal(&ldlm_pools_thread->t_ctl_waitq); - /* + /* * Make sure that pools thread is finished before freeing @thread. * This fixes possible race and oops due to accessing freed memory - * in pools thread. + * in pools thread. */ wait_for_completion(&ldlm_pools_comp); OBD_FREE_PTR(ldlm_pools_thread); diff --git a/lustre/ldlm/ldlm_request.c b/lustre/ldlm/ldlm_request.c index 44f5fb2..1317844 100644 --- a/lustre/ldlm/ldlm_request.c +++ b/lustre/ldlm/ldlm_request.c @@ -120,6 +120,7 @@ int ldlm_get_enq_timeout(struct ldlm_lock *lock) timeout = timeout + (timeout >> 1); /* 150% */ return max(timeout, ldlm_enqueue_min); } +EXPORT_SYMBOL(ldlm_get_enq_timeout); static int is_granted_or_cancelled(struct ldlm_lock *lock) { @@ -161,8 +162,8 @@ static int ldlm_completion_tail(struct ldlm_lock *lock) } /** - * Implementation of ->l_completion_ast() for a client that doesn't wait - * until lock is granted. Suitable for locks enqueued through ptlrpcd or + * Implementation of ->l_completion_ast() for a client, that doesn't wait + * until lock is granted. Suitable for locks enqueued through ptlrpcd, of * other threads that cannot block for long. */ int ldlm_completion_ast_async(struct ldlm_lock *lock, int flags, void *data) @@ -183,6 +184,7 @@ int ldlm_completion_ast_async(struct ldlm_lock *lock, int flags, void *data) LDLM_DEBUG(lock, "client-side enqueue returned a blocked lock, " "going forward"); ldlm_lock_dump(D_OTHER, lock, 0); + ldlm_reprocess_all(lock->l_resource); RETURN(0); } @@ -276,31 +278,22 @@ noreproc: RETURN(ldlm_completion_tail(lock)); } -/* - * ->l_blocking_ast() callback for LDLM locks acquired by server-side OBDs. +/** + * A helper to build a blocking ast function + * + * Perform a common operation for blocking asts: + * defferred lock cancellation. + * + * \param lock the lock blocking or canceling ast was called on + * \retval 0 + * \see mdt_blocking_ast + * \see ldlm_blocking_ast */ -int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, - void *data, int flag) +int ldlm_blocking_ast_nocheck(struct ldlm_lock *lock) { int do_ast; ENTRY; - if (flag == LDLM_CB_CANCELING) { - /* Don't need to do anything here. */ - RETURN(0); - } - - lock_res_and_lock(lock); - /* Get this: if ldlm_blocking_ast is racing with intent_policy, such - * that ldlm_blocking_ast is called just before intent_policy method - * takes the ns_lock, then by the time we get the lock, we might not - * be the correct blocking function anymore. So check, and return - * early, if so. */ - if (lock->l_blocking_ast != ldlm_blocking_ast) { - unlock_res_and_lock(lock); - RETURN(0); - } - lock->l_flags |= LDLM_FL_CBPENDING; do_ast = (!lock->l_readers && !lock->l_writers); unlock_res_and_lock(lock); @@ -321,6 +314,42 @@ int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, RETURN(0); } +/** + * Server blocking AST + * + * ->l_blocking_ast() callback for LDLM locks acquired by server-side + * OBDs. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int ldlm_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + ENTRY; + + if (flag == LDLM_CB_CANCELING) { + /* Don't need to do anything here. */ + RETURN(0); + } + + lock_res_and_lock(lock); + /* Get this: if ldlm_blocking_ast is racing with intent_policy, such + * that ldlm_blocking_ast is called just before intent_policy method + * takes the ns_lock, then by the time we get the lock, we might not + * be the correct blocking function anymore. So check, and return + * early, if so. */ + if (lock->l_blocking_ast != ldlm_blocking_ast) { + unlock_res_and_lock(lock); + RETURN(0); + } + RETURN(ldlm_blocking_ast_nocheck(lock)); +} + /* * ->l_glimpse_ast() for DLM extent locks acquired on the server-side. See * comment in filter_intent_policy() on why you may need this. @@ -356,6 +385,7 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, ldlm_completion_callback completion, ldlm_glimpse_callback glimpse, void *data, __u32 lvb_len, void *lvb_swabber, + const __u64 *client_cookie, struct lustre_handle *lockh) { struct ldlm_lock *lock; @@ -387,6 +417,8 @@ int ldlm_cli_enqueue_local(struct ldlm_namespace *ns, unlock_res_and_lock(lock); if (policy != NULL) lock->l_policy_data = *policy; + if (client_cookie != NULL) + lock->l_client_cookie = *client_cookie; if (type == LDLM_EXTENT) lock->l_req_extent = policy->l_extent; @@ -607,7 +639,10 @@ static inline int ldlm_req_handles_avail(int req_size, int off) int avail; avail = min_t(int, LDLM_MAXREQSIZE, CFS_PAGE_SIZE - 512) - req_size; - avail /= sizeof(struct lustre_handle); + if (likely(avail >= 0)) + avail /= (int)sizeof(struct lustre_handle); + else + avail = 0; avail += LDLM_LOCKREQ_HANDLES - off; return avail; @@ -650,12 +685,12 @@ int ldlm_prep_elc_req(struct obd_export *exp, struct ptlrpc_request *req, bufcount = req_capsule_filled_sizes(pill, RCL_CLIENT); avail = ldlm_capsule_handles_avail(pill, RCL_CLIENT, canceloff); - flags = ns_connect_lru_resize(ns) ? + flags = ns_connect_lru_resize(ns) ? LDLM_CANCEL_LRUR : LDLM_CANCEL_AGED; to_free = !ns_connect_lru_resize(ns) && opc == LDLM_ENQUEUE ? 1 : 0; - /* Cancel lru locks here _only_ if the server supports + /* Cancel lru locks here _only_ if the server supports * EARLY_CANCEL. Otherwise we have to send extra CANCEL * rpc, what will make us slower. */ if (avail > count) @@ -829,7 +864,9 @@ int ldlm_cli_enqueue(struct obd_export *exp, struct ptlrpc_request **reqp, } LDLM_DEBUG(lock, "sending request"); + rc = ptlrpc_queue_wait(req); + err = ldlm_cli_enqueue_fini(exp, req, einfo->ei_type, policy ? 1 : 0, einfo->ei_mode, flags, lvb, lvb_len, lvb_swabber, lockh, rc); @@ -957,7 +994,7 @@ static int ldlm_cli_cancel_local(struct ldlm_lock *lock) { int rc = LDLM_FL_LOCAL_ONLY; ENTRY; - + if (lock->l_conn_export) { int local_only; @@ -1006,7 +1043,7 @@ static void ldlm_cancel_pack(struct ptlrpc_request *req, LASSERT(dlm != NULL); /* Check the room in the request buffer. */ - max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - + max = req_capsule_get_size(&req->rq_pill, &RMF_DLM_REQ, RCL_CLIENT) - sizeof(struct ldlm_request); max /= sizeof(struct lustre_handle); max += LDLM_LOCKREQ_HANDLES; @@ -1086,7 +1123,7 @@ int ldlm_cli_cancel_req(struct obd_export *exp, struct list_head *cancels, ptlrpc_request_set_replen(req); if (flags & LDLM_FL_ASYNC) { - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); sent = count; GOTO(out, 0); } else { @@ -1132,28 +1169,27 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req) __u64 old_slv, new_slv; __u32 new_limit; ENTRY; - - if (unlikely(!req->rq_import || !req->rq_import->imp_obd || + if (unlikely(!req->rq_import || !req->rq_import->imp_obd || !imp_connect_lru_resize(req->rq_import))) { - /* - * Do nothing for corner cases. + /* + * Do nothing for corner cases. */ RETURN(0); } - /* - * In some cases RPC may contain slv and limit zeroed out. This is + /* + * In some cases RPC may contain slv and limit zeroed out. This is * the case when server does not support lru resize feature. This is * also possible in some recovery cases when server side reqs have no - * ref to obd export and thus access to server side namespace is no - * possible. + * ref to obd export and thus access to server side namespace is no + * possible. */ - if (lustre_msg_get_slv(req->rq_repmsg) == 0 || + if (lustre_msg_get_slv(req->rq_repmsg) == 0 || lustre_msg_get_limit(req->rq_repmsg) == 0) { DEBUG_REQ(D_HA, req, "Zero SLV or Limit found " - "(SLV: "LPU64", Limit: %u)", - lustre_msg_get_slv(req->rq_repmsg), + "(SLV: "LPU64", Limit: %u)", + lustre_msg_get_slv(req->rq_repmsg), lustre_msg_get_limit(req->rq_repmsg)); RETURN(0); } @@ -1162,12 +1198,12 @@ int ldlm_cli_update_pool(struct ptlrpc_request *req) new_slv = lustre_msg_get_slv(req->rq_repmsg); obd = req->rq_import->imp_obd; - /* - * Set new SLV and Limit to obd fields to make accessible for pool + /* + * Set new SLV and Limit to obd fields to make accessible for pool * thread. We do not access obd_namespace and pool directly here * as there is no reliable way to make sure that they are still * alive in cleanup time. Evil races are possible which may cause - * oops in that time. + * oops in that time. */ write_lock(&obd->obd_pool_lock); old_slv = obd->obd_pool_slv; @@ -1265,7 +1301,7 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags) RETURN(count); } -/** +/** * Callback function for shrink policy. Makes decision whether to keep * \a lock in LRU for current \a LRU size \a unused, added in current scan * \a added and number of locks to be preferably canceled \a count. @@ -1276,15 +1312,15 @@ static int ldlm_cancel_list(struct list_head *cancels, int count, int flags) */ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns, struct ldlm_lock *lock, - int unused, int added, + int unused, int added, int count) { int lock_cost; __u64 page_nr; - /* - * Stop lru processing when we reached passed @count or checked all - * locks in lru. + /* + * Stop lru processing when we reached passed @count or checked all + * locks in lru. */ if (count && added >= count) return LDLM_POLICY_KEEP_LOCK; @@ -1297,30 +1333,30 @@ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns, */ page_nr = lock->l_weigh_ast(lock); } else { - struct ldlm_extent *l_extent; + struct ldlm_extent *l_extent; - /* - * For all extent locks cost is 1 + number of pages in - * their extent. - */ - l_extent = &lock->l_policy_data.l_extent; + /* + * For all extent locks cost is 1 + number of pages in + * their extent. + */ + l_extent = &lock->l_policy_data.l_extent; page_nr = l_extent->end - l_extent->start; - do_div(page_nr, CFS_PAGE_SIZE); + do_div(page_nr, CFS_PAGE_SIZE); } lock_cost = 1 + page_nr; } else { - /* - * For all locks which are not extent ones cost is 1 + /* + * For all locks which are not extent ones cost is 1 */ lock_cost = 1; } - /* + /* * Keep all expensive locks in lru for the memory pressure time * cancel policy. They anyways may be canceled by lru resize - * pplicy if they have not small enough CLV. + * pplicy if they have not small enough CLV. */ - return lock_cost > ns->ns_shrink_thumb ? + return lock_cost > ns->ns_shrink_thumb ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } @@ -1334,8 +1370,8 @@ static ldlm_policy_res_t ldlm_cancel_shrink_policy(struct ldlm_namespace *ns, * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU */ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, - int unused, int added, + struct ldlm_lock *lock, + int unused, int added, int count) { cfs_time_t cur = cfs_time_current(); @@ -1343,8 +1379,8 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, __u64 slv, lvf, lv; cfs_time_t la; - /* - * Stop lru processing when we reached passed @count or checked all + /* + * Stop lru processing when we reached passed @count or checked all * locks in lru. */ if (count && added >= count) @@ -1352,20 +1388,20 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, slv = ldlm_pool_get_slv(pl); lvf = ldlm_pool_get_lvf(pl); - la = cfs_duration_sec(cfs_time_sub(cur, + la = cfs_duration_sec(cfs_time_sub(cur, lock->l_last_used)); - /* - * Stop when slv is not yet come from server or lv is smaller than + /* + * Stop when slv is not yet come from server or lv is smaller than * it is. */ lv = lvf * la * unused; - - /* - * Inform pool about current CLV to see it via proc. + + /* + * Inform pool about current CLV to see it via proc. */ ldlm_pool_set_clv(pl, lv); - return (slv == 1 || lv < slv) ? + return (slv == 1 || lv < slv) ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } @@ -1379,15 +1415,15 @@ static ldlm_policy_res_t ldlm_cancel_lrur_policy(struct ldlm_namespace *ns, * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU */ static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, + struct ldlm_lock *lock, int unused, int added, int count) { - /* - * Stop lru processing when we reached passed @count or checked all - * locks in lru. + /* + * Stop lru processing when we reached passed @count or checked all + * locks in lru. */ - return (added >= count) ? + return (added >= count) ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } @@ -1401,18 +1437,18 @@ static ldlm_policy_res_t ldlm_cancel_passed_policy(struct ldlm_namespace *ns, * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU */ static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, + struct ldlm_lock *lock, int unused, int added, int count) { - /* - * Stop lru processing if young lock is found and we reached passed - * @count. + /* + * Stop lru processing if young lock is found and we reached passed + * @count. */ - return ((added >= count) && + return ((added >= count) && cfs_time_before(cfs_time_current(), cfs_time_add(lock->l_last_used, - ns->ns_max_age))) ? + ns->ns_max_age))) ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } @@ -1426,20 +1462,20 @@ static ldlm_policy_res_t ldlm_cancel_aged_policy(struct ldlm_namespace *ns, * \retval LDLM_POLICY_CANCEL_LOCK cancel lock from LRU */ static ldlm_policy_res_t ldlm_cancel_default_policy(struct ldlm_namespace *ns, - struct ldlm_lock *lock, + struct ldlm_lock *lock, int unused, int added, int count) { - /* - * Stop lru processing when we reached passed @count or checked all - * locks in lru. + /* + * Stop lru processing when we reached passed @count or checked all + * locks in lru. */ - return (added >= count) ? + return (added >= count) ? LDLM_POLICY_KEEP_LOCK : LDLM_POLICY_CANCEL_LOCK; } -typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, - struct ldlm_lock *, int, +typedef ldlm_policy_res_t (*ldlm_cancel_lru_policy_t)(struct ldlm_namespace *, + struct ldlm_lock *, int, int, int); static ldlm_cancel_lru_policy_t @@ -1456,10 +1492,10 @@ ldlm_cancel_lru_policy(struct ldlm_namespace *ns, int flags) if (flags & LDLM_CANCEL_AGED) return ldlm_cancel_aged_policy; } - + return ldlm_cancel_default_policy; } - + /* - Free space in lru for @count new locks, * redundant unused locks are canceled locally; * - also cancel locally unused aged locks; @@ -1502,7 +1538,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, pf = ldlm_cancel_lru_policy(ns, flags); LASSERT(pf != NULL); - + while (!list_empty(&ns->ns_unused_list)) { /* For any flags, stop scanning if @max is reached. */ if (max && added >= max) @@ -1533,11 +1569,11 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, * we find a lock that should stay in the cache. * We should take into account lock age anyway * as new lock even if it is small of weight is - * valuable resource. + * valuable resource. * * That is, for shrinker policy we drop only * old locks, but additionally chose them by - * their weight. Big extent locks will stay in + * their weight. Big extent locks will stay in * the cache. */ if (pf(ns, lock, unused, added, count) == LDLM_POLICY_KEEP_LOCK) { @@ -1568,8 +1604,8 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, /* If we have chosen to cancel this lock voluntarily, we * better send cancel notification to server, so that it - * frees appropriate state. This might lead to a race - * where while we are doing cancel here, server is also + * frees appropriate state. This might lead to a race + * where while we are doing cancel here, server is also * silently cancelling this lock. */ lock->l_flags &= ~LDLM_FL_CANCEL_ON_BLOCK; @@ -1599,7 +1635,7 @@ int ldlm_cancel_lru_local(struct ldlm_namespace *ns, struct list_head *cancels, RETURN(ldlm_cancel_list(cancels, added, cancel_flags)); } -/* Returns number of locks which could be canceled next time when +/* Returns number of locks which could be canceled next time when * ldlm_cancel_lru() is called. Used from locks pool shrinker. */ int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, int count, int max, int flags) @@ -1625,10 +1661,10 @@ int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, break; /* Somebody is already doing CANCEL or there is a - * blocking request will send cancel. Let's not count + * blocking request will send cancel. Let's not count * this lock. */ if ((lock->l_flags & LDLM_FL_CANCELING) || - (lock->l_flags & LDLM_FL_BL_AST)) + (lock->l_flags & LDLM_FL_BL_AST)) continue; LDLM_LOCK_GET(lock); @@ -1658,7 +1694,7 @@ int ldlm_cancel_lru_estimate(struct ldlm_namespace *ns, * in a thread and this function will return after the thread has been * asked to call the callback. when called with LDLM_SYNC the blocking * callback will be performed in this function. */ -int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, +int ldlm_cancel_lru(struct ldlm_namespace *ns, int nr, ldlm_sync_t sync, int flags) { CFS_LIST_HEAD(cancels); @@ -1713,7 +1749,7 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res, /* If somebody is already doing CANCEL, or blocking ast came, * skip this lock. */ - if (lock->l_flags & LDLM_FL_BL_AST || + if (lock->l_flags & LDLM_FL_BL_AST || lock->l_flags & LDLM_FL_CANCELING) continue; @@ -1741,10 +1777,10 @@ int ldlm_cancel_resource_local(struct ldlm_resource *res, RETURN(ldlm_cancel_list(cancels, count, cancel_flags)); } -/* If @req is NULL, send CANCEL request to server with handles of locks - * in the @cancels. If EARLY_CANCEL is not supported, send CANCEL requests +/* If @req is NULL, send CANCEL request to server with handles of locks + * in the @cancels. If EARLY_CANCEL is not supported, send CANCEL requests * separately per lock. - * If @req is not NULL, put handles of locks in @cancels into the request + * If @req is not NULL, put handles of locks in @cancels into the request * buffer at the offset @off. * Destroy @cancels at the end. */ int ldlm_cli_cancel_list(struct list_head *cancels, int count, @@ -1756,8 +1792,8 @@ int ldlm_cli_cancel_list(struct list_head *cancels, int count, if (list_empty(cancels) || count == 0) RETURN(0); - - /* XXX: requests (both batched and not) could be sent in parallel. + + /* XXX: requests (both batched and not) could be sent in parallel. * Usually it is enough to have just 1 RPC, but it is possible that * there are to many locks to be cancelled in LRU or on a resource. * It would also speed up the case when the server does not support @@ -2149,7 +2185,7 @@ static int replay_one_lock(struct obd_import *imp, struct ldlm_lock *lock) aa = ptlrpc_req_async_args(req); aa->lock_handle = body->lock_handle[0]; req->rq_interpret_reply = (ptlrpc_interpterer_t)replay_lock_interpret; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } diff --git a/lustre/ldlm/ldlm_resource.c b/lustre/ldlm/ldlm_resource.c index 1b4e46e..320a870 100644 --- a/lustre/ldlm/ldlm_resource.c +++ b/lustre/ldlm/ldlm_resource.c @@ -287,6 +287,12 @@ void ldlm_proc_namespace(struct ldlm_namespace *ns) lock_vars[0].write_fptr = lprocfs_wr_uint; lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + snprintf(lock_name, MAX_STRING_SIZE, "%s/lock_timeouts", + ns->ns_name); + lock_vars[0].data = &ns->ns_timeouts; + lock_vars[0].read_fptr = lprocfs_rd_uint; + lprocfs_add_vars(ldlm_ns_proc_dir, lock_vars, 0); + snprintf(lock_name, MAX_STRING_SIZE, "%s/max_nolock_bytes", ns->ns_name); lock_vars[0].data = &ns->ns_max_nolock_size; @@ -370,6 +376,7 @@ struct ldlm_namespace *ldlm_namespace_new(struct obd_device *obd, char *name, ns->ns_max_unused = LDLM_DEFAULT_LRU_SIZE; ns->ns_max_age = LDLM_DEFAULT_MAX_ALIVE; ns->ns_ctime_age_limit = LDLM_CTIME_AGE_LIMIT; + ns->ns_timeouts = 0; spin_lock_init(&ns->ns_unused_lock); ns->ns_orig_connect_flags = 0; ns->ns_connect_flags = 0; diff --git a/lustre/liblustre/Makefile.am b/lustre/liblustre/Makefile.am index 21de87c..326a8c0 100644 --- a/lustre/liblustre/Makefile.am +++ b/lustre/liblustre/Makefile.am @@ -22,7 +22,7 @@ LUSTRE_LIBS = libllite.a \ $(top_builddir)/lustre/obdclass/liblustreclass.a \ $(top_builddir)/lustre/lvfs/liblvfs.a -if QUOTA +if LIBLUSTRE QUOTA_LIBS = $(top_builddir)/lustre/quota/libquota.a endif @@ -59,11 +59,12 @@ install-exec-hook: endif libllite_a_SOURCES = llite_lib.c llite_fid.c super.c namei.c rw.c file.c dir.c \ - lutil.c lutil.h llite_lib.h + lutil.c lutil.h llite_lib.h llite_cl.c \ + ../lclient/lcommon_cl.c ../lclient/glimpse.c # for make rpms -- need cleanup liblustre_a_SOURCES = llite_lib.c llite_fid.c super.c namei.c rw.c file.c dir.c \ - llite_lib.h + llite_lib.h llite_cl.c liblustre.a : $(LUSTRE_LIBS) $(LND_LIBS) $(LNET_LIBS) $(SYSIO_LIBS) $(QUOTA_LIBS) sh $(srcdir)/genlib.sh "$(SYSIO)" "$(LIBS)" "$(LND_LIBS)" "$(PTHREAD_LIBS)" "$(QUOTA_LIBS)" "$(CAP_LIBS)" "$(ZLIB)" diff --git a/lustre/liblustre/dir.c b/lustre/liblustre/dir.c index 57960a2..8999ae6 100644 --- a/lustre/liblustre/dir.c +++ b/lustre/liblustre/dir.c @@ -194,12 +194,12 @@ static int filldir(char *buf, int buflen, return 0; } -/* +/* * TODO: much of the code here is similar/identical to llite ll_readdir(). * These code can be factored out and shared in a common module. */ -ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, +ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, char *buf, size_t nbytes) { struct llu_inode_info *lli = llu_i2info(dir); @@ -237,9 +237,9 @@ ssize_t llu_iop_filldirentries(struct inode *dir, _SYSIO_OFF_T *basep, struct lu_dirent *ent; if (!IS_ERR(page)) { - /* + /* * If page is empty (end of directoryis reached), - * use this value. + * use this value. */ __u64 hash = DIR_END_OFF; __u64 next; diff --git a/lustre/liblustre/file.c b/lustre/liblustre/file.c index aca3fde..38fb136 100644 --- a/lustre/liblustre/file.c +++ b/lustre/liblustre/file.c @@ -137,10 +137,10 @@ void obdo_refresh_inode(struct inode *dst, if (valid & OBD_MD_FLATIME && src->o_atime > LTIME_S(st->st_atime)) LTIME_S(st->st_atime) = src->o_atime; - + /* mtime is always updated with ctime, but can be set in past. As write and utime(2) may happen within 1 second, and utime's - mtime has a priority over write's one, leave mtime from mds + mtime has a priority over write's one, leave mtime from mds for the same ctimes. */ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(st->st_ctime)) { LTIME_S(st->st_ctime) = src->o_ctime; @@ -320,7 +320,7 @@ int llu_objects_destroy(struct ptlrpc_request *req, struct inode *dir) } } - rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti, NULL); + rc = obd_destroy(llu_i2obdexp(dir), oa, lsm, &oti, NULL, NULL); OBDO_FREE(oa); if (rc) CERROR("obd destroy objid 0x"LPX64" error %d\n", @@ -340,10 +340,10 @@ int llu_sizeonmds_update(struct inode *inode, struct md_open_data *mod, struct obdo oa; int rc; ENTRY; - + LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)); LASSERT(sbi->ll_lco.lco_flags & OBD_CONNECT_SOM); - + rc = llu_inode_getattr(inode, &oa); if (rc == -ENOENT) { oa.o_valid = 0; @@ -356,7 +356,7 @@ int llu_sizeonmds_update(struct inode *inode, struct md_open_data *mod, lli->lli_st_generation); RETURN(rc); } - + md_from_obdo(&op_data, &oa, oa.o_valid); memcpy(&op_data.op_handle, fh, sizeof(*fh)); op_data.op_ioepoch = ioepoch; @@ -387,7 +387,7 @@ int llu_md_close(struct obd_export *md_exp, struct inode *inode) op_data.op_attr.ia_valid = ATTR_MODE | ATTR_ATIME_SET | ATTR_MTIME_SET | ATTR_CTIME_SET; - + if (fd->fd_flags & FMODE_WRITE) { struct llu_sb_info *sbi = llu_i2sbi(inode); if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SOM) || @@ -400,11 +400,11 @@ int llu_md_close(struct obd_export *md_exp, struct inode *inode) * are really changed. */ op_data.op_flags |= MF_SOM_CHANGE; - /* Pack Size-on-MDS attributes if we are in IO epoch and + /* Pack Size-on-MDS attributes if we are in IO epoch and * attributes are valid. */ LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)); - if (!llu_local_size(inode)) - op_data.op_attr.ia_valid |= + if (!cl_local_size(inode)) + op_data.op_attr.ia_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; } } @@ -513,71 +513,3 @@ _SYSIO_OFF_T llu_iop_pos(struct inode *ino, _SYSIO_OFF_T off) RETURN(off); } - -/* this isn't where truncate starts. roughly: - * llu_iop_{open,setattr}->llu_setattr_raw->llu_vmtruncate->llu_truncate - * we grab the lock back in setattr_raw to avoid races. */ -static void llu_truncate(struct inode *inode, obd_flag flags) -{ - struct llu_inode_info *lli = llu_i2info(inode); - struct intnl_stat *st = llu_i2stat(inode); - struct obd_info oinfo = { { { 0 } } }; - struct obdo oa = { 0 }; - int rc; - ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:inode=%llu/%lu(%p) to %llu\n", - (long long)st->st_ino, lli->lli_st_generation, inode, - (long long)st->st_size); - - if (!lli->lli_smd) { - CDEBUG(D_INODE, "truncate on inode %llu with no objects\n", - (long long)st->st_ino); - EXIT; - return; - } - - oinfo.oi_md = lli->lli_smd; - oinfo.oi_policy.l_extent.start = st->st_size; - oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - oinfo.oi_oa = &oa; - oa.o_id = lli->lli_smd->lsm_object_id; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLFLAGS; - oa.o_flags = flags; /* We don't actually want to copy inode flags */ - - obdo_from_inode(&oa, inode, - OBD_MD_FLTYPE | OBD_MD_FLMODE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); - - obd_adjust_kms(llu_i2obdexp(inode), lli->lli_smd, st->st_size, 1); - - CDEBUG(D_INFO, "calling punch for "LPX64" (all bytes after %Lu)\n", - oa.o_id, (long long)st->st_size); - - /* truncate == punch from new size to absolute end of file */ - rc = obd_punch_rqset(llu_i2obdexp(inode), &oinfo, NULL); - if (rc) - CERROR("obd_truncate fails (%d) ino %llu\n", - rc, (long long)st->st_ino); - else - obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLATIME | OBD_MD_FLMTIME | - OBD_MD_FLCTIME); - - EXIT; - return; -} /* llu_truncate */ - -int llu_vmtruncate(struct inode * inode, loff_t offset, obd_flag flags) -{ - llu_i2stat(inode)->st_size = offset; - - /* - * llu_truncate() is only called from this - * point. llu_vmtruncate/llu_truncate split exists to mimic the - * structure of Linux VFS truncate code path. - */ - - llu_truncate(inode, flags); - - return 0; -} diff --git a/lustre/liblustre/llite_cl.c b/lustre/liblustre/llite_cl.c new file mode 100644 index 0000000..ed19dd3 --- /dev/null +++ b/lustre/liblustre/llite_cl.c @@ -0,0 +1,835 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * Copyright (c) 2007 Cluster File Systems, Inc. + * Author: Nikita Danilov + * + * This file is part of Lustre, http://www.lustre.org. + * + * Lustre is free software; you can redistribute it and/or + * modify it under the terms of version 2 of the GNU General Public + * License as published by the Free Software Foundation. + * + * Lustre is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + * GNU General Public License for more details. + * + * You should have received a copy of the GNU General Public License + * along with Lustre; if not, write to the Free Software + * Foundation, Inc., 675 Mass Ave, Cambridge, MA 02139, USA. + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#include +#include +#include +#include +#include +#include +#include +#include +#ifndef __CYGWIN__ +# include +#else +# include +#endif + +#include +#ifdef HAVE_XTIO_H +#include +#endif +#include +#include +#include +#ifdef HAVE_FILE_H +#include +#endif +#include + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "llite_lib.h" + +/* + * slp_ prefix stands for "Sysio Library Posix". It corresponds to historical + * "llu_" prefix. + */ + +static int slp_type_init (struct lu_device_type *t); +static void slp_type_fini (struct lu_device_type *t); + +static struct cl_page * slp_page_init(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage); +static int slp_attr_get (const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); + +static struct lu_device *slp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg); + +static int slp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +static struct slp_io *cl2slp_io(const struct lu_env *env, + const struct cl_io_slice *slice); + + +static void llu_free_user_page(struct page *page); + +static const struct lu_object_operations slp_lu_obj_ops; +static const struct lu_device_operations slp_lu_ops; +static const struct cl_device_operations slp_cl_ops; +static const struct cl_io_operations ccc_io_ops; +static const struct lu_device_type_operations slp_device_type_ops; + //struct lu_device_type slp_device_type; +static const struct cl_page_operations slp_page_ops; +static const struct cl_page_operations slp_transient_page_ops; +static const struct cl_lock_operations slp_lock_ops; + + +/***************************************************************************** + * + * Slp device and device type functions. + * + */ + +void *slp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct slp_session *session; + + OBD_ALLOC_PTR(session); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +void slp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct slp_session *session = data; + OBD_FREE_PTR(session); +} + +struct lu_context_key slp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = slp_session_key_init, + .lct_fini = slp_session_key_fini +}; + +/* type constructor/destructor: slp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(slp, &ccc_key, &ccc_session_key, &slp_session_key); + +static struct lu_device *slp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + return ccc_device_alloc(env, t, cfg, &slp_lu_ops, &slp_cl_ops); +} + +static int slp_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io) +{ + return ccc_lock_init(env, obj, lock, io, &slp_lock_ops); +} + +static const struct cl_object_operations slp_ops = { + .coo_page_init = slp_page_init, + .coo_lock_init = slp_lock_init, + .coo_io_init = slp_io_init, + .coo_attr_get = slp_attr_get, + .coo_attr_set = ccc_attr_set, + .coo_conf_set = ccc_conf_set, + .coo_glimpse = ccc_object_glimpse +}; + +static int slp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct ccc_object *obj = lu2ccc(o); + struct inode *inode = obj->cob_inode; + struct intnl_stat *st = NULL; + + if (inode) + st = llu_i2stat(inode); + + return (*p)(env, cookie, LUSTRE_SLP_NAME"-object@%p(%p:%lu/%u)", + obj, inode, + st ? (unsigned long)st->st_ino : 0UL, + inode ? (unsigned int)llu_i2info(inode)->lli_st_generation + : 0); +} + +static const struct lu_object_operations slp_lu_obj_ops = { + .loo_object_init = ccc_object_init, + .loo_object_start = NULL, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = ccc_object_free, + .loo_object_print = slp_object_print, + .loo_object_invariant = NULL +}; + +static struct lu_object *slp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + return ccc_object_alloc(env, hdr, dev, &slp_ops, &slp_lu_obj_ops); +} + +static const struct lu_device_operations slp_lu_ops = { + .ldo_object_alloc = slp_object_alloc +}; + +static const struct cl_device_operations slp_cl_ops = { + .cdo_req_init = ccc_req_init +}; + +static const struct lu_device_type_operations slp_device_type_ops = { + .ldto_init = slp_type_init, + .ldto_fini = slp_type_fini, + + .ldto_start = slp_type_start, + .ldto_stop = slp_type_stop, + + .ldto_device_alloc = slp_device_alloc, + .ldto_device_free = ccc_device_free, + .ldto_device_init = ccc_device_init, + .ldto_device_fini = ccc_device_fini +}; + +struct lu_device_type slp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_SLP_NAME, + .ldt_ops = &slp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +int slp_global_init(void) +{ + int result; + + result = ccc_global_init(&slp_device_type); + return result; +} + +void slp_global_fini(void) +{ + ccc_global_fini(&slp_device_type); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +static struct cl_page *slp_page_init(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage) +{ + struct ccc_page *cpg; + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + OBD_ALLOC_PTR(cpg); + if (cpg != NULL) { + cpg->cpg_page = vmpage; + + if (page->cp_type == CPT_CACHEABLE) { + LBUG(); + } else { + struct ccc_object *clobj = cl2ccc(obj); + + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &slp_transient_page_ops); + clobj->cob_transient_pages++; + } + result = 0; + } else + result = -ENOMEM; + return ERR_PTR(result); +} + +static int slp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct ccc_io *vio = ccc_env_io(env); + int result = 0; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + cl_io_slice_add(io, &vio->cui_cl, obj, &ccc_io_ops); + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + size_t count; + + count = io->u.ci_rw.crw_count; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + return 1; + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else { + vio->cui_tot_count = count; + vio->cui_tot_nrsegs = 0; + } + + } + return 0; +} + +static int slp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = ccc_object_inode(obj); + struct intnl_stat *st = llu_i2stat(inode); + + attr->cat_size = st->st_size; + attr->cat_blocks = st->st_blocks; + attr->cat_mtime = st->st_mtime; + attr->cat_atime = st->st_atime; + attr->cat_ctime = st->st_ctime; + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +/***************************************************************************** + * + * Page operations. + * + */ + +static void slp_page_fini_common(struct ccc_page *cp) +{ + cfs_page_t *vmpage = cp->cpg_page; + + LASSERT(vmpage != NULL); + llu_free_user_page(vmpage); + OBD_FREE_PTR(cp); +} + +static void slp_page_completion_common(const struct lu_env *env, + struct ccc_page *cp, int ioret) +{ + struct cl_sync_io *anchor = cp->cpg_sync_io; + + if (anchor) { + cp->cpg_sync_io = NULL; + cl_sync_io_note(anchor, ioret); + } else { + LBUG(); + } +} + +static void slp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + ENTRY; + + slp_page_completion_common(env, cp, ioret); + + EXIT; +} + +static void slp_page_completion_write_common(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + + if (ioret == 0) { + cp->cpg_write_queued = 0; + /* + * Only ioret == 0, write succeed, then this page could be + * deleted from the pending_writing count. + */ + } + slp_page_completion_common(env, cp, ioret); +} + +static int slp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return -EBUSY; +} + +static void slp_transient_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *clp = slice->cpl_page; + struct ccc_object *clobj = cl2ccc(clp->cp_obj); + + slp_page_fini_common(cp); + clobj->cob_transient_pages--; +} + + +static const struct cl_page_operations slp_transient_page_ops = { + .cpo_own = ccc_transient_page_own, + .cpo_assume = ccc_transient_page_assume, + .cpo_unassume = ccc_transient_page_unassume, + .cpo_disown = ccc_transient_page_disown, + .cpo_discard = ccc_transient_page_discard, + .cpo_vmpage = ccc_page_vmpage, + .cpo_is_vmlocked = slp_page_is_vmlocked, + .cpo_fini = slp_transient_page_fini, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_completion = slp_page_completion_read, + }, + [CRT_WRITE] = { + .cpo_completion = slp_page_completion_write_common, + } + } +}; + +/***************************************************************************** + * + * Lock operations. + * + */ + +static int slp_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *_, __u32 enqflags) +{ + CLOBINVRNT(env, slice->cls_obj, ccc_object_invariant(slice->cls_obj)); + + liblustre_wait_event(0); + return 0; +} + +static const struct cl_lock_operations slp_lock_ops = { + .clo_fini = ccc_lock_fini, + .clo_enqueue = slp_lock_enqueue, + .clo_wait = ccc_lock_wait, + .clo_unuse = ccc_lock_unuse, + .clo_fits_into = ccc_lock_fits_into, +}; + +/***************************************************************************** + * + * io operations. + * + */ + +static int slp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + int result; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + if (!io->u.ci_wr.wr_append) { // No lock without O_APPEND in liblustre + return 0; + } + + result = ccc_io_one_lock(env, io, 0, mode, start, end); + + return result; +} + +static int slp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + + return slp_io_rw_lock(env, io, CLM_WRITE, start, end); + +} + +static int slp_io_trunc_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + return 0; +} + +static int slp_io_trunc_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + return 0; +} + +static struct page *llu_get_user_page(int index, void *addr, int offset, + int count) +{ + struct page *page; + + OBD_ALLOC_PTR(page); + if (!page) + return NULL; + page->index = index; + page->addr = addr; + page->_offset = offset; + page->_count = count; + + CFS_INIT_LIST_HEAD(&page->list); + CFS_INIT_LIST_HEAD(&page->_node); + + return page; +} + +static void llu_free_user_page(struct page *page) +{ + OBD_FREE_PTR(page); +} + +static int llu_queue_pio(const struct lu_env *env, struct cl_io *io, + struct llu_io_group *group, + char *buf, size_t count, loff_t pos) +{ + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct intnl_stat *st = llu_i2stat(inode); + struct obd_export *exp = llu_i2obdexp(inode); + struct page *page; + int rc = 0, npages = 0, ret_bytes = 0; + int local_lock; + struct cl_page *clp; + struct ccc_page *clup; + struct cl_2queue *queue; + struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io; + ENTRY; + + if (!exp) + RETURN(-EINVAL); + + local_lock = group->lig_params->lrp_lock_mode != LCK_NL; + + queue = &io->ci_queue; + cl_2queue_init(queue); + + + /* prepare the pages array */ + do { + unsigned long index, offset, bytes; + + offset = (pos & ~CFS_PAGE_MASK); + index = pos >> CFS_PAGE_SHIFT; + bytes = CFS_PAGE_SIZE - offset; + if (bytes > count) + bytes = count; + + /* prevent read beyond file range */ + if (/* local_lock && */ + io->ci_type == CIT_READ && pos + bytes >= st->st_size) { + if (pos >= st->st_size) + break; + bytes = st->st_size - pos; + } + + /* prepare page for this index */ + page = llu_get_user_page(index, buf - offset, offset, bytes); + if (!page) { + rc = -ENOMEM; + break; + } + + clp = cl_page_find(env, obj, + cl_index(obj, pos), + page, CPT_TRANSIENT); + + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + clup = cl2ccc_page(cl_page_at(clp, &slp_device_type)); + clup->cpg_sync_io = anchor; + cl_2queue_add(queue, clp); + + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + + cl_page_clip(env, clp, offset, offset+bytes); + + npages++; + count -= bytes; + pos += bytes; + buf += bytes; + + group->lig_rwcount += bytes; + ret_bytes += bytes; + page++; + } while (count); + + cl_sync_io_init(anchor, npages); + /* printk("Inited anchor with %d pages\n", npages); */ + + if (rc == 0) { + rc = cl_io_submit_rw(env, io, + io->ci_type == CIT_READ ? CRT_READ : + CRT_WRITE, + queue); + if (rc == 0) { + /* If some pages weren't sent for any reason, count + * then as completed, to avoid infinite wait. */ + cl_page_list_for_each(clp, &queue->c2_qin) { + CL_PAGE_DEBUG(D_ERROR, env, clp, + "not completed\n"); + cl_sync_io_note(anchor, +1); + } + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, anchor); + } + } + + group->lig_rc = rc; + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + + RETURN(ret_bytes); +} + +static +struct llu_io_group * get_io_group(struct inode *inode, int maxpages, + struct lustre_rw_params *params) +{ + struct llu_io_group *group; + + OBD_ALLOC_PTR(group); + if (!group) + return ERR_PTR(-ENOMEM); + + group->lig_params = params; + + return group; +} + +static int max_io_pages(ssize_t len, int iovlen) +{ + return (((len + CFS_PAGE_SIZE -1) / CFS_PAGE_SIZE) + 2 + iovlen - 1); +} + +void put_io_group(struct llu_io_group *group) +{ + OBD_FREE_PTR(group); +} + +static int slp_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + int err, ret; + loff_t pos; + size_t cnt; + struct llu_io_group *iogroup; + struct lustre_rw_params p = {0}; + int iovidx; + struct intnl_stat *st = llu_i2stat(inode); + struct llu_inode_info *lli = llu_i2info(inode); + struct llu_io_session *session = cl2slp_io(env, ios)->sio_session; + int write = io->ci_type == CIT_WRITE; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + if (write) { + pos = io->u.ci_wr.wr.crw_pos; + cnt = io->u.ci_wr.wr.crw_count; + } else { + pos = io->u.ci_rd.rd.crw_pos; + cnt = io->u.ci_rd.rd.crw_count; + } + if (io->u.ci_wr.wr_append) { + p.lrp_lock_mode = LCK_PW; + } else { + p.lrp_brw_flags = OBD_BRW_SRVLOCK; + p.lrp_lock_mode = LCK_NL; + } + + iogroup = get_io_group(inode, max_io_pages(cnt, cio->cui_nrsegs), &p); + if (IS_ERR(iogroup)) + RETURN(PTR_ERR(iogroup)); + + err = ccc_prep_size(env, obj, io, pos + cnt - 1, 0); + if (err != 0) + GOTO(out, err); + + CDEBUG(D_INODE, + "%s ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", + write?"Write":"Read", (unsigned long)st->st_ino, + cnt, (__u64)pos, (__u64)st->st_size); + + if (write && io->u.ci_wr.wr_append) + pos = io->u.ci_wr.wr.crw_pos = st->st_size; /* XXX? Do we need to change io content too here? */ + /* XXX What about if one write syscall writes at 2 different offsets? */ + + for (iovidx = 0; iovidx < cio->cui_nrsegs; iovidx++) { + char *buf = (char *) cio->cui_iov[iovidx].iov_base; + size_t count = cio->cui_iov[iovidx].iov_len; + + if (!count) + continue; + if (cnt < count) + count = cnt; + if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) { + GOTO(out, err = -EFAULT); + } + + if (io->ci_type == CIT_READ) { + if (/* local_lock && */ pos >= st->st_size) + break; + } else if (io->ci_type == CIT_WRITE) { + if (pos >= lli->lli_maxbytes) { + GOTO(out, err = -EFBIG); + } + if (pos + count >= lli->lli_maxbytes) + count = lli->lli_maxbytes - pos; + } else { + LBUG(); + } + + ret = llu_queue_pio(env, io, iogroup, buf, count, pos); + if (ret < 0) { + GOTO(out, err = ret); + } else { + io->ci_nob += ret; + pos += ret; + cnt -= ret; + if (io->ci_type == CIT_WRITE) { +// obd_adjust_kms(exp, lsm, pos, 0); // XXX + if (pos > st->st_size) + st->st_size = pos; + } + if (!cnt) + break; + } + } + LASSERT(cnt == 0 || io->ci_type == CIT_READ); /* libsysio should guarantee this */ + + session->lis_groups[session->lis_ngroups++] = iogroup; + + return 0; +out: + put_io_group(iogroup); + return err; +} + +static const struct cl_io_operations ccc_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = ccc_io_fini, + .cio_start = slp_io_start, + .cio_end = ccc_io_end + }, + [CIT_WRITE] = { + .cio_fini = ccc_io_fini, + .cio_lock = slp_io_write_lock, + .cio_start = slp_io_start, + .cio_end = ccc_io_end + }, + [CIT_TRUNC] = { + .cio_fini = ccc_io_fini, + .cio_iter_init = slp_io_trunc_iter_init, + .cio_start = slp_io_trunc_start + }, + [CIT_MISC] = { + .cio_fini = ccc_io_fini + } + } +}; + +static struct slp_io *cl2slp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + /* We call it just for assertion here */ + cl2ccc_io(env, slice); + + return slp_env_io(env); +} + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct llu_sb_info *sbi) +{ + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + int refcheck; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + cl = cl_type_setup(env, NULL, &slp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (IS_ERR(cl)) + GOTO(out, rc = PTR_ERR(cl)); + + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; +out: + cl_env_put(env, &refcheck); + RETURN(rc); +} + +int cl_sb_fini(struct llu_sb_info *sbi) +{ + struct lu_env *env; + int refcheck; + + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + if (sbi->ll_cl != NULL) { + cl_stack_fini(env, sbi->ll_cl); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + /* + * If mount failed (sbi->ll_cl == NULL), and this there are no other + * mounts, stop device types manually (this usually happens + * automatically when last device is destroyed). + */ + lu_types_stop(); + cl_env_cache_purge(~0); + RETURN(0); +} diff --git a/lustre/liblustre/llite_lib.c b/lustre/liblustre/llite_lib.c index 31ee23b..232ce2b 100644 --- a/lustre/liblustre/llite_lib.c +++ b/lustre/liblustre/llite_lib.c @@ -67,6 +67,8 @@ #include "lutil.h" #include "llite_lib.h" +int slp_global_init(void); + static int lllib_init(void) { if (liblustre_init_current("liblustre") || @@ -77,7 +79,8 @@ static int lllib_init(void) lmv_init() || mdc_init() || lov_init() || - osc_init()) + osc_init() || + slp_global_init()) return -1; return _sysio_fssw_register("lustre", &llu_fssw_ops); diff --git a/lustre/liblustre/llite_lib.h b/lustre/liblustre/llite_lib.h index 57b98fc..931651e 100644 --- a/lustre/liblustre/llite_lib.h +++ b/lustre/liblustre/llite_lib.h @@ -47,7 +47,11 @@ #include #include -/* This should not be "optimized" use ~0ULL because page->index is a long and +/* for struct cl_lock_descr and struct cl_io */ +#include +#include + +/* This should not be "optimized" use ~0ULL because page->index is a long and * 32-bit systems are therefore limited to 16TB in a mapping */ #define PAGE_CACHE_MAXBYTES ((__u64)(~0UL) << CFS_PAGE_SHIFT) @@ -70,6 +74,8 @@ struct llu_sb_info { struct obd_uuid ll_mds_uuid; struct obd_uuid ll_mds_peer_uuid; char *ll_instance; + struct lu_site *ll_site; + struct cl_device *ll_cl; }; #define LL_SBI_NOLCK 0x1 @@ -109,8 +115,10 @@ struct llu_inode_info { /* not for stat, change it later */ int lli_st_flags; unsigned long lli_st_generation; + struct cl_object *lli_clob; }; + static inline struct llu_sb_info *llu_fs2sbi(struct filesys *fs) { return (struct llu_sb_info*)(fs->fs_private); @@ -218,8 +226,7 @@ int ll_parse_mount_target(const char *target, char **mgsnid, extern struct mount_option_s mount_option; /* super.c */ -void llu_update_inode(struct inode *inode, struct mdt_body *body, - struct lov_stripe_md *lmm); +void llu_update_inode(struct inode *inode, struct lustre_md *md); void obdo_to_inode(struct inode *dst, struct obdo *src, obd_flag valid); void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid); int ll_it_open_error(int phase, struct lookup_intent *it); @@ -253,11 +260,7 @@ int llu_objects_destroy(struct ptlrpc_request *request, struct inode *dir); int llu_iop_read(struct inode *ino, struct ioctx *ioctxp); int llu_iop_write(struct inode *ino, struct ioctx *ioctxp); int llu_iop_iodone(struct ioctx *ioctxp); -int llu_local_size(struct inode *inode); int llu_glimpse_size(struct inode *inode); -int llu_extent_lock_cancel_cb(struct ldlm_lock *lock, - struct ldlm_lock_desc *new, void *data, - int flag); int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, ldlm_policy_data_t *policy, struct lustre_handle *lockh, @@ -278,11 +281,11 @@ int llu_md_blocking_ast(struct ldlm_lock *lock, void *data, int flag); /* dir.c */ -ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep, +ssize_t llu_iop_filldirentries(struct inode *ino, _SYSIO_OFF_T *basep, char *buf, size_t nbytes); /* liblustre/llite_fid.c*/ -unsigned long llu_fid_build_ino(struct llu_sb_info *sbi, +unsigned long llu_fid_build_ino(struct llu_sb_info *sbi, struct lu_fid *fid); /* ext2 related */ @@ -306,6 +309,8 @@ static inline struct ext2_dirent *ext2_next_entry(struct ext2_dirent *p) return (struct ext2_dirent*)((char*) p + le16_to_cpu(p->rec_len)); } +int llu_merge_lvb(struct inode *inode); + static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb) { struct intnl_stat *st = llu_i2stat(inode); @@ -316,4 +321,91 @@ static inline void inode_init_lvb(struct inode *inode, struct ost_lvb *lvb) lvb->lvb_ctime = st->st_ctime; } +#define LLU_IO_GROUP_SIZE(x) \ + (sizeof(struct llu_io_group) + \ + (sizeof(struct ll_async_page) + \ + sizeof(cfs_page_t) + \ + llap_cookie_size) * (x)) + +#define LLU_IO_SESSION_SIZE(x) \ + (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *)) + +struct llu_io_session { + struct inode *lis_inode; + int lis_cmd; + int lis_max_groups; + int lis_ngroups; + struct llu_io_group *lis_groups[0]; +}; + +struct llu_io_group +{ + struct lustre_rw_params *lig_params; + int lig_rc; + __u64 lig_rwcount; +}; + +struct llu_io_session; +void put_io_group(struct llu_io_group *group); + +int cl_sb_init(struct llu_sb_info *sbi); +int cl_sb_fini(struct llu_sb_info *sbi); +int cl_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); + +void llu_io_init(struct cl_io *io, struct inode *inode, int write); + +struct slp_io { + struct llu_io_session *sio_session; +}; + +struct slp_session { + struct slp_io ss_ios; +}; + +static inline struct slp_session *slp_env_session(const struct lu_env *env) +{ + extern struct lu_context_key slp_session_key; + struct slp_session *ses; + ses = lu_context_key_get(env->le_ses, &slp_session_key); + LASSERT(ses != NULL); + return ses; +} +static inline struct slp_io *slp_env_io(const struct lu_env *env) +{ + return &slp_env_session(env)->ss_ios; +} + +/* lclient compat stuff */ +#define cl_inode_info llu_inode_info +#define cl_i2info(info) llu_i2info(info) +#define cl_inode_mode(inode) (llu_i2stat(inode)->st_mode) +#define cl_i2sbi llu_i2sbi +#define cl_isize_read(inode) (llu_i2stat(inode)->st_size) +#define cl_isize_write(inode,kms) do{llu_i2stat(inode)->st_size = kms;}while(0) +#define cl_isize_write_nolock(inode,kms) do{llu_i2stat(inode)->st_size = kms;}while(0) + +static inline void cl_isize_lock(struct inode *inode, int lsmlock) +{ +} + +static inline void cl_isize_unlock(struct inode *inode, int lsmlock) +{ +} + +static inline int cl_merge_lvb(struct inode *inode) +{ + return llu_merge_lvb(inode); +} + +#define cl_inode_atime(inode) (llu_i2stat(inode)->st_atime) +#define cl_inode_ctime(inode) (llu_i2stat(inode)->st_ctime) +#define cl_inode_mtime(inode) (llu_i2stat(inode)->st_mtime) + +static inline struct obd_capa *cl_capa_lookup(struct inode *inode, + enum cl_req_type crt) +{ + return NULL; +} + #endif diff --git a/lustre/liblustre/lutil.c b/lustre/liblustre/lutil.c index 0de50cf..fc9bc5d 100644 --- a/lustre/liblustre/lutil.c +++ b/lustre/liblustre/lutil.c @@ -77,14 +77,14 @@ void *inter_module_get(char *arg) return ldlm_namespace_cleanup; else if (!strcmp(arg, "ldlm_replay_locks")) return ldlm_replay_locks; -#ifdef HAVE_QUOTA_SUPPORT - else if (!strcmp(arg, "osc_quota_interface")) - return &osc_quota_interface; else if (!strcmp(arg, "mdc_quota_interface")) return &mdc_quota_interface; + else if (!strcmp(arg, "lmv_quota_interface")) + return &lmv_quota_interface; + else if (!strcmp(arg, "osc_quota_interface")) + return &osc_quota_interface; else if (!strcmp(arg, "lov_quota_interface")) return &lov_quota_interface; -#endif else return NULL; } diff --git a/lustre/liblustre/namei.c b/lustre/liblustre/namei.c index 2857baf..cfd7100 100644 --- a/lustre/liblustre/namei.c +++ b/lustre/liblustre/namei.c @@ -212,7 +212,7 @@ static int pnode_revalidate_finish(struct ptlrpc_request *req, if (rc) RETURN(rc); - llu_update_inode(inode, md.body, md.lsm); + llu_update_inode(inode, &md); RETURN(rc); } @@ -381,7 +381,7 @@ static int lookup_it_finish(struct ptlrpc_request *request, int offset, /* bug 2334: drop MDS lock before acquiring OST lock */ ll_intent_drop_lock(it); - rc = llu_glimpse_size(inode); + rc = cl_glimpse_size(inode); if (rc) { I_RELE(inode); RETURN(rc); diff --git a/lustre/liblustre/rw.c b/lustre/liblustre/rw.c index 53bda1f..21d8e42 100644 --- a/lustre/liblustre/rw.c +++ b/lustre/liblustre/rw.c @@ -65,37 +65,6 @@ #include "llite_lib.h" -struct llu_io_group -{ - struct obd_io_group *lig_oig; - struct inode *lig_inode; - struct lustre_rw_params *lig_params; - int lig_maxpages; - int lig_npages; - __u64 lig_rwcount; - struct ll_async_page *lig_llaps; - cfs_page_t *lig_pages; - void *lig_llap_cookies; -}; - -#define LLU_IO_GROUP_SIZE(x) \ - (sizeof(struct llu_io_group) + \ - (sizeof(struct ll_async_page) + \ - sizeof(cfs_page_t) + \ - llap_cookie_size) * (x)) - -struct llu_io_session -{ - struct inode *lis_inode; - int lis_cmd; - int lis_max_groups; - int lis_ngroups; - struct llu_io_group *lis_groups[0]; -}; -#define LLU_IO_SESSION_SIZE(x) \ - (sizeof(struct llu_io_session) + (x) * 2 * sizeof(void *)) - - typedef ssize_t llu_file_piov_t(const struct iovec *iovec, int iovlen, _SYSIO_OFF_T pos, ssize_t len, void *private); @@ -177,7 +146,7 @@ int llu_extent_lock_cancel_cb(struct ldlm_lock *lock, if (lsm->lsm_oinfo[stripe]->loi_kms != kms) LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64, lsm->lsm_oinfo[stripe]->loi_kms, kms); - lsm->lsm_oinfo[stripe]->loi_kms = kms; + loi_kms_set(lsm->lsm_oinfo[stripe], kms); iput: I_RELE(inode); break; @@ -222,7 +191,7 @@ static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp) lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB); lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms; - LDLM_DEBUG(lock, "i_size: "LPU64" -> stripe number %u -> kms "LPU64, + LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64, (__u64)llu_i2stat(inode)->st_size, stripe,lvb->lvb_size); iput: I_RELE(inode); @@ -236,7 +205,7 @@ static int llu_glimpse_callback(struct ldlm_lock *lock, void *reqp) return rc; } -static int llu_merge_lvb(struct inode *inode) +int llu_merge_lvb(struct inode *inode) { struct llu_inode_info *lli = llu_i2info(inode); struct llu_sb_info *sbi = llu_i2sbi(inode); @@ -259,81 +228,6 @@ static int llu_merge_lvb(struct inode *inode) RETURN(rc); } -int llu_local_size(struct inode *inode) -{ - ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; - struct llu_inode_info *lli = llu_i2info(inode); - struct llu_sb_info *sbi = llu_i2sbi(inode); - struct lustre_handle lockh = { 0 }; - int flags = 0; - int rc; - ENTRY; - - if (lli->lli_smd->lsm_stripe_count == 0) - RETURN(0); - - rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT, - &policy, LCK_PR, &flags, inode, &lockh); - if (rc < 0) - RETURN(rc); - else if (rc == 0) - RETURN(-ENODATA); - - rc = llu_merge_lvb(inode); - obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh); - RETURN(rc); -} - -/* NB: lov_merge_size will prefer locally cached writes if they extend the - * file (because it prefers KMS over RSS when larger) */ -int llu_glimpse_size(struct inode *inode) -{ - struct llu_inode_info *lli = llu_i2info(inode); - struct intnl_stat *st = llu_i2stat(inode); - struct llu_sb_info *sbi = llu_i2sbi(inode); - struct lustre_handle lockh = { 0 }; - struct ldlm_enqueue_info einfo = { 0 }; - struct obd_info oinfo = { { { 0 } } }; - int rc; - ENTRY; - - /* If size is cached on the mds, skip glimpse. */ - if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) - RETURN(0); - - CDEBUG(D_DLMTRACE, "Glimpsing inode "LPU64"\n", (__u64)st->st_ino); - - if (!lli->lli_smd) { - CDEBUG(D_DLMTRACE, "No objects for inode "LPU64"\n", - (__u64)st->st_ino); - RETURN(0); - } - - einfo.ei_type = LDLM_EXTENT; - einfo.ei_mode = LCK_PR; - einfo.ei_cb_bl = osc_extent_blocking_cb; - einfo.ei_cb_cp = ldlm_completion_ast; - einfo.ei_cb_gl = llu_glimpse_callback; - einfo.ei_cbdata = inode; - - oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - oinfo.oi_lockh = &lockh; - oinfo.oi_md = lli->lli_smd; - oinfo.oi_flags = LDLM_FL_HAS_INTENT; - - rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo); - if (rc) { - CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc); - RETURN(rc > 0 ? -EIO : rc); - } - - rc = llu_merge_lvb(inode); - CDEBUG(D_DLMTRACE, "glimpse: size: "LPU64", blocks: "LPU64"\n", - (__u64)st->st_size, (__u64)st->st_blocks); - - RETURN(rc); -} - int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, struct lov_stripe_md *lsm, int mode, ldlm_policy_data_t *policy, struct lustre_handle *lockh, @@ -356,12 +250,12 @@ int llu_extent_lock(struct ll_file_data *fd, struct inode *inode, RETURN(0); CDEBUG(D_DLMTRACE, "Locking inode %llu, start "LPU64" end "LPU64"\n", - (unsigned long long)st->st_ino, policy->l_extent.start, + (__u64)st->st_ino, policy->l_extent.start, policy->l_extent.end); einfo.ei_type = LDLM_EXTENT; einfo.ei_mode = mode; - einfo.ei_cb_bl = osc_extent_blocking_cb; + einfo.ei_cb_bl = llu_extent_lock_cancel_cb; einfo.ei_cb_cp = ldlm_completion_ast; einfo.ei_cb_gl = llu_glimpse_callback; einfo.ei_cbdata = inode; @@ -411,278 +305,6 @@ int llu_extent_unlock(struct ll_file_data *fd, struct inode *inode, RETURN(rc); } -#define LLAP_MAGIC 12346789 - -struct ll_async_page { - int llap_magic; - void *llap_cookie; - int llap_queued; - cfs_page_t *llap_page; - struct inode *llap_inode; -}; - -static inline struct ll_async_page *llap_from_cookie(void *ptr) -{ - struct ll_async_page *ap = ptr; - LASSERT(ap->llap_magic == LLAP_MAGIC); - return ap; -} - -static void llu_ap_fill_obdo(void *data, int cmd, struct obdo *oa) -{ - struct ll_async_page *llap; - struct inode *inode; - struct lov_stripe_md *lsm; - obd_flag valid_flags; - ENTRY; - - llap = llap_from_cookie(data); - inode = llap->llap_inode; - lsm = llu_i2info(inode)->lli_smd; - - oa->o_id = lsm->lsm_object_id; - oa->o_valid = OBD_MD_FLID; - valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; - if (cmd & OBD_BRW_WRITE) - valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLUID | OBD_MD_FLGID | - OBD_MD_FLFID | OBD_MD_FLGENER; - - obdo_from_inode(oa, inode, valid_flags); - EXIT; -} - -static void llu_ap_update_obdo(void *data, int cmd, struct obdo *oa, - obd_valid valid) -{ - struct ll_async_page *llap; - ENTRY; - - llap = llap_from_cookie(data); - obdo_from_inode(oa, llap->llap_inode, valid); - - EXIT; -} - -/* called for each page in a completed rpc.*/ -static int llu_ap_completion(void *data, int cmd, struct obdo *oa, int rc) -{ - struct ll_async_page *llap; - cfs_page_t *page; - ENTRY; - - llap = llap_from_cookie(data); - llap->llap_queued = 0; - page = llap->llap_page; - - if (rc != 0) { - if (cmd & OBD_BRW_WRITE) - CERROR("writeback error on page %p index %ld: %d\n", - page, page->index, rc); - } - RETURN(0); -} - -static struct obd_capa * llu_ap_lookup_capa(void *data, int cmd) -{ - return NULL; -} - -static struct obd_async_page_ops llu_async_page_ops = { - .ap_make_ready = NULL, - .ap_refresh_count = NULL, - .ap_fill_obdo = llu_ap_fill_obdo, - .ap_update_obdo = llu_ap_update_obdo, - .ap_completion = llu_ap_completion, - .ap_lookup_capa = llu_ap_lookup_capa, -}; - -static int llu_queue_pio(int cmd, struct llu_io_group *group, - char *buf, size_t count, loff_t pos) -{ - struct llu_inode_info *lli = llu_i2info(group->lig_inode); - struct intnl_stat *st = llu_i2stat(group->lig_inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_export *exp = llu_i2obdexp(group->lig_inode); - cfs_page_t *pages = &group->lig_pages[group->lig_npages],*page = pages; - struct ll_async_page *llap = &group->lig_llaps[group->lig_npages]; - void *llap_cookie = group->lig_llap_cookies + - llap_cookie_size * group->lig_npages; - int i, rc, npages = 0, ret_bytes = 0; - int local_lock; - ENTRY; - - if (!exp) - RETURN(-EINVAL); - - local_lock = group->lig_params->lrp_lock_mode != LCK_NL; - /* prepare the pages array */ - do { - unsigned long index, offset, bytes; - - offset = (pos & ~CFS_PAGE_MASK); - index = pos >> CFS_PAGE_SHIFT; - bytes = CFS_PAGE_SIZE - offset; - if (bytes > count) - bytes = count; - - /* prevent read beyond file range */ - if (/* local_lock && */ - cmd == OBD_BRW_READ && pos + bytes >= st->st_size) { - if (pos >= st->st_size) - break; - bytes = st->st_size - pos; - } - - /* prepare page for this index */ - page->index = index; - page->addr = buf - offset; - - page->_offset = offset; - page->_count = bytes; - - page++; - npages++; - count -= bytes; - pos += bytes; - buf += bytes; - - group->lig_rwcount += bytes; - ret_bytes += bytes; - } while (count); - - group->lig_npages += npages; - - for (i = 0, page = pages; i < npages; - i++, page++, llap++, llap_cookie += llap_cookie_size){ - llap->llap_magic = LLAP_MAGIC; - llap->llap_cookie = llap_cookie; - rc = obd_prep_async_page(exp, lsm, NULL, page, - (obd_off)page->index << CFS_PAGE_SHIFT, - &llu_async_page_ops, - llap, &llap->llap_cookie, - 1 /* no cache in liblustre at all */, - NULL); - if (rc) { - LASSERT(rc < 0); - llap->llap_cookie = NULL; - RETURN(rc); - } - - CDEBUG(D_CACHE, "llap %p page %p group %p obj off "LPU64"\n", - llap, page, llap->llap_cookie, - (obd_off)pages->index << CFS_PAGE_SHIFT); - page->private = (unsigned long)llap; - llap->llap_page = page; - llap->llap_inode = group->lig_inode; - - rc = obd_queue_group_io(exp, lsm, NULL, group->lig_oig, - llap->llap_cookie, cmd, - page->_offset, page->_count, - group->lig_params->lrp_brw_flags, - ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC); - if (!local_lock && cmd == OBD_BRW_READ) { - /* - * In OST-side locking case short reads cannot be - * detected properly. - * - * The root of the problem is that - * - * kms = lov_merge_size(lsm, 1); - * if (end >= kms) - * glimpse_size(inode); - * else - * st->st_size = kms; - * - * logic in the read code (both llite and liblustre) - * only works correctly when client holds DLM lock on - * [start, end]. Without DLM lock KMS can be - * completely out of date, and client can either make - * spurious short-read (missing concurrent write), or - * return stale data (missing concurrent - * truncate). For llite client this is fatal, because - * incorrect data are cached and can be later sent - * back to the server (vide bug 5047). This is hard to - * fix by handling short-reads on the server, as there - * is no easy way to communicate file size (or amount - * of bytes read/written) back to the client, - * _especially_ because OSC pages can be sliced and - * dices into multiple RPCs arbitrary. Fortunately, - * liblustre doesn't cache data and the worst case is - * that we get race with concurrent write or truncate. - */ - } - if (rc) { - LASSERT(rc < 0); - RETURN(rc); - } - - llap->llap_queued = 1; - } - - RETURN(ret_bytes); -} - -static -struct llu_io_group * get_io_group(struct inode *inode, int maxpages, - struct lustre_rw_params *params) -{ - struct llu_io_group *group; - int rc; - - if (!llap_cookie_size) - llap_cookie_size = obd_prep_async_page(llu_i2obdexp(inode), - NULL, NULL, NULL, 0, - NULL, NULL, NULL, 0, - NULL); - - OBD_ALLOC(group, LLU_IO_GROUP_SIZE(maxpages)); - if (!group) - return ERR_PTR(-ENOMEM); - - I_REF(inode); - group->lig_inode = inode; - group->lig_maxpages = maxpages; - group->lig_params = params; - group->lig_llaps = (struct ll_async_page *)(group + 1); - group->lig_pages = (cfs_page_t *)(&group->lig_llaps[maxpages]); - group->lig_llap_cookies = (void *)(&group->lig_pages[maxpages]); - - rc = oig_init(&group->lig_oig); - if (rc) { - OBD_FREE(group, LLU_IO_GROUP_SIZE(maxpages)); - return ERR_PTR(rc); - } - - return group; -} - -static int max_io_pages(ssize_t len, int iovlen) -{ - return (((len + CFS_PAGE_SIZE -1) / CFS_PAGE_SIZE) + 2 + iovlen - 1); -} - -static -void put_io_group(struct llu_io_group *group) -{ - struct lov_stripe_md *lsm = llu_i2info(group->lig_inode)->lli_smd; - struct obd_export *exp = llu_i2obdexp(group->lig_inode); - struct ll_async_page *llap = group->lig_llaps; - int i; - - for (i = 0; i < group->lig_npages; i++, llap++) { - if (llap->llap_cookie) - obd_teardown_async_page(exp, lsm, NULL, - llap->llap_cookie); - } - - I_RELE(group->lig_inode); - - oig_release(group->lig_oig); - OBD_FREE(group, LLU_IO_GROUP_SIZE(group->lig_maxpages)); -} - static ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, _SYSIO_OFF_T pos, ssize_t len, @@ -691,18 +313,11 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, struct llu_io_session *session = (struct llu_io_session *) private; struct inode *inode = session->lis_inode; struct llu_inode_info *lli = llu_i2info(inode); - struct intnl_stat *st = llu_i2stat(inode); - struct ll_file_data *fd = lli->lli_file_data; - struct lustre_handle lockh = {0}; - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_export *exp = NULL; - struct llu_io_group *iogroup; - struct lustre_rw_params p; - struct ost_lvb lvb; - __u64 kms; - int err, is_read, iovidx, ret; - int local_lock; - ssize_t ret_len = len; + int err; + struct lu_env *env; + struct cl_io *io; + struct slp_io *sio; + int refcheck; ENTRY; /* in a large iov read/write we'll be repeatedly called. @@ -710,126 +325,40 @@ ssize_t llu_file_prwv(const struct iovec *iovec, int iovlen, */ liblustre_wait_event(0); - exp = llu_i2obdexp(inode); - if (exp == NULL) - RETURN(-EINVAL); - if (len == 0 || iovlen == 0) RETURN(0); if (pos + len > lli->lli_maxbytes) RETURN(-ERANGE); - lustre_build_lock_params(session->lis_cmd, lli->lli_open_flags, - lli->lli_sbi->ll_lco.lco_flags, - pos, len, &p); - - iogroup = get_io_group(inode, max_io_pages(len, iovlen), &p); - if (IS_ERR(iogroup)) - RETURN(PTR_ERR(iogroup)); - - local_lock = p.lrp_lock_mode != LCK_NL; - - err = llu_extent_lock(fd, inode, lsm, p.lrp_lock_mode, &p.lrp_policy, - &lockh, p.lrp_ast_flags); - if (err != ELDLM_OK) - GOTO(err_put, err); - - is_read = (session->lis_cmd == OBD_BRW_READ); - if (is_read) { - /* - * If OST-side locking is used, KMS can be completely out of - * date, and, hence, cannot be used for short-read - * detection. Rely in OST to handle short reads in that case. - */ - inode_init_lvb(inode, &lvb); - obd_merge_lvb(exp, lsm, &lvb, 1); - kms = lvb.lvb_size; - /* extent.end is last byte of the range */ - if (p.lrp_policy.l_extent.end >= kms) { - /* A glimpse is necessary to determine whether - * we return a short read or some zeroes at - * the end of the buffer - * - * In the case of OST-side locking KMS can be - * completely out of date and short-reads maybe - * mishandled. See llu_queue_pio() for more detailed - * comment. - */ - if ((err = llu_glimpse_size(inode))) { - GOTO(err_unlock, err); - } - } else { - st->st_size = kms; - } - } else if (lli->lli_open_flags & O_APPEND) { - pos = st->st_size; - } - - for (iovidx = 0; iovidx < iovlen; iovidx++) { - char *buf = (char *) iovec[iovidx].iov_base; - size_t count = iovec[iovidx].iov_len; - - if (!count) - continue; - if (len < count) - count = len; - if (IS_BAD_PTR(buf) || IS_BAD_PTR(buf + count)) { - GOTO(err_unlock, err = -EFAULT); - } - - if (is_read) { - if (/* local_lock && */ pos >= st->st_size) - break; - } else { - if (pos >= lli->lli_maxbytes) { - GOTO(err_unlock, err = -EFBIG); - } - if (pos + count >= lli->lli_maxbytes) - count = lli->lli_maxbytes - pos; - } - - ret = llu_queue_pio(session->lis_cmd, iogroup, buf, count, pos); - if (ret < 0) { - GOTO(err_unlock, err = ret); - } else { - pos += ret; - if (!is_read) { - LASSERT(ret == count); - obd_adjust_kms(exp, lsm, pos, 0); - /* file size grow immediately */ - if (pos > st->st_size) - st->st_size = pos; - } - len -= ret; - if (!len) - break; - } - } - LASSERT(len == 0 || is_read); /* libsysio should guarantee this */ - - err = obd_trigger_group_io(exp, lsm, NULL, iogroup->lig_oig); - if (err) - GOTO(err_unlock, err); - - err = oig_wait(iogroup->lig_oig); - if (err) { - CERROR("%s error: %s\n", is_read ? "read" : "write", strerror(-err)); - GOTO(err_unlock, err); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + + if (cl_io_rw_init(env, io, session->lis_cmd == OBD_BRW_WRITE?CIT_WRITE: + CIT_READ, + pos, len) == 0) { + struct ccc_io *cio; + sio = slp_env_io(env); + cio = ccc_env_io(env); + /* XXX this is not right: cio->cui_iov can be modified. */ + cio->cui_iov = (struct iovec *)iovec; + cio->cui_nrsegs = iovlen; + sio->sio_session = session; + err = cl_io_loop(env, io); + } else { + /* XXX WTF? */ + LBUG(); } + cl_io_fini(env, io); + cl_env_put(env, &refcheck); - ret = llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); - if (ret) - CERROR("extent unlock error %d\n", ret); + if (err < 0) + RETURN(err); - session->lis_groups[session->lis_ngroups++] = iogroup; - RETURN(ret_len); - -err_unlock: - llu_extent_unlock(fd, inode, lsm, p.lrp_lock_mode, &lockh); -err_put: - put_io_group(iogroup); - RETURN((ssize_t)err); + RETURN(len); } static @@ -906,34 +435,91 @@ static int llu_file_rwx(struct inode *ino, RETURN(cc); } +void llu_io_init(struct cl_io *io, struct inode *inode, int write) +{ + struct llu_inode_info *lli = llu_i2info(inode); + + memset(io, 0, sizeof *io); + + io->u.ci_rw.crw_nonblock = lli->lli_open_flags & O_NONBLOCK; + if (write) + io->u.ci_wr.wr_append = lli->lli_open_flags & O_APPEND; + io->ci_obj = llu_i2info(inode)->lli_clob; + + if (lli->lli_open_flags & O_APPEND) + io->ci_lockreq = CILR_MANDATORY; + else + io->ci_lockreq = CILR_NEVER; + +} + int llu_iop_read(struct inode *ino, struct ioctx *ioctx) { - /* BUG: 5972 */ struct intnl_stat *st = llu_i2stat(ino); + struct lu_env *env; + struct cl_io *io; + int refcheck; + int ret; + + /* BUG: 5972 */ st->st_atime = CURRENT_TIME; - return llu_file_rwx(ino, ioctx, 1); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + llu_io_init(io, ino, 0); + + ret = llu_file_rwx(ino, ioctx, 1); + + cl_env_put(env, &refcheck); + return ret; } int llu_iop_write(struct inode *ino, struct ioctx *ioctx) { struct intnl_stat *st = llu_i2stat(ino); + struct lu_env *env; + struct cl_io *io; + int refcheck; + int ret; + st->st_mtime = st->st_ctime = CURRENT_TIME; - return llu_file_rwx(ino, ioctx, 0); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + llu_io_init(io, ino, 1); + + ret = llu_file_rwx(ino, ioctx, 0); + cl_env_put(env, &refcheck); + return ret; } int llu_iop_iodone(struct ioctx *ioctx) { struct llu_io_session *session; struct llu_io_group *group; - int i, err = 0, rc = 0; + int i, rc = 0; + struct lu_env *env; + struct cl_io *io; + int refcheck; ENTRY; liblustre_wait_event(0); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + cl_io_fini(env, io); + cl_env_put(env, &refcheck); session = (struct llu_io_session *) ioctx->ioctx_private; LASSERT(session); LASSERT(!IS_ERR(session)); @@ -941,11 +527,8 @@ int llu_iop_iodone(struct ioctx *ioctx) for (i = 0; i < session->lis_ngroups; i++) { group = session->lis_groups[i]; if (group) { - if (!rc) { - err = oig_wait(group->lig_oig); - if (err) - rc = err; - } + if (!rc) + rc = group->lig_rc; if (!rc) ioctx->ioctx_cc += group->lig_rwcount; put_io_group(group); diff --git a/lustre/liblustre/super.c b/lustre/liblustre/super.c index 258de2e..ff23356 100644 --- a/lustre/liblustre/super.c +++ b/lustre/liblustre/super.c @@ -112,8 +112,7 @@ static void llu_fsop_gone(struct filesys *fs) ENTRY; list_del(&sbi->ll_conn_chain); - obd_unregister_lock_cancel_cb(sbi->ll_dt_exp, - llu_extent_lock_cancel_cb); + cl_sb_fini(sbi); obd_disconnect(sbi->ll_dt_exp); obd_disconnect(sbi->ll_md_exp); @@ -146,15 +145,23 @@ static ldlm_mode_t llu_take_md_lock(struct inode *inode, __u64 bits, RETURN(rc); } -void llu_update_inode(struct inode *inode, struct mdt_body *body, - struct lov_stripe_md *lsm) +void llu_update_inode(struct inode *inode, struct lustre_md *md) { struct llu_inode_info *lli = llu_i2info(inode); + struct mdt_body *body = md->body; + struct lov_stripe_md *lsm = md->lsm; struct intnl_stat *st = llu_i2stat(inode); LASSERT ((lsm != NULL) == ((body->valid & OBD_MD_FLEASIZE) != 0)); + + if (body->valid & OBD_MD_FLMODE) + st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT); + if (body->valid & OBD_MD_FLTYPE) + st->st_mode = (st->st_mode & ~S_IFMT)|(body->mode & S_IFMT); + if (lsm != NULL) { if (lli->lli_smd == NULL) { + cl_inode_init(inode, md); lli->lli_smd = lsm; lli->lli_maxbytes = lsm->lsm_maxbytes; if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) @@ -185,11 +192,7 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body, if (body->valid & OBD_MD_FLMTIME) LTIME_S(st->st_mtime) = body->mtime; } - if (body->valid & OBD_MD_FLMODE) - st->st_mode = (st->st_mode & S_IFMT)|(body->mode & ~S_IFMT); - if (body->valid & OBD_MD_FLTYPE) - st->st_mode = (st->st_mode & ~S_IFMT)|(body->mode & S_IFMT); - if (S_ISREG(st->st_mode)) + if (S_ISREG(st->st_mode)) st->st_blksize = min(2UL * PTLRPC_MAX_BRW_SIZE, LL_MAX_BLKSIZE); else st->st_blksize = 4096; @@ -204,13 +207,13 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body, if (body->valid & OBD_MD_FLFLAGS) lli->lli_st_flags = body->flags; if (body->valid & OBD_MD_FLSIZE) { - if ((llu_i2sbi(inode)->ll_lco.lco_flags & OBD_CONNECT_SOM) && + if ((llu_i2sbi(inode)->ll_lco.lco_flags & OBD_CONNECT_SOM) && S_ISREG(st->st_mode) && lli->lli_smd) { struct lustre_handle lockh; ldlm_mode_t mode; - + /* As it is possible a blocking ast has been processed - * by this time, we need to check there is an UPDATE + * by this time, we need to check there is an UPDATE * lock on the client and set LLIF_MDS_SIZE_LOCK holding * it. */ mode = llu_take_md_lock(inode, MDS_INODELOCK_UPDATE, @@ -223,7 +226,7 @@ void llu_update_inode(struct inode *inode, struct mdt_body *body, } else { st->st_size = body->size; } - + if (body->valid & OBD_MD_FLBLOCKS) st->st_blocks = body->blocks; } @@ -503,7 +506,7 @@ static int llu_inode_revalidate(struct inode *inode) } - llu_update_inode(inode, md.body, md.lsm); + llu_update_inode(inode, &md); if (md.lsm != NULL && llu_i2info(inode)->lli_smd != md.lsm) obd_free_memmd(sbi->ll_dt_exp, &md.lsm); ptlrpc_req_finished(req); @@ -515,7 +518,7 @@ static int llu_inode_revalidate(struct inode *inode) /* ll_glimpse_size will prefer locally cached writes if they extend * the file */ - RETURN(llu_glimpse_size(inode)); + RETURN(cl_glimpse_size(inode)); } static void copy_stat_buf(struct inode *ino, struct intnl_stat *b) @@ -583,6 +586,8 @@ void llu_clear_inode(struct inode *inode) obd_change_cbdata(sbi->ll_dt_exp, lli->lli_smd, null_if_equal, inode); + cl_inode_fini(inode); + if (lli->lli_smd) { obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd); lli->lli_smd = NULL; @@ -675,7 +680,7 @@ int llu_md_setattr(struct inode *inode, struct md_op_data *op_data, * to call vmtruncate in inode_setattr to update inode->i_size * (bug 6196) */ inode_setattr(inode, &op_data->op_attr); - llu_update_inode(inode, md.body, md.lsm); + llu_update_inode(inode, &md); ptlrpc_req_finished(request); RETURN(rc); @@ -729,7 +734,6 @@ static int llu_setattr_done_writing(struct inode *inode, int llu_setattr_raw(struct inode *inode, struct iattr *attr) { struct lov_stripe_md *lsm = llu_i2info(inode)->lli_smd; - struct llu_sb_info *sbi = llu_i2sbi(inode); struct intnl_stat *st = llu_i2stat(inode); int ia_valid = attr->ia_valid; struct md_op_data op_data = { { 0 } }; @@ -766,7 +770,7 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) { /* To avoid stale mtime on mds, obtain it from ost and send to mds. */ - rc = llu_glimpse_size(inode); + rc = cl_glimpse_size(inode); if (rc) RETURN(rc); @@ -833,71 +837,11 @@ int llu_setattr_raw(struct inode *inode, struct iattr *attr) } if (ia_valid & ATTR_SIZE) { - ldlm_policy_data_t policy = { .l_extent = {attr->ia_size, - OBD_OBJECT_EOF} }; - struct lustre_handle lockh = { 0, }; - struct lustre_handle match_lockh = { 0, }; - - int err; - int flags = LDLM_FL_TEST_LOCK; /* for assertion check below */ - int lock_mode; - obd_flag obd_flags; - - /* check that there are no matching locks */ - LASSERT(obd_match(sbi->ll_dt_exp, lsm, LDLM_EXTENT, &policy, - LCK_PW, &flags, inode, &match_lockh) <= 0); - - /* XXX when we fix the AST intents to pass the discard-range - * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA - * XXX here. */ - flags = (attr->ia_size == 0) ? LDLM_AST_DISCARD_DATA : 0; - - if (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK) { - lock_mode = LCK_NL; - obd_flags = OBD_FL_TRUNCLOCK; - CDEBUG(D_INODE, "delegating locking to the OST"); - } else { - lock_mode = LCK_PW; - obd_flags = 0; - } - - /* with lock_mode == LK_NL no lock is taken. */ - rc = llu_extent_lock(NULL, inode, lsm, lock_mode, &policy, - &lockh, flags); - if (rc != ELDLM_OK) { - if (rc > 0) - GOTO(out, rc = -ENOLCK); - GOTO(out, rc); - } - rc = llu_vmtruncate(inode, attr->ia_size, obd_flags); - - /* unlock now as we don't mind others file lockers racing with - * the mds updates below? */ - err = llu_extent_unlock(NULL, inode, lsm, lock_mode, &lockh); - if (err) { - CERROR("llu_extent_unlock failed: %d\n", err); - if (!rc) - rc = err; - } + rc = cl_setattr_do_truncate(inode, attr->ia_size, NULL); } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { - struct obd_info oinfo = { { { 0 } } }; - struct obdo oa; - - CDEBUG(D_INODE, "set mtime on OST inode %llu to "CFS_TIME_T"\n", - (long long)st->st_ino, LTIME_S(attr->ia_mtime)); - oa.o_id = lsm->lsm_object_id; - oa.o_gr = lsm->lsm_object_gr; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME); - - oinfo.oi_oa = &oa; - oinfo.oi_md = lsm; - - rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL); - if (rc) - CERROR("obd_setattr_async fails: rc=%d\n", rc); + CDEBUG(D_INODE, "set mtime on OST inode %llu to %lu\n", + (long long unsigned)st->st_ino, LTIME_S(attr->ia_mtime)); + rc = cl_setattr_ost(inode, NULL); } EXIT; out: @@ -976,7 +920,7 @@ static int llu_iop_symlink_raw(struct pnode *pno, const char *tgt) if (llu_i2stat(dir)->st_nlink >= EXT2_LINK_MAX) RETURN(err); - llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, + llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, LUSTRE_OPC_SYMLINK); err = md_create(sbi->ll_md_exp, &op_data, tgt, strlen(tgt) + 1, @@ -1135,7 +1079,7 @@ static int llu_iop_link_raw(struct pnode *old, struct pnode *new) LASSERT(dir); liblustre_wait_event(0); - llu_prep_md_op_data(&op_data, src, dir, name, namelen, 0, + llu_prep_md_op_data(&op_data, src, dir, name, namelen, 0, LUSTRE_OPC_ANY); rc = md_link(llu_i2sbi(src)->ll_md_exp, &op_data, &request); ptlrpc_req_finished(request); @@ -1162,7 +1106,7 @@ static int llu_iop_unlink_raw(struct pnode *pno) LASSERT(target); liblustre_wait_event(0); - llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, + llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, LUSTRE_OPC_ANY); rc = md_unlink(llu_i2sbi(dir)->ll_md_exp, &op_data, &request); if (!rc) @@ -1190,7 +1134,7 @@ static int llu_iop_rename_raw(struct pnode *old, struct pnode *new) LASSERT(tgt); liblustre_wait_event(0); - llu_prep_md_op_data(&op_data, src, tgt, NULL, 0, 0, + llu_prep_md_op_data(&op_data, src, tgt, NULL, 0, 0, LUSTRE_OPC_ANY); rc = md_rename(llu_i2sbi(src)->ll_md_exp, &op_data, oldname, oldnamelen, newname, newnamelen, @@ -1337,7 +1281,7 @@ static int llu_iop_mkdir_raw(struct pnode *pno, mode_t mode) if (st->st_nlink >= EXT2_LINK_MAX) RETURN(err); - llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, + llu_prep_md_op_data(&op_data, dir, NULL, name, len, 0, LUSTRE_OPC_MKDIR); err = md_create(llu_i2sbi(dir)->ll_md_exp, &op_data, NULL, 0, @@ -1364,7 +1308,7 @@ static int llu_iop_rmdir_raw(struct pnode *pno) (long long)llu_i2stat(dir)->st_ino, llu_i2info(dir)->lli_st_generation, dir); - llu_prep_md_op_data(&op_data, dir, NULL, name, len, S_IFDIR, + llu_prep_md_op_data(&op_data, dir, NULL, name, len, S_IFDIR, LUSTRE_OPC_ANY); rc = md_unlink(llu_i2sbi(dir)->ll_md_exp, &op_data, &request); ptlrpc_req_finished(request); @@ -1466,9 +1410,9 @@ static int llu_file_flock(struct inode *ino, if (lmv->desc.ld_tgt_count < 1) RETURN(rc = -ENODEV); - + if (lmv->tgts[0].ltd_exp != NULL) - rc = ldlm_cli_enqueue(lmv->tgts[0].ltd_exp, NULL, &einfo, &res_id, + rc = ldlm_cli_enqueue(lmv->tgts[0].ltd_exp, NULL, &einfo, &res_id, &flock, &flags, NULL, 0, NULL, &lockh, 0); else rc = -ENODEV; @@ -1720,7 +1664,7 @@ static int llu_lov_dir_setstripe(struct inode *ino, unsigned long arg) struct lov_user_md lum, *lump = (struct lov_user_md *)arg; int rc = 0; - llu_prep_md_op_data(&op_data, ino, NULL, NULL, 0, 0, + llu_prep_md_op_data(&op_data, ino, NULL, NULL, 0, 0, LUSTRE_OPC_ANY); LASSERT(sizeof(lum) == sizeof(*lump)); @@ -1826,7 +1770,6 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags, if (rc) GOTO(out, rc); - llu_update_inode(ino, md.body, md.lsm); lli->lli_smd = lli2->lli_smd; lli2->lli_smd = NULL; @@ -1842,6 +1785,8 @@ static int llu_lov_setstripe_ea_info(struct inode *ino, int flags, rc = llu_file_release(ino); out: ino->i_private = lli; + if (!rc) + llu_update_inode(ino, &md); if (lli2) OBD_FREE(lli2, sizeof(struct llu_inode_info)); if (req != NULL) @@ -1965,14 +1910,14 @@ struct inode *llu_iget(struct filesys *fs, struct lustre_md *md) I_RELE(inode); } else { - llu_update_inode(inode, md->body, md->lsm); + llu_update_inode(inode, md); return inode; } } inode = llu_new_inode(fs, &fid); if (inode) - llu_update_inode(inode, md->body, md->lsm); + llu_update_inode(inode, md); return inode; } @@ -2153,8 +2098,6 @@ llu_fsswop_mount(const char *source, sbi->ll_dt_exp = class_conn2export(&dt_conn); sbi->ll_lco.lco_flags = ocd.ocd_connect_flags; - err = obd_register_lock_cancel_cb(sbi->ll_dt_exp, - llu_extent_lock_cancel_cb); if (err) { CERROR("cannot register lock cancel callback: rc = %d\n", err); GOTO(out_dt, err); @@ -2212,6 +2155,8 @@ llu_fsswop_mount(const char *source, goto out_inode; } + cl_sb_init(sbi); + ptlrpc_req_finished(request); CDEBUG(D_SUPER, "LibLustre: %s mounted successfully!\n", source); @@ -2224,8 +2169,6 @@ out_inode: out_request: ptlrpc_req_finished(request); out_lock_cn_cb: - obd_unregister_lock_cancel_cb(sbi->ll_dt_exp, - llu_extent_lock_cancel_cb); out_dt: obd_disconnect(sbi->ll_dt_exp); out_md: diff --git a/lustre/liblustre/tests/sanity.c b/lustre/liblustre/tests/sanity.c index acb2385..070e3c6 100644 --- a/lustre/liblustre/tests/sanity.c +++ b/lustre/liblustre/tests/sanity.c @@ -1102,8 +1102,8 @@ int t52(char *name) close(fd); t_unlink(file); return -1; - } - atime = statbuf.st_atime; + } + atime = statbuf.st_atime; } close(fd); t_unlink(file); @@ -1117,26 +1117,26 @@ int t53(char *name) struct utimbuf times; /* struct. buffer for utime() */ struct stat stat_buf; /* struct buffer to hold file info. */ time_t mtime, atime; - + ENTER("mtime/atime should be updated by utime() call"); snprintf(file, MAX_PATH_LENGTH, "%s/test_t53_file", lustre_path); t_echo_create(file, "check mtime/atime update by utime() call"); - + /* Initialize the modification and access time in the times arg */ times.actime = NEW_TIME+10; times.modtime = NEW_TIME; - + /* file modification/access time */ utime(file, ×); - + if (stat(file, &stat_buf) < 0) { printf("stat(2) of %s failed, error:%d %s\n", - file, errno, strerror(errno)); + file, errno, strerror(errno)); } mtime = stat_buf.st_mtime; atime = stat_buf.st_atime; - + if ((mtime == NEW_TIME) && (atime == NEW_TIME + 10)) { t_unlink(file); LEAVE(); @@ -1144,7 +1144,7 @@ int t53(char *name) printf("mod time %ld, expected %ld\n", mtime, (long)NEW_TIME); printf("acc time %ld, expected %ld\n", atime, (long)NEW_TIME + 10); - + t_unlink(file); return (-1); } @@ -1170,7 +1170,7 @@ int t54(char *name) lock.l_whence = 0; lock.l_len = 1; if ((err = t_fcntl(fd, F_SETLKW, &lock)) != 0) { - fprintf(stderr, "fcntl returned: %d (%s)\n", + fprintf(stderr, "fcntl returned: %d (%s)\n", err, strerror(err)); close(fd); t_unlink(file); @@ -1203,7 +1203,7 @@ int t55(char *name) ENTER("setstripe/getstripe"); snprintf(path, MAX_PATH_LENGTH, "%s/test_t55", lustre_path); snprintf(file, MAX_PATH_LENGTH, "%s/test_t55/file_t55", lustre_path); - + buflen = sizeof(struct lov_user_md); buflen += STRIPE_COUNT * sizeof(struct lov_user_ost_data); lum = (struct lov_user_md *)malloc(buflen); @@ -1232,7 +1232,7 @@ int t55(char *name) free(lum); return -1; } - + lum->lmm_magic = LOV_USER_MAGIC; lum->lmm_stripe_count = STRIPE_COUNT; rc = ioctl(fd, LL_IOC_LOV_GETSTRIPE, lum); @@ -1255,7 +1255,7 @@ int t55(char *name) printf("lmm_stripe_count: %u\n", (int)lum->lmm_stripe_count); printf("lmm_stripe_size: %u\n", lum->lmm_stripe_size); printf("lmm_stripe_pattern: %x\n", lum->lmm_pattern); - + for (index = 0; index < lum->lmm_stripe_count; index++) { lo = lum->lmm_objects + index; printf("object %d:\n", index); @@ -1292,7 +1292,7 @@ int t55(char *name) } fd = open(file, O_RDWR, 0644); if (fd < 0) { - printf("failed to open(%s): rc = %d (%s)\n", + printf("failed to open(%s): rc = %d (%s)\n", file, fd, strerror(errno)); t_unlink(file); t_rmdir(path); @@ -1321,7 +1321,7 @@ int t55(char *name) printf("lmm_stripe_count: %u\n", (int)lum->lmm_stripe_count); printf("lmm_stripe_size: %u\n", lum->lmm_stripe_size); printf("lmm_stripe_pattern: %x\n", lum->lmm_pattern); - + for (index = 0; index < lum->lmm_stripe_count; index++) { lo = lum->lmm_objects + index; printf("object %d:\n", index); diff --git a/lustre/llite/Makefile.in b/lustre/llite/Makefile.in index 8d02c85..b06f901 100644 --- a/lustre/llite/Makefile.in +++ b/lustre/llite/Makefile.in @@ -3,6 +3,8 @@ lustre-objs := dcache.o dir.o file.o llite_close.o llite_lib.o llite_nfs.o lustre-objs += llite_fid.o rw.o lproc_llite.o namei.o symlink.o llite_mmap.o lustre-objs += xattr.o remote_perm.o llite_rmtacl.o llite_capa.o lustre-objs += rw26.o super25.o statahead.o +lustre-objs += ../lclient/glimpse.o ../lclient/lcommon_cl.o +lustre-objs += vvp_dev.o vvp_page.o vvp_lock.o vvp_io.o vvp_object.o llite_lloop-objs := lloop.o diff --git a/lustre/llite/autoMakefile.am b/lustre/llite/autoMakefile.am index 2473676..d5d1c10 100644 --- a/lustre/llite/autoMakefile.am +++ b/lustre/llite/autoMakefile.am @@ -40,4 +40,5 @@ endif DIST_SOURCES := $(lustre-objs:.o=.c) llite_internal.h rw26.c super25.c DIST_SOURCES += $(llite_lloop-objs:.o=.c) +DIST_SOURCES += vvp_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/llite/dcache.c b/lustre/llite/dcache.c index 2627a45..95e0694 100644 --- a/lustre/llite/dcache.c +++ b/lustre/llite/dcache.c @@ -216,7 +216,7 @@ int ll_drop_dentry(struct dentry *dentry) spin_lock(&dcache_lock); return 1; } - /* disconected dentry can not be find without lookup, because we + /* disconected dentry can not be find without lookup, because we * not need his to unhash or mark invalid. */ if (dentry->d_flags & DCACHE_DISCONNECTED) { unlock_dentry(dentry); @@ -309,7 +309,7 @@ int ll_revalidate_it_finish(struct ptlrpc_request *request, if (!request) RETURN(0); - if (it_disposition(it, DISP_LOOKUP_NEG)) + if (it_disposition(it, DISP_LOOKUP_NEG)) RETURN(-ENOENT); rc = ll_prep_inode(&de->d_inode, request, NULL); @@ -346,7 +346,7 @@ void ll_frob_intent(struct lookup_intent **itp, struct lookup_intent *deft) struct lookup_intent *it = *itp; #ifdef HAVE_VFS_INTENT_PATCHES if (it) { - LASSERTF(it->it_magic == INTENT_MAGIC, + LASSERTF(it->it_magic == INTENT_MAGIC, "%p has bad intent magic: %x\n", it, it->it_magic); } @@ -505,8 +505,8 @@ revalidate_finish: GOTO(out, rc = 0); } - if ((it->it_op & IT_OPEN) && de->d_inode && - !S_ISREG(de->d_inode->i_mode) && + if ((it->it_op & IT_OPEN) && de->d_inode && + !S_ISREG(de->d_inode->i_mode) && !S_ISDIR(de->d_inode->i_mode)) { ll_release_openhandle(de, it); } diff --git a/lustre/llite/dir.c b/lustre/llite/dir.c index 061f82e..334ecfe 100644 --- a/lustre/llite/dir.c +++ b/lustre/llite/dir.c @@ -419,7 +419,7 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) if (!IS_ERR(page)) { /* - * If page is empty (end of directoryis reached), + * If page is empty (end of directory is reached), * use this value. */ __u64 hash = DIR_END_OFF; @@ -500,16 +500,6 @@ int ll_readdir(struct file *filp, void *cookie, filldir_t filldir) RETURN(rc); } -#define QCTL_COPY(out, in) \ -do { \ - Q_COPY(out, in, qc_cmd); \ - Q_COPY(out, in, qc_type); \ - Q_COPY(out, in, qc_id); \ - Q_COPY(out, in, qc_stat); \ - Q_COPY(out, in, qc_dqinfo); \ - Q_COPY(out, in, qc_dqblk); \ -} while (0) - int ll_send_mgc_param(struct obd_export *mgc, char *string) { struct mgs_send_param *msp; @@ -1011,7 +1001,8 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, struct obd_quotactl *oqctl; int rc, error = 0; - if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) RETURN(-EPERM); OBD_ALLOC_PTR(oqctl); @@ -1035,7 +1026,8 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, struct if_quotacheck *check; int rc; - if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) RETURN(-EPERM); OBD_ALLOC_PTR(check); @@ -1063,47 +1055,39 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, OBD_FREE_PTR(check); RETURN(rc); } -#ifdef HAVE_QUOTA_SUPPORT case OBD_IOC_QUOTACTL: { struct if_quotactl *qctl; - struct obd_quotactl *oqctl; - - int cmd, type, id, rc = 0; + int cmd, type, id, valid, rc = 0; OBD_ALLOC_PTR(qctl); if (!qctl) RETURN(-ENOMEM); - OBD_ALLOC_PTR(oqctl); - if (!oqctl) { - OBD_FREE_PTR(qctl); - RETURN(-ENOMEM); - } if (copy_from_user(qctl, (void *)arg, sizeof(*qctl))) GOTO(out_quotactl, rc = -EFAULT); cmd = qctl->qc_cmd; type = qctl->qc_type; id = qctl->qc_id; + valid = qctl->qc_valid; + switch (cmd) { + case LUSTRE_Q_INVALIDATE: + case LUSTRE_Q_FINVALIDATE: case Q_QUOTAON: case Q_QUOTAOFF: case Q_SETQUOTA: case Q_SETINFO: - if (!cfs_capable(CFS_CAP_SYS_ADMIN)) + if (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT) GOTO(out_quotactl, rc = -EPERM); break; case Q_GETQUOTA: if (((type == USRQUOTA && current->euid != id) || (type == GRPQUOTA && !in_egroup_p(id))) && - !cfs_capable(CFS_CAP_SYS_ADMIN)) + (!cfs_capable(CFS_CAP_SYS_ADMIN) || + sbi->ll_flags & LL_SBI_RMT_CLIENT)) GOTO(out_quotactl, rc = -EPERM); - - /* XXX: dqb_valid is borrowed as a flag to mark that - * only mds quota is wanted */ - if (qctl->qc_dqblk.dqb_valid) - qctl->obd_uuid = sbi->ll_md_exp->exp_obd-> - u.cli.cl_target_uuid; break; case Q_GETINFO: break; @@ -1112,69 +1096,76 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, GOTO(out_quotactl, rc = -ENOTTY); } - QCTL_COPY(oqctl, qctl); - - if (qctl->obd_uuid.uuid[0]) { - struct obd_device *obd; - struct obd_uuid *uuid = &qctl->obd_uuid; - - obd = class_find_client_notype(uuid, - &sbi->ll_dt_exp->exp_obd->obd_uuid); - if (!obd) - GOTO(out_quotactl, rc = -ENOENT); + if (valid != QC_GENERAL) { + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + GOTO(out_quotactl, rc = -EOPNOTSUPP); if (cmd == Q_GETINFO) - oqctl->qc_cmd = Q_GETOINFO; + qctl->qc_cmd = Q_GETOINFO; else if (cmd == Q_GETQUOTA) - oqctl->qc_cmd = Q_GETOQUOTA; + qctl->qc_cmd = Q_GETOQUOTA; else GOTO(out_quotactl, rc = -EINVAL); - if (sbi->ll_md_exp->exp_obd == obd) { - rc = obd_quotactl(sbi->ll_md_exp, oqctl); - } else { - int i; - struct obd_export *exp; - struct lov_obd *lov = &sbi->ll_dt_exp-> - exp_obd->u.lov; - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || - !lov->lov_tgts[i]->ltd_active) - continue; - exp = lov->lov_tgts[i]->ltd_exp; - if (exp->exp_obd == obd) { - rc = obd_quotactl(exp, oqctl); - break; - } - } + switch (valid) { + case QC_MDTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_OSTIDX: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, NULL); + break; + case QC_UUID: + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_md_exp, + sizeof(*qctl), qctl, NULL); + if (rc == -EAGAIN) + rc = obd_iocontrol(OBD_IOC_QUOTACTL, + sbi->ll_dt_exp, + sizeof(*qctl), qctl, + NULL); + break; + default: + rc = -EINVAL; + break; } - oqctl->qc_cmd = cmd; - QCTL_COPY(qctl, oqctl); - - if (copy_to_user((void *)arg, qctl, sizeof(*qctl))) - rc = -EFAULT; - - GOTO(out_quotactl, rc); - } - - rc = obd_quotactl(sbi->ll_md_exp, oqctl); - if (rc && rc != -EBUSY && cmd == Q_QUOTAON) { - oqctl->qc_cmd = Q_QUOTAOFF; - obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) + GOTO(out_quotactl, rc); + else + qctl->qc_cmd = cmd; + } else { + struct obd_quotactl *oqctl; + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + GOTO(out_quotactl, rc = -ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(sbi->ll_md_exp, oqctl); + if (rc) { + if (rc != -EBUSY && cmd == Q_QUOTAON) { + oqctl->qc_cmd = Q_QUOTAOFF; + obd_quotactl(sbi->ll_md_exp, oqctl); + } + OBD_FREE_PTR(oqctl); + GOTO(out_quotactl, rc); + } else { + QCTL_COPY(qctl, oqctl); + OBD_FREE_PTR(oqctl); + } } - QCTL_COPY(qctl, oqctl); - if (copy_to_user((void *)arg, qctl, sizeof(*qctl))) rc = -EFAULT; + out_quotactl: OBD_FREE_PTR(qctl); - OBD_FREE_PTR(oqctl); RETURN(rc); } -#endif /* HAVE_QUOTA_SUPPORT */ case OBD_IOC_GETNAME: { struct obd_device *obd = class_exp2obd(sbi->ll_dt_exp); if (!obd) @@ -1202,6 +1193,27 @@ static int ll_dir_ioctl(struct inode *inode, struct file *file, RETURN(0); } #endif + case LL_IOC_GETOBDCOUNT: { + int count; + + if (copy_from_user(&count, (int *)arg, sizeof(int))) + RETURN(-EFAULT); + + if (!count) { + /* get ost count */ + struct lov_obd *lov = &sbi->ll_dt_exp->exp_obd->u.lov; + count = lov->desc.ld_tgt_count; + } else { + /* get mdt count */ + struct lmv_obd *lmv = &sbi->ll_md_exp->exp_obd->u.lmv; + count = lmv->desc.ld_tgt_count; + } + + if (copy_to_user((int *)arg, &count, sizeof(int))) + RETURN(-EFAULT); + + RETURN(0); + } default: RETURN(obd_iocontrol(cmd, sbi->ll_dt_exp,0,NULL,(void *)arg)); } diff --git a/lustre/llite/file.c b/lustre/llite/file.c index e8978f3..9850774 100644 --- a/lustre/llite/file.c +++ b/lustre/llite/file.c @@ -49,7 +49,8 @@ #include "llite_internal.h" #include -/* also used by llite/special.c:ll_special_open() */ +#include "cl_object.h" + struct ll_file_data *ll_file_data_get(void) { struct ll_file_data *fd; @@ -237,10 +238,12 @@ int ll_md_close(struct obd_export *md_exp, struct inode *inode, /* clear group lock, if present */ if (unlikely(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { +#if 0 /* XXX */ struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh); +#endif } /* Let's see if we have good enough OPEN lock on the file and if @@ -328,7 +331,7 @@ int ll_file_release(struct inode *inode, struct file *file) * Different processes can open the same dir, "ll_opendir_key" means: * it is me that should stop the statahead thread. */ if (lli->lli_opendir_key == fd && lli->lli_opendir_pid != 0) - ll_stop_statahead(inode, fd); + ll_stop_statahead(inode, lli->lli_opendir_key); if (inode->i_sb->s_root == file->f_dentry) { LUSTRE_FPRIVATE(file) = NULL; @@ -506,6 +509,7 @@ int ll_file_open(struct inode *inode, struct file *file) if (fd == NULL) RETURN(-ENOMEM); + fd->fd_file = file; if (S_ISDIR(inode->i_mode)) { again: spin_lock(&lli->lli_lock); @@ -686,7 +690,7 @@ out_och_free: up(&lli->lli_och_sem); out_openerr: if (opendir_set != 0) - ll_stop_statahead(inode, fd); + ll_stop_statahead(inode, lli->lli_opendir_key); } return rc; @@ -743,281 +747,7 @@ int ll_inode_getattr(struct inode *inode, struct obdo *obdo) RETURN(0); } -static int ll_lock_to_stripe_offset(struct inode *inode, struct ldlm_lock *lock) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_export *exp = ll_i2dtexp(inode); - struct { - char name[16]; - struct ldlm_lock *lock; - } key = { .name = KEY_LOCK_TO_STRIPE, .lock = lock }; - __u32 stripe, vallen = sizeof(stripe); - struct lov_oinfo *loinfo; - int rc; - ENTRY; - - if (lsm->lsm_stripe_count == 1) - GOTO(check, stripe = 0); - - /* get our offset in the lov */ - rc = obd_get_info(exp, sizeof(key), &key, &vallen, &stripe, lsm); - if (rc != 0) { - CERROR("obd_get_info: rc = %d\n", rc); - RETURN(rc); - } - LASSERT(stripe < lsm->lsm_stripe_count); - -check: - loinfo = lsm->lsm_oinfo[stripe]; - if (!osc_res_name_eq(loinfo->loi_id, loinfo->loi_gr, - &lock->l_resource->lr_name)){ - LDLM_ERROR(lock, "resource doesn't match object "LPU64"/"LPU64, - loinfo->loi_id, loinfo->loi_gr); - RETURN(-ELDLM_NO_LOCK_DATA); - } - - RETURN(stripe); -} - -/* Get extra page reference to ensure it is not going away */ -void ll_pin_extent_cb(void *data) -{ - struct page *page = data; - - page_cache_get(page); - - return; -} - -/* Flush the page from page cache for an extent as its canceled. - * Page to remove is delivered as @data. - * - * No one can dirty the extent until we've finished our work and they cannot - * enqueue another lock. The DLM protects us from ll_file_read/write here, - * but other kernel actors could have pages locked. - * - * If @discard is set, there is no need to write the page if it is dirty. - * - * Called with the DLM lock held. */ -int ll_page_removal_cb(void *data, int discard) -{ - int rc; - struct page *page = data; - struct address_space *mapping; - - ENTRY; - - /* We have page reference already from ll_pin_page */ - lock_page(page); - - /* Already truncated by somebody */ - if (!page->mapping) - GOTO(out, rc = 0); - mapping = page->mapping; - - ll_teardown_mmaps(mapping, - (__u64)page->index << PAGE_CACHE_SHIFT, - ((__u64)page->index<mapping); - rc = ll_call_writepage(page->mapping->host, page); - /* either waiting for io to complete or reacquiring - * the lock that the failed writepage released */ - lock_page(page); - wait_on_page_writeback(page); - if (rc != 0) { - CERROR("writepage inode %lu(%p) of page %p " - "failed: %d\n", mapping->host->i_ino, - mapping->host, page, rc); - if (rc == -ENOSPC) - set_bit(AS_ENOSPC, &mapping->flags); - else - set_bit(AS_EIO, &mapping->flags); - } - set_bit(AS_EIO, &mapping->flags); - } - if (page->mapping != NULL) { - struct ll_async_page *llap = llap_cast_private(page); - /* checking again to account for writeback's lock_page() */ - LL_CDEBUG_PAGE(D_PAGE, page, "truncating\n"); - if (llap) - ll_ra_accounting(llap, page->mapping); - ll_truncate_complete_page(page); - } - EXIT; -out: - LASSERT(!PageWriteback(page)); - unlock_page(page); - page_cache_release(page); - - return 0; -} - -int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, int flag) -{ - struct inode *inode; - struct ll_inode_info *lli; - struct lov_stripe_md *lsm; - int stripe; - __u64 kms; - - ENTRY; - - if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) { - LDLM_ERROR(lock, "cancelling lock with bad data %p", data); - LBUG(); - } - - inode = ll_inode_from_lock(lock); - if (inode == NULL) - RETURN(0); - lli = ll_i2info(inode); - if (lli == NULL) - GOTO(iput, 0); - if (lli->lli_smd == NULL) - GOTO(iput, 0); - lsm = lli->lli_smd; - - stripe = ll_lock_to_stripe_offset(inode, lock); - if (stripe < 0) - GOTO(iput, 0); - - lov_stripe_lock(lsm); - lock_res_and_lock(lock); - kms = ldlm_extent_shift_kms(lock, - lsm->lsm_oinfo[stripe]->loi_kms); - - if (lsm->lsm_oinfo[stripe]->loi_kms != kms) - LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64, - lsm->lsm_oinfo[stripe]->loi_kms, kms); - lsm->lsm_oinfo[stripe]->loi_kms = kms; - unlock_res_and_lock(lock); - lov_stripe_unlock(lsm); - ll_queue_done_writing(inode, 0); - EXIT; -iput: - iput(inode); - - return 0; -} - -#if 0 -int ll_async_completion_ast(struct ldlm_lock *lock, int flags, void *data) -{ - /* XXX ALLOCATE - 160 bytes */ - struct inode *inode = ll_inode_from_lock(lock); - struct ll_inode_info *lli = ll_i2info(inode); - struct lustre_handle lockh = { 0 }; - struct ost_lvb *lvb; - int stripe; - ENTRY; - - if (flags & (LDLM_FL_BLOCK_WAIT | LDLM_FL_BLOCK_GRANTED | - LDLM_FL_BLOCK_CONV)) { - LBUG(); /* not expecting any blocked async locks yet */ - LDLM_DEBUG(lock, "client-side async enqueue returned a blocked " - "lock, returning"); - ldlm_lock_dump(D_OTHER, lock, 0); - ldlm_reprocess_all(lock->l_resource); - RETURN(0); - } - - LDLM_DEBUG(lock, "client-side async enqueue: granted/glimpsed"); - - stripe = ll_lock_to_stripe_offset(inode, lock); - if (stripe < 0) - goto iput; - - if (lock->l_lvb_len) { - struct lov_stripe_md *lsm = lli->lli_smd; - __u64 kms; - lvb = lock->l_lvb_data; - lsm->lsm_oinfo[stripe].loi_rss = lvb->lvb_size; - - lock_res_and_lock(lock); - ll_inode_size_lock(inode, 1); - kms = MAX(lsm->lsm_oinfo[stripe].loi_kms, lvb->lvb_size); - kms = ldlm_extent_shift_kms(NULL, kms); - if (lsm->lsm_oinfo[stripe].loi_kms != kms) - LDLM_DEBUG(lock, "updating kms from "LPU64" to "LPU64, - lsm->lsm_oinfo[stripe].loi_kms, kms); - lsm->lsm_oinfo[stripe].loi_kms = kms; - ll_inode_size_unlock(inode, 1); - unlock_res_and_lock(lock); - } - -iput: - iput(inode); - wake_up(&lock->l_waitq); - - ldlm_lock2handle(lock, &lockh); - ldlm_lock_decref(&lockh, LCK_PR); - RETURN(0); -} -#endif - -static int ll_glimpse_callback(struct ldlm_lock *lock, void *reqp) -{ - struct ptlrpc_request *req = reqp; - struct inode *inode = ll_inode_from_lock(lock); - struct ll_inode_info *lli; - struct lov_stripe_md *lsm; - struct ost_lvb *lvb; - int rc, stripe; - ENTRY; - - if (inode == NULL) - GOTO(out, rc = -ELDLM_NO_LOCK_DATA); - lli = ll_i2info(inode); - if (lli == NULL) - GOTO(iput, rc = -ELDLM_NO_LOCK_DATA); - lsm = lli->lli_smd; - if (lsm == NULL) - GOTO(iput, rc = -ELDLM_NO_LOCK_DATA); - - /* First, find out which stripe index this lock corresponds to. */ - stripe = ll_lock_to_stripe_offset(inode, lock); - if (stripe < 0) - GOTO(iput, rc = -ELDLM_NO_LOCK_DATA); - - req_capsule_extend(&req->rq_pill, &RQF_LDLM_GL_CALLBACK); - req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - sizeof(*lvb)); - rc = req_capsule_server_pack(&req->rq_pill); - if (rc) { - CERROR("lustre_pack_reply: %d\n", rc); - GOTO(iput, rc); - } - - lvb = req_capsule_server_get(&req->rq_pill, &RMF_DLM_LVB); - lvb->lvb_size = lli->lli_smd->lsm_oinfo[stripe]->loi_kms; - lvb->lvb_mtime = LTIME_S(inode->i_mtime); - lvb->lvb_atime = LTIME_S(inode->i_atime); - lvb->lvb_ctime = LTIME_S(inode->i_ctime); - - LDLM_DEBUG(lock, "i_size: %llu -> stripe number %u -> kms "LPU64 - " atime "LPU64", mtime "LPU64", ctime "LPU64, - i_size_read(inode), stripe, lvb->lvb_size, lvb->lvb_mtime, - lvb->lvb_atime, lvb->lvb_ctime); - iput: - iput(inode); - - out: - /* These errors are normal races, so we don't want to fill the console - * with messages by calling ptlrpc_error() */ - if (rc == -ELDLM_NO_LOCK_DATA) - lustre_pack_reply(req, 1, NULL, NULL); - - req->rq_status = rc; - return rc; -} - -static int ll_merge_lvb(struct inode *inode) +int ll_merge_lvb(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); struct ll_sb_info *sbi = ll_i2sbi(inode); @@ -1040,824 +770,324 @@ static int ll_merge_lvb(struct inode *inode) RETURN(rc); } -int ll_local_size(struct inode *inode) -{ - ldlm_policy_data_t policy = { .l_extent = { 0, OBD_OBJECT_EOF } }; - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lustre_handle lockh = { 0 }; - int flags = 0; - int rc; - ENTRY; - - if (lli->lli_smd->lsm_stripe_count == 0) - RETURN(0); - - rc = obd_match(sbi->ll_dt_exp, lli->lli_smd, LDLM_EXTENT, - &policy, LCK_PR, &flags, inode, &lockh); - if (rc < 0) - RETURN(rc); - else if (rc == 0) - RETURN(-ENODATA); - - rc = ll_merge_lvb(inode); - obd_cancel(sbi->ll_dt_exp, lli->lli_smd, LCK_PR, &lockh); - RETURN(rc); -} - int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, lstat_t *st) { - struct lustre_handle lockh = { 0 }; - struct ldlm_enqueue_info einfo = { 0 }; - struct obd_info oinfo = { { { 0 } } }; - struct ost_lvb lvb; - int rc; - - ENTRY; - - einfo.ei_type = LDLM_EXTENT; - einfo.ei_mode = LCK_PR; - einfo.ei_cb_bl = osc_extent_blocking_cb; - einfo.ei_cb_cp = ldlm_completion_ast; - einfo.ei_cb_gl = ll_glimpse_callback; - einfo.ei_cbdata = NULL; - - oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - oinfo.oi_lockh = &lockh; - oinfo.oi_md = lsm; - oinfo.oi_flags = LDLM_FL_HAS_INTENT; - - rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo); - if (rc == -ENOENT) - RETURN(rc); - if (rc != 0) { - CERROR("obd_enqueue returned rc %d, " - "returning -EIO\n", rc); - RETURN(rc > 0 ? -EIO : rc); - } - - lov_stripe_lock(lsm); - memset(&lvb, 0, sizeof(lvb)); - obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 0); - st->st_size = lvb.lvb_size; - st->st_blocks = lvb.lvb_blocks; - st->st_mtime = lvb.lvb_mtime; - st->st_atime = lvb.lvb_atime; - st->st_ctime = lvb.lvb_ctime; - lov_stripe_unlock(lsm); - - RETURN(rc); + /* XXX */ + return -ENOSYS; } -/* NB: obd_merge_lvb will prefer locally cached writes if they extend the - * file (because it prefers KMS over RSS when larger) */ -int ll_glimpse_size(struct inode *inode, int ast_flags) +void ll_io_init(struct cl_io *io, const struct file *file, int write) { - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lustre_handle lockh = { 0 }; - struct ldlm_enqueue_info einfo = { 0 }; - struct obd_info oinfo = { { { 0 } } }; - int rc; + struct inode *inode = file->f_dentry->d_inode; + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + + LASSERT(fd != NULL); + memset(io, 0, sizeof *io); + io->u.ci_rw.crw_nonblock = file->f_flags & O_NONBLOCK; + if (write) + io->u.ci_wr.wr_append = file->f_flags & O_APPEND; + io->ci_obj = ll_i2info(inode)->lli_clob; + io->ci_lockreq = CILR_MAYBE; + if (fd->fd_flags & LL_FILE_IGNORE_LOCK || sbi->ll_flags & LL_SBI_NOLCK) + io->ci_lockreq = CILR_NEVER; + else if (file->f_flags & O_APPEND) + io->ci_lockreq = CILR_MANDATORY; +} + +static ssize_t ll_file_io_generic(const struct lu_env *env, + struct ccc_io_args *args, struct file *file, + enum cl_io_type iot, loff_t *ppos, size_t count) +{ + struct cl_io *io; + ssize_t result; ENTRY; - if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) - RETURN(0); + io = &ccc_env_info(env)->cti_io; + ll_io_init(io, file, iot == CIT_WRITE); - CDEBUG(D_DLMTRACE, "Glimpsing inode %lu\n", inode->i_ino); + if (iot == CIT_READ) + io->u.ci_rd.rd_is_sendfile = args->cia_is_sendfile; - if (!lli->lli_smd) { - CDEBUG(D_DLMTRACE, "No objects for inode %lu\n", inode->i_ino); - RETURN(0); - } - - /* NOTE: this looks like DLM lock request, but it may not be one. Due - * to LDLM_FL_HAS_INTENT flag, this is glimpse request, that - * won't revoke any conflicting DLM locks held. Instead, - * ll_glimpse_callback() will be called on each client - * holding a DLM lock against this file, and resulting size - * will be returned for each stripe. DLM lock on [0, EOF] is - * acquired only if there were no conflicting locks. */ - einfo.ei_type = LDLM_EXTENT; - einfo.ei_mode = LCK_PR; - einfo.ei_cb_bl = osc_extent_blocking_cb; - einfo.ei_cb_cp = ldlm_completion_ast; - einfo.ei_cb_gl = ll_glimpse_callback; - einfo.ei_cbdata = inode; - - oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - oinfo.oi_lockh = &lockh; - oinfo.oi_md = lli->lli_smd; - oinfo.oi_flags = ast_flags | LDLM_FL_HAS_INTENT; - - rc = obd_enqueue_rqset(sbi->ll_dt_exp, &oinfo, &einfo); - if (rc == -ENOENT) - RETURN(rc); - if (rc != 0) { - CERROR("obd_enqueue returned rc %d, returning -EIO\n", rc); - RETURN(rc > 0 ? -EIO : rc); + if (cl_io_rw_init(env, io, iot, *ppos, count) == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + if (cl_io_is_sendfile(io)) { + vio->u.read.cui_actor = args->cia_actor; + vio->u.read.cui_target = args->cia_target; + } else { + cio->cui_iov = args->cia_iov; + cio->cui_nrsegs = args->cia_nrsegs; +#ifndef HAVE_FILE_WRITEV + cio->cui_iocb = args->cia_iocb; +#endif + } + cio->cui_fd = LUSTRE_FPRIVATE(file); + result = cl_io_loop(env, io); + } else + /* cl_io_rw_init() handled IO */ + result = io->ci_result; + if (io->ci_nob > 0) { + result = io->ci_nob; + *ppos = io->u.ci_wr.wr.crw_pos; } - - rc = ll_merge_lvb(inode); - - CDEBUG(D_DLMTRACE, "glimpse: size: %llu, blocks: %llu\n", - i_size_read(inode), (unsigned long long)inode->i_blocks); - - RETURN(rc); + cl_io_fini(env, io); + RETURN(result); } -int ll_extent_lock(struct ll_file_data *fd, struct inode *inode, - struct lov_stripe_md *lsm, int mode, - ldlm_policy_data_t *policy, struct lustre_handle *lockh, - int ast_flags) -{ - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ost_lvb lvb; - struct ldlm_enqueue_info einfo = { 0 }; - struct obd_info oinfo = { { { 0 } } }; - int rc; - ENTRY; - - LASSERT(!lustre_handle_is_used(lockh)); - LASSERT(lsm != NULL); - - /* XXX phil: can we do this? won't it screw the file size up? */ - if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || - (sbi->ll_flags & LL_SBI_NOLCK)) - RETURN(0); - - CDEBUG(D_DLMTRACE, "Locking inode %lu, start "LPU64" end "LPU64"\n", - inode->i_ino, policy->l_extent.start, policy->l_extent.end); - einfo.ei_type = LDLM_EXTENT; - einfo.ei_mode = mode; - einfo.ei_cb_bl = osc_extent_blocking_cb; - einfo.ei_cb_cp = ldlm_completion_ast; - einfo.ei_cb_gl = ll_glimpse_callback; - einfo.ei_cbdata = inode; - - oinfo.oi_policy = *policy; - oinfo.oi_lockh = lockh; - oinfo.oi_md = lsm; - oinfo.oi_flags = ast_flags; +/* + * XXX: exact copy from kernel code (__generic_file_aio_write_nolock) + */ +static int ll_file_get_iov_count(const struct iovec *iov, + unsigned long *nr_segs, size_t *count) +{ + size_t cnt = 0; + unsigned long seg; - rc = obd_enqueue(sbi->ll_dt_exp, &oinfo, &einfo, NULL); - *policy = oinfo.oi_policy; - if (rc > 0) - rc = -EIO; + for (seg = 0; seg < *nr_segs; seg++) { + const struct iovec *iv = &iov[seg]; - ll_inode_size_lock(inode, 1); - inode_init_lvb(inode, &lvb); - obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1); - - if (policy->l_extent.start == 0 && - policy->l_extent.end == OBD_OBJECT_EOF) { - /* vmtruncate()->ll_truncate() first sets the i_size and then - * the kms under both a DLM lock and the - * ll_inode_size_lock(). If we don't get the - * ll_inode_size_lock() here we can match the DLM lock and - * reset i_size from the kms before the truncating path has - * updated the kms. generic_file_write can then trust the - * stale i_size when doing appending writes and effectively - * cancel the result of the truncate. Getting the - * ll_inode_size_lock() after the enqueue maintains the DLM - * -> ll_inode_size_lock() acquiring order. */ - i_size_write(inode, lvb.lvb_size); - CDEBUG(D_INODE, "inode=%lu, updating i_size %llu\n", - inode->i_ino, i_size_read(inode)); - } - - if (rc == 0) { - LTIME_S(inode->i_mtime) = lvb.lvb_mtime; - LTIME_S(inode->i_atime) = lvb.lvb_atime; - LTIME_S(inode->i_ctime) = lvb.lvb_ctime; + /* + * If any segment has a negative length, or the cumulative + * length ever wraps negative then return -EINVAL. + */ + cnt += iv->iov_len; + if (unlikely((ssize_t)(cnt|iv->iov_len) < 0)) + return -EINVAL; + if (access_ok(VERIFY_READ, iv->iov_base, iv->iov_len)) + continue; + if (seg == 0) + return -EFAULT; + *nr_segs = seg; + cnt -= iv->iov_len; /* This segment is no good */ + break; } - ll_inode_size_unlock(inode, 1); - - RETURN(rc); + *count = cnt; + return 0; } -int ll_extent_unlock(struct ll_file_data *fd, struct inode *inode, - struct lov_stripe_md *lsm, int mode, - struct lustre_handle *lockh) +#ifdef HAVE_FILE_READV +static ssize_t ll_file_readv(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc; + struct lu_env *env; + struct ccc_io_args *args; + size_t count; + ssize_t result; + int refcheck; ENTRY; - /* XXX phil: can we do this? won't it screw the file size up? */ - if ((fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) || - (sbi->ll_flags & LL_SBI_NOLCK)) - RETURN(0); - - rc = obd_cancel(sbi->ll_dt_exp, lsm, mode, lockh); + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); - RETURN(rc); -} - -static void ll_set_file_contended(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); - cfs_time_t now = cfs_time_current(); - - spin_lock(&lli->lli_lock); - lli->lli_contention_time = now; - lli->lli_flags |= LLIF_CONTENDED; - spin_unlock(&lli->lli_lock); -} - -void ll_clear_file_contended(struct inode *inode) -{ - struct ll_inode_info *lli = ll_i2info(inode); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - spin_lock(&lli->lli_lock); - lli->lli_flags &= ~LLIF_CONTENDED; - spin_unlock(&lli->lli_lock); + args = &vvp_env_info(env)->vti_args; + args->cia_is_sendfile = 0; + args->cia_iov = (struct iovec *)iov; + args->cia_nrsegs = nr_segs; + result = ll_file_io_generic(env, args, file, CIT_READ, ppos, count); + cl_env_put(env, &refcheck); + RETURN(result); } -static int ll_is_file_contended(struct file *file) +static ssize_t ll_file_read(struct file *file, char *buf, size_t count, + loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); + struct lu_env *env; + struct iovec *local_iov; + ssize_t result; + int refcheck; ENTRY; - if (!(sbi->ll_lco.lco_flags & OBD_CONNECT_SRVLOCK)) { - CDEBUG(D_INFO, "the server does not support SRVLOCK feature," - " osc connect flags = 0x"LPX64"\n", - sbi->ll_lco.lco_flags); - RETURN(0); - } - if (fd && (fd->fd_flags & LL_FILE_IGNORE_LOCK)) - RETURN(1); - if (lli->lli_flags & LLIF_CONTENDED) { - cfs_time_t cur_time = cfs_time_current(); - cfs_time_t retry_time; - - retry_time = cfs_time_add( - lli->lli_contention_time, - cfs_time_seconds(sbi->ll_contention_time)); - if (cfs_time_after(cur_time, retry_time)) { - ll_clear_file_contended(inode); - RETURN(0); - } - RETURN(1); - } - RETURN(0); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + local_iov = &vvp_env_info(env)->vti_local_iov; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; + result = ll_file_readv(file, local_iov, 1, ppos); + cl_env_put(env, &refcheck); + RETURN(result); } -static int ll_file_get_tree_lock(struct ll_lock_tree *tree, struct file *file, - const char *buf, size_t count, - loff_t start, loff_t end, int rw) -{ - int append; - int tree_locked = 0; - int rc; - struct inode * inode = file->f_dentry->d_inode; +#else +static ssize_t ll_file_aio_read(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) +{ + struct lu_env *env; + struct ccc_io_args *args; + size_t count; + ssize_t result; + int refcheck; ENTRY; - append = (rw == OBD_BRW_WRITE) && (file->f_flags & O_APPEND); + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); - if (append || !ll_is_file_contended(file)) { - struct ll_lock_tree_node *node; - int ast_flags; + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - ast_flags = append ? 0 : LDLM_FL_DENY_ON_CONTENTION; - if (file->f_flags & O_NONBLOCK) - ast_flags |= LDLM_FL_BLOCK_NOWAIT; - node = ll_node_from_inode(inode, start, end, - (rw == OBD_BRW_WRITE) ? LCK_PW : LCK_PR); - if (IS_ERR(node)) { - rc = PTR_ERR(node); - GOTO(out, rc); - } - tree->lt_fd = LUSTRE_FPRIVATE(file); - rc = ll_tree_lock(tree, node, buf, count, ast_flags); - if (rc == 0) - tree_locked = 1; - else if (rc == -EUSERS) - ll_set_file_contended(inode); - else - GOTO(out, rc); - } - RETURN(tree_locked); -out: - return rc; + args = &vvp_env_info(env)->vti_args; + args->cia_is_sendfile = 0; + args->cia_iov = (struct iovec *)iov; + args->cia_nrsegs = nr_segs; + args->cia_iocb = iocb; + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_READ, + &iocb->ki_pos, count); + cl_env_put(env, &refcheck); + RETURN(result); } -/** - * Checks if requested extent lock is compatible with a lock under a page. - * - * Checks if the lock under \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param page the page under which lock is considered - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param start start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * - * \post result == 1, *cookie == context, appropriate lock is referenced or - * \post result == 0 - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * - * \see ll_release_short_lock - */ -static int ll_reget_short_lock(struct page *page, int rw, - obd_off start, obd_off end, - void **cookie) +static ssize_t ll_file_read(struct file *file, char *buf, size_t count, + loff_t *ppos) { - struct ll_async_page *llap; - struct obd_export *exp; - struct inode *inode = page->mapping->host; - + struct lu_env *env; + struct iovec *local_iov; + struct kiocb *kiocb; + ssize_t result; + int refcheck; ENTRY; - exp = ll_i2dtexp(inode); - if (exp == NULL) - RETURN(0); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - llap = llap_cast_private(page); - if (llap == NULL) - RETURN(0); + local_iov = &vvp_env_info(env)->vti_local_iov; + kiocb = &vvp_env_info(env)->vti_kiocb; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; + init_sync_kiocb(kiocb, file); + kiocb->ki_pos = *ppos; + kiocb->ki_left = count; - RETURN(obd_reget_short_lock(exp, ll_i2info(inode)->lli_smd, - &llap->llap_cookie, rw, start, end, - cookie)); -} + result = ll_file_aio_read(kiocb, local_iov, 1, kiocb->ki_pos); + *ppos = kiocb->ki_pos; -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or a write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param inode inode to which data belong - * \param end end of the locked extent - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see ll_reget_short_lock - */ -static void ll_release_short_lock(struct inode *inode, obd_off end, - void *cookie, int rw) -{ - struct obd_export *exp; - int rc; - - exp = ll_i2dtexp(inode); - if (exp == NULL) - return; - - rc = obd_release_short_lock(exp, ll_i2info(inode)->lli_smd, end, - cookie, rw); - if (rc < 0) - CERROR("unlock failed (%d)\n", rc); + cl_env_put(env, &refcheck); + RETURN(result); } +#endif -/** - * Checks if requested extent lock is compatible - * with a lock under a page in page cache. - * - * Checks if a lock under some \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param file the file under which lock is considered - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param ppos start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * \param buf userspace buffer for the data - * - * \post result == 1, *cookie == context, appropriate lock is referenced - * \post retuls == 0 - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * - * \see ll_file_put_fast_lock +/* + * Write to a file (through the page cache). */ -static inline int ll_file_get_fast_lock(struct file *file, - obd_off ppos, obd_off end, - char *buf, void **cookie, int rw) -{ - int rc = 0; - struct page *page; - +#ifdef HAVE_FILE_WRITEV +static ssize_t ll_file_writev(struct file *file, const struct iovec *iov, + unsigned long nr_segs, loff_t *ppos) +{ + struct lu_env *env; + struct ccc_io_args *args; + size_t count; + ssize_t result; + int refcheck; ENTRY; - if (!ll_region_mapped((unsigned long)buf, end - ppos)) { - page = find_lock_page(file->f_dentry->d_inode->i_mapping, - ppos >> CFS_PAGE_SHIFT); - if (page) { - if (ll_reget_short_lock(page, rw, ppos, end, cookie)) - rc = 1; + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); - unlock_page(page); - page_cache_release(page); - } - } + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - RETURN(rc); -} - -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or a write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param inode inode to which data belong - * \param end end of the locked extent - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see ll_file_get_fast_lock - */ -static inline void ll_file_put_fast_lock(struct inode *inode, obd_off end, - void *cookie, int rw) -{ - ll_release_short_lock(inode, end, cookie, rw); + args = &vvp_env_info(env)->vti_args; + args->cia_iov = (struct iovec *)iov; + args->cia_nrsegs = nr_segs; + result = ll_file_io_generic(env, args, file, CIT_WRITE, ppos, count); + cl_env_put(env, &refcheck); + RETURN(result); } -enum ll_lock_style { - LL_LOCK_STYLE_NOLOCK = 0, - LL_LOCK_STYLE_FASTLOCK = 1, - LL_LOCK_STYLE_TREELOCK = 2 -}; - -/** - * Checks if requested extent lock is compatible with a lock - * under a page cache page. - * - * Checks if the lock under \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param file file under which I/O is processed - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param ppos start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * (only used with LL_LOCK_STYLE_FASTLOCK) - * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK) - * \param buf userspace buffer for the data - * - * \retval LL_LOCK_STYLE_FASTLOCK owned lock is reused through fast lock - * \retval LL_LOCK_STYLE_TREELOCK got a lock through tree lock - * \retval LL_LOCK_STYLE_NOLOCK got no lock - * - * \see ll_file_put_lock - */ -static inline int ll_file_get_lock(struct file *file, obd_off ppos, - obd_off end, char *buf, void **cookie, - struct ll_lock_tree *tree, int rw) +static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, + loff_t *ppos) { - int rc; - + struct lu_env *env; + struct iovec *local_iov; + ssize_t result; + int refcheck; ENTRY; - if (ll_file_get_fast_lock(file, ppos, end, buf, cookie, rw)) - RETURN(LL_LOCK_STYLE_FASTLOCK); - - rc = ll_file_get_tree_lock(tree, file, buf, ppos - end, ppos, end, rw); - /* rc: 1 for tree lock, 0 for no lock, <0 for error */ - switch (rc) { - case 1: - RETURN(LL_LOCK_STYLE_TREELOCK); - case 0: - RETURN(LL_LOCK_STYLE_NOLOCK); - } + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - /* an error happened if we reached this point, rc = -errno here */ - RETURN(rc); -} + local_iov = &vvp_env_info(env)->vti_local_iov; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; -/** - * Drops the lock taken by ll_file_get_lock. - * - * Releases a read or a write (specified by \a rw) lock - * referenced by \a tree or \a cookie. - * - * \param inode inode to which data belong - * \param end end of the locked extent - * \param lockstyle facility through which the lock was taken - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * (only used with LL_LOCK_STYLE_FASTLOCK) - * \param tree lock tree (only used with LL_LOCK_STYLE_TREELOCK) - * - * \post appropriate lock is dereferenced - * - * \see ll_file_get_lock - */ -static inline void ll_file_put_lock(struct inode *inode, obd_off end, - enum ll_lock_style lock_style, - void *cookie, struct ll_lock_tree *tree, - int rw) - -{ - switch (lock_style) { - case LL_LOCK_STYLE_TREELOCK: - ll_tree_unlock(tree); - break; - case LL_LOCK_STYLE_FASTLOCK: - ll_file_put_fast_lock(inode, end, cookie, rw); - break; - default: - CERROR("invalid locking style (%d)\n", lock_style); - } + result = ll_file_writev(file, local_iov, 1, ppos); + cl_env_put(env, &refcheck); + RETURN(result); } -static ssize_t ll_file_read(struct file *file, char *buf, size_t count, - loff_t *ppos) +#else /* AIO stuff */ +static ssize_t ll_file_aio_write(struct kiocb *iocb, const struct iovec *iov, + unsigned long nr_segs, loff_t pos) { - struct inode *inode = file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_lock_tree tree; - struct ost_lvb lvb; - struct ll_ra_read bead; - int ra = 0; - obd_off end; - ssize_t retval, chunk, sum = 0; - int lock_style; - void *cookie; - - __u64 kms; + struct lu_env *env; + struct ccc_io_args *args; + size_t count; + ssize_t result; + int refcheck; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", - inode->i_ino, inode->i_generation, inode, count, *ppos); - /* "If nbyte is 0, read() will return 0 and have no other results." - * -- Single Unix Spec */ - if (count == 0) - RETURN(0); - - ll_stats_ops_tally(sbi, LPROC_LL_READ_BYTES, count); - - if (!lsm) { - /* Read on file with no objects should return zero-filled - * buffers up to file size (we can get non-zero sizes with - * mknod + truncate, then opening file for read. This is a - * common pattern in NFS case, it seems). Bug 6243 */ - int notzeroed; - /* Since there are no objects on OSTs, we have nothing to get - * lock on and so we are forced to access inode->i_size - * unguarded */ - - /* Read beyond end of file */ - if (*ppos >= i_size_read(inode)) - RETURN(0); - - if (count > i_size_read(inode) - *ppos) - count = i_size_read(inode) - *ppos; - /* Make sure to correctly adjust the file pos pointer for - * EFAULT case */ - notzeroed = clear_user(buf, count); - count -= notzeroed; - *ppos += count; - if (!count) - RETURN(-EFAULT); - RETURN(count); - } -repeat: - if (sbi->ll_max_rw_chunk != 0) { - /* first, let's know the end of the current stripe */ - end = *ppos; - obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, &end); - - /* correct, the end is beyond the request */ - if (end > *ppos + count - 1) - end = *ppos + count - 1; - - /* and chunk shouldn't be too large even if striping is wide */ - if (end - *ppos > sbi->ll_max_rw_chunk) - end = *ppos + sbi->ll_max_rw_chunk - 1; - } else { - end = *ppos + count - 1; - } - - lock_style = ll_file_get_lock(file, (obd_off)(*ppos), end, - buf, &cookie, &tree, OBD_BRW_READ); - if (lock_style < 0) - GOTO(out, retval = lock_style); - - ll_inode_size_lock(inode, 1); - /* - * Consistency guarantees: following possibilities exist for the - * relation between region being read and real file size at this - * moment: - * - * (A): the region is completely inside of the file; - * - * (B-x): x bytes of region are inside of the file, the rest is - * outside; - * - * (C): the region is completely outside of the file. - * - * This classification is stable under DLM lock acquired by - * ll_tree_lock() above, because to change class, other client has to - * take DLM lock conflicting with our lock. Also, any updates to - * ->i_size by other threads on this client are serialized by - * ll_inode_size_lock(). This guarantees that short reads are handled - * correctly in the face of concurrent writes and truncates. - */ - inode_init_lvb(inode, &lvb); - obd_merge_lvb(sbi->ll_dt_exp, lsm, &lvb, 1); - kms = lvb.lvb_size; - if (*ppos + count - 1 > kms) { - /* A glimpse is necessary to determine whether we return a - * short read (B) or some zeroes at the end of the buffer (C) */ - ll_inode_size_unlock(inode, 1); - retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); - if (retval) { - if (lock_style != LL_LOCK_STYLE_NOLOCK) - ll_file_put_lock(inode, end, lock_style, - cookie, &tree, OBD_BRW_READ); - goto out; - } - } else { - /* region is within kms and, hence, within real file size (A). - * We need to increase i_size to cover the read region so that - * generic_file_read() will do its job, but that doesn't mean - * the kms size is _correct_, it is only the _minimum_ size. - * If someone does a stat they will get the correct size which - * will always be >= the kms value here. b=11081 */ - if (i_size_read(inode) < kms) - i_size_write(inode, kms); - ll_inode_size_unlock(inode, 1); - } - - chunk = end - *ppos + 1; - CDEBUG(D_INODE,"Read ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", - inode->i_ino, chunk, *ppos, i_size_read(inode)); - - if (lock_style != LL_LOCK_STYLE_NOLOCK) { - /* turn off the kernel's read-ahead */ - file->f_ra.ra_pages = 0; - - /* initialize read-ahead window once per syscall */ - if (ra == 0) { - ra = 1; - bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; - bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - ll_ra_read_in(file, &bead); - } - /* BUG: 5972 */ - file_accessed(file); - retval = generic_file_read(file, buf, chunk, ppos); - ll_file_put_lock(inode, end, lock_style, cookie, &tree, - OBD_BRW_READ); - } else { - retval = ll_file_lockless_io(file, buf, chunk, ppos, READ); - } + result = ll_file_get_iov_count(iov, &nr_segs, &count); + if (result) + RETURN(result); - ll_rw_stats_tally(sbi, current->pid, file, chunk, 0); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - if (retval > 0) { - buf += retval; - count -= retval; - sum += retval; - if (retval == chunk && count > 0) - goto repeat; - } - - out: - if (ra != 0) - ll_ra_read_ex(file, &bead); - retval = (sum > 0) ? sum : retval; - RETURN(retval); + args = &vvp_env_info(env)->vti_args; + args->cia_iov = (struct iovec *)iov; + args->cia_nrsegs = nr_segs; + args->cia_iocb = iocb; + result = ll_file_io_generic(env, args, iocb->ki_filp, CIT_WRITE, + &iocb->ki_pos, count); + cl_env_put(env, &refcheck); + RETURN(result); } -/* - * Write to a file (through the page cache). - */ static ssize_t ll_file_write(struct file *file, const char *buf, size_t count, loff_t *ppos) { - struct inode *inode = file->f_dentry->d_inode; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct lov_stripe_md *lsm = ll_i2info(inode)->lli_smd; - struct ll_lock_tree tree; - loff_t maxbytes = ll_file_maxbytes(inode); - loff_t lock_start, lock_end, end; - ssize_t retval, chunk, sum = 0; - int tree_locked; + struct lu_env *env; + struct iovec *local_iov; + struct kiocb *kiocb; + ssize_t result; + int refcheck; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", - inode->i_ino, inode->i_generation, inode, count, *ppos); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ + local_iov = &vvp_env_info(env)->vti_local_iov; + kiocb = &vvp_env_info(env)->vti_kiocb; + local_iov->iov_base = (void __user *)buf; + local_iov->iov_len = count; + init_sync_kiocb(kiocb, file); + kiocb->ki_pos = *ppos; + kiocb->ki_left = count; - /* POSIX, but surprised the VFS doesn't check this already */ - if (count == 0) - RETURN(0); - - /* If file was opened for LL_IOC_LOV_SETSTRIPE but the ioctl wasn't - * called on the file, don't fail the below assertion (bug 2388). */ - if (file->f_flags & O_LOV_DELAY_CREATE && - ll_i2info(inode)->lli_smd == NULL) - RETURN(-EBADF); - - LASSERT(ll_i2info(inode)->lli_smd != NULL); - - down(&ll_i2info(inode)->lli_write_sem); - -repeat: - chunk = 0; /* just to fix gcc's warning */ - end = *ppos + count - 1; - - if (file->f_flags & O_APPEND) { - lock_start = 0; - lock_end = OBD_OBJECT_EOF; - } else if (sbi->ll_max_rw_chunk != 0) { - /* first, let's know the end of the current stripe */ - end = *ppos; - obd_extent_calc(sbi->ll_dt_exp, lsm, OBD_CALC_STRIPE_END, - (obd_off *)&end); - - /* correct, the end is beyond the request */ - if (end > *ppos + count - 1) - end = *ppos + count - 1; - - /* and chunk shouldn't be too large even if striping is wide */ - if (end - *ppos > sbi->ll_max_rw_chunk) - end = *ppos + sbi->ll_max_rw_chunk - 1; - lock_start = *ppos; - lock_end = end; - } else { - lock_start = *ppos; - lock_end = *ppos + count - 1; - } - - tree_locked = ll_file_get_tree_lock(&tree, file, buf, count, - lock_start, lock_end, OBD_BRW_WRITE); - if (tree_locked < 0) - GOTO(out, retval = tree_locked); - - /* This is ok, g_f_w will overwrite this under i_sem if it races - * with a local truncate, it just makes our maxbyte checking easier. - * The i_size value gets updated in ll_extent_lock() as a consequence - * of the [0,EOF] extent lock we requested above. */ - if (file->f_flags & O_APPEND) { - *ppos = i_size_read(inode); - end = *ppos + count - 1; - } + result = ll_file_aio_write(kiocb, local_iov, 1, kiocb->ki_pos); + *ppos = kiocb->ki_pos; - if (*ppos >= maxbytes) { - send_sig(SIGXFSZ, current, 0); - GOTO(out_unlock, retval = -EFBIG); - } - if (end > maxbytes - 1) - end = maxbytes - 1; - - /* generic_file_write handles O_APPEND after getting i_mutex */ - chunk = end - *ppos + 1; - CDEBUG(D_INFO, "Writing inode %lu, "LPSZ" bytes, offset %Lu\n", - inode->i_ino, chunk, *ppos); - if (tree_locked) - retval = generic_file_write(file, buf, chunk, ppos); - else - retval = ll_file_lockless_io(file, (char*)buf, chunk, - ppos, WRITE); - ll_rw_stats_tally(ll_i2sbi(inode), current->pid, file, chunk, 1); - -out_unlock: - if (tree_locked) - ll_tree_unlock(&tree); - -out: - if (retval > 0) { - buf += retval; - count -= retval; - sum += retval; - if (retval == chunk && count > 0) - goto repeat; - } - - up(&ll_i2info(inode)->lli_write_sem); - - retval = (sum > 0) ? sum : retval; - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_WRITE_BYTES, - retval > 0 ? retval : 0); - RETURN(retval); + cl_env_put(env, &refcheck); + RETURN(result); } +#endif + /* * Send file content (through pagecache) somewhere with helper @@ -1865,100 +1095,28 @@ out: static ssize_t ll_file_sendfile(struct file *in_file, loff_t *ppos,size_t count, read_actor_t actor, void *target) { - struct inode *inode = in_file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct ll_lock_tree tree; - struct ll_lock_tree_node *node; - struct ost_lvb lvb; - struct ll_ra_read bead; - int rc; - ssize_t retval; - __u64 kms; + struct lu_env *env; + struct ccc_io_args *args; + ssize_t result; + int refcheck; ENTRY; - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),size="LPSZ",offset=%Ld\n", - inode->i_ino, inode->i_generation, inode, count, *ppos); - - /* "If nbyte is 0, read() will return 0 and have no other results." - * -- Single Unix Spec */ - if (count == 0) - RETURN(0); - - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_READ_BYTES, count); - /* turn off the kernel's read-ahead */ - in_file->f_ra.ra_pages = 0; - - /* File with no objects, nothing to lock */ - if (!lsm) - RETURN(generic_file_sendfile(in_file, ppos,count,actor,target)); - - node = ll_node_from_inode(inode, *ppos, *ppos + count - 1, LCK_PR); - if (IS_ERR(node)) - RETURN(PTR_ERR(node)); - - tree.lt_fd = LUSTRE_FPRIVATE(in_file); - rc = ll_tree_lock(&tree, node, NULL, count, - in_file->f_flags & O_NONBLOCK?LDLM_FL_BLOCK_NOWAIT:0); - if (rc != 0) - RETURN(rc); - - ll_clear_file_contended(inode); - ll_inode_size_lock(inode, 1); - /* - * Consistency guarantees: following possibilities exist for the - * relation between region being read and real file size at this - * moment: - * - * (A): the region is completely inside of the file; - * - * (B-x): x bytes of region are inside of the file, the rest is - * outside; - * - * (C): the region is completely outside of the file. - * - * This classification is stable under DLM lock acquired by - * ll_tree_lock() above, because to change class, other client has to - * take DLM lock conflicting with our lock. Also, any updates to - * ->i_size by other threads on this client are serialized by - * ll_inode_size_lock(). This guarantees that short reads are handled - * correctly in the face of concurrent writes and truncates. - */ - inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2sbi(inode)->ll_dt_exp, lsm, &lvb, 1); - kms = lvb.lvb_size; - if (*ppos + count - 1 > kms) { - /* A glimpse is necessary to determine whether we return a - * short read (B) or some zeroes at the end of the buffer (C) */ - ll_inode_size_unlock(inode, 1); - retval = ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); - if (retval) - goto out; - } else { - /* region is within kms and, hence, within real file size (A) */ - i_size_write(inode, kms); - ll_inode_size_unlock(inode, 1); - } - CDEBUG(D_INFO, "Send ino %lu, "LPSZ" bytes, offset %lld, i_size %llu\n", - inode->i_ino, count, *ppos, i_size_read(inode)); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); - bead.lrr_start = *ppos >> CFS_PAGE_SHIFT; - bead.lrr_count = (count + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - ll_ra_read_in(in_file, &bead); - /* BUG: 5972 */ - file_accessed(in_file); - retval = generic_file_sendfile(in_file, ppos, count, actor, target); - ll_ra_read_ex(in_file, &bead); - - out: - ll_tree_unlock(&tree); - RETURN(retval); + args = &vvp_env_info(env)->vti_args; + args->cia_is_sendfile = 1; + args->cia_target = target; + args->cia_actor = actor; + result = ll_file_io_generic(env, args, in_file, CIT_READ, ppos, count); + cl_env_put(env, &refcheck); + RETURN(result); } static int ll_lov_recreate_obj(struct inode *inode, struct file *file, unsigned long arg) { - struct ll_inode_info *lli = ll_i2info(inode); struct obd_export *exp = ll_i2dtexp(inode); struct ll_recreate_obj ucreatp; struct obd_trans_info oti = { 0 }; @@ -1979,8 +1137,8 @@ static int ll_lov_recreate_obj(struct inode *inode, struct file *file, if (oa == NULL) RETURN(-ENOMEM); - down(&lli->lli_size_sem); - lsm = lli->lli_smd; + ll_inode_size_lock(inode, 0); + lsm = ll_i2info(inode)->lli_smd; if (lsm == NULL) GOTO(out, rc = -ENOENT); lsm_size = sizeof(*lsm) + (sizeof(struct lov_oinfo) * @@ -2004,7 +1162,7 @@ static int ll_lov_recreate_obj(struct inode *inode, struct file *file, OBD_FREE(lsm2, lsm_size); GOTO(out, rc); out: - up(&lli->lli_size_sem); + ll_inode_size_unlock(inode, 0); OBDO_FREE(oa); return rc; } @@ -2012,16 +1170,15 @@ out: int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, int flags, struct lov_user_md *lum, int lum_size) { - struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm; struct lookup_intent oit = {.it_op = IT_OPEN, .it_flags = flags}; int rc = 0; ENTRY; - down(&lli->lli_size_sem); - lsm = lli->lli_smd; + ll_inode_size_lock(inode, 0); + lsm = ll_i2info(inode)->lli_smd; if (lsm) { - up(&lli->lli_size_sem); + ll_inode_size_unlock(inode, 0); CDEBUG(D_IOCTL, "stripe already exists for ino %lu\n", inode->i_ino); RETURN(-EEXIST); @@ -2039,7 +1196,7 @@ int ll_lov_setstripe_ea_info(struct inode *inode, struct file *file, ll_release_openhandle(file->f_dentry, &oit); out: - up(&lli->lli_size_sem); + ll_inode_size_unlock(inode, 0); ll_intent_release(&oit); RETURN(rc); out_req_free: @@ -2251,61 +1408,15 @@ static int ll_lov_getstripe(struct inode *inode, unsigned long arg) static int ll_get_grouplock(struct inode *inode, struct file *file, unsigned long arg) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - ldlm_policy_data_t policy = { .l_extent = { .start = 0, - .end = OBD_OBJECT_EOF}}; - struct lustre_handle lockh = { 0 }; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - int flags = 0, rc; - ENTRY; - - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) { - RETURN(-EINVAL); - } - - policy.l_extent.gid = arg; - if (file->f_flags & O_NONBLOCK) - flags = LDLM_FL_BLOCK_NOWAIT; - - rc = ll_extent_lock(fd, inode, lsm, LCK_GROUP, &policy, &lockh, flags); - if (rc) - RETURN(rc); - - fd->fd_flags |= LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK; - fd->fd_gid = arg; - memcpy(&fd->fd_cwlockh, &lockh, sizeof(lockh)); - - RETURN(0); + /* XXX */ + return -ENOSYS; } static int ll_put_grouplock(struct inode *inode, struct file *file, unsigned long arg) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - int rc; - ENTRY; - - if (!(fd->fd_flags & LL_FILE_GROUP_LOCKED)) { - /* Ugh, it's already unlocked. */ - RETURN(-EINVAL); - } - - if (fd->fd_gid != arg) /* Ugh? Unlocking with different gid? */ - RETURN(-EINVAL); - - fd->fd_flags &= ~(LL_FILE_GROUP_LOCKED|LL_FILE_IGNORE_LOCK); - - rc = ll_extent_unlock(fd, inode, lsm, LCK_GROUP, &fd->fd_cwlockh); - if (rc) - RETURN(rc); - - fd->fd_gid = 0; - memset(&fd->fd_cwlockh, 0, sizeof(fd->fd_cwlockh)); - - RETURN(0); + /* XXX */ + return -ENOSYS; } #if LUSTRE_FIX >= 50 @@ -2745,8 +1856,6 @@ error: loff_t ll_file_seek(struct file *file, loff_t offset, int origin) { struct inode *inode = file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; loff_t retval; ENTRY; retval = offset + ((origin == 2) ? i_size_read(inode) : @@ -2762,11 +1871,9 @@ loff_t ll_file_seek(struct file *file, loff_t offset, int origin) if (file->f_flags & O_NONBLOCK) nonblock = LDLM_FL_BLOCK_NOWAIT; - if (lsm != NULL) { - rc = ll_glimpse_size(inode, nonblock); - if (rc != 0) - RETURN(rc); - } + rc = cl_glimpse_size(inode); + if (rc != 0) + RETURN(rc); ll_inode_size_lock(inode, 0); offset += i_size_read(inode); @@ -3121,9 +2228,9 @@ int ll_inode_revalidate_it(struct dentry *dentry, struct lookup_intent *it) if (ll_i2info(inode)->lli_smd == NULL) GOTO(out, rc = 0); - /* ll_glimpse_size will prefer locally cached writes if they extend + /* cl_glimpse_size will prefer locally cached writes if they extend * the file */ - rc = ll_glimpse_size(inode, 0); + rc = cl_glimpse_size(inode); EXIT; out: ptlrpc_req_finished(req); @@ -3260,10 +2367,24 @@ check_capabilities: } #endif +#ifdef HAVE_FILE_READV +#define READ_METHOD readv +#define READ_FUNCTION ll_file_readv +#define WRITE_METHOD writev +#define WRITE_FUNCTION ll_file_writev +#else +#define READ_METHOD aio_read +#define READ_FUNCTION ll_file_aio_read +#define WRITE_METHOD aio_write +#define WRITE_FUNCTION ll_file_aio_write +#endif + /* -o localflock - only provides locally consistent flock locks */ struct file_operations ll_file_operations = { .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, @@ -3275,7 +2396,9 @@ struct file_operations ll_file_operations = { struct file_operations ll_file_operations_flock = { .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, @@ -3292,7 +2415,9 @@ struct file_operations ll_file_operations_flock = { /* These are for -o noflock - to return ENOSYS on flock calls */ struct file_operations ll_file_operations_noflock = { .read = ll_file_read, + .READ_METHOD = READ_FUNCTION, .write = ll_file_write, + .WRITE_METHOD = WRITE_FUNCTION, .ioctl = ll_file_ioctl, .open = ll_file_open, .release = ll_file_release, diff --git a/lustre/llite/llite_capa.c b/lustre/llite/llite_capa.c index ab515b2..1a7bd1f 100644 --- a/lustre/llite/llite_capa.c +++ b/lustre/llite/llite_capa.c @@ -157,9 +157,10 @@ static void ll_delete_capa(struct obd_capa *ocapa) } DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free client"); - list_del(&ocapa->c_list); + list_del_init(&ocapa->c_list); capa_count[CAPA_SITE_CLIENT]--; - free_capa(ocapa); + /* release the ref when alloc */ + capa_put(ocapa); } /* three places where client capa is deleted: @@ -238,7 +239,6 @@ static int capa_thread_main(void *unused) capa_get(ocapa); ll_capa_renewed++; spin_unlock(&capa_lock); - rc = md_renew_capa(ll_i2mdexp(inode), ocapa, ll_update_capa); spin_lock(&capa_lock); @@ -259,7 +259,7 @@ static int capa_thread_main(void *unused) break; } - if (atomic_read(&ocapa->c_refc)) { + if (atomic_read(&ocapa->c_refc) > 1) { DEBUG_CAPA(D_SEC, &ocapa->c_capa, "expired(c_refc %d), don't release", atomic_read(&ocapa->c_refc)); @@ -312,37 +312,17 @@ void ll_capa_thread_stop(void) ll_capa_thread.t_flags & SVC_STOPPED); } -static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_capa *ocapa; - - /* inside capa_lock */ - list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { - if ((capa_opc(&ocapa->c_capa) & opc) != opc) - continue; - - LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), - ll_inode2fid(inode))); - LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); - - DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); - return ocapa; - } - - return NULL; -} - struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc) { struct ll_inode_info *lli = ll_i2info(inode); struct obd_capa *ocapa; int found = 0; - if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0) - return NULL; ENTRY; + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_OSS_CAPA) == 0) + RETURN(NULL); + LASSERT(opc == CAPA_OPC_OSS_WRITE || opc == CAPA_OPC_OSS_RW || opc == CAPA_OPC_OSS_TRUNC); @@ -352,14 +332,17 @@ struct obd_capa *ll_osscapa_get(struct inode *inode, __u64 opc) continue; if ((opc & CAPA_OPC_OSS_WRITE) && capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_WRITE)) { - found = 1; break; + found = 1; + break; } else if ((opc & CAPA_OPC_OSS_READ) && capa_opc_supported(&ocapa->c_capa, CAPA_OPC_OSS_READ)) { - found = 1; break; + found = 1; + break; } else if ((opc & CAPA_OPC_OSS_TRUNC) && capa_opc_supported(&ocapa->c_capa, opc)) { - found = 1; break; + found = 1; + break; } } @@ -393,7 +376,7 @@ struct obd_capa *ll_mdscapa_get(struct inode *inode) ENTRY; LASSERT(inode != NULL); - + if ((ll_i2sbi(inode)->ll_flags & LL_SBI_MDS_CAPA) == 0) RETURN(NULL); @@ -428,12 +411,33 @@ static struct obd_capa *do_add_mds_capa(struct inode *inode, DEBUG_CAPA(D_SEC, capa, "update MDS"); - free_capa(ocapa); + capa_put(ocapa); ocapa = old; } return ocapa; } +static struct obd_capa *do_lookup_oss_capa(struct inode *inode, int opc) +{ + struct ll_inode_info *lli = ll_i2info(inode); + struct obd_capa *ocapa; + + /* inside capa_lock */ + list_for_each_entry(ocapa, &lli->lli_oss_capas, u.cli.lli_list) { + if ((capa_opc(&ocapa->c_capa) & opc) != opc) + continue; + + LASSERT(lu_fid_eq(capa_fid(&ocapa->c_capa), + ll_inode2fid(inode))); + LASSERT(ocapa->c_site == CAPA_SITE_CLIENT); + + DEBUG_CAPA(D_SEC, &ocapa->c_capa, "found client"); + return ocapa; + } + + return NULL; +} + static inline void inode_add_oss_capa(struct inode *inode, struct obd_capa *ocapa) { @@ -478,7 +482,7 @@ static struct obd_capa *do_add_oss_capa(struct inode *inode, DEBUG_CAPA(D_SEC, capa, "update OSS"); - free_capa(ocapa); + capa_put(ocapa); ocapa = old; } @@ -495,7 +499,7 @@ struct obd_capa *ll_add_capa(struct inode *inode, struct obd_capa *ocapa) /* truncate capa won't renew */ if (ocapa->c_capa.lc_opc != CAPA_OPC_OSS_TRUNC) { set_capa_expiry(ocapa); - list_del(&ocapa->c_list); + list_del_init(&ocapa->c_list); sort_add_capa(ocapa, ll_capa_list); update_capa_timer(ocapa, capa_renewal_time(ocapa)); @@ -546,18 +550,18 @@ int ll_update_capa(struct obd_capa *ocapa, struct lustre_capa *capa) } } - list_del(&ocapa->c_list); + list_del_init(&ocapa->c_list); sort_add_capa(ocapa, &ll_idle_capas); spin_unlock(&capa_lock); capa_put(ocapa); iput(inode); - return rc; + RETURN(rc); } spin_lock(&ocapa->c_lock); LASSERT(!memcmp(&ocapa->c_capa, capa, - offsetof(struct lustre_capa, lc_flags))); + offsetof(struct lustre_capa, lc_opc))); ocapa->c_capa = *capa; set_capa_expiry(ocapa); spin_unlock(&ocapa->c_lock); @@ -615,10 +619,13 @@ void ll_truncate_free_capa(struct obd_capa *ocapa) LASSERT(ocapa->c_capa.lc_opc & CAPA_OPC_OSS_TRUNC); DEBUG_CAPA(D_SEC, &ocapa->c_capa, "free truncate"); + /* release ref when find */ capa_put(ocapa); - spin_lock(&capa_lock); - ll_delete_capa(ocapa); - spin_unlock(&capa_lock); + if (likely(ocapa->c_capa.lc_opc == CAPA_OPC_OSS_TRUNC)) { + spin_lock(&capa_lock); + ll_delete_capa(ocapa); + spin_unlock(&capa_lock); + } } void ll_clear_inode_capas(struct inode *inode) @@ -630,7 +637,7 @@ void ll_clear_inode_capas(struct inode *inode) ocapa = lli->lli_mds_capa; if (ocapa) ll_delete_capa(ocapa); - + list_for_each_entry_safe(ocapa, tmp, &lli->lli_oss_capas, u.cli.lli_list) ll_delete_capa(ocapa); diff --git a/lustre/llite/llite_close.c b/lustre/llite/llite_close.c index 1c973b6..53b8514 100644 --- a/lustre/llite/llite_close.c +++ b/lustre/llite/llite_close.c @@ -46,38 +46,39 @@ #include #include "llite_internal.h" -/* record that a write is in flight */ -void llap_write_pending(struct inode *inode, struct ll_async_page *llap) +/** records that a write is in flight */ +void vvp_write_pending(struct ccc_object *club, struct ccc_page *page) { - struct ll_inode_info *lli = ll_i2info(inode); + struct ll_inode_info *lli = ll_i2info(club->cob_inode); ENTRY; spin_lock(&lli->lli_lock); lli->lli_flags |= LLIF_SOM_DIRTY; - if (llap && list_empty(&llap->llap_pending_write)) - list_add(&llap->llap_pending_write, - &lli->lli_pending_write_llaps); + if (page != NULL && list_empty(&page->cpg_pending_linkage)) + list_add(&page->cpg_pending_linkage, &club->cob_pending_list); spin_unlock(&lli->lli_lock); EXIT; } -/* record that a write has completed */ -int llap_write_complete(struct inode *inode, struct ll_async_page *llap) +/** records that a write has completed */ +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page) { - struct ll_inode_info *lli = ll_i2info(inode); + struct ll_inode_info *lli = ll_i2info(club->cob_inode); int rc = 0; - + ENTRY; spin_lock(&lli->lli_lock); - if (llap && !list_empty(&llap->llap_pending_write)) { - list_del_init(&llap->llap_pending_write); + if (page != NULL && !list_empty(&page->cpg_pending_linkage)) { + list_del_init(&page->cpg_pending_linkage); rc = 1; } spin_unlock(&lli->lli_lock); - RETURN(rc); + if (rc) + ll_queue_done_writing(club->cob_inode, 0); + EXIT; } -/* Queue DONE_WRITING if +/** Queues DONE_WRITING if * - done writing is allowed; * - inode has no no dirty pages; */ void ll_queue_done_writing(struct inode *inode, unsigned long flags) @@ -94,7 +95,7 @@ void ll_queue_done_writing(struct inode *inode, unsigned long flags) if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) CWARN("ino %lu/%u(flags %lu) som valid it just after " "recovery\n", - inode->i_ino, inode->i_generation, + inode->i_ino, inode->i_generation, lli->lli_flags); /* DONE_WRITING is allowed and inode has no dirty page. */ spin_lock(&lcq->lcq_lock); @@ -118,8 +119,8 @@ void ll_queue_done_writing(struct inode *inode, unsigned long flags) spin_unlock(&lli->lli_lock); } -/* Close epoch and send Size-on-MDS attribute update if possible. - * Call this under @lli->lli_lock spinlock. */ +/** Closes epoch and sends Size-on-MDS attribute update if possible. Call + * this under ll_inode_info::lli_lock spinlock. */ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data, struct obd_client_handle **och, unsigned long flags) { @@ -140,7 +141,7 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data, inode = igrab(inode); LASSERT(inode); GOTO(out, 0); - } + } if (flags & LLIF_DONE_WRITING) { /* Some pages are still dirty, it is early to send * DONE_WRITE. Wait untill all pages will be flushed @@ -190,7 +191,7 @@ void ll_epoch_close(struct inode *inode, struct md_op_data *op_data, "recovery\n", inode->i_ino, inode->i_generation, lli->lli_flags); - if (!ll_local_size(inode)) { + if (!cl_local_size(inode)) { /* Send Size-on-MDS Attributes if valid. Atime is sent along * with all the attributes. */ op_data->op_attr.ia_valid |= ATTR_MTIME_SET | ATTR_CTIME_SET | @@ -209,7 +210,7 @@ int ll_sizeonmds_update(struct inode *inode, struct md_open_data *mod, struct obdo *oa; int rc; ENTRY; - + /* LASSERT(!(lli->lli_flags & LLIF_MDS_SIZE_LOCK)); */ /* After recovery that can be valid. */ if (lli->lli_flags & LLIF_MDS_SIZE_LOCK) @@ -252,7 +253,8 @@ out: return rc; } -/* Send a DONE_WRITING rpc, pack Size-on-MDS attributes into it, if possible */ +/** Sends a DONE_WRITING rpc, packs Size-on-MDS attributes into it, if + * possible */ static void ll_done_writing(struct inode *inode) { struct obd_client_handle *och = NULL; @@ -261,7 +263,7 @@ static void ll_done_writing(struct inode *inode) ENTRY; LASSERT(ll_i2mdexp(inode)->exp_connect_flags & OBD_CONNECT_SOM); - + OBD_ALLOC_PTR(op_data); if (op_data == NULL) { CERROR("can't allocate op_data\n"); @@ -273,12 +275,12 @@ static void ll_done_writing(struct inode *inode) /* If there is no @och, we do not do D_W yet. */ if (och == NULL) GOTO(out, 0); - + ll_pack_inode2opdata(inode, op_data, &och->och_fh); rc = md_done_writing(ll_i2sbi(inode)->ll_md_exp, op_data, och->och_mod); if (rc == -EAGAIN) { - /* MDS has instructed us to obtain Size-on-MDS attribute from + /* MDS has instructed us to obtain Size-on-MDS attribute from * OSTs and send setattr to back to MDS. */ rc = ll_sizeonmds_update(inode, och->och_mod, &och->och_fh, op_data->op_ioepoch); @@ -322,7 +324,7 @@ static int ll_close_thread(void *arg) snprintf(name, sizeof(name) - 1, "ll_close"); cfs_daemonize(name); } - + complete(&lcq->lcq_comp); while (1) { diff --git a/lustre/llite/llite_fid.c b/lustre/llite/llite_fid.c index 15c4021..eab0e84 100644 --- a/lustre/llite/llite_fid.c +++ b/lustre/llite/llite_fid.c @@ -77,3 +77,15 @@ ino_t ll_fid_build_ino(struct ll_sb_info *sbi, ino = ino | 0x80000000; RETURN(ino); } + +__u32 ll_fid_build_gen(struct ll_sb_info *sbi, + struct lu_fid *fid) +{ + __u32 gen = 0; + ENTRY; + + if (fid_is_igif(fid)) { + gen = lu_igif_gen(fid); + } + RETURN(gen); +} diff --git a/lustre/llite/llite_internal.h b/lustre/llite/llite_internal.h index 07e2976..a03e1bf 100644 --- a/lustre/llite/llite_internal.h +++ b/lustre/llite/llite_internal.h @@ -44,6 +44,10 @@ #include /* for s2sbi */ #include +/* for struct cl_lock_descr and struct cl_io */ +#include +#include + #ifndef FMODE_EXEC #define FMODE_EXEC 0 #endif @@ -127,7 +131,6 @@ struct ll_inode_info { * Open handle data are needed for the recovery to reconstruct * the inode state on the MDS. XXX: recovery is not ready yet. */ struct obd_client_handle *lli_pending_och; - atomic_t lli_mmap_cnt; /* for writepage() only to communicate to fsync */ int lli_async_rc; @@ -171,12 +174,13 @@ struct ll_inode_info { * dir statahead. */ pid_t lli_opendir_pid; - /* + /* * since parent-child threads can share the same @file struct, * "opendir_key" is the token when dir close for case of parent exit * before child -- it is me should cleanup the dir readahead. */ void *lli_opendir_key; struct ll_statahead_info *lli_sai; + struct cl_object *lli_clob; }; /* @@ -220,28 +224,31 @@ enum ra_stat { _NR_RA_STAT, }; -#define LL_RA_STAT _NR_RA_STAT -#define LL_RA_STAT_STRINGS { \ - [RA_STAT_HIT] = "hits", \ - [RA_STAT_MISS] = "misses", \ - [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", \ - [RA_STAT_MISS_IN_WINDOW] = "miss inside window", \ - [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", \ - [RA_STAT_FAILED_MATCH] = "failed lock match", \ - [RA_STAT_DISCARDED] = "read but discarded", \ - [RA_STAT_ZERO_LEN] = "zero length file", \ - [RA_STAT_ZERO_WINDOW] = "zero size window", \ - [RA_STAT_EOF] = "read-ahead to EOF", \ - [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", \ - [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page",\ -} - struct ll_ra_info { atomic_t ra_cur_pages; unsigned long ra_max_pages; unsigned long ra_max_read_ahead_whole_pages; }; +/* ra_io_arg will be filled in the beginning of ll_readahead with + * ras_lock, then the following ll_read_ahead_pages will read RA + * pages according to this arg, all the items in this structure are + * counted by page index. + */ +struct ra_io_arg { + unsigned long ria_start; /* start offset of read-ahead*/ + unsigned long ria_end; /* end offset of read-ahead*/ + /* If stride read pattern is detected, ria_stoff means where + * stride read is started. Note: for normal read-ahead, the + * value here is meaningless, and also it will not be accessed*/ + pgoff_t ria_stoff; + /* ria_length and ria_pages are the length and pages length in the + * stride I/O mode. And they will also be used to check whether + * it is stride I/O read-ahead in the read-ahead pages*/ + unsigned long ria_length; + unsigned long ria_pages; +}; + /* LL_HIST_MAX=32 causes an overflow */ #define LL_HIST_MAX 28 #define LL_HIST_START 12 /* buckets start at 2^12 = 4k */ @@ -267,7 +274,7 @@ struct ll_rw_process_info { loff_t rw_offset; size_t rw_smallest_extent; size_t rw_largest_extent; - struct file *rw_last_file; + struct ll_file_data *rw_last_file; }; enum stats_track_type { @@ -323,30 +330,10 @@ struct eacl_table { struct list_head et_entries[EE_HASHES]; }; -/* percpu data structure for lustre lru page list */ -struct ll_pglist_data { - spinlock_t llpd_lock; /* lock to protect llpg_list */ - struct list_head llpd_list; /* all pages (llap_pglist_item) */ - unsigned long llpd_gen; /* generation # of this list */ - unsigned long llpd_count; /* How many pages in this list */ - atomic_t llpd_sample_count; - unsigned long llpd_reblnc_count; - /* the pages in this list shouldn't be over this number */ - unsigned long llpd_budget; - int llpd_cpu; - /* which page the pglist data is in */ - struct page *llpd_page; - - /* stats */ - unsigned long llpd_hit; - unsigned long llpd_miss; - unsigned long llpd_cross; -}; - struct ll_sb_info { struct list_head ll_list; - /* this protects pglist(only ll_async_page_max) and ra_info. - * It isn't safe to grab from interrupt contexts. */ + /* this protects pglist and ra_info. It isn't safe to + * grab from interrupt contexts */ spinlock_t ll_lock; spinlock_t ll_pp_extent_lock; /* Lock for pp_extent entries */ spinlock_t ll_process_lock; /* Lock for ll_rw_process_info */ @@ -365,23 +352,11 @@ struct ll_sb_info { struct lprocfs_stats *ll_stats; /* lprocfs stats counter */ - /* reblnc lock protects llpd_budget */ - spinlock_t ll_async_page_reblnc_lock; - unsigned long ll_async_page_reblnc_count; - unsigned long ll_async_page_sample_max; - /* I defined this array here rather than in ll_pglist_data - * because it is always accessed by only one cpu. -jay */ - unsigned long *ll_async_page_sample; unsigned long ll_async_page_max; - unsigned long ll_async_page_clock_hand; - lcounter_t ll_async_page_count; - struct ll_pglist_data **ll_pglist; + unsigned long ll_async_page_count; struct lprocfs_stats *ll_ra_stats; - unsigned ll_contention_time; /* seconds */ - unsigned ll_lockless_truncate_enable; /* true/false */ - struct ll_ra_info ll_ra_info; unsigned int ll_namelen; struct file_operations *ll_fop; @@ -394,6 +369,8 @@ struct ll_sb_info { * >0 - max. chunk to be read/written w/o lock re-acquiring */ unsigned long ll_max_rw_chunk; + struct lu_site *ll_site; + struct cl_device *ll_cl; /* Statistics */ struct ll_rw_extents_info ll_rw_extents_info; int ll_extent_process_count; @@ -425,68 +402,6 @@ struct ll_sb_info { #define LL_DEFAULT_MAX_RW_CHUNK (32 * 1024 * 1024) -#define LL_PGLIST_DATA_CPU(sbi, cpu) ((sbi)->ll_pglist[cpu]) -#define LL_PGLIST_DATA(sbi) LL_PGLIST_DATA_CPU(sbi, smp_processor_id()) - -static inline struct ll_pglist_data *ll_pglist_cpu_lock( - struct ll_sb_info *sbi, - int cpu) -{ - spin_lock(&sbi->ll_pglist[cpu]->llpd_lock); - return LL_PGLIST_DATA_CPU(sbi, cpu); -} - -static inline void ll_pglist_cpu_unlock(struct ll_sb_info *sbi, int cpu) -{ - spin_unlock(&sbi->ll_pglist[cpu]->llpd_lock); -} - -static inline struct ll_pglist_data *ll_pglist_double_lock( - struct ll_sb_info *sbi, - int cpu, struct ll_pglist_data **pd_cpu) -{ - int current_cpu = get_cpu(); - - if (cpu == current_cpu) { - ll_pglist_cpu_lock(sbi, cpu); - } else if (current_cpu < cpu) { - ll_pglist_cpu_lock(sbi, current_cpu); - ll_pglist_cpu_lock(sbi, cpu); - } else { - ll_pglist_cpu_lock(sbi, cpu); - ll_pglist_cpu_lock(sbi, current_cpu); - } - - if (pd_cpu) - *pd_cpu = LL_PGLIST_DATA_CPU(sbi, cpu); - - return LL_PGLIST_DATA(sbi); -} - -static inline void ll_pglist_double_unlock(struct ll_sb_info *sbi, int cpu) -{ - int current_cpu = smp_processor_id(); - if (cpu == current_cpu) { - ll_pglist_cpu_unlock(sbi, cpu); - } else { - ll_pglist_cpu_unlock(sbi, cpu); - ll_pglist_cpu_unlock(sbi, current_cpu); - } - put_cpu(); -} - -static inline struct ll_pglist_data *ll_pglist_lock(struct ll_sb_info *sbi) -{ - ll_pglist_cpu_lock(sbi, get_cpu()); - return LL_PGLIST_DATA(sbi); -} - -static inline void ll_pglist_unlock(struct ll_sb_info *sbi) -{ - ll_pglist_cpu_unlock(sbi, smp_processor_id()); - put_cpu(); -} - struct ll_ra_read { pgoff_t lrr_start; pgoff_t lrr_count; @@ -557,12 +472,12 @@ struct ll_readahead_state { * protected by ->ras_lock. */ struct list_head ras_read_beads; - /* + /* * The following 3 items are used for detecting the stride I/O - * mode. - * In stride I/O mode, - * ...............|-----data-----|****gap*****|--------|******|.... - * offset |-stride_pages-|-stride_gap-| + * mode. + * In stride I/O mode, + * ...............|-----data-----|****gap*****|--------|******|.... + * offset |-stride_pages-|-stride_gap-| * ras_stride_offset = offset; * ras_stride_length = stride_pages + stride_gap; * ras_stride_pages = stride_pages; @@ -571,7 +486,7 @@ struct ll_readahead_state { unsigned long ras_stride_length; unsigned long ras_stride_pages; pgoff_t ras_stride_offset; - /* + /* * number of consecutive stride request count, and it is similar as * ras_consecutive_requests, but used for stride I/O mode. * Note: only more than 2 consecutive stride request are detected, @@ -592,6 +507,7 @@ struct ll_file_data { unsigned long fd_gid; struct ll_file_dir fd_dir; __u32 fd_flags; + struct file *fd_file; }; struct lov_stripe_md; @@ -618,48 +534,6 @@ void ll_i2gids(__u32 *suppgids, struct inode *i1,struct inode *i2); extern cfs_mem_cache_t *ll_async_page_slab; extern size_t ll_async_page_slab_size; -struct ll_async_page { - int llap_magic; - /* only trust these if the page lock is providing exclusion */ - unsigned int llap_write_queued:1, - llap_defer_uptodate:1, - llap_origin:3, - llap_ra_used:1, - llap_ignore_quota:1, - llap_nocache:1, - llap_lockless_io_page:1, - llap_reserved:7; - unsigned int llap_pglist_cpu:16; - void *llap_cookie; - struct page *llap_page; - struct list_head llap_pending_write; - struct list_head llap_pglist_item; - /* checksum for paranoid I/O debugging */ - __u32 llap_checksum; -}; - -static inline struct ll_async_page *llap_from_cookie(void *ptr) -{ - struct ll_async_page *ap = ptr; - LASSERT(ap->llap_magic == LLAP_MAGIC); - return ap; -} - -/* - * enumeration of llap_from_page() call-sites. Used to export statistics in - * /proc/fs/lustre/llite/fsN/dump_page_cache. - */ -enum { - LLAP_ORIGIN_UNKNOWN = 0, - LLAP_ORIGIN_READPAGE, - LLAP_ORIGIN_READAHEAD, - LLAP_ORIGIN_COMMIT_WRITE, - LLAP_ORIGIN_WRITEPAGE, - LLAP_ORIGIN_REMOVEPAGE, - LLAP_ORIGIN_LOCKLESS_IO, - LLAP__ORIGIN_MAX, -}; -extern char *llap_origins[]; #ifdef HAVE_REGISTER_CACHE #include @@ -738,22 +612,17 @@ void ll_lookup_finish_locks(struct lookup_intent *it, struct dentry *dentry); /* llite/rw.c */ int ll_prepare_write(struct file *, struct page *, unsigned from, unsigned to); int ll_commit_write(struct file *, struct page *, unsigned from, unsigned to); -int ll_writepage(struct page *page); -void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa); -int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc); -int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction); -struct ll_async_page *llap_from_page(struct page *page, unsigned origin); -extern struct cache_definition ll_cache_definition; +int ll_writepage(struct page *page, struct writeback_control *wbc); void ll_removepage(struct page *page); int ll_readpage(struct file *file, struct page *page); -struct ll_async_page *llap_cast_private(struct page *page); void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras); -void ll_ra_accounting(struct ll_async_page *llap,struct address_space *mapping); void ll_truncate(struct inode *inode); int ll_file_punch(struct inode *, loff_t, int); ssize_t ll_file_lockless_io(struct file *, char *, size_t, loff_t *, int); void ll_clear_file_contended(struct inode*); int ll_sync_page_range(struct inode *, struct address_space *, loff_t, size_t); +int ll_readahead(const struct lu_env *env, struct cl_io *io, struct ll_readahead_state *ras, + struct address_space *mapping, struct cl_page_list *queue, int flags); /* llite/file.c */ extern struct file_operations ll_file_operations; @@ -764,19 +633,11 @@ extern int ll_inode_revalidate_it(struct dentry *, struct lookup_intent *); extern int ll_have_md_lock(struct inode *inode, __u64 bits); extern ldlm_mode_t ll_take_md_lock(struct inode *inode, __u64 bits, struct lustre_handle *lockh); -int ll_region_mapped(unsigned long addr, size_t count); -int ll_extent_lock(struct ll_file_data *, struct inode *, - struct lov_stripe_md *, int mode, ldlm_policy_data_t *, - struct lustre_handle *, int ast_flags); -int ll_extent_unlock(struct ll_file_data *, struct inode *, - struct lov_stripe_md *, int mode, struct lustre_handle *); int ll_file_open(struct inode *inode, struct file *file); int ll_file_release(struct inode *inode, struct file *file); int ll_lsm_getattr(struct obd_export *, struct lov_stripe_md *, struct obdo *); -int ll_local_size(struct inode *inode); int ll_glimpse_ioctl(struct ll_sb_info *sbi, struct lov_stripe_md *lsm, lstat_t *st); -int ll_glimpse_size(struct inode *inode, int ast_flags); int ll_local_open(struct file *file, struct lookup_intent *it, struct ll_file_data *fd, struct obd_client_handle *och); @@ -793,8 +654,9 @@ int ll_md_setattr(struct inode *inode, struct md_op_data *op_data, struct md_open_data **mod); void ll_pack_inode2opdata(struct inode *inode, struct md_op_data *op_data, struct lustre_handle *fh); -extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file - *file, size_t count, int rw); +extern void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw); int ll_getattr_it(struct vfsmount *mnt, struct dentry *de, struct lookup_intent *it, struct kstat *stat); int ll_getattr(struct vfsmount *mnt, struct dentry *de, struct kstat *stat); @@ -808,15 +670,12 @@ int ll_lov_getstripe_ea_info(struct inode *inode, const char *filename, struct ptlrpc_request **request); int ll_dir_setstripe(struct inode *inode, struct lov_user_md *lump, int set_default); -int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, +int ll_dir_getstripe(struct inode *inode, struct lov_mds_md **lmm, int *lmm_size, struct ptlrpc_request **request); int ll_fsync(struct file *file, struct dentry *dentry, int data); int ll_fiemap(struct inode *inode, struct ll_user_fiemap *fiemap, int num_bytes); -void ll_pin_extent_cb(void *data); -int ll_page_removal_cb(void *data, int discard); -int ll_extent_lock_cancel_cb(struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, int flag); +int ll_merge_lvb(struct inode *inode); /* llite/dcache.c */ /* llite/namei.c */ @@ -873,8 +732,6 @@ int ll_prep_inode(struct inode **inode, struct ptlrpc_request *req, struct super_block *); void lustre_dump_dentry(struct dentry *, int recur); void lustre_dump_inode(struct inode *); -struct ll_async_page *llite_pglist_next_llap(struct list_head *head, - struct list_head *list); int ll_obd_statfs(struct inode *inode, void *arg); int ll_get_max_mdsize(struct ll_sb_info *sbi, int *max_mdsize); int ll_process_config(struct lustre_cfg *lcfg); @@ -912,9 +769,96 @@ struct ll_close_queue { atomic_t lcq_stop; }; -void llap_write_pending(struct inode *inode, struct ll_async_page *llap); -int llap_write_complete(struct inode *inode, struct ll_async_page *llap); -int ll_inode_dirty(struct inode *inode, unsigned long flags); +struct vvp_thread_info { + struct ost_lvb vti_lvb; + struct cl_2queue vti_queue; + struct iovec vti_local_iov; + struct ccc_io_args vti_args; + struct ra_io_arg vti_ria; + struct kiocb vti_kiocb; +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode); + +static inline struct vvp_thread_info *vvp_env_info(const struct lu_env *env) +{ + extern struct lu_context_key vvp_key; + struct vvp_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &vvp_key); + LASSERT(info != NULL); + return info; +} + +void vvp_write_pending (struct ccc_object *club, struct ccc_page *page); +void vvp_write_complete(struct ccc_object *club, struct ccc_page *page); + +struct vvp_io { + union { + struct { + read_actor_t cui_actor; + void *cui_target; + } read; + struct vvp_fault_io { + /** + * Inode modification time that is checked across DLM + * lock request. + */ + time_t ft_mtime; + struct vm_area_struct *ft_vma; + /** + * Virtual address at which fault occurred. + */ + unsigned long ft_address; + /** + * Fault type, as to be supplied to filemap_nopage(). + */ + int *ft_type; + } fault; + } u; + /** + * Read-ahead state used by read and page-fault IO contexts. + */ + struct ll_ra_read cui_bead; + /** + * Set when cui_bead has been initialized. + */ + int cui_ra_window_set; + /** + * If IO was created directly in low level method like + * ->prepare_write(), this field stores the number of method calls + * that constitute this IO. This field is decremented by ll_cl_fini(), + * and cl_io is destroyed, when it reaches 0. When oneshot IO + * completes, this fields is set to -1. + */ + + int cui_oneshot; + /** + * Partially truncated page, that vvp_io_trunc_start() keeps locked + * across truncate. + */ + struct cl_page *cui_partpage; +}; + +struct vvp_session { + struct vvp_io vs_ios; +}; + +static inline struct vvp_session *vvp_env_session(const struct lu_env *env) +{ + extern struct lu_context_key vvp_session_key; + struct vvp_session *ses; + + ses = lu_context_key_get(env->le_ses, &vvp_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct vvp_io *vvp_env_io(const struct lu_env *env) +{ + return &vvp_env_session(env)->vs_ios; +} + void ll_queue_done_writing(struct inode *inode, unsigned long flags); void ll_close_thread_shutdown(struct ll_close_queue *lcq); int ll_close_thread_start(struct ll_close_queue **lcq_ret); @@ -934,10 +878,9 @@ int ll_teardown_mmaps(struct address_space *mapping, __u64 first, __u64 last); int ll_file_mmap(struct file * file, struct vm_area_struct * vma); struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, __u64 end, ldlm_mode_t mode); -int ll_tree_lock(struct ll_lock_tree *tree, - struct ll_lock_tree_node *first_node, - const char *buf, size_t count, int ast_flags); -int ll_tree_unlock(struct ll_lock_tree *tree); +void policy_from_vma(ldlm_policy_data_t *policy, + struct vm_area_struct *vma, unsigned long addr, size_t count); +struct vm_area_struct *our_vma(unsigned long addr, size_t count); #define ll_s2sbi(sb) (s2lsi(sb)->lsi_llsbi) @@ -1021,6 +964,7 @@ int lustre_check_remote_perm(struct inode *inode, int mask); /* llite/llite_fid.c */ ino_t ll_fid_build_ino(struct ll_sb_info *sbi, struct lu_fid *fid); +__u32 ll_fid_build_gen(struct ll_sb_info *sbi, struct lu_fid *fid); /* llite/llite_capa.c */ extern cfs_timer_t ll_capa_timer; @@ -1042,6 +986,28 @@ void ll_truncate_free_capa(struct obd_capa *ocapa); void ll_clear_inode_capas(struct inode *inode); void ll_print_capa_stat(struct ll_sb_info *sbi); +/* llite/llite_cl.c */ +extern struct lu_device_type vvp_device_type; + +/** + * Common IO arguments for various VFS I/O interfaces. + */ + +int cl_sb_init(struct super_block *sb); +int cl_sb_fini(struct super_block *sb); +int cl_inode_init(struct inode *inode, struct lustre_md *md); +void cl_inode_fini(struct inode *inode); + +enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma); +void ll_io_init(struct cl_io *io, const struct file *file, int write); + +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit); +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len); +int ll_is_file_contended(struct file *file); +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which); + /* llite/llite_rmtacl.c */ #ifdef CONFIG_FS_POSIX_ACL obd_valid rce_ops2valid(int ops); @@ -1103,23 +1069,23 @@ struct ll_statahead_info { }; int do_statahead_enter(struct inode *dir, struct dentry **dentry, int lookup); -int ll_statahead_exit(struct dentry *dentry, int result); +void ll_statahead_exit(struct dentry *dentry, int result); void ll_stop_statahead(struct inode *inode, void *key); static inline -int ll_statahead_mark(struct dentry *dentry) +void ll_statahead_mark(struct dentry *dentry) { struct ll_inode_info *lli = ll_i2info(dentry->d_parent->d_inode); - struct ll_statahead_info *sai = lli->lli_sai; struct ll_dentry_data *ldd = ll_d2d(dentry); - int rc = 0; - if (likely(ldd != NULL)) - ldd->lld_sa_generation = sai->sai_generation; - else - rc = -ENOMEM; + /* not the same process, don't mark */ + if (lli->lli_opendir_pid != cfs_curproc_pid()) + return; - return rc; + spin_lock(&lli->lli_lock); + if (likely(lli->lli_sai != NULL && ldd != NULL)) + ldd->lld_sa_generation = lli->lli_sai->sai_generation; + spin_unlock(&lli->lli_lock); } static inline @@ -1150,7 +1116,7 @@ int ll_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) * "IT_GETATTR" for the first time, and the subsequent "IT_GETATTR" * will bypass interacting with statahead thread for checking: * "lld_sa_generation == lli_sai->sai_generation" - */ + */ if (ldd && lli->lli_sai && ldd->lld_sa_generation == lli->lli_sai->sai_generation) return -EAGAIN; @@ -1188,36 +1154,66 @@ enum llioc_iter { * Parameters: * @magic: Dynamic ioctl call routine will feed this vaule with the pointer * returned to ll_iocontrol_register. Callback functions should use this - * data to check the potential collasion of ioctl cmd. If collasion is + * data to check the potential collasion of ioctl cmd. If collasion is * found, callback function should return LLIOC_CONT. * @rcp: The result of ioctl command. * * Return values: - * If @magic matches the pointer returned by ll_iocontrol_data, the + * If @magic matches the pointer returned by ll_iocontrol_data, the * callback should return LLIOC_STOP; return LLIOC_STOP otherwise. */ -typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, +typedef enum llioc_iter (*llioc_callback_t)(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg, void *magic, int *rcp); -enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, +enum llioc_iter ll_iocontrol_call(struct inode *inode, struct file *file, unsigned int cmd, unsigned long arg, int *rcp); /* export functions */ -/* Register ioctl block dynamatically for a regular file. +/* Register ioctl block dynamatically for a regular file. * * @cmd: the array of ioctl command set * @count: number of commands in the @cmd - * @cb: callback function, it will be called if an ioctl command is found to + * @cb: callback function, it will be called if an ioctl command is found to * belong to the command list @cmd. * * Return vaule: - * A magic pointer will be returned if success; - * otherwise, NULL will be returned. + * A magic pointer will be returned if success; + * otherwise, NULL will be returned. * */ void *ll_iocontrol_register(llioc_callback_t cb, int count, unsigned int *cmd); void ll_iocontrol_unregister(void *magic); #endif +/* lclient compat stuff */ +#define cl_inode_info ll_inode_info +#define cl_i2info(info) ll_i2info(info) +#define cl_inode_mode(inode) ((inode)->i_mode) +#define cl_i2sbi ll_i2sbi +#define cl_isize_read(inode) i_size_read(inode) +#define cl_isize_write(inode,kms) i_size_write(inode, kms) +#define cl_isize_write_nolock(inode,kms) do {(inode)->i_size=(kms);}while(0) + +static inline void cl_isize_lock(struct inode *inode, int lsmlock) +{ + ll_inode_size_lock(inode, lsmlock); +} + +static inline void cl_isize_unlock(struct inode *inode, int lsmlock) +{ + ll_inode_size_unlock(inode, lsmlock); +} + +static inline int cl_merge_lvb(struct inode *inode) +{ + return ll_merge_lvb(inode); +} + +#define cl_inode_atime(inode) LTIME_S((inode)->i_atime) +#define cl_inode_ctime(inode) LTIME_S((inode)->i_ctime) +#define cl_inode_mtime(inode) LTIME_S((inode)->i_mtime) + +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt); + #endif /* LLITE_INTERNAL_H */ diff --git a/lustre/llite/llite_lib.c b/lustre/llite/llite_lib.c index ca9a7f8..a2a6c3e 100644 --- a/lustre/llite/llite_lib.c +++ b/lustre/llite/llite_lib.c @@ -53,8 +53,8 @@ #include #include #include +#include #include -#include #include "llite_internal.h" cfs_mem_cache_t *ll_file_data_slab; @@ -69,65 +69,6 @@ extern struct address_space_operations ll_dir_aops; #define log2(n) ffz(~(n)) #endif -static inline void ll_pglist_fini(struct ll_sb_info *sbi) -{ - struct page *page; - int i; - - if (sbi->ll_pglist == NULL) - return; - - for_each_possible_cpu(i) { - page = sbi->ll_pglist[i]->llpd_page; - if (page) { - sbi->ll_pglist[i] = NULL; - __free_page(page); - } - } - - OBD_FREE(sbi->ll_pglist, sizeof(void *)*num_possible_cpus()); - sbi->ll_pglist = NULL; -} - -static inline int ll_pglist_init(struct ll_sb_info *sbi) -{ - struct ll_pglist_data *pd; - unsigned long budget; - int i, color = 0; - ENTRY; - - OBD_ALLOC(sbi->ll_pglist, sizeof(void *) * num_possible_cpus()); - if (sbi->ll_pglist == NULL) - RETURN(-ENOMEM); - - budget = sbi->ll_async_page_max / num_online_cpus(); - for_each_possible_cpu(i) { - struct page *page = alloc_pages_node(cpu_to_node(i), - GFP_KERNEL, 0); - if (page == NULL) { - ll_pglist_fini(sbi); - RETURN(-ENOMEM); - } - - if (color + L1_CACHE_ALIGN(sizeof(*pd)) > PAGE_SIZE) - color = 0; - - pd = (struct ll_pglist_data *)(page_address(page) + color); - memset(pd, 0, sizeof(*pd)); - spin_lock_init(&pd->llpd_lock); - INIT_LIST_HEAD(&pd->llpd_list); - if (cpu_online(i)) - pd->llpd_budget = budget; - pd->llpd_cpu = i; - pd->llpd_page = page; - atomic_set(&pd->llpd_sample_count, 0); - sbi->ll_pglist[i] = pd; - color += L1_CACHE_ALIGN(sizeof(*pd)); - } - - RETURN(0); -} - static struct ll_sb_info *ll_init_sbi(void) { struct ll_sb_info *sbi = NULL; @@ -141,10 +82,6 @@ static struct ll_sb_info *ll_init_sbi(void) if (!sbi) RETURN(NULL); - OBD_ALLOC(sbi->ll_async_page_sample, sizeof(long)*num_possible_cpus()); - if (sbi->ll_async_page_sample == NULL) - GOTO(out, 0); - spin_lock_init(&sbi->ll_lock); spin_lock_init(&sbi->ll_lco.lco_lock); spin_lock_init(&sbi->ll_pp_extent_lock); @@ -162,20 +99,11 @@ static struct ll_sb_info *ll_init_sbi(void) } else { sbi->ll_async_page_max = (pages / 4) * 3; } - lcounter_init(&sbi->ll_async_page_count); - spin_lock_init(&sbi->ll_async_page_reblnc_lock); - sbi->ll_async_page_sample_max = 64 * num_online_cpus(); - sbi->ll_async_page_reblnc_count = 0; - sbi->ll_async_page_clock_hand = 0; - if (ll_pglist_init(sbi)) - GOTO(out, 0); sbi->ll_ra_info.ra_max_pages = min(pages / 32, SBI_DEFAULT_READAHEAD_MAX); sbi->ll_ra_info.ra_max_read_ahead_whole_pages = SBI_DEFAULT_READAHEAD_WHOLE_MAX; - sbi->ll_contention_time = SBI_DEFAULT_CONTENTION_SECONDS; - sbi->ll_lockless_truncate_enable = SBI_DEFAULT_LOCKLESS_TRUNCATE_ENABLE; INIT_LIST_HEAD(&sbi->ll_conn_chain); INIT_LIST_HEAD(&sbi->ll_orphan_dentry_list); @@ -208,14 +136,6 @@ static struct ll_sb_info *ll_init_sbi(void) sbi->ll_sa_max = LL_SA_RPC_DEF; RETURN(sbi); - -out: - if (sbi->ll_async_page_sample) - OBD_FREE(sbi->ll_async_page_sample, - sizeof(long) * num_possible_cpus()); - ll_pglist_fini(sbi); - OBD_FREE(sbi, sizeof(*sbi)); - RETURN(NULL); } void ll_free_sbi(struct super_block *sb) @@ -224,13 +144,9 @@ void ll_free_sbi(struct super_block *sb) ENTRY; if (sbi != NULL) { - ll_pglist_fini(sbi); spin_lock(&ll_sb_lock); list_del(&sbi->ll_list); spin_unlock(&ll_sb_lock); - lcounter_destroy(&sbi->ll_async_page_count); - OBD_FREE(sbi->ll_async_page_sample, - sizeof(long) * num_possible_cpus()); OBD_FREE(sbi, sizeof(*sbi)); } EXIT; @@ -316,7 +232,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) OBD_CONNECT_VERSION | OBD_CONNECT_MDS_CAPA | OBD_CONNECT_OSS_CAPA | OBD_CONNECT_CANCELSET| OBD_CONNECT_FID | OBD_CONNECT_AT | - OBD_CONNECT_LOV_V3; + OBD_CONNECT_LOV_V3 | OBD_CONNECT_RMT_CLIENT; #ifdef HAVE_LRU_RESIZE_SUPPORT if (sbi->ll_flags & LL_SBI_LRU_RESIZE) @@ -347,13 +263,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) /* real client */ data->ocd_connect_flags |= OBD_CONNECT_REAL; - if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { - data->ocd_connect_flags &= ~OBD_CONNECT_LCL_CLIENT; - data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT; - } else { - data->ocd_connect_flags &= ~OBD_CONNECT_RMT_CLIENT; - data->ocd_connect_flags |= OBD_CONNECT_LCL_CLIENT; - } + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; err = obd_connect(NULL, &md_conn, obd, &sbi->ll_sb_uuid, data, NULL); if (err == -EBUSY) { @@ -431,21 +342,16 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) if (data->ocd_connect_flags & OBD_CONNECT_JOIN) sbi->ll_flags |= LL_SBI_JOIN; - if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { - if (!(data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT)) { - /* sometimes local client claims to be remote, but mdt - * will disagree when client gss not applied. */ - LCONSOLE_INFO("client claims to be remote, but server " - "rejected, forced to be local.\n"); - sbi->ll_flags &= ~LL_SBI_RMT_CLIENT; + if (data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) { + if (!(sbi->ll_flags & LL_SBI_RMT_CLIENT)) { + sbi->ll_flags |= LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client is set as remote by default.\n"); } } else { - if (!(data->ocd_connect_flags & OBD_CONNECT_LCL_CLIENT)) { - /* with gss applied, remote client can not claim to be - * local, so mdt maybe force client to be remote. */ - LCONSOLE_INFO("client claims to be local, but server " - "rejected, forced to be remote.\n"); - sbi->ll_flags |= LL_SBI_RMT_CLIENT; + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) { + sbi->ll_flags &= ~LL_SBI_RMT_CLIENT; + LCONSOLE_INFO("client claims to be remote, but server " + "rejected, forced to be local.\n"); } } @@ -469,9 +375,8 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) OBD_CONNECT_REQPORTAL | OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CANCELSET | OBD_CONNECT_FID | OBD_CONNECT_SRVLOCK | OBD_CONNECT_TRUNCLOCK| - OBD_CONNECT_AT; - if (sbi->ll_flags & LL_SBI_OSS_CAPA) - data->ocd_connect_flags |= OBD_CONNECT_OSS_CAPA; + OBD_CONNECT_AT | OBD_CONNECT_RMT_CLIENT | + OBD_CONNECT_OSS_CAPA; if (!OBD_FAIL_CHECK(OBD_FAIL_OSC_CONNECT_CKSUM)) { /* OBD_CONNECT_CKSUM should always be set, even if checksums are @@ -490,6 +395,9 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) #ifdef HAVE_LRU_RESIZE_SUPPORT data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; #endif + if (sbi->ll_flags & LL_SBI_RMT_CLIENT) + data->ocd_connect_flags |= OBD_CONNECT_RMT_CLIENT_FORCE; + CDEBUG(D_RPCTRACE, "ocd_connect_flags: "LPX64" ocd_version: %d " "ocd_grant: %d\n", data->ocd_connect_flags, data->ocd_version, data->ocd_grant); @@ -523,45 +431,12 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) sbi->ll_lco.lco_flags = data->ocd_connect_flags; spin_unlock(&sbi->ll_lco.lco_lock); - err = obd_register_page_removal_cb(sbi->ll_dt_exp, - ll_page_removal_cb, - ll_pin_extent_cb); - if (err) { - CERROR("cannot register page removal callback: rc = %d\n",err); - GOTO(out_dt, err); - } - err = obd_register_lock_cancel_cb(sbi->ll_dt_exp, - ll_extent_lock_cancel_cb); - if (err) { - CERROR("cannot register lock cancel callback: rc = %d\n", err); - GOTO(out_page_rm_cb, err); - } - err = ll_init_ea_size(sbi->ll_md_exp, sbi->ll_dt_exp);; if (err) { CERROR("cannot set max EA and cookie sizes: rc = %d\n", err); GOTO(out_lock_cn_cb, err); } - err = obd_prep_async_page(sbi->ll_dt_exp, NULL, NULL, NULL, - 0, NULL, NULL, NULL, 0, NULL); - if (err < 0) { - LCONSOLE_ERROR_MSG(0x151, "There are no OST's in this " - "filesystem. There must be at least one " - "active OST for a client to start.\n"); - GOTO(out_lock_cn_cb, err); - } - - if (!ll_async_page_slab) { - ll_async_page_slab_size = - size_round(sizeof(struct ll_async_page)) + err; - ll_async_page_slab = cfs_mem_cache_create("ll_async_page", - ll_async_page_slab_size, - 0, 0); - if (!ll_async_page_slab) - GOTO(out_lock_cn_cb, err = -ENOMEM); - } - fid_zero(&sbi->ll_root_fid); err = md_getstatus(sbi->ll_md_exp, &sbi->ll_root_fid, &oc); if (err) { @@ -585,10 +460,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) else if (sbi->ll_flags & LL_SBI_ACL) valid |= OBD_MD_FLACL; - err = md_getattr(sbi->ll_md_exp, &sbi->ll_root_fid, oc, valid, 0, + err = md_getattr(sbi->ll_md_exp, &sbi->ll_root_fid, oc, valid, 0, &request); if (oc) - free_capa(oc); + capa_put(oc); if (err) { CERROR("md_getattr failed for root: rc = %d\n", err); GOTO(out_lock_cn_cb, err); @@ -607,7 +482,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) md_free_lustre_md(sbi->ll_md_exp, &lmd); ptlrpc_req_finished(request); - if (root == NULL || is_bad_inode(root)) { + if (root == NULL || IS_ERR(root)) { if (lmd.lsm) obd_free_memmd(sbi->ll_dt_exp, &lmd.lsm); #ifdef CONFIG_FS_POSIX_ACL @@ -616,8 +491,10 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) lmd.posix_acl = NULL; } #endif + err = IS_ERR(root) ? PTR_ERR(root) : -EBADF; + root = NULL; CERROR("lustre_lite: bad iget4 for root\n"); - GOTO(out_root, err = -EBADF); + GOTO(out_root, err); } err = ll_close_thread_start(&sbi->ll_lcq); @@ -637,6 +514,7 @@ static int client_common_fill_super(struct super_block *sb, char *md, char *dt) err = obd_set_info_async(sbi->ll_dt_exp, sizeof(KEY_CHECKSUM), KEY_CHECKSUM, sizeof(checksum), &checksum, NULL); + cl_sb_init(sb); sb->s_root = d_alloc_root(root); if (data != NULL) @@ -660,11 +538,6 @@ out_root: if (root) iput(root); out_lock_cn_cb: - obd_unregister_lock_cancel_cb(sbi->ll_dt_exp, - ll_extent_lock_cancel_cb); -out_page_rm_cb: - obd_unregister_page_removal_cb(sbi->ll_dt_exp, - ll_page_removal_cb); obd_fid_fini(sbi->ll_dt_exp); out_dt: obd_disconnect(sbi->ll_dt_exp); @@ -1119,6 +992,8 @@ out_free: } /* ll_fill_super */ +void lu_context_keys_dump(void); + void ll_put_super(struct super_block *sb) { struct config_llog_instance cfg; @@ -1154,6 +1029,8 @@ void ll_put_super(struct super_block *sb) } } + cl_sb_fini(sb); + if (sbi->ll_lcq) { /* Only if client_common_fill_super succeeded */ client_common_put_super(sb); @@ -1171,6 +1048,9 @@ void ll_put_super(struct super_block *sb) lustre_common_put_super(sb); + cl_env_cache_purge(~0); + lu_context_keys_dump(); + LCONSOLE_WARN("client %s umount complete\n", ll_instance); cfs_module_put(); @@ -1178,32 +1058,6 @@ void ll_put_super(struct super_block *sb) EXIT; } /* client_put_super */ -#if defined(HAVE_REGISTER_CACHE) || defined(HAVE_SHRINKER_CACHE) - -#if defined(HAVE_CACHE_RETURN_INT) -static int -#else -static void -#endif -ll_shrink_cache(int priority, unsigned int gfp_mask) -{ - struct ll_sb_info *sbi; - int count = 0; - - list_for_each_entry(sbi, &ll_super_blocks, ll_list) - count += llap_shrink_cache(sbi, priority); - -#if defined(HAVE_CACHE_RETURN_INT) - return count; -#endif -} - -struct cache_definition ll_cache_definition = { - .name = "llap_cache", - .shrink = ll_shrink_cache -}; -#endif /* HAVE_REGISTER_CACHE || HAVE_SHRINKER_CACHE */ - struct inode *ll_inode_from_lock(struct ldlm_lock *lock) { struct inode *inode = NULL; @@ -1270,14 +1124,6 @@ void ll_clear_inode(struct inode *inode) if (lli->lli_mds_read_och) ll_md_real_close(inode, FMODE_READ); - if (lli->lli_smd) { - obd_change_cbdata(sbi->ll_dt_exp, lli->lli_smd, - null_if_equal, inode); - - obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd); - lli->lli_smd = NULL; - } - if (lli->lli_symlink_name) { OBD_FREE(lli->lli_symlink_name, strlen(lli->lli_symlink_name) + 1); @@ -1307,6 +1153,17 @@ void ll_clear_inode(struct inode *inode) spin_unlock(&sbi->ll_deathrow_lock); #endif ll_clear_inode_capas(inode); + /* + * XXX This has to be done before lsm is freed below, because + * cl_object still uses inode lsm. + */ + cl_inode_fini(inode); + + if (lli->lli_smd) { + obd_free_memmd(sbi->ll_dt_exp, &lli->lli_smd); + lli->lli_smd = NULL; + } + EXIT; } @@ -1396,86 +1253,25 @@ static int ll_setattr_done_writing(struct inode *inode, RETURN(rc); } -static int ll_setattr_do_truncate(struct inode *inode, loff_t new_size) +static int ll_setattr_do_truncate(struct inode *inode, loff_t size) { - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; + struct obd_capa *capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC); int rc; - ldlm_policy_data_t policy = { .l_extent = {new_size, - OBD_OBJECT_EOF } }; - struct lustre_handle lockh = { 0 }; - int local_lock = 0; /* 0 - no local lock; - * 1 - lock taken by lock_extent; - * 2 - by obd_match*/ - int ast_flags; - int err; - ENTRY; - UNLOCK_INODE_MUTEX(inode); - UP_WRITE_I_ALLOC_SEM(inode); - - if (sbi->ll_lockless_truncate_enable && - (sbi->ll_lco.lco_flags & OBD_CONNECT_TRUNCLOCK)) { - ast_flags = LDLM_FL_BLOCK_GRANTED; - rc = obd_match(sbi->ll_dt_exp, lsm, LDLM_EXTENT, - &policy, LCK_PW, &ast_flags, inode, &lockh); - if (rc > 0) { - local_lock = 2; - rc = 0; - } else if (rc == 0) { - rc = ll_file_punch(inode, new_size, 1); - } - } else { - /* XXX when we fix the AST intents to pass the discard-range - * XXX extent, make ast_flags always LDLM_AST_DISCARD_DATA - * XXX here. */ - ast_flags = (new_size == 0) ? LDLM_AST_DISCARD_DATA : 0; - rc = ll_extent_lock(NULL, inode, lsm, LCK_PW, &policy, - &lockh, ast_flags); - if (likely(rc == 0)) - local_lock = 1; - } - - LOCK_INODE_MUTEX(inode); - DOWN_WRITE_I_ALLOC_SEM(inode); - - if (likely(rc == 0)) { - /* Only ll_inode_size_lock is taken at this level. - * lov_stripe_lock() is grabbed by ll_truncate() only over - * call to obd_adjust_kms(). If vmtruncate returns 0, then - * ll_truncate dropped ll_inode_size_lock() */ - ll_inode_size_lock(inode, 0); - if (!local_lock) { - spin_lock(&lli->lli_lock); - lli->lli_flags |= LLIF_SRVLOCK; - spin_unlock(&lli->lli_lock); - } - rc = vmtruncate(inode, new_size); - if (!local_lock) { - spin_lock(&lli->lli_lock); - lli->lli_flags &= ~LLIF_SRVLOCK; - spin_unlock(&lli->lli_lock); - } - if (rc != 0) { - LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0); - ll_inode_size_unlock(inode, 0); - } - } + rc = cl_setattr_do_truncate(inode, size, capa); + ll_truncate_free_capa(capa); + return rc; +} - if (local_lock) { - if (local_lock == 2) - err = obd_cancel(sbi->ll_dt_exp, lsm, LCK_PW, &lockh); - else - err = ll_extent_unlock(NULL, inode, lsm, LCK_PW, &lockh); - if (unlikely(err != 0)){ - CERROR("extent unlock failed: err=%d," - " unlock method =%d\n", err, local_lock); - if (rc == 0) - rc = err; - } - } - RETURN(rc); +static int ll_setattr_ost(struct inode *inode) +{ + struct obd_capa *capa = ll_mdscapa_get(inode); + int rc; + + rc = cl_setattr_ost(inode, capa); + capa_put(capa); + + return rc; } /* If this inode has objects allocated to it (lsm != NULL), then the OST @@ -1495,7 +1291,6 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) { struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct ll_sb_info *sbi = ll_i2sbi(inode); struct md_op_data *op_data = NULL; struct md_open_data *mod = NULL; int ia_valid = attr->ia_valid; @@ -1539,7 +1334,7 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) if ((attr->ia_valid & ATTR_CTIME) && !(attr->ia_valid & ATTR_MTIME)) { /* To avoid stale mtime on mds, obtain it from ost and send to mds. */ - rc = ll_glimpse_size(inode, 0); + rc = cl_glimpse_size(inode); if (rc) RETURN(rc); @@ -1584,48 +1379,12 @@ int ll_setattr_raw(struct inode *inode, struct iattr *attr) GOTO(out, rc = 0); } - /* We really need to get our PW lock before we change inode->i_size. - * If we don't we can race with other i_size updaters on our node, like - * ll_file_read. We can also race with i_size propogation to other - * nodes through dirtying and writeback of final cached pages. This - * last one is especially bad for racing o_append users on other - * nodes. */ - if (ia_valid & ATTR_SIZE) { + if (ia_valid & ATTR_SIZE) rc = ll_setattr_do_truncate(inode, attr->ia_size); - } else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { - obd_flag flags; - struct obd_info oinfo = { { { 0 } } }; - struct obdo *oa; - + else if (ia_valid & (ATTR_MTIME | ATTR_MTIME_SET)) { CDEBUG(D_INODE, "set mtime on OST inode %lu to %lu\n", inode->i_ino, LTIME_S(attr->ia_mtime)); - - OBDO_ALLOC(oa); - if (oa) { - oa->o_id = lsm->lsm_object_id; - oa->o_gr = lsm->lsm_object_gr; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - - flags = OBD_MD_FLTYPE | OBD_MD_FLATIME | - OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLFID | OBD_MD_FLGENER | - OBD_MD_FLGROUP; - - obdo_from_inode(oa, inode, flags); - - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - oinfo.oi_capa = ll_mdscapa_get(inode); - - /* XXX: this looks unnecessary now. */ - rc = obd_setattr_rqset(sbi->ll_dt_exp, &oinfo, NULL); - capa_put(oinfo.oi_capa); - if (rc) - CERROR("obd_setattr_async fails: rc=%d\n", rc); - OBDO_FREE(oa); - } else { - rc = -ENOMEM; - } + rc = ll_setattr_ost(inode); } EXIT; out: @@ -1815,9 +1574,11 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) } CDEBUG(D_INODE, "adding lsm %p to inode %lu/%u(%p)\n", lsm, inode->i_ino, inode->i_generation, inode); - /* ll_inode_size_lock() requires it is only called - * with lli_smd != NULL or lock_lsm == 0 or we can - * race between lock/unlock. bug 9547 */ + cl_inode_init(inode, md); + /* ll_inode_size_lock() requires it is only + * called with lli_smd != NULL or lock_lsm == 0 + * or we can race between lock/unlock. + * bug 9547 */ lli->lli_smd = lsm; lli->lli_maxbytes = lsm->lsm_maxbytes; if (lli->lli_maxbytes > PAGE_CACHE_MAXBYTES) @@ -1835,8 +1596,10 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) dump_lsm(D_ERROR, lsm); LBUG(); } - } else + } else { + cl_inode_init(inode, md); ll_replace_lsm(inode, lsm); + } } if (lli->lli_smd != lsm) obd_free_memmd(ll_i2dtexp(inode), &lsm); @@ -1856,6 +1619,7 @@ void ll_update_inode(struct inode *inode, struct lustre_md *md) } #endif inode->i_ino = ll_fid_build_ino(sbi, &body->fid1); + inode->i_generation = ll_fid_build_gen(sbi, &body->fid1); if (body->valid & OBD_MD_FLATIME && body->atime > LTIME_S(inode->i_atime)) @@ -2233,7 +1997,7 @@ int ll_prep_inode(struct inode **inode, { struct ll_sb_info *sbi = NULL; struct lustre_md md; - int rc = 0; + int rc; ENTRY; LASSERT(*inode || sb); @@ -2257,8 +2021,8 @@ int ll_prep_inode(struct inode **inode, */ LASSERT(fid_is_sane(&md.body->fid1)); - *inode = ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1), &md); - if (*inode == NULL || is_bad_inode(*inode)) { + *inode = ll_iget(sb, ll_fid_build_ino(sbi, &md.body->fid1),&md); + if (*inode == NULL || IS_ERR(*inode)) { if (md.lsm) obd_free_memmd(sbi->ll_dt_exp, &md.lsm); #ifdef CONFIG_FS_POSIX_ACL @@ -2267,7 +2031,8 @@ int ll_prep_inode(struct inode **inode, md.posix_acl = NULL; } #endif - rc = -ENOMEM; + rc = IS_ERR(*inode) ? PTR_ERR(*inode) : -ENOMEM; + *inode = NULL; CERROR("new_inode -fatal: rc %d\n", rc); GOTO(out, rc); } @@ -2280,33 +2045,6 @@ out: RETURN(rc); } -char *llap_origins[] = { - [LLAP_ORIGIN_UNKNOWN] = "--", - [LLAP_ORIGIN_READPAGE] = "rp", - [LLAP_ORIGIN_READAHEAD] = "ra", - [LLAP_ORIGIN_COMMIT_WRITE] = "cw", - [LLAP_ORIGIN_WRITEPAGE] = "wp", - [LLAP_ORIGIN_LOCKLESS_IO] = "ls" -}; - -struct ll_async_page *llite_pglist_next_llap(struct list_head *head, - struct list_head *list) -{ - struct ll_async_page *llap; - struct list_head *pos; - - list_for_each(pos, list) { - if (pos == head) - return NULL; - llap = list_entry(pos, struct ll_async_page, llap_pglist_item); - if (llap->llap_page == NULL) - continue; - return llap; - } - LBUG(); - return NULL; -} - int ll_obd_statfs(struct inode *inode, void *arg) { struct ll_sb_info *sbi = NULL; @@ -2369,6 +2107,8 @@ int ll_process_config(struct lustre_cfg *lcfg) proc fns must be able to handle that! */ rc = class_process_proc_param(PARAM_LLITE, lvars.obd_vars, lcfg, sb); + if (rc > 0) + rc = 0; return(rc); } diff --git a/lustre/llite/llite_mmap.c b/lustre/llite/llite_mmap.c index f17a90d..4126a83 100644 --- a/lustre/llite/llite_mmap.c +++ b/lustre/llite/llite_mmap.c @@ -69,210 +69,10 @@ vma->vm_file->f_dentry->d_inode->i_ino, \ vma->vm_file->f_dentry->d_iname, ## arg); \ - -struct ll_lock_tree_node { - rb_node_t lt_node; - struct list_head lt_locked_item; - __u64 lt_oid; - ldlm_policy_data_t lt_policy; - struct lustre_handle lt_lockh; - ldlm_mode_t lt_mode; - struct inode *lt_inode; -}; - -int lt_get_mmap_locks(struct ll_lock_tree *tree, - unsigned long addr, size_t count); - struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type); -struct ll_lock_tree_node * ll_node_from_inode(struct inode *inode, __u64 start, - __u64 end, ldlm_mode_t mode) -{ - struct ll_lock_tree_node *node; - - OBD_ALLOC(node, sizeof(*node)); - if (node == NULL) - RETURN(ERR_PTR(-ENOMEM)); - - node->lt_inode = inode; - node->lt_oid = ll_i2info(inode)->lli_smd->lsm_object_id; - node->lt_policy.l_extent.start = start; - node->lt_policy.l_extent.end = end; - memset(&node->lt_lockh, 0, sizeof(node->lt_lockh)); - INIT_LIST_HEAD(&node->lt_locked_item); - node->lt_mode = mode; - - return node; -} - -int lt_compare(struct ll_lock_tree_node *one, struct ll_lock_tree_node *two) -{ - /* To avoid multiple fs deadlock */ - if (one->lt_inode->i_sb->s_dev < two->lt_inode->i_sb->s_dev) - return -1; - if (one->lt_inode->i_sb->s_dev > two->lt_inode->i_sb->s_dev) - return 1; - - if (one->lt_oid < two->lt_oid) - return -1; - if (one->lt_oid > two->lt_oid) - return 1; - - if (one->lt_policy.l_extent.end < two->lt_policy.l_extent.start) - return -1; - if (one->lt_policy.l_extent.start > two->lt_policy.l_extent.end) - return 1; - - return 0; /* they are the same object and overlap */ -} - -static void lt_merge(struct ll_lock_tree_node *dst, - struct ll_lock_tree_node *src) -{ - dst->lt_policy.l_extent.start = min(dst->lt_policy.l_extent.start, - src->lt_policy.l_extent.start); - dst->lt_policy.l_extent.end = max(dst->lt_policy.l_extent.end, - src->lt_policy.l_extent.end); - - /* XXX could be a real call to the dlm to find superset modes */ - if (src->lt_mode == LCK_PW && dst->lt_mode != LCK_PW) - dst->lt_mode = LCK_PW; -} - -static void lt_insert(struct ll_lock_tree *tree, - struct ll_lock_tree_node *node) -{ - struct ll_lock_tree_node *walk; - rb_node_t **p, *parent; - ENTRY; - -restart: - p = &tree->lt_root.rb_node; - parent = NULL; - while (*p) { - parent = *p; - walk = rb_entry(parent, struct ll_lock_tree_node, lt_node); - switch (lt_compare(node, walk)) { - case -1: - p = &(*p)->rb_left; - break; - case 1: - p = &(*p)->rb_right; - break; - case 0: - lt_merge(node, walk); - rb_erase(&walk->lt_node, &tree->lt_root); - OBD_FREE(walk, sizeof(*walk)); - goto restart; - break; - default: - LBUG(); - break; - } - } - rb_link_node(&node->lt_node, parent, p); - rb_insert_color(&node->lt_node, &tree->lt_root); - EXIT; -} - -static struct ll_lock_tree_node *lt_least_node(struct ll_lock_tree *tree) -{ - rb_node_t *rbnode; - struct ll_lock_tree_node *node = NULL; - - for ( rbnode = tree->lt_root.rb_node; rbnode != NULL; - rbnode = rbnode->rb_left) { - if (rbnode->rb_left == NULL) { - node = rb_entry(rbnode, struct ll_lock_tree_node, - lt_node); - break; - } - } - RETURN(node); -} - -int ll_tree_unlock(struct ll_lock_tree *tree) -{ - struct ll_lock_tree_node *node; - struct list_head *pos, *n; - struct inode *inode; - int rc = 0; - ENTRY; - - list_for_each_safe(pos, n, &tree->lt_locked_list) { - node = list_entry(pos, struct ll_lock_tree_node, - lt_locked_item); - - inode = node->lt_inode; - rc = ll_extent_unlock(tree->lt_fd, inode, - ll_i2info(inode)->lli_smd, node->lt_mode, - &node->lt_lockh); - if (rc != 0) { - /* XXX better message */ - CERROR("couldn't unlock %d\n", rc); - } - list_del(&node->lt_locked_item); - OBD_FREE(node, sizeof(*node)); - } - - while ((node = lt_least_node(tree))) { - rb_erase(&node->lt_node, &tree->lt_root); - OBD_FREE(node, sizeof(*node)); - } - - RETURN(rc); -} - -int ll_tree_lock(struct ll_lock_tree *tree, - struct ll_lock_tree_node *first_node, - const char *buf, size_t count, int ast_flags) -{ - struct ll_lock_tree_node *node; - int rc = 0; - ENTRY; - - tree->lt_root.rb_node = NULL; - INIT_LIST_HEAD(&tree->lt_locked_list); - if (first_node != NULL) - lt_insert(tree, first_node); - - /* To avoid such subtle deadlock case: client1 try to read file1 to - * mmapped file2, on the same time, client2 try to read file2 to - * mmapped file1.*/ - rc = lt_get_mmap_locks(tree, (unsigned long)buf, count); - if (rc) - GOTO(out, rc); - - while ((node = lt_least_node(tree))) { - struct inode *inode = node->lt_inode; - rc = ll_extent_lock(tree->lt_fd, inode, - ll_i2info(inode)->lli_smd, node->lt_mode, - &node->lt_policy, &node->lt_lockh, - ast_flags); - if (rc != 0) - GOTO(out, rc); - - rb_erase(&node->lt_node, &tree->lt_root); - list_add_tail(&node->lt_locked_item, &tree->lt_locked_list); - } - RETURN(rc); -out: - ll_tree_unlock(tree); - RETURN(rc); -} - -static ldlm_mode_t mode_from_vma(struct vm_area_struct *vma) -{ - /* we only want to hold PW locks if the mmap() can generate - * writes back to the file and that only happens in shared - * writable vmas */ - if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) - return LCK_PW; - return LCK_PR; -} - -static void policy_from_vma(ldlm_policy_data_t *policy, +void policy_from_vma(ldlm_policy_data_t *policy, struct vm_area_struct *vma, unsigned long addr, size_t count) { @@ -282,7 +82,7 @@ static void policy_from_vma(ldlm_policy_data_t *policy, ~CFS_PAGE_MASK; } -static struct vm_area_struct * our_vma(unsigned long addr, size_t count) +struct vm_area_struct * our_vma(unsigned long addr, size_t count) { struct mm_struct *mm = current->mm; struct vm_area_struct *vma, *ret = NULL; @@ -305,56 +105,19 @@ static struct vm_area_struct * our_vma(unsigned long addr, size_t count) RETURN(ret); } -int ll_region_mapped(unsigned long addr, size_t count) -{ - return !!our_vma(addr, count); -} - -int lt_get_mmap_locks(struct ll_lock_tree *tree, - unsigned long addr, size_t count) -{ - struct vm_area_struct *vma; - struct ll_lock_tree_node *node; - ldlm_policy_data_t policy; - struct inode *inode; - ENTRY; - - if (count == 0) - RETURN(0); - - /* we need to look up vmas on page aligned addresses */ - count += addr & (~CFS_PAGE_MASK); - addr &= CFS_PAGE_MASK; - - while ((vma = our_vma(addr, count)) != NULL) { - LASSERT(vma->vm_file); - - inode = vma->vm_file->f_dentry->d_inode; - policy_from_vma(&policy, vma, addr, count); - node = ll_node_from_inode(inode, policy.l_extent.start, - policy.l_extent.end, - mode_from_vma(vma)); - if (IS_ERR(node)) { - CERROR("not enough mem for lock_tree_node!\n"); - RETURN(-ENOMEM); - } - lt_insert(tree, node); - - if (vma->vm_end - addr >= count) - break; - count -= vma->vm_end - addr; - addr = vma->vm_end; - } - RETURN(0); -} - /** - * Page fault handler. + * Lustre implementation of a vm_operations_struct::nopage() method, called by + * VM to server page fault (both in kernel and user space). + * + * This function sets up CIT_FAULT cl_io that does the job. * * \param vma - is virtiual area struct related to page fault * \param address - address when hit fault * \param type - of fault * + * XXX newer 2.6 kernels provide vm_operations_struct::fault() method with + * slightly different semantics instead. + * * \return allocated and filled page for address * \retval NOPAGE_SIGBUS if page not exist on this address * \retval NOPAGE_OOM not have memory for allocate new page @@ -362,151 +125,113 @@ int lt_get_mmap_locks(struct ll_lock_tree *tree, struct page *ll_nopage(struct vm_area_struct *vma, unsigned long address, int *type) { - struct file *filp = vma->vm_file; - struct ll_file_data *fd = LUSTRE_FPRIVATE(filp); - struct inode *inode = filp->f_dentry->d_inode; - struct lustre_handle lockh = { 0 }; - ldlm_policy_data_t policy; - ldlm_mode_t mode; - struct page *page = NULL; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm; - struct ost_lvb lvb; - __u64 kms, old_mtime; - unsigned long pgoff, size, rand_read, seq_read; - int rc = 0; - ENTRY; - - if (lli->lli_smd == NULL) { - CERROR("No lsm on fault?\n"); - RETURN(NULL); - } - - ll_clear_file_contended(inode); - - /* start and end the lock on the first and last bytes in the page */ - policy_from_vma(&policy, vma, address, CFS_PAGE_SIZE); + struct file *file = vma->vm_file; + struct inode *inode = file->f_dentry->d_inode; + struct lu_env *env; + struct cl_io *io; + struct page *page = NULL; + struct cl_env_nest nest; + int result; - CDEBUG(D_MMAP, "nopage vma %p inode %lu, locking ["LPU64", "LPU64"]\n", - vma, inode->i_ino, policy.l_extent.start, policy.l_extent.end); - - mode = mode_from_vma(vma); - old_mtime = LTIME_S(inode->i_mtime); - - lsm = lli->lli_smd; - rc = ll_extent_lock(fd, inode, lsm, mode, &policy, - &lockh, LDLM_FL_CBPENDING); - if (rc != 0) - RETURN(NULL); - - if (vma->vm_flags & VM_EXEC && LTIME_S(inode->i_mtime) != old_mtime) - CWARN("binary changed. inode %lu\n", inode->i_ino); - - lov_stripe_lock(lsm); - inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1); - kms = lvb.lvb_size; - - pgoff = ((address - vma->vm_start) >> CFS_PAGE_SHIFT) + vma->vm_pgoff; - size = (kms + CFS_PAGE_SIZE - 1) >> CFS_PAGE_SHIFT; - - if (pgoff >= size) { - lov_stripe_unlock(lsm); - ll_glimpse_size(inode, LDLM_FL_BLOCK_GRANTED); - } else { - /* XXX change inode size without ll_inode_size_lock() held! - * there is a race condition with truncate path. (see - * ll_extent_lock) */ - /* XXX i_size_write() is not used because it is not safe to - * take the ll_inode_size_lock() due to a potential lock - * inversion (bug 6077). And since it's not safe to use - * i_size_write() without a covering mutex we do the - * assignment directly. It is not critical that the - * size be correct. */ - /* region is within kms and, hence, within real file size (A). - * We need to increase i_size to cover the read region so that - * generic_file_read() will do its job, but that doesn't mean - * the kms size is _correct_, it is only the _minimum_ size. - * If someone does a stat they will get the correct size which - * will always be >= the kms value here. b=11081 */ - if (i_size_read(inode) < kms) { - inode->i_size = kms; - CDEBUG(D_INODE, "ino=%lu, updating i_size %llu\n", - inode->i_ino, i_size_read(inode)); - } - lov_stripe_unlock(lsm); - } + ENTRY; - /* If mapping is writeable, adjust kms to cover this page, - * but do not extend kms beyond actual file size. - * policy.l_extent.end is set to the end of the page by policy_from_vma - * bug 10919 */ - lov_stripe_lock(lsm); - if (mode == LCK_PW) - obd_adjust_kms(ll_i2dtexp(inode), lsm, - min_t(loff_t, policy.l_extent.end + 1, - i_size_read(inode)), 0); - lov_stripe_unlock(lsm); - - /* disable VM_SEQ_READ and use VM_RAND_READ to make sure that - * the kernel will not read other pages not covered by ldlm in - * filemap_nopage. we do our readahead in ll_readpage. + /* + * vm_operations_struct::nopage() can be called when lustre IO is + * already active for the current thread, e.g., when doing read/write + * against user level buffer mapped from Lustre buffer. To avoid + * stomping on existing context, optionally force an allocation of a new + * one. */ - rand_read = vma->vm_flags & VM_RAND_READ; - seq_read = vma->vm_flags & VM_SEQ_READ; - vma->vm_flags &= ~ VM_SEQ_READ; - vma->vm_flags |= VM_RAND_READ; - - page = filemap_nopage(vma, address, type); - if (page != NOPAGE_SIGBUS && page != NOPAGE_OOM) - LL_CDEBUG_PAGE(D_PAGE, page, "got addr %lu type %lx\n", address, - (long)type); - else - CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", address, - (long)type); - - vma->vm_flags &= ~VM_RAND_READ; - vma->vm_flags |= (rand_read | seq_read); - - ll_extent_unlock(fd, inode, ll_i2info(inode)->lli_smd, mode, &lockh); + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + pgoff_t pg_offset; + const unsigned long writable = VM_SHARED|VM_WRITE; + unsigned long ra_flags; + struct cl_fault_io *fio; + + io = &ccc_env_info(env)->cti_io; + io->ci_obj = ll_i2info(inode)->lli_clob; + LASSERT(io->ci_obj != NULL); + + fio = &io->u.ci_fault; + pg_offset = (address - vma->vm_start) >> PAGE_SHIFT; + fio->ft_index = pg_offset + vma->vm_pgoff; + fio->ft_writable = (vma->vm_flags&writable) == writable; + fio->ft_executable = vma->vm_flags&VM_EXEC; + + /* + * disable VM_SEQ_READ and use VM_RAND_READ to make sure that + * the kernel will not read other pages not covered by ldlm in + * filemap_nopage. we do our readahead in ll_readpage. + */ + ra_flags = vma->vm_flags & (VM_RAND_READ|VM_SEQ_READ); + vma->vm_flags &= ~VM_SEQ_READ; + vma->vm_flags |= VM_RAND_READ; + + CDEBUG(D_INFO, "vm_flags: %lx (%lu %i %i)\n", vma->vm_flags, + fio->ft_index, fio->ft_writable, fio->ft_executable); + + if (cl_io_init(env, io, CIT_FAULT, io->ci_obj) == 0) { + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + + LASSERT(cio->cui_cl.cis_io == io); + + vio->u.fault.ft_vma = vma; + vio->u.fault.ft_address = address; + vio->u.fault.ft_type = type; + cio->cui_fd = LUSTRE_FPRIVATE(file); + + result = cl_io_loop(env, io); + if (result == 0) { + LASSERT(fio->ft_page != NULL); + page = cl_page_vmpage(env, fio->ft_page); + } else if (result == -EFAULT) { + page = NOPAGE_SIGBUS; + } else if (result == -ENOMEM) { + page = NOPAGE_OOM; + } + } else + result = io->ci_result; + + vma->vm_flags &= ~VM_RAND_READ; + vma->vm_flags |= ra_flags; + + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + } RETURN(page); } -/* To avoid cancel the locks covering mmapped region for lock cache pressure, - * we track the mapped vma count by lli_mmap_cnt. - * ll_vm_open(): when first vma is linked, split locks from lru. - * ll_vm_close(): when last vma is unlinked, join all this file's locks to lru. - * - * XXX we don't check the if the region of vma/lock for performance. +/** + * To avoid cancel the locks covering mmapped region for lock cache pressure, + * we track the mapped vma count in ccc_object::cob_mmap_cnt. */ static void ll_vm_open(struct vm_area_struct * vma) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - ENTRY; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ccc_object *vob = cl_inode2ccc(inode); + ENTRY; LASSERT(vma->vm_file); - - spin_lock(&lli->lli_lock); - LASSERT(atomic_read(&lli->lli_mmap_cnt) >= 0); - - atomic_inc(&lli->lli_mmap_cnt); - spin_unlock(&lli->lli_lock); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); + atomic_inc(&vob->cob_mmap_cnt); + EXIT; } +/** + * Dual to ll_vm_open(). + */ static void ll_vm_close(struct vm_area_struct *vma) { - struct inode *inode = vma->vm_file->f_dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); - ENTRY; + struct inode *inode = vma->vm_file->f_dentry->d_inode; + struct ccc_object *vob = cl_inode2ccc(inode); + ENTRY; LASSERT(vma->vm_file); - - spin_lock(&lli->lli_lock); - LASSERT(atomic_read(&lli->lli_mmap_cnt) > 0); - - atomic_dec(&lli->lli_mmap_cnt); - spin_unlock(&lli->lli_lock); + atomic_dec(&vob->cob_mmap_cnt); + LASSERT(atomic_read(&vob->cob_mmap_cnt) >= 0); + EXIT; } #ifndef HAVE_FILEMAP_POPULATE @@ -570,7 +295,7 @@ int ll_file_mmap(struct file * file, struct vm_area_struct * vma) vma->vm_ops = &ll_file_vm_ops; vma->vm_ops->open(vma); /* update the inode's size and mtime */ - rc = ll_glimpse_size(file->f_dentry->d_inode, 0); + rc = cl_glimpse_size(file->f_dentry->d_inode); } RETURN(rc); diff --git a/lustre/llite/lloop.c b/lustre/llite/lloop.c index 3241307..05026f1 100644 --- a/lustre/llite/lloop.c +++ b/lustre/llite/lloop.c @@ -134,8 +134,8 @@ struct lloop_device { loff_t lo_offset; loff_t lo_sizelimit; int lo_flags; - int (*ioctl)(struct lloop_device *, int cmd, - unsigned long arg); + int (*ioctl)(struct lloop_device *, int cmd, + unsigned long arg); struct file * lo_backing_file; struct block_device *lo_device; @@ -241,8 +241,8 @@ static int do_bio_filebacked(struct lloop_device *lo, struct bio *bio) oinfo.oi_md = lsm; opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; oinfo.oi_capa = ll_osscapa_get(inode, opc); - ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, - (obd_count)(i - bio->bi_idx), + ret = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, + (obd_count)(i - bio->bi_idx), lo->lo_requests[0].lrd_pages, NULL); capa_put(oinfo.oi_capa); if (ret == 0) @@ -470,7 +470,7 @@ static int loop_set_fd(struct lloop_device *lo, struct file *unused, return error; } -static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, +static int loop_clr_fd(struct lloop_device *lo, struct block_device *bdev, int count) { struct file *filp = lo->lo_backing_file; @@ -532,7 +532,7 @@ static int lo_release(struct inode *inode, struct file *file) } /* lloop device node's ioctl function. */ -static int lo_ioctl(struct inode *inode, struct file *unused, +static int lo_ioctl(struct inode *inode, struct file *unused, unsigned int cmd, unsigned long arg) { struct lloop_device *lo = inode->i_bdev->bd_disk->private_data; @@ -556,7 +556,7 @@ static int lo_ioctl(struct inode *inode, struct file *unused, if (put_user(ino, (__u64 *)arg)) err = -EFAULT; - break; + break; } default: @@ -575,13 +575,13 @@ static struct block_device_operations lo_fops = { .ioctl = lo_ioctl, }; -/* dynamic iocontrol callback. - * This callback is registered in lloop_init and will be called by - * ll_iocontrol_call. - * This is a llite regular file ioctl function. It takes the responsibility - * of attaching a file, and detaching a file by a lloop's device numner. +/* dynamic iocontrol callback. + * This callback is registered in lloop_init and will be called by + * ll_iocontrol_call. + * This is a llite regular file ioctl function. It takes the responsibility + * of attaching a file, and detaching a file by a lloop's device numner. */ -static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, +static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, unsigned int cmd, unsigned long arg, void *magic, int *rcp) { @@ -611,7 +611,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, lo_free = lo; continue; } - if (lo->lo_backing_file->f_dentry->d_inode == + if (lo->lo_backing_file->f_dentry->d_inode == file->f_dentry->d_inode) break; } @@ -641,7 +641,7 @@ static enum llioc_iter lloop_ioctl(struct inode *unused, struct file *file, case LL_IOC_LLOOP_DETACH_BYDEV: { int minor; - + dev = old_decode_dev(arg); if (MAJOR(dev) != lloop_major) GOTO(out, err = -EINVAL); diff --git a/lustre/llite/lproc_llite.c b/lustre/llite/lproc_llite.c index c0eda8c..be49e62 100644 --- a/lustre/llite/lproc_llite.c +++ b/lustre/llite/lproc_llite.c @@ -47,7 +47,7 @@ struct proc_dir_entry *proc_lustre_fs_root; #ifdef LPROCFS /* /proc/lustre/llite mount point registration */ -struct file_operations llite_dump_pgcache_fops; +extern struct file_operations vvp_dump_pgcache_file_ops; struct file_operations ll_rw_extents_stats_fops; struct file_operations ll_rw_extents_stats_pp_fops; struct file_operations ll_rw_offset_stats_fops; @@ -212,6 +212,19 @@ static int ll_rd_sb_uuid(char *page, char **start, off_t off, int count, return snprintf(page, count, "%s\n", ll_s2sbi(sb)->ll_sb_uuid.uuid); } +static int ll_rd_site_stats(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct super_block *sb = data; + + /* + * See description of statistical counters in struct cl_site, and + * struct lu_site. + */ + return cl_site_stats_print(lu2cl_site(ll_s2sbi(sb)->ll_site), + page, count); +} + static int ll_rd_max_readahead_mb(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -318,8 +331,7 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, { struct super_block *sb = data; struct ll_sb_info *sbi = ll_s2sbi(sb); - unsigned long budget; - int mult, rc, pages_number, cpu; + int mult, rc, pages_number; mult = 1 << (20 - CFS_PAGE_SHIFT); rc = lprocfs_write_frac_helper(buffer, count, &pages_number, mult); @@ -340,46 +352,9 @@ static int ll_wr_max_cached_mb(struct file *file, const char *buffer, /* Not set up yet, don't call llap_shrink_cache */ return count; - spin_lock(&sbi->ll_async_page_reblnc_lock); - budget = sbi->ll_async_page_max / num_online_cpus(); - for_each_online_cpu(cpu) - LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget = budget; - spin_unlock(&sbi->ll_async_page_reblnc_lock); - - if (lcounter_read(&sbi->ll_async_page_count) >= sbi->ll_async_page_max) - llap_shrink_cache(sbi, -1); - return count; } -static int ll_rd_pgcache_bnlc(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct super_block *sb = data; - struct ll_sb_info *sbi = ll_s2sbi(sb); - struct ll_pglist_data *pd; - unsigned long total_budget = 0; - int n = 0, cpu; - - n += snprintf(page +n, count - n, - "cpu\tpage count\tbudget\t\treblnc count\tgen\thit\tmiss\tcross\n"); - for_each_online_cpu(cpu) { - pd = LL_PGLIST_DATA_CPU(sbi, cpu); - n += snprintf(page + n, count - n, - "%d\t%-8lu\t%-8lu\t%-8lu\t%lu\t%lu\t%lu\t%lu\n", - cpu, pd->llpd_count, pd->llpd_budget, - pd->llpd_reblnc_count, pd->llpd_gen, - pd->llpd_hit, pd->llpd_miss, pd->llpd_cross); - total_budget += pd->llpd_budget; - } - n += snprintf(page + n, count - n, - "Total budget: %lu, page max: %lu, rebalance cnt: %lu\n", - total_budget, sbi->ll_async_page_max, - sbi->ll_async_page_reblnc_count); - *eof = 1; - return n; -} - static int ll_rd_checksum(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -558,51 +533,11 @@ static int ll_rd_statahead_stats(char *page, char **start, off_t off, sbi->ll_sa_miss); } -static int ll_rd_contention_time(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct super_block *sb = data; - - *eof = 1; - return snprintf(page, count, "%u\n", ll_s2sbi(sb)->ll_contention_time); - -} - -static int ll_wr_contention_time(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct super_block *sb = data; - struct ll_sb_info *sbi = ll_s2sbi(sb); - - return lprocfs_write_helper(buffer, count,&sbi->ll_contention_time) ?: - count; -} - -static int ll_rd_lockless_truncate(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct super_block *sb = data; - - *eof = 1; - return snprintf(page, count, "%u\n", - ll_s2sbi(sb)->ll_lockless_truncate_enable); -} - -static int ll_wr_lockless_truncate(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct super_block *sb = data; - struct ll_sb_info *sbi = ll_s2sbi(sb); - - return lprocfs_write_helper(buffer, count, - &sbi->ll_lockless_truncate_enable) - ?: count; -} - static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "uuid", ll_rd_sb_uuid, 0, 0 }, //{ "mntpt_path", ll_rd_path, 0, 0 }, { "fstype", ll_rd_fstype, 0, 0 }, + { "site", ll_rd_site_stats, 0, 0 }, { "blocksize", ll_rd_blksize, 0, 0 }, { "kbytestotal", ll_rd_kbytestotal, 0, 0 }, { "kbytesfree", ll_rd_kbytesfree, 0, 0 }, @@ -616,7 +551,6 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "max_read_ahead_whole_mb", ll_rd_max_read_ahead_whole_mb, ll_wr_max_read_ahead_whole_mb, 0 }, { "max_cached_mb", ll_rd_max_cached_mb, ll_wr_max_cached_mb, 0 }, - { "pgcache_balance",ll_rd_pgcache_bnlc, 0, 0 }, { "checksum_pages", ll_rd_checksum, ll_wr_checksum, 0 }, { "max_rw_chunk", ll_rd_max_rw_chunk, ll_wr_max_rw_chunk, 0 }, { "stats_track_pid", ll_rd_track_pid, ll_wr_track_pid, 0 }, @@ -624,9 +558,6 @@ static struct lprocfs_vars lprocfs_llite_obd_vars[] = { { "stats_track_gid", ll_rd_track_gid, ll_wr_track_gid, 0 }, { "statahead_max", ll_rd_statahead_max, ll_wr_statahead_max, 0 }, { "statahead_stats", ll_rd_statahead_stats, 0, 0 }, - { "contention_seconds", ll_rd_contention_time, ll_wr_contention_time, 0}, - { "lockless_truncate", ll_rd_lockless_truncate, - ll_wr_lockless_truncate, 0}, { 0 } }; @@ -706,6 +637,22 @@ void ll_stats_ops_tally(struct ll_sb_info *sbi, int op, int count) } EXPORT_SYMBOL(ll_stats_ops_tally); +static const char *ra_stat_string[] = { + [RA_STAT_HIT] = "hits", + [RA_STAT_MISS] = "misses", + [RA_STAT_DISTANT_READPAGE] = "readpage not consecutive", + [RA_STAT_MISS_IN_WINDOW] = "miss inside window", + [RA_STAT_FAILED_GRAB_PAGE] = "failed grab_cache_page", + [RA_STAT_FAILED_MATCH] = "failed lock match", + [RA_STAT_DISCARDED] = "read but discarded", + [RA_STAT_ZERO_LEN] = "zero length file", + [RA_STAT_ZERO_WINDOW] = "zero size window", + [RA_STAT_EOF] = "read-ahead to EOF", + [RA_STAT_MAX_IN_FLIGHT] = "hit max r-a issue", + [RA_STAT_WRONG_GRAB_PAGE] = "wrong page from grab_cache_page", +}; + + int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct super_block *sb, char *osc, char *mdc) { @@ -715,7 +662,6 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, struct obd_device *obd; char name[MAX_STRING_SIZE + 1], *ptr; int err, id, len, rc; - static const char *ra_stats_string[] = LL_RA_STAT_STRINGS; ENTRY; memset(lvars, 0, sizeof(lvars)); @@ -744,9 +690,8 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, RETURN(err); } - rc = lprocfs_seq_create(sbi->ll_proc_root, "dump_page_cache", 0444, - &llite_dump_pgcache_fops, sbi); + &vvp_dump_pgcache_file_ops, sbi); if (rc) CWARN("Error adding the dump_page_cache file\n"); @@ -789,14 +734,14 @@ int lprocfs_register_mountpoint(struct proc_dir_entry *parent, if (err) GOTO(out, err); - sbi->ll_ra_stats = lprocfs_alloc_stats(LL_RA_STAT, + sbi->ll_ra_stats = lprocfs_alloc_stats(ARRAY_SIZE(ra_stat_string), LPROCFS_STATS_FLAG_PERCPU); if (sbi->ll_ra_stats == NULL) GOTO(out, err = -ENOMEM); - for (id = 0; id < LL_RA_STAT; id++) + for (id = 0; id < ARRAY_SIZE(ra_stat_string); id++) lprocfs_counter_init(sbi->ll_ra_stats, id, 0, - ra_stats_string[id], "pages"); + ra_stat_string[id], "pages"); err = lprocfs_register_stats(sbi->ll_proc_root, "read_ahead_stats", sbi->ll_ra_stats); if (err) @@ -863,224 +808,6 @@ void lprocfs_unregister_mountpoint(struct ll_sb_info *sbi) } #undef MAX_STRING_SIZE -#define seq_page_flag(seq, page, flag, has_flags) do { \ - if (test_bit(PG_##flag, &(page)->flags)) { \ - if (!has_flags) \ - has_flags = 1; \ - else \ - seq_putc(seq, '|'); \ - seq_puts(seq, #flag); \ - } \ - } while(0); - -static void *llite_dump_pgcache_seq_start(struct seq_file *seq, loff_t *pos) -{ - struct ll_async_page *dummy_llap = seq->private; - - if (dummy_llap->llap_magic == 2) - return NULL; - - return (void *)1; -} - -static int llite_dump_pgcache_seq_show(struct seq_file *seq, void *v) -{ - struct ll_async_page *llap, *dummy_llap = seq->private; - struct ll_sb_info *sbi = dummy_llap->llap_cookie; - struct ll_pglist_data *pd; - int cpu = dummy_llap->llap_pglist_cpu; - - /* 2.4 doesn't seem to have SEQ_START_TOKEN, so we implement - * it in our own state */ - if (dummy_llap->llap_magic == 0) { - seq_printf(seq, "gener | llap cookie origin wq du wb | page " - "inode index count [ page flags ]\n"); - return 0; - } - - pd = ll_pglist_cpu_lock(sbi, cpu); - llap = llite_pglist_next_llap(&pd->llpd_list, - &dummy_llap->llap_pglist_item); - if (llap != NULL) { - int has_flags = 0, i; - struct page *page = llap->llap_page; - unsigned long gen = 0UL; - - LASSERTF(llap->llap_origin < LLAP__ORIGIN_MAX, "%u\n", - llap->llap_origin); - - for_each_online_cpu(i) - gen += LL_PGLIST_DATA_CPU(sbi, i)->llpd_gen; - - seq_printf(seq," %5lu | %p %p %s %s %s %s | %p %lu/%u(%p) " - "%lu %u [", - gen, - llap, llap->llap_cookie, - llap_origins[llap->llap_origin], - llap->llap_write_queued ? "wq" : "- ", - llap->llap_defer_uptodate ? "du" : "- ", - PageWriteback(page) ? "wb" : "-", - page, page->mapping->host->i_ino, - page->mapping->host->i_generation, - page->mapping->host, page->index, - page_count(page)); - seq_page_flag(seq, page, locked, has_flags); - seq_page_flag(seq, page, error, has_flags); - seq_page_flag(seq, page, referenced, has_flags); - seq_page_flag(seq, page, uptodate, has_flags); - seq_page_flag(seq, page, dirty, has_flags); -#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,12)) - seq_page_flag(seq, page, highmem, has_flags); -#endif - seq_page_flag(seq, page, writeback, has_flags); - if (!has_flags) - seq_puts(seq, "-]\n"); - else - seq_puts(seq, "]\n"); - } - ll_pglist_cpu_unlock(sbi, cpu); - - return 0; -} - -static void *llite_dump_pgcache_seq_next(struct seq_file *seq, void *v, - loff_t *pos) -{ - struct ll_async_page *llap, *dummy_llap = seq->private; - struct ll_sb_info *sbi = dummy_llap->llap_cookie; - struct ll_pglist_data *pd, *next; - int cpu = dummy_llap->llap_pglist_cpu; - - /* bail if we just displayed the banner */ - if (dummy_llap->llap_magic == 0) { - dummy_llap->llap_magic = 1; - return dummy_llap; - } - - /* we've just displayed the llap that is after us in the list. - * we advance to a position beyond it, returning null if there - * isn't another llap in the list beyond that new position. */ - pd = ll_pglist_cpu_lock(sbi, cpu); - llap = llite_pglist_next_llap(&pd->llpd_list, - &dummy_llap->llap_pglist_item); - list_del_init(&dummy_llap->llap_pglist_item); - if (llap) { - list_add(&dummy_llap->llap_pglist_item,&llap->llap_pglist_item); - llap = llite_pglist_next_llap(&pd->llpd_list, - &dummy_llap->llap_pglist_item); - } - if (llap == NULL) { - int i = cpu + 1; - for (next = NULL; i < num_possible_cpus(); i++, next = NULL) { - next = ll_pglist_cpu_lock(sbi, i); - if (!list_empty(&next->llpd_list)) - break; - ll_pglist_cpu_unlock(sbi, i); - } - if (next != NULL) { - list_move(&dummy_llap->llap_pglist_item, - &next->llpd_list); - dummy_llap->llap_pglist_cpu = i; - ll_pglist_cpu_unlock(sbi, cpu); - llap = llite_pglist_next_llap(&next->llpd_list, - &dummy_llap->llap_pglist_item); - LASSERT(llap); - cpu = i; - } - } - ll_pglist_cpu_unlock(sbi, cpu); - - ++*pos; - if (llap == NULL) { - dummy_llap->llap_magic = 2; - return NULL; - } - return dummy_llap; -} - -static void null_stop(struct seq_file *seq, void *v) -{ -} - -struct seq_operations llite_dump_pgcache_seq_sops = { - .start = llite_dump_pgcache_seq_start, - .stop = null_stop, - .next = llite_dump_pgcache_seq_next, - .show = llite_dump_pgcache_seq_show, -}; - -/* we're displaying llaps in a list_head list. we don't want to hold a lock - * while we walk the entire list, and we don't want to have to seek into - * the right position in the list as an app advances with many syscalls. we - * allocate a dummy llap and hang it off file->private. its position in - * the list records where the app is currently displaying. this way our - * seq .start and .stop don't actually do anything. .next returns null - * when the dummy hits the end of the list which eventually leads to .release - * where we tear down. this kind of displaying is super-racey, so we put - * a generation counter on the list so the output shows when the list - * changes between reads. - */ -static int llite_dump_pgcache_seq_open(struct inode *inode, struct file *file) -{ - struct proc_dir_entry *dp = PDE(inode); - struct ll_async_page *dummy_llap; - struct seq_file *seq; - struct ll_sb_info *sbi = dp->data; - struct ll_pglist_data *pd; - int rc = -ENOMEM; - - LPROCFS_ENTRY_AND_CHECK(dp); - - OBD_ALLOC_PTR_WAIT(dummy_llap); - if (dummy_llap == NULL) - GOTO(out, rc); - dummy_llap->llap_page = NULL; - dummy_llap->llap_cookie = sbi; - dummy_llap->llap_magic = 0; - dummy_llap->llap_pglist_cpu = 0; - - rc = seq_open(file, &llite_dump_pgcache_seq_sops); - if (rc) { - OBD_FREE(dummy_llap, sizeof(*dummy_llap)); - GOTO(out, rc); - } - seq = file->private_data; - seq->private = dummy_llap; - - pd = ll_pglist_cpu_lock(sbi, 0); - list_add(&dummy_llap->llap_pglist_item, &pd->llpd_list); - ll_pglist_cpu_unlock(sbi, 0); - -out: - if (rc) - LPROCFS_EXIT(); - return rc; -} - -static int llite_dump_pgcache_seq_release(struct inode *inode, - struct file *file) -{ - struct seq_file *seq = file->private_data; - struct ll_async_page *dummy_llap = seq->private; - struct ll_sb_info *sbi = dummy_llap->llap_cookie; - int cpu = dummy_llap->llap_pglist_cpu; - - ll_pglist_cpu_lock(sbi, cpu); - if (!list_empty(&dummy_llap->llap_pglist_item)) - list_del_init(&dummy_llap->llap_pglist_item); - ll_pglist_cpu_unlock(sbi, cpu); - OBD_FREE(dummy_llap, sizeof(*dummy_llap)); - - return lprocfs_seq_release(inode, file); -} - -struct file_operations llite_dump_pgcache_fops = { - .owner = THIS_MODULE, - .open = llite_dump_pgcache_seq_open, - .read = seq_read, - .release = llite_dump_pgcache_seq_release, -}; - #define pct(a,b) (b ? a * 100 / b : 0) static void ll_display_extents_info(struct ll_rw_extents_info *io_extents, @@ -1248,8 +975,9 @@ static ssize_t ll_rw_extents_stats_seq_write(struct file *file, const char *buf, LPROC_SEQ_FOPS(ll_rw_extents_stats); -void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file - *file, size_t count, int rw) +void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, + struct ll_file_data *file, loff_t pos, + size_t count, int rw) { int i, cur = -1; struct ll_rw_process_info *process; @@ -1298,9 +1026,8 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file for (i = 0; i < LL_PROCESS_HIST_MAX; i++) { if (process[i].rw_pid == pid) { if (process[i].rw_last_file != file) { - process[i].rw_range_start = file->f_pos; - process[i].rw_last_file_pos = - file->f_pos + count; + process[i].rw_range_start = pos; + process[i].rw_last_file_pos = pos + count; process[i].rw_smallest_extent = count; process[i].rw_largest_extent = count; process[i].rw_offset = 0; @@ -1308,7 +1035,7 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file spin_unlock(&sbi->ll_process_lock); return; } - if (process[i].rw_last_file_pos != file->f_pos) { + if (process[i].rw_last_file_pos != pos) { *off_count = (*off_count + 1) % LL_OFFSET_HIST_MAX; offset[*off_count].rw_op = process[i].rw_op; @@ -1324,17 +1051,17 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file offset[*off_count].rw_offset = process[i].rw_offset; process[i].rw_op = rw; - process[i].rw_range_start = file->f_pos; + process[i].rw_range_start = pos; process[i].rw_smallest_extent = count; process[i].rw_largest_extent = count; - process[i].rw_offset = file->f_pos - + process[i].rw_offset = pos - process[i].rw_last_file_pos; } if(process[i].rw_smallest_extent > count) process[i].rw_smallest_extent = count; if(process[i].rw_largest_extent < count) process[i].rw_largest_extent = count; - process[i].rw_last_file_pos = file->f_pos + count; + process[i].rw_last_file_pos = pos + count; spin_unlock(&sbi->ll_process_lock); return; } @@ -1342,8 +1069,8 @@ void ll_rw_stats_tally(struct ll_sb_info *sbi, pid_t pid, struct file *process_count = (*process_count + 1) % LL_PROCESS_HIST_MAX; process[*process_count].rw_pid = pid; process[*process_count].rw_op = rw; - process[*process_count].rw_range_start = file->f_pos; - process[*process_count].rw_last_file_pos = file->f_pos + count; + process[*process_count].rw_range_start = pos; + process[*process_count].rw_last_file_pos = pos + count; process[*process_count].rw_smallest_extent = count; process[*process_count].rw_largest_extent = count; process[*process_count].rw_offset = 0; diff --git a/lustre/llite/namei.c b/lustre/llite/namei.c index a7d87c6..91c81c3 100644 --- a/lustre/llite/namei.c +++ b/lustre/llite/namei.c @@ -123,22 +123,28 @@ struct inode *ll_iget(struct super_block *sb, ino_t hash, if (inode) { lli = ll_i2info(inode); if (inode->i_state & I_NEW) { + int rc; + ll_read_inode2(inode, md); - unlock_new_inode(inode); - } else { - if (!(inode->i_state & (I_FREEING | I_CLEAR))) + rc = cl_inode_init(inode, md); + if (rc != 0) { + md->lsm = NULL; + make_bad_inode(inode); + unlock_new_inode(inode); + iput(inode); + inode = ERR_PTR(rc); + } else + unlock_new_inode(inode); + } else if (!(inode->i_state & (I_FREEING | I_CLEAR))) ll_update_inode(inode, md); - } - CDEBUG(D_VFSTRACE, "got inode: %lu/%u(%p) for "DFID"\n", - inode->i_ino, inode->i_generation, inode, - PFID(&lli->lli_fid)); + CDEBUG(D_VFSTRACE, "got inode: %p for "DFID"\n", + inode, PFID(&md->body->fid1)); } - RETURN(inode); } static void ll_drop_negative_dentry(struct inode *dir) -{ +{ struct dentry *dentry, *tmp_alias, *tmp_subdir; spin_lock(&ll_lookup_lock); @@ -438,7 +444,7 @@ int ll_lookup_it_finish(struct ptlrpc_request *request, 2.4 and vfs_getattr_it->ll_getattr()->ll_inode_revalidate_it() in 2.6 Everybody else who needs correct file size would call - ll_glimpse_size or some equivalent themselves anyway. + cl_glimpse_size or some equivalent themselves anyway. Also see bug 7198. */ ll_dops_init(*de, 1); @@ -461,7 +467,7 @@ int ll_lookup_it_finish(struct ptlrpc_request *request, might get picked up later when UPDATE lock will appear */ if (ll_have_md_lock(parent, MDS_INODELOCK_UPDATE)) { spin_lock(&dcache_lock); - ll_d_add(*de, inode); + ll_d_add(*de, NULL); spin_unlock(&dcache_lock); } else { (*de)->d_inode = NULL; @@ -996,7 +1002,7 @@ static void ll_get_child_fid(struct inode * dir, struct qstr *name, struct lu_fid *fid) { struct dentry *parent, *child; - + parent = list_entry(dir->i_dentry.next, struct dentry, d_alias); child = d_lookup(parent, name); if (child) { @@ -1013,7 +1019,7 @@ static int ll_rmdir_generic(struct inode *dir, struct dentry *dparent, struct md_op_data *op_data; int rc; ENTRY; - + CDEBUG(D_VFSTRACE, "VFS Op:name=%.*s,dir=%lu/%u(%p)\n", name->len, name->name, dir->i_ino, dir->i_generation, dir); @@ -1041,6 +1047,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) struct lov_stripe_md *lsm = NULL; struct obd_trans_info oti = { 0 }; struct obdo *oa; + struct obd_capa *oc = NULL; int rc; ENTRY; @@ -1084,7 +1091,7 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) if (body->valid & OBD_MD_FLCOOKIE) { oa->o_valid |= OBD_MD_FLCOOKIE; - oti.oti_logcookies = + oti.oti_logcookies = req_capsule_server_sized_get(&request->rq_pill, &RMF_LOGCOOKIES, sizeof(struct llog_cookie) * @@ -1095,7 +1102,14 @@ int ll_objects_destroy(struct ptlrpc_request *request, struct inode *dir) } } - rc = obd_destroy(ll_i2dtexp(dir), oa, lsm, &oti, ll_i2mdexp(dir)); + if (body->valid & OBD_MD_FLOSSCAPA) { + rc = md_unpack_capa(ll_i2mdexp(dir), request, &RMF_CAPA2, &oc); + if (rc) + GOTO(out_free_memmd, rc); + } + + rc = obd_destroy(ll_i2dtexp(dir), oa, lsm, &oti, ll_i2mdexp(dir), oc); + capa_put(oc); OBDO_FREE(oa); if (rc) CERROR("obd destroy objid "LPX64" error %d\n", diff --git a/lustre/llite/rw.c b/lustre/llite/rw.c index 6a2be0a..4fb44d13 100644 --- a/lustre/llite/rw.c +++ b/lustre/llite/rw.c @@ -56,6 +56,8 @@ #include #include #include +/* current_is_kswapd() */ +#include #define DEBUG_SUBSYSTEM S_LLITE @@ -65,107 +67,6 @@ #include "llite_internal.h" #include -#ifndef list_for_each_prev_safe -#define list_for_each_prev_safe(pos, n, head) \ - for (pos = (head)->prev, n = pos->prev; pos != (head); \ - pos = n, n = pos->prev ) -#endif - -cfs_mem_cache_t *ll_async_page_slab = NULL; -size_t ll_async_page_slab_size = 0; - -/* SYNCHRONOUS I/O to object storage for an inode */ -static int ll_brw(int cmd, struct inode *inode, struct obdo *oa, - struct page *page, int flags) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_info oinfo = { { { 0 } } }; - struct brw_page pg; - int opc, rc; - ENTRY; - - pg.pg = page; - pg.off = ((obd_off)page->index) << CFS_PAGE_SHIFT; - - if ((cmd & OBD_BRW_WRITE) && (pg.off+CFS_PAGE_SIZE>i_size_read(inode))) - pg.count = i_size_read(inode) % CFS_PAGE_SIZE; - else - pg.count = CFS_PAGE_SIZE; - - LL_CDEBUG_PAGE(D_PAGE, page, "%s %d bytes ino %lu at "LPU64"/"LPX64"\n", - cmd & OBD_BRW_WRITE ? "write" : "read", pg.count, - inode->i_ino, pg.off, pg.off); - if (pg.count == 0) { - CERROR("ZERO COUNT: ino %lu: size %p:%Lu(%p:%Lu) idx %lu off " - LPU64"\n", inode->i_ino, inode, i_size_read(inode), - page->mapping->host, i_size_read(page->mapping->host), - page->index, pg.off); - } - - pg.flag = flags; - - if (cmd & OBD_BRW_WRITE) - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_WRITE, - pg.count); - else - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_BRW_READ, - pg.count); - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - /* NB partial write, so we might not have CAPA_OPC_OSS_READ capa */ - opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; - oinfo.oi_capa = ll_osscapa_get(inode, opc); - rc = obd_brw(cmd, ll_i2dtexp(inode), &oinfo, 1, &pg, NULL); - capa_put(oinfo.oi_capa); - if (rc == 0) - obdo_to_inode(inode, oa, OBD_MD_FLBLOCKS); - else if (rc != -EIO) - CERROR("error from obd_brw: rc = %d\n", rc); - RETURN(rc); -} - -int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock) -{ - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_info oinfo = { { { 0 } } }; - struct obdo oa; - int rc; - - ENTRY; - CDEBUG(D_INFO, "calling punch for "LPX64" (new size %Lu=%#Lx)\n", - lli->lli_smd->lsm_object_id, i_size_read(inode), i_size_read(inode)); - - oinfo.oi_md = lli->lli_smd; - oinfo.oi_policy.l_extent.start = new_size; - oinfo.oi_policy.l_extent.end = OBD_OBJECT_EOF; - oinfo.oi_oa = &oa; - oa.o_id = lli->lli_smd->lsm_object_id; - oa.o_gr = lli->lli_smd->lsm_object_gr; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - if (srvlock) { - /* set OBD_MD_FLFLAGS in o_valid, only if we - * set OBD_FL_TRUNCLOCK, otherwise ost_punch - * and filter_setattr get confused, see the comment - * in ost_punch */ - oa.o_flags = OBD_FL_TRUNCLOCK; - oa.o_valid |= OBD_MD_FLFLAGS; - } - obdo_from_inode(&oa, inode, OBD_MD_FLTYPE | OBD_MD_FLMODE | - OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLFID | OBD_MD_FLGENER); - - oinfo.oi_capa = ll_osscapa_get(inode, CAPA_OPC_OSS_TRUNC); - rc = obd_punch_rqset(ll_i2dtexp(inode), &oinfo, NULL); - ll_truncate_free_capa(oinfo.oi_capa); - if (rc) - CERROR("obd_truncate fails (%d) ino %lu\n", rc, inode->i_ino); - else - obdo_to_inode(inode, &oa, OBD_MD_FLSIZE | OBD_MD_FLBLOCKS | - OBD_MD_FLATIME | OBD_MD_FLMTIME | OBD_MD_FLCTIME); - RETURN(rc); -} - /* this isn't where truncate starts. roughly: * sys_truncate->ll_setattr_raw->vmtruncate->ll_truncate. setattr_raw grabs * DLM lock on [size, EOF], i_mutex, ->lli_size_sem, and WRITE_I_ALLOC_SEM to @@ -175,7 +76,6 @@ int ll_file_punch(struct inode * inode, loff_t new_size, int srvlock) void ll_truncate(struct inode *inode) { struct ll_inode_info *lli = ll_i2info(inode); - int srvlock = !!(lli->lli_flags & LLIF_SRVLOCK); loff_t new_size; ENTRY; CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p) to %Lu=%#Lx\n",inode->i_ino, @@ -183,7 +83,7 @@ void ll_truncate(struct inode *inode) i_size_read(inode)); ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_TRUNC, 1); - if (lli->lli_size_sem_owner != current) { + if (lli->lli_size_sem_owner != cfs_current()) { EXIT; return; } @@ -193,29 +93,7 @@ void ll_truncate(struct inode *inode) inode->i_ino); GOTO(out_unlock, 0); } - - LASSERT(atomic_read(&lli->lli_size_sem.count) <= 0); - - if (!srvlock) { - struct ost_lvb lvb; - int rc; - - /* XXX I'm pretty sure this is a hack to paper - * over a more fundamental race condition. */ - lov_stripe_lock(lli->lli_smd); - inode_init_lvb(inode, &lvb); - rc = obd_merge_lvb(ll_i2dtexp(inode), lli->lli_smd, &lvb, 0); - if (lvb.lvb_size == i_size_read(inode) && rc == 0) { - CDEBUG(D_VFSTRACE, "skipping punch for obj "LPX64 - ",%Lu=%#Lx\n", lli->lli_smd->lsm_object_id, - i_size_read(inode), i_size_read(inode)); - lov_stripe_unlock(lli->lli_smd); - GOTO(out_unlock, 0); - } - obd_adjust_kms(ll_i2dtexp(inode), lli->lli_smd, - i_size_read(inode), 1); - lov_stripe_unlock(lli->lli_smd); - } + LASSERT_SEM_LOCKED(&lli->lli_size_sem); if (unlikely((ll_i2sbi(inode)->ll_flags & LL_SBI_CHECKSUM) && (i_size_read(inode) & ~CFS_PAGE_MASK))) { @@ -224,6 +102,7 @@ void ll_truncate(struct inode *inode) i_size_read(inode) >> CFS_PAGE_SHIFT); if (page != NULL) { +#if 0 /* XXX */ struct ll_async_page *llap = llap_cast_private(page); if (llap != NULL) { char *kaddr = kmap_atomic(page, KM_USER0); @@ -236,15 +115,12 @@ void ll_truncate(struct inode *inode) kunmap_atomic(kaddr, KM_USER0); } page_cache_release(page); +#endif } } new_size = i_size_read(inode); ll_inode_size_unlock(inode, 0); - if (!srvlock) - ll_file_punch(inode, new_size, 0); - else - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_LOCKLESS_TRUNC, 1); EXIT; return; @@ -253,848 +129,231 @@ void ll_truncate(struct inode *inode) ll_inode_size_unlock(inode, 0); } /* ll_truncate */ -int ll_prepare_write(struct file *file, struct page *page, unsigned from, - unsigned to) -{ - struct inode *inode = page->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - obd_off offset = ((obd_off)page->index) << CFS_PAGE_SHIFT; - struct obd_info oinfo = { { { 0 } } }; - struct brw_page pga; - struct obdo oa; - struct ost_lvb lvb; - int rc = 0; - ENTRY; - - LASSERT(PageLocked(page)); - (void)llap_cast_private(page); /* assertion */ - - /* Check to see if we should return -EIO right away */ - pga.pg = page; - pga.off = offset; - pga.count = CFS_PAGE_SIZE; - pga.flag = 0; - - oa.o_mode = inode->i_mode; - oa.o_id = lsm->lsm_object_id; - oa.o_gr = lsm->lsm_object_gr; - oa.o_valid = OBD_MD_FLID | OBD_MD_FLMODE | - OBD_MD_FLTYPE | OBD_MD_FLGROUP; - obdo_from_inode(&oa, inode, OBD_MD_FLFID | OBD_MD_FLGENER); - - oinfo.oi_oa = &oa; - oinfo.oi_md = lsm; - rc = obd_brw(OBD_BRW_CHECK, ll_i2dtexp(inode), &oinfo, 1, &pga, NULL); - if (rc) - RETURN(rc); - - if (PageUptodate(page)) { - LL_CDEBUG_PAGE(D_PAGE, page, "uptodate\n"); - RETURN(0); - } - - /* We're completely overwriting an existing page, so _don't_ set it up - * to date until commit_write */ - if (from == 0 && to == CFS_PAGE_SIZE) { - LL_CDEBUG_PAGE(D_PAGE, page, "full page write\n"); - POISON_PAGE(page, 0x11); - RETURN(0); - } - - /* If are writing to a new page, no need to read old data. The extent - * locking will have updated the KMS, and for our purposes here we can - * treat it like i_size. */ - lov_stripe_lock(lsm); - inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1); - lov_stripe_unlock(lsm); - if (lvb.lvb_size <= offset) { - char *kaddr = kmap_atomic(page, KM_USER0); - LL_CDEBUG_PAGE(D_PAGE, page, "kms "LPU64" <= offset "LPU64"\n", - lvb.lvb_size, offset); - memset(kaddr, 0, CFS_PAGE_SIZE); - kunmap_atomic(kaddr, KM_USER0); - GOTO(prepare_done, rc = 0); - } - - /* XXX could be an async ocp read.. read-ahead? */ - rc = ll_brw(OBD_BRW_READ, inode, &oa, page, 0); - if (rc == 0) { - /* bug 1598: don't clobber blksize */ - oa.o_valid &= ~(OBD_MD_FLSIZE | OBD_MD_FLBLKSZ); - obdo_refresh_inode(inode, &oa, oa.o_valid); - } - - EXIT; - prepare_done: - if (rc == 0) - SetPageUptodate(page); - - return rc; -} - /** - * make page ready for ASYNC write - * \param data - pointer to llap cookie - * \param cmd - is OBD_BRW_* macroses - * - * \retval 0 is page successfully prepared to send - * \retval -EAGAIN is page not need to send + * Initializes common cl-data at the typical address_space operation entry + * point. */ -static int ll_ap_make_ready(void *data, int cmd) +static int ll_cl_init(struct file *file, struct page *vmpage, + struct lu_env **env, + struct cl_io **io, struct cl_page **page, int *refcheck) { - struct ll_async_page *llap; - struct page *page; - ENTRY; - - llap = llap_from_cookie(data); - page = llap->llap_page; - - /* we're trying to write, but the page is locked.. come back later */ - if (TryLockPage(page)) - RETURN(-EAGAIN); - - LASSERTF(!(cmd & OBD_BRW_READ) || !PageWriteback(page), - "cmd %x page %p ino %lu index %lu fl %lx\n", cmd, page, - page->mapping->host->i_ino, page->index, page->flags); - - /* if we left PageDirty we might get another writepage call - * in the future. list walkers are bright enough - * to check page dirty so we can leave it on whatever list - * its on. XXX also, we're called with the cli list so if - * we got the page cache list we'd create a lock inversion - * with the removepage path which gets the page lock then the - * cli lock */ - LASSERTF(!PageWriteback(page),"cmd %x page %p ino %lu index %lu\n", cmd, page, - page->mapping->host->i_ino, page->index); - if(!clear_page_dirty_for_io(page)) { - unlock_page(page); - RETURN(-EAGAIN); - } - - /* This actually clears the dirty bit in the radix tree.*/ - set_page_writeback(page); - - LL_CDEBUG_PAGE(D_PAGE, page, "made ready\n"); - page_cache_get(page); - - RETURN(0); + struct lu_env *_env; + struct cl_io *_io; + struct cl_page *_page; + struct cl_object *clob; + + int result; + + *env = NULL; + *io = NULL; + *page = NULL; + + clob = ll_i2info(vmpage->mapping->host)->lli_clob; + LASSERT(clob != NULL); + + _env = cl_env_get(refcheck); + if (!IS_ERR(env)) { + struct ccc_io *cio = ccc_env_io(_env); + + *env = _env; + *io = _io = cio->cui_cl.cis_io; + if (_io != NULL) { + LASSERT(_io->ci_state == CIS_IO_GOING); + LASSERT(cio->cui_fd == LUSTRE_FPRIVATE(file)); + _page = cl_page_find(_env, clob, vmpage->index, vmpage, + CPT_CACHEABLE); + if (!IS_ERR(_page)) { + *page = _page; + lu_ref_add(&_page->cp_reference, "cl_io", _io); + result = 0; + } else + result = PTR_ERR(_page); + } else + /* + * This is for a case where operation can be called + * either with or without cl_io created by the upper + * layer (e.g., ->prepare_write() called directly from + * loop-back driver). + */ + result = -EALREADY; + } else + result = PTR_ERR(_env); + CDEBUG(D_VFSTRACE, "%lu@"DFID" -> %i %p %p %p\n", + vmpage->index, PFID(lu_object_fid(&clob->co_lu)), result, + *env, *io, *page); + return result; } -/* We have two reasons for giving llite the opportunity to change the - * write length of a given queued page as it builds the RPC containing - * the page: - * - * 1) Further extending writes may have landed in the page cache - * since a partial write first queued this page requiring us - * to write more from the page cache. (No further races are possible, since - * by the time this is called, the page is locked.) - * 2) We might have raced with truncate and want to avoid performing - * write RPCs that are just going to be thrown away by the - * truncate's punch on the storage targets. - * - * The kms serves these purposes as it is set at both truncate and extending - * writes. +/** + * Finalizes cl-data before exiting typical address_space operation. Dual to + * ll_cl_init(). */ -static int ll_ap_refresh_count(void *data, int cmd) -{ - struct ll_inode_info *lli; - struct ll_async_page *llap; - struct lov_stripe_md *lsm; - struct page *page; - struct inode *inode; - struct ost_lvb lvb; - __u64 kms; - ENTRY; - - /* readpage queues with _COUNT_STABLE, shouldn't get here. */ - LASSERT(cmd != OBD_BRW_READ); - - llap = llap_from_cookie(data); - page = llap->llap_page; - inode = page->mapping->host; - lli = ll_i2info(inode); - lsm = lli->lli_smd; - - lov_stripe_lock(lsm); - inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1); - kms = lvb.lvb_size; - lov_stripe_unlock(lsm); - - /* catch race with truncate */ - if (((__u64)page->index << CFS_PAGE_SHIFT) >= kms) - return 0; - - /* catch sub-page write at end of file */ - if (((__u64)page->index << CFS_PAGE_SHIFT) + CFS_PAGE_SIZE > kms) - return kms % CFS_PAGE_SIZE; - - return CFS_PAGE_SIZE; -} - -void ll_inode_fill_obdo(struct inode *inode, int cmd, struct obdo *oa) -{ - struct lov_stripe_md *lsm; - obd_flag valid_flags; - - lsm = ll_i2info(inode)->lli_smd; - - oa->o_id = lsm->lsm_object_id; - oa->o_gr = lsm->lsm_object_gr; - oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - valid_flags = OBD_MD_FLTYPE | OBD_MD_FLATIME; - if (cmd & OBD_BRW_WRITE) { - oa->o_valid |= OBD_MD_FLEPOCH; - oa->o_easize = ll_i2info(inode)->lli_ioepoch; - - valid_flags |= OBD_MD_FLMTIME | OBD_MD_FLCTIME | - OBD_MD_FLUID | OBD_MD_FLGID | - OBD_MD_FLFID | OBD_MD_FLGENER; - } - - obdo_from_inode(oa, inode, valid_flags); -} - -static void ll_ap_fill_obdo(void *data, int cmd, struct obdo *oa) -{ - struct ll_async_page *llap; - ENTRY; - - llap = llap_from_cookie(data); - ll_inode_fill_obdo(llap->llap_page->mapping->host, cmd, oa); - - EXIT; -} - -static void ll_ap_update_obdo(void *data, int cmd, struct obdo *oa, - obd_valid valid) -{ - struct ll_async_page *llap; - ENTRY; - - llap = llap_from_cookie(data); - obdo_from_inode(oa, llap->llap_page->mapping->host, valid); - - EXIT; -} - -static struct obd_capa *ll_ap_lookup_capa(void *data, int cmd) -{ - int opc = cmd & OBD_BRW_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; - struct ll_async_page *llap = llap_from_cookie(data); - - return ll_osscapa_get(llap->llap_page->mapping->host, opc); -} - -static struct obd_async_page_ops ll_async_page_ops = { - .ap_make_ready = ll_ap_make_ready, - .ap_refresh_count = ll_ap_refresh_count, - .ap_fill_obdo = ll_ap_fill_obdo, - .ap_update_obdo = ll_ap_update_obdo, - .ap_completion = ll_ap_completion, - .ap_lookup_capa = ll_ap_lookup_capa, -}; - -struct ll_async_page *llap_cast_private(struct page *page) -{ - struct ll_async_page *llap = (struct ll_async_page *)page_private(page); - - LASSERTF(llap == NULL || llap->llap_magic == LLAP_MAGIC, - "page %p private %lu gave magic %d which != %d\n", - page, page_private(page), llap->llap_magic, LLAP_MAGIC); - - return llap; -} - -/* Try to reap @target pages in the specific @cpu's async page list. - * - * There is an llap attached onto every page in lustre, linked off @sbi. - * We add an llap to the list so we don't lose our place during list walking. - * If llaps in the list are being moved they will only move to the end - * of the LRU, and we aren't terribly interested in those pages here (we - * start at the beginning of the list where the least-used llaps are. */ -static inline int llap_shrink_cache_internal(struct ll_sb_info *sbi, - int cpu, int target) +static void ll_cl_fini(struct lu_env *env, + struct cl_io *io, struct cl_page *page, int *refcheck) { - struct ll_async_page *llap, dummy_llap = { .llap_magic = 0xd11ad11a }; - struct ll_pglist_data *pd; - struct list_head *head; - int count = 0; - - pd = ll_pglist_cpu_lock(sbi, cpu); - head = &pd->llpd_list; - list_add(&dummy_llap.llap_pglist_item, head); - while (count < target) { - struct page *page; - int keep; - - if (unlikely(need_resched())) { - ll_pglist_cpu_unlock(sbi, cpu); - cond_resched(); - ll_pglist_cpu_lock(sbi, cpu); - } - - llap = llite_pglist_next_llap(head, - &dummy_llap.llap_pglist_item); - list_del_init(&dummy_llap.llap_pglist_item); - if (llap == NULL) - break; - - page = llap->llap_page; - LASSERT(page != NULL); - - list_add(&dummy_llap.llap_pglist_item, &llap->llap_pglist_item); - - /* Page needs/undergoing IO */ - if (TryLockPage(page)) { - LL_CDEBUG_PAGE(D_PAGE, page, "can't lock\n"); - continue; - } - - keep = (llap->llap_write_queued || PageDirty(page) || - PageWriteback(page) || (!PageUptodate(page) && - llap->llap_origin != LLAP_ORIGIN_READAHEAD)); - - LL_CDEBUG_PAGE(D_PAGE, page,"%s LRU page: %s%s%s%s%s origin %s\n", - keep ? "keep" : "drop", - llap->llap_write_queued ? "wq " : "", - PageDirty(page) ? "pd " : "", - PageUptodate(page) ? "" : "!pu ", - PageWriteback(page) ? "wb" : "", - llap->llap_defer_uptodate ? "" : "!du", - llap_origins[llap->llap_origin]); - - /* If page is dirty or undergoing IO don't discard it */ - if (keep) { - unlock_page(page); - continue; - } - - page_cache_get(page); - ll_pglist_cpu_unlock(sbi, cpu); - - if (page->mapping != NULL) { - ll_teardown_mmaps(page->mapping, - (__u64)page->index << CFS_PAGE_SHIFT, - ((__u64)page->index << CFS_PAGE_SHIFT)| - ~CFS_PAGE_MASK); - if (!PageDirty(page) && !page_mapped(page)) { - ll_ra_accounting(llap, page->mapping); - ll_truncate_complete_page(page); - ++count; - } else { - LL_CDEBUG_PAGE(D_PAGE, page, "Not dropping page" - " because it is " - "%s\n", - PageDirty(page)? - "dirty":"mapped"); + if (page != NULL) { + lu_ref_del(&page->cp_reference, "cl_io", io); + cl_page_put(env, page); + } + if (env != NULL) { + struct vvp_io *vio; + + vio = vvp_env_io(env); + LASSERT(vio->cui_oneshot >= 0); + if (vio->cui_oneshot > 0) { + if (--vio->cui_oneshot == 0) { + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_iter_fini(env, io); + cl_io_fini(env, io); + /* to trigger assertion above, if ll_cl_fini() + * is called against freed io. */ + vio->cui_oneshot = -1; } + /* additional reference on env was acquired by io, + * disable refcheck */ + refcheck = NULL; } - unlock_page(page); - page_cache_release(page); - - ll_pglist_cpu_lock(sbi, cpu); - } - list_del(&dummy_llap.llap_pglist_item); - ll_pglist_cpu_unlock(sbi, cpu); - - CDEBUG(D_CACHE, "shrank %d, expected %d however. \n", count, target); - return count; -} - - -/* Try to shrink the page cache for the @sbi filesystem by 1/@shrink_fraction. - * - * At first, this code calculates total pages wanted by @shrink_fraction, then - * it deduces how many pages should be reaped from each cpu in proportion as - * their own # of page count(llpd_count). - */ -int llap_shrink_cache(struct ll_sb_info *sbi, int shrink_fraction) -{ - unsigned long total, want, percpu_want, count = 0; - int cpu, nr_cpus; - - total = lcounter_read(&sbi->ll_async_page_count); - if (total == 0) - return 0; - -#ifdef HAVE_SHRINKER_CACHE - want = shrink_fraction; - if (want == 0) - return total; -#else - /* There can be a large number of llaps (600k or more in a large - * memory machine) so the VM 1/6 shrink ratio is likely too much. - * Since we are freeing pages also, we don't necessarily want to - * shrink so much. Limit to 40MB of pages + llaps per call. */ - if (shrink_fraction <= 0) - want = total - sbi->ll_async_page_max + 32*num_online_cpus(); - else - want = (total + shrink_fraction - 1) / shrink_fraction; -#endif - - if (want > 40 << (20 - CFS_PAGE_SHIFT)) - want = 40 << (20 - CFS_PAGE_SHIFT); - - CDEBUG(D_CACHE, "shrinking %lu of %lu pages (1/%d)\n", - want, total, shrink_fraction); - - nr_cpus = num_possible_cpus(); - cpu = sbi->ll_async_page_clock_hand; - /* we at most do one round */ - do { - int c; - - cpu = (cpu + 1) % nr_cpus; - c = LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_count; - if (!cpu_online(cpu)) - percpu_want = c; - else - percpu_want = want / ((total / (c + 1)) + 1); - if (percpu_want == 0) - continue; - - count += llap_shrink_cache_internal(sbi, cpu, percpu_want); - if (count >= want) - sbi->ll_async_page_clock_hand = cpu; - } while (cpu != sbi->ll_async_page_clock_hand); - - CDEBUG(D_CACHE, "shrank %lu/%lu and left %lu unscanned\n", - count, want, total); - -#ifdef HAVE_SHRINKER_CACHE - return lcounter_read(&sbi->ll_async_page_count); -#else - return count; -#endif + cl_env_put(env, refcheck); + } else + LASSERT(io == NULL); } -/* Rebalance the async page queue len for each cpu. We hope that the cpu - * which do much IO job has a relative longer queue len. - * This function should be called with preempt disabled. +/** + * Initializes one-shot cl_io for the case when loop driver calls + * ->{prepare,commit}_write() methods directly. */ -static inline int llap_async_cache_rebalance(struct ll_sb_info *sbi) +static int ll_prepare_loop(struct lu_env *env, struct cl_io *io, + struct file *file, struct page *vmpage, + unsigned from, unsigned to) { - unsigned long sample = 0, *cpu_sample, bias, slice; - struct ll_pglist_data *pd; - cpumask_t mask; - int cpu, surplus; - int w1 = 7, w2 = 3, base = (w1 + w2); /* weight value */ - atomic_t *pcnt; - - if (!spin_trylock(&sbi->ll_async_page_reblnc_lock)) { - /* someone else is doing the job */ - return 1; - } - - pcnt = &LL_PGLIST_DATA(sbi)->llpd_sample_count; - if (!atomic_read(pcnt)) { - /* rare case, somebody else has gotten this job done */ - spin_unlock(&sbi->ll_async_page_reblnc_lock); - return 1; - } - - sbi->ll_async_page_reblnc_count++; - cpu_sample = sbi->ll_async_page_sample; - memset(cpu_sample, 0, num_possible_cpus() * sizeof(unsigned long)); - for_each_online_cpu(cpu) { - pcnt = &LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_sample_count; - cpu_sample[cpu] = atomic_read(pcnt); - atomic_set(pcnt, 0); - sample += cpu_sample[cpu]; - } + struct vvp_io *vio; + struct ccc_io *cio; + int result; + loff_t pos; - cpus_clear(mask); - surplus = sbi->ll_async_page_max; - slice = surplus / sample + 1; - sample /= num_online_cpus(); - bias = sample >> 4; - for_each_online_cpu(cpu) { - pd = LL_PGLIST_DATA_CPU(sbi, cpu); - if (labs((long int)sample - cpu_sample[cpu]) > bias) { - unsigned long budget = pd->llpd_budget; - /* weighted original queue length and expected queue - * length to avoid thrashing. */ - pd->llpd_budget = (budget * w1) / base + - (slice * cpu_sample[cpu]) * w2 / base; - cpu_set(cpu, mask); - } - surplus -= pd->llpd_budget; - } - surplus /= cpus_weight(mask) ?: 1; - for_each_cpu_mask(cpu, mask) - LL_PGLIST_DATA_CPU(sbi, cpu)->llpd_budget += surplus; - spin_unlock(&sbi->ll_async_page_reblnc_lock); - - /* TODO: do we really need to call llap_shrink_cache_internal - * for every cpus with its page_count greater than budget? - * for_each_cpu_mask(cpu, mask) - * ll_shrink_cache_internal(...) + vio = vvp_env_io(env); + cio = ccc_env_io(env); + ll_io_init(io, file, 1); + pos = (vmpage->index << CFS_PAGE_SHIFT) + from; + /* + * Create IO and quickly drive it through CIS_{INIT,IT_STARTED,LOCKED} + * states. DLM locks are not taken for vio->cui_oneshot IO---we cannot + * take DLM locks here, because page is already locked. With new + * ->write_{being,end}() address_space operations lustre might be + * luckier. */ - - return 0; -} - -static struct ll_async_page *llap_from_page_with_lockh(struct page *page, - unsigned origin, - struct lustre_handle *lockh) -{ - struct ll_async_page *llap; - struct obd_export *exp; - struct inode *inode = page->mapping->host; - struct ll_sb_info *sbi; - struct ll_pglist_data *pd; - int rc, cpu, target; - ENTRY; - - if (!inode) { - static int triggered; - - if (!triggered) { - LL_CDEBUG_PAGE(D_ERROR, page, "Bug 10047. Wrong anon " - "page received\n"); - libcfs_debug_dumpstack(NULL); - triggered = 1; - } - RETURN(ERR_PTR(-EINVAL)); - } - sbi = ll_i2sbi(inode); - LASSERT(ll_async_page_slab); - LASSERTF(origin < LLAP__ORIGIN_MAX, "%u\n", origin); - - llap = llap_cast_private(page); - if (llap != NULL) { - /* move to end of LRU list, except when page is just about to - * die */ - if (origin != LLAP_ORIGIN_REMOVEPAGE) { - int old_cpu = llap->llap_pglist_cpu; - struct ll_pglist_data *old_pd; - - pd = ll_pglist_double_lock(sbi, old_cpu, &old_pd); - pd->llpd_hit++; - while (old_cpu != llap->llap_pglist_cpu) { - /* rarely case, someone else is touching this - * page too. */ - ll_pglist_double_unlock(sbi, old_cpu); - old_cpu = llap->llap_pglist_cpu; - pd=ll_pglist_double_lock(sbi, old_cpu, &old_pd); - } - - list_move(&llap->llap_pglist_item, - &pd->llpd_list); - old_pd->llpd_gen++; - if (pd->llpd_cpu != old_cpu) { - pd->llpd_count++; - old_pd->llpd_count--; - old_pd->llpd_gen++; - llap->llap_pglist_cpu = pd->llpd_cpu; - pd->llpd_cross++; - } - ll_pglist_double_unlock(sbi, old_cpu); - } - GOTO(out, llap); - } - - exp = ll_i2dtexp(page->mapping->host); - if (exp == NULL) - RETURN(ERR_PTR(-EINVAL)); - - /* limit the number of lustre-cached pages */ - cpu = get_cpu(); - pd = LL_PGLIST_DATA(sbi); - target = pd->llpd_count - pd->llpd_budget; - if (target > 0) { - rc = 0; - atomic_inc(&pd->llpd_sample_count); - if (atomic_read(&pd->llpd_sample_count) > - sbi->ll_async_page_sample_max) { - pd->llpd_reblnc_count++; - rc = llap_async_cache_rebalance(sbi); - if (rc == 0) - target = pd->llpd_count - pd->llpd_budget; - } - /* if rc equals 1, it means other cpu is doing the rebalance - * job, and our budget # would be modified when we read it. - * Furthermore, it is much likely being increased because - * we have already reached the rebalance threshold. In this - * case, we skip to shrink cache here. */ - if ((rc == 0) && target > 0) - llap_shrink_cache_internal(sbi, cpu, target + 32); - } - put_cpu(); - - OBD_SLAB_ALLOC(llap, ll_async_page_slab, CFS_ALLOC_STD, - ll_async_page_slab_size); - if (llap == NULL) - RETURN(ERR_PTR(-ENOMEM)); - llap->llap_magic = LLAP_MAGIC; - llap->llap_cookie = (void *)llap + size_round(sizeof(*llap)); - - /* XXX: for bug 11270 - check for lockless origin here! */ - if (origin == LLAP_ORIGIN_LOCKLESS_IO) - llap->llap_nocache = 1; - - rc = obd_prep_async_page(exp, ll_i2info(inode)->lli_smd, NULL, page, - (obd_off)page->index << CFS_PAGE_SHIFT, - &ll_async_page_ops, llap, &llap->llap_cookie, - llap->llap_nocache, lockh); - if (rc) { - OBD_SLAB_FREE(llap, ll_async_page_slab, - ll_async_page_slab_size); - RETURN(ERR_PTR(rc)); - } - - CDEBUG(D_CACHE, "llap %p page %p cookie %p obj off "LPU64"\n", llap, - page, llap->llap_cookie, (obd_off)page->index << CFS_PAGE_SHIFT); - /* also zeroing the PRIVBITS low order bitflags */ - __set_page_ll_data(page, llap); - llap->llap_page = page; - - lcounter_inc(&sbi->ll_async_page_count); - pd = ll_pglist_lock(sbi); - list_add_tail(&llap->llap_pglist_item, &pd->llpd_list); - INIT_LIST_HEAD(&llap->llap_pending_write); - pd->llpd_count++; - pd->llpd_gen++; - pd->llpd_miss++; - llap->llap_pglist_cpu = pd->llpd_cpu; - ll_pglist_unlock(sbi); - - out: - if (unlikely(sbi->ll_flags & LL_SBI_CHECKSUM)) { - __u32 csum; - char *kaddr = kmap_atomic(page, KM_USER0); - csum = init_checksum(OSC_DEFAULT_CKSUM); - csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE, - OSC_DEFAULT_CKSUM); - kunmap_atomic(kaddr, KM_USER0); - if (origin == LLAP_ORIGIN_READAHEAD || - origin == LLAP_ORIGIN_READPAGE || - origin == LLAP_ORIGIN_LOCKLESS_IO) { - llap->llap_checksum = 0; - } else if (origin == LLAP_ORIGIN_COMMIT_WRITE || - llap->llap_checksum == 0) { - llap->llap_checksum = csum; - CDEBUG(D_PAGE, "page %p cksum %x\n", page, csum); - } else if (llap->llap_checksum == csum) { - /* origin == LLAP_ORIGIN_WRITEPAGE */ - CDEBUG(D_PAGE, "page %p cksum %x confirmed\n", - page, csum); - } else { - /* origin == LLAP_ORIGIN_WRITEPAGE */ - LL_CDEBUG_PAGE(D_ERROR, page, "old cksum %x != new " - "%x!\n", llap->llap_checksum, csum); + result = cl_io_rw_init(env, io, CIT_WRITE, pos, from - to); + if (result == 0) { + cio->cui_fd = LUSTRE_FPRIVATE(file); + vio->cui_oneshot = 1; + result = cl_io_iter_init(env, io); + if (result == 0) { + result = cl_io_lock(env, io); + if (result == 0) + result = cl_io_start(env, io); } - } - - llap->llap_origin = origin; - RETURN(llap); + } else + result = io->ci_result; + return result; } -struct ll_async_page *llap_from_page(struct page *page, - unsigned origin) -{ - return llap_from_page_with_lockh(page, origin, NULL); -} - -static int queue_or_sync_write(struct obd_export *exp, struct inode *inode, - struct ll_async_page *llap, - unsigned to, obd_flag async_flags) +/** + * ->prepare_write() address space operation called by generic_file_write() + * for every page during write. + */ +int ll_prepare_write(struct file *file, struct page *vmpage, unsigned from, + unsigned to) { - unsigned long size_index = i_size_read(inode) >> CFS_PAGE_SHIFT; - struct obd_io_group *oig; - struct ll_sb_info *sbi = ll_i2sbi(inode); - int rc, noquot = llap->llap_ignore_quota ? OBD_BRW_NOQUOTA : 0; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + int result; + int refcheck; ENTRY; - /* _make_ready only sees llap once we've unlocked the page */ - llap->llap_write_queued = 1; - rc = obd_queue_async_io(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie, OBD_BRW_WRITE | noquot, - 0, 0, 0, async_flags); - if (rc == 0) { - LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "write queued\n"); - GOTO(out, 0); - } - - llap->llap_write_queued = 0; - /* Do not pass llap here as it is sync write. */ - llap_write_pending(inode, NULL); - - rc = oig_init(&oig); - if (rc) - GOTO(out, rc); - - /* make full-page requests if we are not at EOF (bug 4410) */ - if (to != CFS_PAGE_SIZE && llap->llap_page->index < size_index) { - LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, - "sync write before EOF: size_index %lu, to %d\n", - size_index, to); - to = CFS_PAGE_SIZE; - } else if (to != CFS_PAGE_SIZE && llap->llap_page->index == size_index){ - int size_to = i_size_read(inode) & ~CFS_PAGE_MASK; - LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, - "sync write at EOF: size_index %lu, to %d/%d\n", - size_index, to, size_to); - if (to < size_to) - to = size_to; - } - - /* compare the checksum once before the page leaves llite */ - if (unlikely((sbi->ll_flags & LL_SBI_CHECKSUM) && - llap->llap_checksum != 0)) { - __u32 csum; - struct page *page = llap->llap_page; - char *kaddr = kmap_atomic(page, KM_USER0); - csum = init_checksum(OSC_DEFAULT_CKSUM); - csum = compute_checksum(csum, kaddr, CFS_PAGE_SIZE, - OSC_DEFAULT_CKSUM); - kunmap_atomic(kaddr, KM_USER0); - if (llap->llap_checksum == csum) { - CDEBUG(D_PAGE, "page %p cksum %x confirmed\n", - page, csum); - } else { - CERROR("page %p old cksum %x != new cksum %x!\n", - page, llap->llap_checksum, csum); + result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck); + /* + * Loop-back driver calls ->prepare_write() and ->sendfile() methods + * directly, bypassing file system ->write() operation, so cl_io has + * to be created here. + */ + if (result == -EALREADY) { + io = &ccc_env_info(env)->cti_io; + result = ll_prepare_loop(env, io, file, vmpage, from, to); + if (result == 0) { + result = ll_cl_init(file, vmpage, + &env, &io, &page, &refcheck); + cl_env_put(env, NULL); } } - - rc = obd_queue_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig, - llap->llap_cookie, OBD_BRW_WRITE | noquot, - 0, to, 0, ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC); - if (rc) - GOTO(free_oig, rc); - - rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig); - if (rc) - GOTO(free_oig, rc); - - rc = oig_wait(oig); - - if (!rc && async_flags & ASYNC_READY) { - unlock_page(llap->llap_page); - if (PageWriteback(llap->llap_page)) - end_page_writeback(llap->llap_page); - } - - if (rc == 0 && llap_write_complete(inode, llap)) - ll_queue_done_writing(inode, 0); - - LL_CDEBUG_PAGE(D_PAGE, llap->llap_page, "sync write returned %d\n", rc); - -free_oig: - oig_release(oig); -out: - RETURN(rc); + if (result == 0) { + cl_page_assume(env, io, page); + result = cl_io_prepare_write(env, io, page, from, to); + if (result == 0) { + struct vvp_io *vio; + + /* + * Add a reference, so that page is not evicted from + * the cache until ->commit_write() is called. + */ + cl_page_get(page); + lu_ref_add(&page->cp_reference, "prepare_write", + cfs_current()); + vio = vvp_env_io(env); + if (vio->cui_oneshot > 0) + vio->cui_oneshot++; + } else + cl_page_unassume(env, io, page); + } + ll_cl_fini(env, io, page, &refcheck); + RETURN(result); } -/* update our write count to account for i_size increases that may have - * happened since we've queued the page for io. */ - -/* be careful not to return success without setting the page Uptodate or - * the next pass through prepare_write will read in stale data from disk. */ -int ll_commit_write(struct file *file, struct page *page, unsigned from, +int ll_commit_write(struct file *file, struct page *vmpage, unsigned from, unsigned to) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(file); - struct inode *inode = page->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct lov_stripe_md *lsm = lli->lli_smd; - struct obd_export *exp; - struct ll_async_page *llap; - loff_t size; - struct lustre_handle *lockh = NULL; - int rc = 0; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + int result; + int refcheck; ENTRY; - SIGNAL_MASK_ASSERT(); /* XXX BUG 1511 */ - LASSERT(inode == file->f_dentry->d_inode); - LASSERT(PageLocked(page)); - - CDEBUG(D_INODE, "inode %p is writing page %p from %d to %d at %lu\n", - inode, page, from, to, page->index); - - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) - lockh = &fd->fd_cwlockh; - - llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_COMMIT_WRITE, lockh); - if (IS_ERR(llap)) - RETURN(PTR_ERR(llap)); - - exp = ll_i2dtexp(inode); - if (exp == NULL) - RETURN(-EINVAL); - - llap->llap_ignore_quota = cfs_capable(CFS_CAP_SYS_RESOURCE); - - /* - * queue a write for some time in the future the first time we - * dirty the page. - * - * This is different from what other file systems do: they usually - * just mark page (and some of its buffers) dirty and rely on - * balance_dirty_pages() to start a write-back. Lustre wants write-back - * to be started earlier for the following reasons: - * - * (1) with a large number of clients we need to limit the amount - * of cached data on the clients a lot; - * - * (2) large compute jobs generally want compute-only then io-only - * and the IO should complete as quickly as possible; - * - * (3) IO is batched up to the RPC size and is async until the - * client max cache is hit - * (/proc/fs/lustre/osc/OSC.../max_dirty_mb) - * - */ - if (!PageDirty(page)) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_MISSES, 1); - - rc = queue_or_sync_write(exp, inode, llap, to, 0); - if (rc) - GOTO(out, rc); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRTY_HITS, 1); + result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck); + LASSERT(result != -EALREADY); + if (result == 0) { + LASSERT(cl_page_is_owned(page, io)); + result = cl_io_commit_write(env, io, page, from, to); + if (cl_page_is_owned(page, io)) + cl_page_unassume(env, io, page); + /* + * Release reference acquired by cl_io_prepare_write(). + */ + lu_ref_del(&page->cp_reference, "prepare_write", cfs_current()); + cl_page_put(env, page); } + ll_cl_fini(env, io, page, &refcheck); + RETURN(result); +} - /* put the page in the page cache, from now on ll_removepage is - * responsible for cleaning up the llap. - * only set page dirty when it's queued to be write out */ - if (llap->llap_write_queued) - set_page_dirty(page); +struct obd_capa *cl_capa_lookup(struct inode *inode, enum cl_req_type crt) +{ + __u64 opc; -out: - size = (((obd_off)page->index) << CFS_PAGE_SHIFT) + to; - ll_inode_size_lock(inode, 0); - if (rc == 0) { - lov_stripe_lock(lsm); - obd_adjust_kms(exp, lsm, size, 0); - lov_stripe_unlock(lsm); - if (size > i_size_read(inode)) - i_size_write(inode, size); - SetPageUptodate(page); - } else if (size > i_size_read(inode)) { - /* this page beyond the pales of i_size, so it can't be - * truncated in ll_p_r_e during lock revoking. we must - * teardown our book-keeping here. */ - ll_removepage(page); - } - ll_inode_size_unlock(inode, 0); - RETURN(rc); + opc = crt == CRT_WRITE ? CAPA_OPC_OSS_WRITE : CAPA_OPC_OSS_RW; + return ll_osscapa_get(inode, opc); } static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which); -/* WARNING: This algorithm is used to reduce the contention on - * sbi->ll_lock. It should work well if the ra_max_pages is much +/* WARNING: This algorithm is used to reduce the contention on + * sbi->ll_lock. It should work well if the ra_max_pages is much * greater than the single file's read-ahead window. * - * TODO: There may exist a `global sync problem' in this implementation. + * TODO: There may exist a `global sync problem' in this implementation. * Considering the global ra window is 100M, and each file's ra window is 10M, - * there are over 10 files trying to get its ra budget and reach + * there are over 10 files trying to get its ra budget and reach * ll_ra_count_get at the exactly same time. All of them will get a zero ra * window, although the global window is 100M. -jay */ @@ -1116,187 +375,24 @@ out: RETURN(ret); } -static void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) +void ll_ra_count_put(struct ll_sb_info *sbi, unsigned long len) { struct ll_ra_info *ra = &sbi->ll_ra_info; atomic_sub(len, &ra->ra_cur_pages); } -/* called for each page in a completed rpc.*/ -int ll_ap_completion(void *data, int cmd, struct obdo *oa, int rc) -{ - struct ll_async_page *llap; - struct page *page; - int ret = 0; - ENTRY; - - llap = llap_from_cookie(data); - page = llap->llap_page; - LASSERT(PageLocked(page)); - LASSERT(CheckWriteback(page,cmd)); - - LL_CDEBUG_PAGE(D_PAGE, page, "completing cmd %d with %d\n", cmd, rc); - - if (cmd & OBD_BRW_READ && llap->llap_defer_uptodate) - ll_ra_count_put(ll_i2sbi(page->mapping->host), 1); - - if (rc == 0) { - if (cmd & OBD_BRW_READ) { - if (!llap->llap_defer_uptodate) - SetPageUptodate(page); - } else { - llap->llap_write_queued = 0; - } - ClearPageError(page); - } else { - if (cmd & OBD_BRW_READ) { - llap->llap_defer_uptodate = 0; - } - SetPageError(page); - if (rc == -ENOSPC) - set_bit(AS_ENOSPC, &page->mapping->flags); - else - set_bit(AS_EIO, &page->mapping->flags); - } - - /* be carefull about clear WB. - * if WB will cleared after page lock is released - paralel IO can be - * started before ap_make_ready is finished - so we will be have page - * with PG_Writeback set from ->writepage() and completed READ which - * clear this flag */ - if ((cmd & OBD_BRW_WRITE) && PageWriteback(page)) - end_page_writeback(page); - - unlock_page(page); - - if (cmd & OBD_BRW_WRITE) { - /* Only rc == 0, write succeed, then this page could be deleted - * from the pending_writing list - */ - if (rc == 0 && llap_write_complete(page->mapping->host, llap)) - ll_queue_done_writing(page->mapping->host, 0); - } - - page_cache_release(page); - - RETURN(ret); -} - -static void __ll_put_llap(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct obd_export *exp; - struct ll_async_page *llap; - struct ll_sb_info *sbi = ll_i2sbi(inode); - struct ll_pglist_data *pd; - int rc, cpu; - ENTRY; - - exp = ll_i2dtexp(inode); - if (exp == NULL) { - CERROR("page %p ind %lu gave null export\n", page, page->index); - EXIT; - return; - } - - llap = llap_from_page(page, LLAP_ORIGIN_REMOVEPAGE); - if (IS_ERR(llap)) { - CERROR("page %p ind %lu couldn't find llap: %ld\n", page, - page->index, PTR_ERR(llap)); - EXIT; - return; - } - - if (llap_write_complete(inode, llap)) - ll_queue_done_writing(inode, 0); - - rc = obd_teardown_async_page(exp, ll_i2info(inode)->lli_smd, NULL, - llap->llap_cookie); - if (rc != 0) - CERROR("page %p ind %lu failed: %d\n", page, page->index, rc); - - /* this unconditional free is only safe because the page lock - * is providing exclusivity to memory pressure/truncate/writeback..*/ - __clear_page_ll_data(page); - - lcounter_dec(&sbi->ll_async_page_count); - cpu = llap->llap_pglist_cpu; - pd = ll_pglist_cpu_lock(sbi, cpu); - pd->llpd_gen++; - pd->llpd_count--; - if (!list_empty(&llap->llap_pglist_item)) - list_del_init(&llap->llap_pglist_item); - ll_pglist_cpu_unlock(sbi, cpu); - OBD_SLAB_FREE(llap, ll_async_page_slab, ll_async_page_slab_size); - EXIT; -} - -/* the kernel calls us here when a page is unhashed from the page cache. - * the page will be locked and the kernel is holding a spinlock, so - * we need to be careful. we're just tearing down our book-keeping - * here. */ -void ll_removepage(struct page *page) -{ - struct ll_async_page *llap = llap_cast_private(page); - ENTRY; - - LASSERT(!in_interrupt()); - - /* sync pages or failed read pages can leave pages in the page - * cache that don't have our data associated with them anymore */ - if (page_private(page) == 0) { - EXIT; - return; - } - - LASSERT(!llap->llap_lockless_io_page); - LASSERT(!llap->llap_nocache); - LL_CDEBUG_PAGE(D_PAGE, page, "being evicted\n"); - __ll_put_llap(page); - EXIT; -} - -static int ll_issue_page_read(struct obd_export *exp, - struct ll_async_page *llap, - struct obd_io_group *oig, int defer) -{ - struct page *page = llap->llap_page; - int rc; - - page_cache_get(page); - llap->llap_defer_uptodate = defer; - llap->llap_ra_used = 0; - rc = obd_queue_group_io(exp, ll_i2info(page->mapping->host)->lli_smd, - NULL, oig, llap->llap_cookie, OBD_BRW_READ, 0, - CFS_PAGE_SIZE, 0, ASYNC_COUNT_STABLE | - ASYNC_READY | ASYNC_URGENT); - if (rc) { - LL_CDEBUG_PAGE(D_ERROR, page, "read queue failed: rc %d\n", rc); - page_cache_release(page); - } - RETURN(rc); -} - static void ll_ra_stats_inc_sbi(struct ll_sb_info *sbi, enum ra_stat which) { LASSERTF(which >= 0 && which < _NR_RA_STAT, "which: %u\n", which); lprocfs_counter_incr(sbi->ll_ra_stats, which); } -static void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) +void ll_ra_stats_inc(struct address_space *mapping, enum ra_stat which) { struct ll_sb_info *sbi = ll_i2sbi(mapping->host); ll_ra_stats_inc_sbi(sbi, which); } -void ll_ra_accounting(struct ll_async_page *llap, struct address_space *mapping) -{ - if (!llap->llap_defer_uptodate || llap->llap_ra_used) - return; - - ll_ra_stats_inc(mapping, RA_STAT_DISCARDED); -} - #define RAS_CDEBUG(ras) \ CDEBUG(D_READA, \ "lrp %lu cr %lu cp %lu ws %lu wl %lu nra %lu r %lu ri %lu" \ @@ -1380,86 +476,102 @@ struct ll_ra_read *ll_ra_read_get(struct file *f) return bead; } -static int ll_read_ahead_page(struct obd_export *exp, struct obd_io_group *oig, +static int cl_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_page *page, + struct page *vmpage) +{ + struct ccc_page *cp; + int rc; + + ENTRY; + + rc = 0; + cl_page_assume(env, io, page); + lu_ref_add(&page->cp_reference, "ra", cfs_current()); + cp = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + if (!cp->cpg_defer_uptodate && !Page_Uptodate(vmpage)) { + rc = cl_page_is_under_lock(env, io, page); + if (rc == -EBUSY) { + cp->cpg_defer_uptodate = 1; + cp->cpg_ra_used = 0; + cl_page_list_add(queue, page); + rc = 1; + } else { + cl_page_delete(env, page); + rc = -ENOLCK; + } + } else + /* skip completed pages */ + cl_page_unassume(env, io, page); + lu_ref_del(&page->cp_reference, "ra", cfs_current()); + cl_page_put(env, page); + RETURN(rc); +} + +/** + * Initiates read-ahead of a page with given index. + * + * \retval +ve: page was added to \a queue. + * + * \retval -ENOLCK: there is no extent lock for this part of a file, stop + * read-ahead. + * + * \retval -ve, 0: page wasn't added to \a queue for other reason. + */ +static int ll_read_ahead_page(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, int index, struct address_space *mapping) { - struct ll_async_page *llap; - struct page *page; - unsigned int gfp_mask = 0; - int rc = 0; + struct page *vmpage; + struct cl_object *clob = ll_i2info(mapping->host)->lli_clob; + struct cl_page *page; + enum ra_stat which = _NR_RA_STAT; /* keep gcc happy */ + unsigned int gfp_mask; + int rc = 0; + const char *msg = NULL; + + ENTRY; gfp_mask = GFP_HIGHUSER & ~__GFP_WAIT; #ifdef __GFP_NOWARN gfp_mask |= __GFP_NOWARN; #endif - page = grab_cache_page_nowait_gfp(mapping, index, gfp_mask); - if (page == NULL) { - ll_ra_stats_inc(mapping, RA_STAT_FAILED_GRAB_PAGE); - CDEBUG(D_READA, "g_c_p_n failed\n"); - return 0; - } - - /* Check if page was truncated or reclaimed */ - if (page->mapping != mapping) { - ll_ra_stats_inc(mapping, RA_STAT_WRONG_GRAB_PAGE); - CDEBUG(D_READA, "g_c_p_n returned invalid page\n"); - GOTO(unlock_page, rc = 0); - } - - /* we do this first so that we can see the page in the /proc - * accounting */ - llap = llap_from_page(page, LLAP_ORIGIN_READAHEAD); - if (IS_ERR(llap) || llap->llap_defer_uptodate) { - if (PTR_ERR(llap) == -ENOLCK) { - ll_ra_stats_inc(mapping, RA_STAT_FAILED_MATCH); - CDEBUG(D_READA | D_PAGE, - "Adding page to cache failed index " - "%d\n", index); - CDEBUG(D_READA, "nolock page\n"); - GOTO(unlock_page, rc = -ENOLCK); + vmpage = grab_cache_page_nowait_gfp(mapping, index, gfp_mask); + if (vmpage != NULL) { + /* Check if vmpage was truncated or reclaimed */ + if (vmpage->mapping == mapping) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + rc = cl_read_ahead_page(env, io, queue, + page, vmpage); + if (rc == -ENOLCK) { + which = RA_STAT_FAILED_MATCH; + msg = "lock match failed"; + } + } else { + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "cl_page_find failed"; + } + } else { + which = RA_STAT_WRONG_GRAB_PAGE; + msg = "g_c_p_n returned invalid page"; } - CDEBUG(D_READA, "read-ahead page\n"); - GOTO(unlock_page, rc = 0); - } - - /* skip completed pages */ - if (Page_Uptodate(page)) - GOTO(unlock_page, rc = 0); - - /* bail out when we hit the end of the lock. */ - rc = ll_issue_page_read(exp, llap, oig, 1); - if (rc == 0) { - LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "started read-ahead\n"); - rc = 1; + if (rc != 1) + unlock_page(vmpage); + page_cache_release(vmpage); } else { -unlock_page: - unlock_page(page); - LL_CDEBUG_PAGE(D_READA | D_PAGE, page, "skipping read-ahead\n"); + which = RA_STAT_FAILED_GRAB_PAGE; + msg = "g_c_p_n failed"; + } + if (msg != NULL) { + ll_ra_stats_inc(mapping, which); + CDEBUG(D_READA, "%s\n", msg); } - page_cache_release(page); - return rc; + RETURN(rc); } -/* ra_io_arg will be filled in the beginning of ll_readahead with - * ras_lock, then the following ll_read_ahead_pages will read RA - * pages according to this arg, all the items in this structure are - * counted by page index. - */ -struct ra_io_arg { - unsigned long ria_start; /* start offset of read-ahead*/ - unsigned long ria_end; /* end offset of read-ahead*/ - /* If stride read pattern is detected, ria_stoff means where - * stride read is started. Note: for normal read-ahead, the - * value here is meaningless, and also it will not be accessed*/ - pgoff_t ria_stoff; - /* ria_length and ria_pages are the length and pages length in the - * stride I/O mode. And they will also be used to check whether - * it is stride I/O read-ahead in the read-ahead pages*/ - unsigned long ria_length; - unsigned long ria_pages; -}; - -#define RIA_DEBUG(ria) \ +#define RIA_DEBUG(ria) \ CDEBUG(D_READA, "rs %lu re %lu ro %lu rl %lu rp %lu\n", \ ria->ria_start, ria->ria_end, ria->ria_stoff, ria->ria_length,\ ria->ria_pages) @@ -1522,9 +634,9 @@ static int ras_inside_ra_window(unsigned long idx, struct ra_io_arg *ria) (idx - ria->ria_stoff) % ria->ria_length < ria->ria_pages; } -static int ll_read_ahead_pages(struct obd_export *exp, - struct obd_io_group *oig, - struct ra_io_arg *ria, +static int ll_read_ahead_pages(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *queue, + struct ra_io_arg *ria, unsigned long *reserved_pages, struct address_space *mapping, unsigned long *ra_end) @@ -1540,17 +652,18 @@ static int ll_read_ahead_pages(struct obd_export *exp, *reserved_pages > 0; page_idx++) { if (ras_inside_ra_window(page_idx, ria)) { /* If the page is inside the read-ahead window*/ - rc = ll_read_ahead_page(exp, oig, page_idx, mapping); - if (rc == 1) { - (*reserved_pages)--; - count ++; - } else if (rc == -ENOLCK) - break; + rc = ll_read_ahead_page(env, io, queue, + page_idx, mapping); + if (rc == 1) { + (*reserved_pages)--; + count ++; + } else if (rc == -ENOLCK) + break; } else if (stride_ria) { /* If it is not in the read-ahead window, and it is * read-ahead mode, then check whether it should skip * the stride gap */ - pgoff_t offset; + pgoff_t offset; /* FIXME: This assertion only is valid when it is for * forward read-ahead, it will be fixed when backward * read-ahead is implemented */ @@ -1559,9 +672,9 @@ static int ll_read_ahead_pages(struct obd_export *exp, " offset %lu \n", page_idx, ria->ria_stoff); offset = page_idx - ria->ria_stoff; - offset = offset % (ria->ria_length); - if (offset > ria->ria_pages) { - page_idx += ria->ria_length - offset; + offset = offset % (ria->ria_length); + if (offset > ria->ria_pages) { + page_idx += ria->ria_length - offset; CDEBUG(D_READA, "i %lu skip %lu \n", page_idx, ria->ria_length - offset); continue; @@ -1572,43 +685,56 @@ static int ll_read_ahead_pages(struct obd_export *exp, return count; } -static int ll_readahead(struct ll_readahead_state *ras, - struct obd_export *exp, struct address_space *mapping, - struct obd_io_group *oig, int flags) +int ll_readahead(const struct lu_env *env, struct cl_io *io, + struct ll_readahead_state *ras, struct address_space *mapping, + struct cl_page_list *queue, int flags) { + struct vvp_io *vio = vvp_env_io(env); + struct vvp_thread_info *vti = vvp_env_info(env); + struct ccc_thread_info *cti = ccc_env_info(env); unsigned long start = 0, end = 0, reserved; unsigned long ra_end, len; struct inode *inode; - struct lov_stripe_md *lsm; struct ll_ra_read *bead; - struct ost_lvb lvb; - struct ra_io_arg ria = { 0 }; + struct ra_io_arg *ria = &vti->vti_ria; + struct ll_inode_info *lli; + struct cl_object *clob; + struct cl_attr *attr = &cti->cti_attr; int ret = 0; __u64 kms; ENTRY; inode = mapping->host; - lsm = ll_i2info(inode)->lli_smd; + lli = ll_i2info(inode); + clob = lli->lli_clob; + + memset(ria, 0, sizeof *ria); + + cl_object_attr_lock(clob); + ret = cl_object_attr_get(env, clob, attr); + cl_object_attr_unlock(clob); - lov_stripe_lock(lsm); - inode_init_lvb(inode, &lvb); - obd_merge_lvb(ll_i2dtexp(inode), lsm, &lvb, 1); - kms = lvb.lvb_size; - lov_stripe_unlock(lsm); + if (ret != 0) + RETURN(ret); + kms = attr->cat_kms; if (kms == 0) { ll_ra_stats_inc(mapping, RA_STAT_ZERO_LEN); RETURN(0); } spin_lock(&ras->ras_lock); - bead = ll_ra_read_get_locked(ras); + if (vio->cui_ra_window_set) + bead = &vio->cui_bead; + else + bead = NULL; + /* Enlarge the RA window to encompass the full read */ if (bead != NULL && ras->ras_window_start + ras->ras_window_len < bead->lrr_start + bead->lrr_count) { ras->ras_window_len = bead->lrr_start + bead->lrr_count - ras->ras_window_start; } - /* Reserve a part of the read-ahead window that we'll be issuing */ + /* Reserve a part of the read-ahead window that we'll be issuing */ if (ras->ras_window_len) { start = ras->ras_next_readahead; end = ras->ras_window_start + ras->ras_window_len - 1; @@ -1619,13 +745,13 @@ static int ll_readahead(struct ll_readahead_state *ras, ras->ras_next_readahead = max(end, end + 1); RAS_CDEBUG(ras); } - ria.ria_start = start; - ria.ria_end = end; + ria->ria_start = start; + ria->ria_end = end; /* If stride I/O mode is detected, get stride window*/ if (stride_io_mode(ras)) { - ria.ria_stoff = ras->ras_stride_offset; - ria.ria_length = ras->ras_stride_length; - ria.ria_pages = ras->ras_stride_pages; + ria->ria_stoff = ras->ras_stride_offset; + ria->ria_length = ras->ras_stride_length; + ria->ria_pages = ras->ras_stride_pages; } spin_unlock(&ras->ras_lock); @@ -1633,7 +759,7 @@ static int ll_readahead(struct ll_readahead_state *ras, ll_ra_stats_inc(mapping, RA_STAT_ZERO_WINDOW); RETURN(0); } - len = ria_page_count(&ria); + len = ria_page_count(ria); if (len == 0) RETURN(0); @@ -1643,8 +769,9 @@ static int ll_readahead(struct ll_readahead_state *ras, ll_ra_stats_inc(mapping, RA_STAT_MAX_IN_FLIGHT); CDEBUG(D_READA, "reserved page %lu \n", reserved); - - ret = ll_read_ahead_pages(exp, oig, &ria, &reserved, mapping, &ra_end); + + ret = ll_read_ahead_pages(env, io, queue, + ria, &reserved, mapping, &ra_end); LASSERTF(reserved >= 0, "reserved %lu\n", reserved); if (reserved != 0) @@ -1659,15 +786,15 @@ static int ll_readahead(struct ll_readahead_state *ras, * if the region we failed to issue read-ahead on is still ahead * of the app and behind the next index to start read-ahead from */ CDEBUG(D_READA, "ra_end %lu end %lu stride end %lu \n", - ra_end, end, ria.ria_end); + ra_end, end, ria->ria_end); - if (ra_end != (end + 1)) { + if (ra_end != end + 1) { spin_lock(&ras->ras_lock); if (ra_end < ras->ras_next_readahead && index_in_window(ra_end, ras->ras_window_start, 0, ras->ras_window_len)) { - ras->ras_next_readahead = ra_end; - RAS_CDEBUG(ras); + ras->ras_next_readahead = ra_end; + RAS_CDEBUG(ras); } spin_unlock(&ras->ras_lock); } @@ -1710,7 +837,7 @@ void ll_readahead_init(struct inode *inode, struct ll_readahead_state *ras) INIT_LIST_HEAD(&ras->ras_read_beads); } -/* +/* * Check whether the read request is in the stride window. * If it is in the stride window, return 1, otherwise return 0. */ @@ -1719,7 +846,7 @@ static int index_in_stride_window(unsigned long index, struct inode *inode) { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - + if (ras->ras_stride_length == 0 || ras->ras_stride_pages == 0) return 0; @@ -1729,7 +856,7 @@ static int index_in_stride_window(unsigned long index, /* Otherwise check the stride by itself */ return (ras->ras_stride_length - ras->ras_stride_pages) == stride_gap && - ras->ras_consecutive_pages == ras->ras_stride_pages; + ras->ras_consecutive_pages == ras->ras_stride_pages; } static void ras_update_stride_detector(struct ll_readahead_state *ras, @@ -1737,7 +864,7 @@ static void ras_update_stride_detector(struct ll_readahead_state *ras, { unsigned long stride_gap = index - ras->ras_last_readpage - 1; - if (!stride_io_mode(ras) && (stride_gap != 0 || + if (!stride_io_mode(ras) && (stride_gap != 0 || ras->ras_consecutive_stride_requests == 0)) { ras->ras_stride_pages = ras->ras_consecutive_pages; ras->ras_stride_length = stride_gap +ras->ras_consecutive_pages; @@ -1809,14 +936,15 @@ static void ras_set_stride_offset(struct ll_readahead_state *ras) RAS_CDEBUG(ras); } -static void ras_update(struct ll_sb_info *sbi, struct inode *inode, - struct ll_readahead_state *ras, unsigned long index, - unsigned hit) +void ras_update(struct ll_sb_info *sbi, struct inode *inode, + struct ll_readahead_state *ras, unsigned long index, + unsigned hit) { struct ll_ra_info *ra = &sbi->ll_ra_info; int zero = 0, stride_detect = 0, ra_miss = 0; ENTRY; + spin_lock(&sbi->ll_lock); spin_lock(&ras->ras_lock); ll_ra_stats_inc_sbi(sbi, hit ? RA_STAT_HIT : RA_STAT_MISS); @@ -1879,14 +1007,14 @@ static void ras_update(struct ll_sb_info *sbi, struct inode *inode, if (ra_miss) { if (index_in_stride_window(index, ras, inode) && stride_io_mode(ras)) { - /*If stride-RA hit cache miss, the stride dector + /*If stride-RA hit cache miss, the stride dector *will not be reset to avoid the overhead of *redetecting read-ahead mode */ if (index != ras->ras_last_readpage + 1) ras->ras_consecutive_pages = 0; RAS_CDEBUG(ras); } else { - /* Reset both stride window and normal RA window */ + /* Reset both stride window and normal RA window */ ras_reset(ras, index); ras->ras_consecutive_pages++; ras_stride_reset(ras); @@ -1940,433 +1068,107 @@ out_unlock: RAS_CDEBUG(ras); ras->ras_request_index++; spin_unlock(&ras->ras_lock); + spin_unlock(&sbi->ll_lock); return; } -int ll_writepage(struct page *page) -{ - struct inode *inode = page->mapping->host; - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *exp; - struct ll_async_page *llap; - int rc = 0; - ENTRY; - - LASSERT(PageLocked(page)); - - exp = ll_i2dtexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); - - llap = llap_from_page(page, LLAP_ORIGIN_WRITEPAGE); - if (IS_ERR(llap)) - GOTO(out, rc = PTR_ERR(llap)); - - LASSERT(!llap->llap_nocache); - LASSERT(!PageWriteback(page)); - set_page_writeback(page); - - page_cache_get(page); - if (llap->llap_write_queued) { - LL_CDEBUG_PAGE(D_PAGE, page, "marking urgent\n"); - rc = obd_set_async_flags(exp, lli->lli_smd, NULL, - llap->llap_cookie, - ASYNC_READY | ASYNC_URGENT); - } else { - rc = queue_or_sync_write(exp, inode, llap, CFS_PAGE_SIZE, - ASYNC_READY | ASYNC_URGENT); - } - if (rc) { - /* re-dirty page on error so it retries write */ - if (PageWriteback(page)) - end_page_writeback(page); - - /* resend page only for not started IO*/ - if (!PageError(page)) - ll_redirty_page(page); - - page_cache_release(page); - } -out: - if (rc) { - if (!lli->lli_async_rc) - lli->lli_async_rc = rc; - /* resend page only for not started IO*/ - unlock_page(page); - } - RETURN(rc); -} - -/* - * for now we do our readpage the same on both 2.4 and 2.5. The kernel's - * read-ahead assumes it is valid to issue readpage all the way up to - * i_size, but our dlm locks make that not the case. We disable the - * kernel's read-ahead and do our own by walking ahead in the page cache - * checking for dlm lock coverage. the main difference between 2.4 and - * 2.6 is how read-ahead gets batched and issued, but we're using our own, - * so they look the same. - */ -int ll_readpage(struct file *filp, struct page *page) +int ll_writepage(struct page *vmpage, struct writeback_control *_) { - struct ll_file_data *fd = LUSTRE_FPRIVATE(filp); - struct inode *inode = page->mapping->host; - struct obd_export *exp; - struct ll_async_page *llap; - struct obd_io_group *oig = NULL; - struct lustre_handle *lockh = NULL; - int rc; + struct inode *inode = vmpage->mapping->host; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + struct cl_object *clob; + struct cl_2queue *queue; + struct cl_env_nest nest; + int result; ENTRY; - LASSERT(PageLocked(page)); - LASSERT(!PageUptodate(page)); - CDEBUG(D_VFSTRACE, "VFS Op:inode=%lu/%u(%p),offset=%Lu=%#Lx\n", - inode->i_ino, inode->i_generation, inode, - (((loff_t)page->index) << CFS_PAGE_SHIFT), - (((loff_t)page->index) << CFS_PAGE_SHIFT)); - LASSERT(atomic_read(&filp->f_dentry->d_inode->i_count) > 0); - - if (!ll_i2info(inode)->lli_smd) { - /* File with no objects - one big hole */ - /* We use this just for remove_from_page_cache that is not - * exported, we'd make page back up to date. */ - ll_truncate_complete_page(page); - clear_page(kmap(page)); - kunmap(page); - SetPageUptodate(page); - unlock_page(page); - RETURN(0); - } - - rc = oig_init(&oig); - if (rc < 0) - GOTO(out, rc); - - exp = ll_i2dtexp(inode); - if (exp == NULL) - GOTO(out, rc = -EINVAL); + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); - if (fd->fd_flags & LL_FILE_GROUP_LOCKED) - lockh = &fd->fd_cwlockh; - - llap = llap_from_page_with_lockh(page, LLAP_ORIGIN_READPAGE, lockh); - if (IS_ERR(llap)) { - if (PTR_ERR(llap) == -ENOLCK) { - CWARN("ino %lu page %lu (%llu) not covered by " - "a lock (mmap?). check debug logs.\n", - inode->i_ino, page->index, - (long long)page->index << PAGE_CACHE_SHIFT); - } - GOTO(out, rc = PTR_ERR(llap)); - } - - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) - ras_update(ll_i2sbi(inode), inode, &fd->fd_ras, page->index, - llap->llap_defer_uptodate); - - - if (llap->llap_defer_uptodate) { - /* This is the callpath if we got the page from a readahead */ - llap->llap_ra_used = 1; - rc = ll_readahead(&fd->fd_ras, exp, page->mapping, oig, - fd->fd_flags); - if (rc > 0) - obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, - NULL, oig); - LL_CDEBUG_PAGE(D_PAGE, page, "marking uptodate from defer\n"); - SetPageUptodate(page); - unlock_page(page); - GOTO(out_oig, rc = 0); - } - - rc = ll_issue_page_read(exp, llap, oig, 0); - if (rc) - GOTO(out, rc); - - LL_CDEBUG_PAGE(D_PAGE, page, "queued readpage\n"); - /* We have just requested the actual page we want, see if we can tack - * on some readahead to that page's RPC before it is sent. */ - if (ll_i2sbi(inode)->ll_ra_info.ra_max_pages) - ll_readahead(&fd->fd_ras, exp, page->mapping, oig, - fd->fd_flags); - - rc = obd_trigger_group_io(exp, ll_i2info(inode)->lli_smd, NULL, oig); - -out: - if (rc) - unlock_page(page); -out_oig: - if (oig != NULL) - oig_release(oig); - RETURN(rc); -} - -static void ll_file_put_pages(struct page **pages, int numpages) -{ - int i; - struct page **pp; - ENTRY; - - for (i = 0, pp = pages; i < numpages; i++, pp++) { - if (*pp) { - LL_CDEBUG_PAGE(D_PAGE, (*pp), "free\n"); - __ll_put_llap(*pp); - if (page_private(*pp)) - CERROR("the llap wasn't freed\n"); - (*pp)->mapping = NULL; - if (page_count(*pp) != 1) - CERROR("page %p, flags %#lx, count %i, private %p\n", - (*pp), (unsigned long)(*pp)->flags, page_count(*pp), - (void*)page_private(*pp)); - __free_pages(*pp, 0); - } - } - OBD_FREE(pages, numpages * sizeof(struct page*)); - EXIT; -} - -static struct page **ll_file_prepare_pages(int numpages, struct inode *inode, - unsigned long first) -{ - struct page **pages; - int i; - int rc = 0; - ENTRY; - - OBD_ALLOC(pages, sizeof(struct page *) * numpages); - if (pages == NULL) - RETURN(ERR_PTR(-ENOMEM)); - for (i = 0; i < numpages; i++) { - struct page *page; - struct ll_async_page *llap; - - page = alloc_pages(GFP_HIGHUSER, 0); - if (page == NULL) - GOTO(err, rc = -ENOMEM); - pages[i] = page; - /* llap_from_page needs page index and mapping to be set */ - page->index = first++; - page->mapping = inode->i_mapping; - llap = llap_from_page(page, LLAP_ORIGIN_LOCKLESS_IO); - if (IS_ERR(llap)) - GOTO(err, rc = PTR_ERR(llap)); - llap->llap_lockless_io_page = 1; - } - RETURN(pages); -err: - ll_file_put_pages(pages, numpages); - RETURN(ERR_PTR(rc)); - } - -static ssize_t ll_file_copy_pages(struct page **pages, int numpages, - char *buf, loff_t pos, size_t count, int rw) -{ - ssize_t amount = 0; - int i; - int updatechecksum = ll_i2sbi(pages[0]->mapping->host)->ll_flags & - LL_SBI_CHECKSUM; - ENTRY; + if (ll_i2dtexp(inode) == NULL) + RETURN(-EINVAL); - for (i = 0; i < numpages; i++) { - unsigned offset, bytes, left; - char *vaddr; - - vaddr = kmap(pages[i]); - offset = pos & (CFS_PAGE_SIZE - 1); - bytes = min_t(unsigned, CFS_PAGE_SIZE - offset, count); - LL_CDEBUG_PAGE(D_PAGE, pages[i], "op = %s, addr = %p, " - "buf = %p, bytes = %u\n", - (rw == WRITE) ? "CFU" : "CTU", - vaddr + offset, buf, bytes); - if (rw == WRITE) { - left = copy_from_user(vaddr + offset, buf, bytes); - if (updatechecksum) { - struct ll_async_page *llap; - - llap = llap_cast_private(pages[i]); - llap->llap_checksum = crc32_le(0, vaddr, - CFS_PAGE_SIZE); + env = cl_env_nested_get(&nest); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + io = &ccc_env_info(env)->cti_io; + queue = &vvp_env_info(env)->vti_queue; + clob = ll_i2info(inode)->lli_clob; + LASSERT(clob != NULL); + + io->ci_obj = clob; + result = cl_io_init(env, io, CIT_MISC, clob); + if (result == 0) { + page = cl_page_find(env, clob, vmpage->index, + vmpage, CPT_CACHEABLE); + if (!IS_ERR(page)) { + lu_ref_add(&page->cp_reference, "writepage", + cfs_current()); + cl_page_assume(env, io, page); + /* + * Mark page dirty, because this is what + * ->vio_submit()->cpo_prep_write() assumes. + * + * XXX better solution is to detect this from within + * cl_io_submit_rw() somehow. + */ + set_page_dirty(vmpage); + cl_2queue_init_page(queue, page); + result = cl_io_submit_rw(env, io, CRT_WRITE, queue); + cl_page_list_disown(env, io, &queue->c2_qin); + if (result != 0) { + /* + * There is no need to clear PG_writeback, as + * cl_io_submit_rw() calls completion callback + * on failure. + */ + /* + * Re-dirty page on error so it retries write, + * but not in case when IO has actually + * occurred and completed with an error. + */ + if (!PageError(vmpage)) + set_page_dirty(vmpage); } - } else { - left = copy_to_user(buf, vaddr + offset, bytes); - } - kunmap(pages[i]); - amount += bytes; - if (left) { - amount -= left; - break; + LASSERT(!cl_page_is_owned(page, io)); + lu_ref_del(&page->cp_reference, + "writepage", cfs_current()); + cl_page_put(env, page); + cl_2queue_fini(env, queue); } - buf += bytes; - count -= bytes; - pos += bytes; } - if (amount == 0) - RETURN(-EFAULT); - RETURN(amount); + cl_io_fini(env, io); + cl_env_nested_put(&nest, env); + RETURN(result); } -static int ll_file_oig_pages(struct inode * inode, struct page **pages, - int numpages, loff_t pos, size_t count, int rw) +int ll_readpage(struct file *file, struct page *vmpage) { - struct obd_io_group *oig; - struct ll_inode_info *lli = ll_i2info(inode); - struct obd_export *exp; - loff_t org_pos = pos; - obd_flag brw_flags; - int rc; - int i; + struct lu_env *env; + struct cl_io *io; + struct cl_page *page; + int result; + int refcheck; ENTRY; - exp = ll_i2dtexp(inode); - if (exp == NULL) - RETURN(-EINVAL); - rc = oig_init(&oig); - if (rc) - RETURN(rc); - brw_flags = OBD_BRW_SRVLOCK; - if (cfs_capable(CFS_CAP_SYS_RESOURCE)) - brw_flags |= OBD_BRW_NOQUOTA; - - for (i = 0; i < numpages; i++) { - struct ll_async_page *llap; - unsigned from, bytes; - - from = pos & (CFS_PAGE_SIZE - 1); - bytes = min_t(unsigned, CFS_PAGE_SIZE - from, - count - pos + org_pos); - llap = llap_cast_private(pages[i]); - LASSERT(llap); - - lock_page(pages[i]); - - LL_CDEBUG_PAGE(D_PAGE, pages[i], "offset "LPU64"," - " from %u, bytes = %u\n", - (__u64)pos, from, bytes); - LASSERTF(pos >> CFS_PAGE_SHIFT == pages[i]->index, - "wrong page index %lu (%lu)\n", - pages[i]->index, - (unsigned long)(pos >> CFS_PAGE_SHIFT)); - rc = obd_queue_group_io(exp, lli->lli_smd, NULL, oig, - llap->llap_cookie, - (rw == WRITE) ? - OBD_BRW_WRITE:OBD_BRW_READ, - from, bytes, brw_flags, - ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE | ASYNC_GROUP_SYNC); - if (rc) { - i++; - GOTO(out, rc); + result = ll_cl_init(file, vmpage, &env, &io, &page, &refcheck); + if (result == 0) { + LASSERT(page->cp_type == CPT_CACHEABLE); + if (likely(!PageUptodate(vmpage))) { + cl_page_assume(env, io, page); + result = cl_io_read_page(env, io, page); + } else { + /* Page from a non-object file. */ + LASSERT(!ll_i2info(vmpage->mapping->host)->lli_smd); + unlock_page(vmpage); + result = 0; } - pos += bytes; } - rc = obd_trigger_group_io(exp, lli->lli_smd, NULL, oig); - if (rc) - GOTO(out, rc); - rc = oig_wait(oig); -out: - while(--i >= 0) - unlock_page(pages[i]); - oig_release(oig); - RETURN(rc); + LASSERT(!cl_page_is_owned(page, io)); + ll_cl_fini(env, io, page, &refcheck); + RETURN(result); } -ssize_t ll_file_lockless_io(struct file *file, char *buf, size_t count, - loff_t *ppos, int rw) -{ - loff_t pos; - struct inode *inode = file->f_dentry->d_inode; - ssize_t rc = 0; - int max_pages; - size_t amount = 0; - unsigned long first, last; - ENTRY; - - if (rw == READ) { - loff_t isize; - - ll_inode_size_lock(inode, 0); - isize = i_size_read(inode); - ll_inode_size_unlock(inode, 0); - if (*ppos >= isize) - GOTO(out, rc = 0); - if (*ppos + count >= isize) - count -= *ppos + count - isize; - if (count == 0) - GOTO(out, rc); - } else { - rc = generic_write_checks(file, ppos, &count, 0); - if (rc) - GOTO(out, rc); - rc = ll_remove_suid(file->f_dentry, file->f_vfsmnt); - if (rc) - GOTO(out, rc); - } - pos = *ppos; - first = pos >> CFS_PAGE_SHIFT; - last = (pos + count - 1) >> CFS_PAGE_SHIFT; - max_pages = PTLRPC_MAX_BRW_PAGES * - ll_i2info(inode)->lli_smd->lsm_stripe_count; - CDEBUG(D_INFO, "%u, stripe_count = %u\n", - PTLRPC_MAX_BRW_PAGES /* max_pages_per_rpc */, - ll_i2info(inode)->lli_smd->lsm_stripe_count); - - while (first <= last && rc >= 0) { - int pages_for_io; - struct page **pages; - size_t bytes = count - amount; - - pages_for_io = min_t(int, last - first + 1, max_pages); - pages = ll_file_prepare_pages(pages_for_io, inode, first); - if (IS_ERR(pages)) { - rc = PTR_ERR(pages); - break; - } - if (rw == WRITE) { - rc = ll_file_copy_pages(pages, pages_for_io, buf, - pos + amount, bytes, rw); - if (rc < 0) - GOTO(put_pages, rc); - bytes = rc; - } - rc = ll_file_oig_pages(inode, pages, pages_for_io, - pos + amount, bytes, rw); - if (rc) - GOTO(put_pages, rc); - if (rw == READ) { - rc = ll_file_copy_pages(pages, pages_for_io, buf, - pos + amount, bytes, rw); - if (rc < 0) - GOTO(put_pages, rc); - bytes = rc; - } - amount += bytes; - buf += bytes; -put_pages: - ll_file_put_pages(pages, pages_for_io); - first += pages_for_io; - /* a short read/write check */ - if (pos + amount < ((loff_t)first << CFS_PAGE_SHIFT)) - break; - } - /* NOTE: don't update i_size and KMS in absence of LDLM locks even - * write makes the file large */ - file_accessed(file); - if (rw == READ && amount < count && rc == 0) { - unsigned long not_cleared; - - not_cleared = clear_user(buf, count - amount); - amount = count - not_cleared; - if (not_cleared) - rc = -EFAULT; - } - if (amount > 0) { - lprocfs_counter_add(ll_i2sbi(inode)->ll_stats, - (rw == WRITE) ? - LPROC_LL_LOCKLESS_WRITE : - LPROC_LL_LOCKLESS_READ, - (long)amount); - *ppos += amount; - RETURN(amount); - } -out: - RETURN(rc); -} diff --git a/lustre/llite/rw26.c b/lustre/llite/rw26.c index 1b0d1bc..031b1ab 100644 --- a/lustre/llite/rw26.c +++ b/lustre/llite/rw26.c @@ -68,28 +68,68 @@ #include "llite_internal.h" #include -static int ll_writepage_26(struct page *page, struct writeback_control *wbc) +/** + * Implements Linux VM address_space::invalidatepage() method. This method is + * called when the page is truncate from a file, either as a result of + * explicit truncate, or when inode is removed from memory (as a result of + * final iput(), umount, or memory pressure induced icache shrinking). + * + * [0, offset] bytes of the page remain valid (this is for a case of not-page + * aligned truncate). Lustre leaves partially truncated page in the cache, + * relying on struct inode::i_size to limit further accesses. + */ +static int cl_invalidatepage(struct page *vmpage, unsigned long offset) { - return ll_writepage(page); + struct inode *inode; + struct lu_env *env; + struct cl_page *page; + struct cl_object *obj; + + int result; + int refcheck; + + LASSERT(PageLocked(vmpage)); + LASSERT(!PageWriteback(vmpage)); + + /* + * It is safe to not check anything in invalidatepage/releasepage + * below because they are run with page locked and all our io is + * happening with locked page too + */ + result = 0; + if (offset == 0) { + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + inode = vmpage->mapping->host; + obj = ll_i2info(inode)->lli_clob; + if (obj != NULL) { + page = cl_vmpage_page(vmpage, obj); + if (page != NULL) { + lu_ref_add(&page->cp_reference, + "delete", vmpage); + cl_page_delete(env, page); + result = 1; + lu_ref_del(&page->cp_reference, + "delete", vmpage); + cl_page_put(env, page); + } + } else + LASSERT(vmpage->private == 0); + cl_env_put(env, &refcheck); + } + } + return result; } -/* It is safe to not check anything in invalidatepage/releasepage below - because they are run with page locked and all our io is happening with - locked page too */ #ifdef HAVE_INVALIDATEPAGE_RETURN_INT static int ll_invalidatepage(struct page *page, unsigned long offset) { - if (offset) - return 0; - if (PagePrivate(page)) - ll_removepage(page); - return 1; + return cl_invalidatepage(page, offset); } -#else +#else /* !HAVE_INVALIDATEPAGE_RETURN_INT */ static void ll_invalidatepage(struct page *page, unsigned long offset) { - if (offset == 0 && PagePrivate(page)) - ll_removepage(page); + cl_invalidatepage(page, offset); } #endif @@ -100,22 +140,34 @@ static void ll_invalidatepage(struct page *page, unsigned long offset) #endif static int ll_releasepage(struct page *page, RELEASEPAGE_ARG_TYPE gfp_mask) { - if (PagePrivate(page)) - ll_removepage(page); + void *cookie; + + cookie = cl_env_reenter(); + ll_invalidatepage(page, 0); + cl_env_reexit(cookie); return 1; } -static int ll_set_page_dirty(struct page *page) +static int ll_set_page_dirty(struct page *vmpage) { - struct ll_async_page *llap; - ENTRY; - - llap = llap_from_page(page, LLAP_ORIGIN_UNKNOWN); - if (IS_ERR(llap)) - RETURN(PTR_ERR(llap)); - - llap_write_pending(page->mapping->host, llap); - RETURN(__set_page_dirty_nobuffers(page)); +#if 0 + struct cl_page *page = vvp_vmpage_page_transient(vmpage); + struct vvp_object *obj = cl_inode2vvp(vmpage->mapping->host); + struct vvp_page *cpg; + + /* + * XXX should page method be called here? + */ + LASSERT(&obj->co_cl == page->cp_obj); + cpg = cl2vvp_page(cl_page_at(page, &vvp_device_type)); + /* + * XXX cannot do much here, because page is possibly not locked: + * sys_munmap()->... + * ->unmap_page_range()->zap_pte_range()->set_page_dirty(). + */ + vvp_write_pending(obj, cpg); +#endif + RETURN(__set_page_dirty_nobuffers(vmpage)); } #define MAX_DIRECTIO_SIZE 2*1024*1024*1024UL @@ -164,45 +216,116 @@ static void ll_free_user_pages(struct page **pages, int npages, int do_dirty) OBD_FREE(pages, npages * sizeof(*pages)); } -static ssize_t ll_direct_IO_26_seg(int rw, struct inode *inode, +static ssize_t ll_direct_IO_26_seg(const struct lu_env *env, struct cl_io *io, + int rw, struct inode *inode, struct address_space *mapping, - struct obd_info *oinfo, - struct ptlrpc_request_set *set, size_t size, loff_t file_offset, struct page **pages, int page_count) { - struct brw_page *pga; - int i, rc = 0; - size_t length; + struct cl_page *clp; + struct ccc_page *clup; + struct cl_2queue *queue; + struct cl_object *obj = io->ci_obj; + struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io; + int i; + ssize_t rc = 0; + ssize_t size_orig = size; + size_t page_size = cl_page_size(obj); ENTRY; - OBD_ALLOC(pga, sizeof(*pga) * page_count); - if (!pga) { - CDEBUG(D_VFSTRACE, "sizeof(*pga) = %u page_count = %u\n", - (int)sizeof(*pga), page_count); - RETURN(-ENOMEM); - } + cl_sync_io_init(anchor, page_count); - for (i = 0, length = size; length > 0; - length -=pga[i].count, file_offset +=pga[i].count,i++) {/*i last!*/ - pga[i].pg = pages[i]; - pga[i].off = file_offset; - /* To the end of the page, or the length, whatever is less */ - pga[i].count = min_t(int, CFS_PAGE_SIZE - - (file_offset & ~CFS_PAGE_MASK), - length); - pga[i].flag = 0; - if (rw == READ) - POISON_PAGE(pages[i], 0x0d); + queue = &io->ci_queue; + cl_2queue_init(queue); + for (i = 0; i < page_count; i++) { + clp = cl_page_find(env, obj, cl_index(obj, file_offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + + /* check the page type: if the page is a host page, then do + * write directly */ + /* + * Very rare case that the host pages can be found for + * directIO case, since linux kernel truncated all covered + * pages before getting here. So, to make the OST happy(to + * write a contiguous region), all pages are issued + * here. -jay */ + if (clp->cp_type == CPT_CACHEABLE) { + cfs_page_t *vmpage = cl_page_vmpage(env, clp); + cfs_page_t *src_page; + cfs_page_t *dst_page; + void *src; + void *dst; + + src_page = (rw == WRITE) ? pages[i] : vmpage; + dst_page = (rw == WRITE) ? vmpage : pages[i]; + + src = kmap_atomic(src_page, KM_USER0); + dst = kmap_atomic(dst_page, KM_USER1); + memcpy(dst, (const void *)src, min(page_size, size)); + kunmap_atomic(dst, KM_USER1); + kunmap_atomic(src, KM_USER0); + + /* make sure page will be added to the transfer by + * cl_io_submit()->...->vvp_page_prep_write(). */ + if (rw == WRITE) + set_page_dirty(vmpage); + /* + * If direct-io read finds up-to-date page in the + * cache, just copy it to the user space. Page will be + * filtered out by vvp_page_prep_read(). This + * preserves an invariant, that page is read at most + * once, see cl_page_flags::CPF_READ_COMPLETED. + */ + } + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + clup = cl2ccc_page(cl_page_at(clp, &vvp_device_type)); + clup->cpg_sync_io = anchor; + cl_2queue_add(queue, clp); + + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + /* + * Set page clip to tell transfer formation engine that page + * has to be sent even if it is beyond KMS. + */ + cl_page_clip(env, clp, 0, min(size, page_size)); + size -= page_size; + file_offset += page_size; } - rc = obd_brw_async(rw == WRITE ? OBD_BRW_WRITE : OBD_BRW_READ, - ll_i2dtexp(inode), oinfo, page_count, - pga, NULL, set); - if (rc == 0) - rc = size; + if (rc == 0) { + rc = cl_io_submit_rw(env, io, rw == READ ? CRT_READ : CRT_WRITE, + queue); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * direct-io read found up-to-date pages in the + * cache), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(clp, &queue->c2_qin) + cl_sync_io_note(anchor, +1); + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, + anchor) ?: size_orig; + } + } - OBD_FREE(pga, sizeof(*pga) * page_count); + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); RETURN(rc); } @@ -216,17 +339,17 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, const struct iovec *iov, loff_t file_offset, unsigned long nr_segs) { + struct lu_env *env; + struct cl_io *io; struct file *file = iocb->ki_filp; struct inode *inode = file->f_mapping->host; + struct ccc_object *obj = cl_inode2ccc(inode); ssize_t count = iov_length(iov, nr_segs), tot_bytes = 0; struct ll_inode_info *lli = ll_i2info(inode); struct lov_stripe_md *lsm = lli->lli_smd; - struct ptlrpc_request_set *set; - struct obd_info oinfo; - struct obdo oa; unsigned long seg = 0; size_t size = MAX_DIO_SIZE; - int opc; + int refcheck; ENTRY; if (!lli->lli_smd || !lli->lli_smd->lsm_object_id) @@ -242,15 +365,6 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, file_offset, file_offset, count >> CFS_PAGE_SHIFT, MAX_DIO_SIZE >> CFS_PAGE_SHIFT); - if (rw == WRITE) { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRECT_WRITE, count); - opc = CAPA_OPC_OSS_WRITE; - llap_write_pending(inode, NULL); - } else { - ll_stats_ops_tally(ll_i2sbi(inode), LPROC_LL_DIRECT_READ, count); - opc = CAPA_OPC_OSS_RW; - } - /* Check that all user buffers are aligned as well */ for (seg = 0; seg < nr_segs; seg++) { if (((unsigned long)iov[seg].iov_base & ~CFS_PAGE_MASK) || @@ -258,20 +372,18 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, RETURN(-EINVAL); } - set = ptlrpc_prep_set(); - if (set == NULL) - RETURN(-ENOMEM); + env = cl_env_get(&refcheck); + LASSERT(!IS_ERR(env)); + io = ccc_env_io(env)->cui_cl.cis_io; + LASSERT(io != NULL); - ll_inode_fill_obdo(inode, rw, &oa); - oinfo.oi_oa = &oa; - oinfo.oi_md = lsm; - oinfo.oi_capa = ll_osscapa_get(inode, opc); - - /* need locking between buffered and direct access. and race with - *size changing by concurrent truncates and writes. */ + /* 0. Need locking between buffered and direct access. and race with + *size changing by concurrent truncates and writes. + * 1. Need inode sem to operate transient pages. */ if (rw == READ) LOCK_INODE_MUTEX(inode); + LASSERT(obj->cob_transient_pages == 0); for (seg = 0; seg < nr_segs; seg++) { size_t iov_left = iov[seg].iov_len; unsigned long user_addr = (unsigned long)iov[seg].iov_base; @@ -293,9 +405,8 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, &pages); LASSERT(page_count != 0); if (page_count > 0) { - result = ll_direct_IO_26_seg(rw, inode, + result = ll_direct_IO_26_seg(env, io, rw, inode, file->f_mapping, - &oinfo, set, min(size,iov_left), file_offset, pages, page_count); @@ -332,24 +443,19 @@ static ssize_t ll_direct_IO_26(int rw, struct kiocb *iocb, } } out: + LASSERT(obj->cob_transient_pages == 0); if (rw == READ) UNLOCK_INODE_MUTEX(inode); if (tot_bytes > 0) { - int rc; - - rc = ptlrpc_set_wait(set); - if (rc) { - tot_bytes = rc; - } else if (rw == WRITE) { + if (rw == WRITE) { lov_stripe_lock(lsm); obd_adjust_kms(ll_i2dtexp(inode), lsm, file_offset, 0); lov_stripe_unlock(lsm); } } - capa_put(oinfo.oi_capa); - ptlrpc_set_destroy(set); + cl_env_put(env, &refcheck); RETURN(tot_bytes); } @@ -357,13 +463,13 @@ struct address_space_operations ll_aops = { .readpage = ll_readpage, // .readpages = ll_readpages, .direct_IO = ll_direct_IO_26, - .writepage = ll_writepage_26, + .writepage = ll_writepage, .writepages = generic_writepages, .set_page_dirty = ll_set_page_dirty, .sync_page = NULL, .prepare_write = ll_prepare_write, .commit_write = ll_commit_write, .invalidatepage = ll_invalidatepage, - .releasepage = ll_releasepage, + .releasepage = (void *)ll_releasepage, .bmap = NULL }; diff --git a/lustre/llite/statahead.c b/lustre/llite/statahead.c index e5fff85..e341675 100644 --- a/lustre/llite/statahead.c +++ b/lustre/llite/statahead.c @@ -741,8 +741,8 @@ static int ll_statahead_thread(void *arg) struct inode *dir = parent->d_inode; struct ll_inode_info *lli = ll_i2info(dir); struct ll_sb_info *sbi = ll_i2sbi(dir); - struct ll_statahead_info *sai = ll_sai_get(lli->lli_sai); - struct ptlrpc_thread *thread = &sai->sai_thread; + struct ll_statahead_info *sai; + struct ptlrpc_thread *thread; struct page *page; __u64 pos = 0; int first = 0; @@ -750,12 +750,23 @@ static int ll_statahead_thread(void *arg) struct ll_dir_chain chain; ENTRY; + spin_lock(&lli->lli_lock); + if (unlikely(lli->lli_sai == NULL)) { + spin_unlock(&lli->lli_lock); + dput(parent); + RETURN(-EAGAIN); + } else { + sai = ll_sai_get(lli->lli_sai); + spin_unlock(&lli->lli_lock); + } + { char pname[16]; snprintf(pname, 15, "ll_sa_%u", sta->sta_pid); cfs_daemonize(pname); } + thread = &sai->sai_thread; sbi->ll_sa_total++; spin_lock(&lli->lli_lock); thread->t_flags = SVC_RUNNING; @@ -1055,6 +1066,7 @@ out: * \retval 1 -- stat ahead thread process such dentry, for lookup, it hit * \retval -EEXIST -- stat ahead thread started, and this is the first dentry * \retval -EBADFD -- statahead thread exit and not dentry available + * \retval -EAGAIN -- try to stat by caller * \retval others -- error */ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) @@ -1175,7 +1187,7 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) sai->sai_thread.t_flags = SVC_STOPPED; ll_sai_put(sai); LASSERT(lli->lli_sai == NULL); - RETURN(rc); + RETURN(-EAGAIN); } l_wait_event(sai->sai_thread.t_ctl_waitq, @@ -1192,20 +1204,16 @@ int do_statahead_enter(struct inode *dir, struct dentry **dentryp, int lookup) /** * update hit/miss count. */ -int ll_statahead_exit(struct dentry *dentry, int result) +void ll_statahead_exit(struct dentry *dentry, int result) { - struct dentry *parent = dentry->d_parent; - struct ll_inode_info *lli = ll_i2info(parent->d_inode); - struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode); - int rc = 0; + struct dentry *parent = dentry->d_parent; + struct ll_inode_info *lli = ll_i2info(parent->d_inode); + struct ll_sb_info *sbi = ll_i2sbi(parent->d_inode); + struct ll_statahead_info *sai = lli->lli_sai; + struct ll_dentry_data *ldd = ll_d2d(dentry); ENTRY; - if (lli->lli_opendir_pid != cfs_curproc_pid()) - RETURN(-EBADFD); - - if (lli->lli_sai) { - struct ll_statahead_info *sai = lli->lli_sai; - + if (lli->lli_opendir_pid == cfs_curproc_pid() && sai) { if (result >= 1) { sbi->ll_sa_hit++; sai->sai_hit++; @@ -1235,7 +1243,8 @@ int ll_statahead_exit(struct dentry *dentry, int result) if (!sa_is_stopped(sai)) cfs_waitq_signal(&sai->sai_thread.t_ctl_waitq); ll_sai_entry_fini(sai); - rc = ll_statahead_mark(dentry); + if (likely(ldd != NULL)) + ldd->lld_sa_generation = sai->sai_generation; } - RETURN(rc); + EXIT; } diff --git a/lustre/llite/super25.c b/lustre/llite/super25.c index 979f0c4..b61e7fc 100644 --- a/lustre/llite/super25.c +++ b/lustre/llite/super25.c @@ -105,12 +105,21 @@ struct super_operations lustre_super_operations = void lustre_register_client_process_config(int (*cpc)(struct lustre_cfg *lcfg)); +int vvp_global_init(void); +void vvp_global_fini(void); + static int __init init_lustre_lite(void) { int i, rc, seed[2]; struct timeval tv; lnet_process_id_t lnet_id; - + + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_CONSOLE, "Lustre client module (%p).\n", + &lustre_super_operations); + rc = ll_init_inodecache(); if (rc) return -ENOMEM; @@ -148,8 +157,6 @@ static int __init init_lustre_lite(void) proc_lustre_fs_root = proc_lustre_root ? lprocfs_register("llite", proc_lustre_root, NULL, NULL) : NULL; - ll_register_cache(&ll_cache_definition); - lustre_register_client_fill_super(ll_fill_super); lustre_register_kill_super_cb(ll_kill_super); @@ -174,13 +181,20 @@ static int __init init_lustre_lite(void) init_timer(&ll_capa_timer); ll_capa_timer.function = ll_capa_timer_callback; rc = ll_capa_thread_start(); + /* + * XXX normal cleanup is needed here. + */ + if (rc == 0) + rc = vvp_global_init(); + return rc; } static void __exit exit_lustre_lite(void) { int rc; - + + vvp_global_fini(); del_timer(&ll_capa_timer); ll_capa_thread_stop(); LASSERTF(capa_count[CAPA_SITE_CLIENT] == 0, @@ -192,8 +206,6 @@ static void __exit exit_lustre_lite(void) lustre_register_client_process_config(NULL); - ll_unregister_cache(&ll_cache_definition); - ll_destroy_inodecache(); rc = cfs_mem_cache_destroy(ll_rmtperm_hash_cachep); @@ -206,11 +218,6 @@ static void __exit exit_lustre_lite(void) rc = cfs_mem_cache_destroy(ll_file_data_slab); LASSERTF(rc == 0, "couldn't destroy ll_file_data slab\n"); - if (ll_async_page_slab) { - rc = cfs_mem_cache_destroy(ll_async_page_slab); - LASSERTF(rc == 0, "couldn't destroy ll_async_page slab\n"); - } - if (proc_lustre_fs_root) lprocfs_remove(&proc_lustre_fs_root); } diff --git a/lustre/llite/symlink.c b/lustre/llite/symlink.c index b1754da..ff1c99a 100644 --- a/lustre/llite/symlink.c +++ b/lustre/llite/symlink.c @@ -78,7 +78,7 @@ static int ll_readlink_internal(struct inode *inode, CERROR("OBD_MD_LINKNAME not set on reply\n"); GOTO(failed, rc = -EPROTO); } - + LASSERT(symlen != 0); if (body->eadatasize != symlen) { CERROR("inode %lu: symlink length %d not expected %d\n", @@ -114,7 +114,6 @@ static int ll_readlink_internal(struct inode *inode, static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) { struct inode *inode = dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); struct ptlrpc_request *request; char *symname; int rc; @@ -122,7 +121,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) CDEBUG(D_VFSTRACE, "VFS Op\n"); /* on symlinks lli_open_sem protects lli_symlink_name allocation/data */ - down(&lli->lli_size_sem); + ll_inode_size_lock(inode, 0); rc = ll_readlink_internal(inode, &request, &symname); if (rc) GOTO(out, rc); @@ -130,7 +129,7 @@ static int ll_readlink(struct dentry *dentry, char *buffer, int buflen) rc = vfs_readlink(dentry, buffer, buflen, symname); ptlrpc_req_finished(request); out: - up(&lli->lli_size_sem); + ll_inode_size_unlock(inode, 0); RETURN(rc); } @@ -144,11 +143,10 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, struct nameidata *nd) { struct inode *inode = dentry->d_inode; - struct ll_inode_info *lli = ll_i2info(inode); #ifdef HAVE_VFS_INTENT_PATCHES struct lookup_intent *it = ll_nd2it(nd); #endif - struct ptlrpc_request *request; + struct ptlrpc_request *request = NULL; int rc; char *symname; ENTRY; @@ -165,24 +163,16 @@ static LL_FOLLOW_LINK_RETURN_TYPE ll_follow_link(struct dentry *dentry, #endif CDEBUG(D_VFSTRACE, "VFS Op\n"); -#if THREAD_SIZE < 8192 - /* - * We set the limits recursive symlink to 5 - * instead of default 8 when kernel has 4k stack - * to prevent stack overflow. - */ - if (current->link_count >= 5) { + /* Limit the recursive symlink depth to 5 instead of default + * 8 links when kernel has 4k stack to prevent stack overflow. */ + if (THREAD_SIZE < 8192 && current->link_count >= 5) { rc = -ELOOP; - GOTO(out_release, rc); + } else { + ll_inode_size_lock(inode, 0); + rc = ll_readlink_internal(inode, &request, &symname); + ll_inode_size_unlock(inode, 0); } -#endif - down(&lli->lli_size_sem); - rc = ll_readlink_internal(inode, &request, &symname); - up(&lli->lli_size_sem); if (rc) { -#if THREAD_SIZE < 8192 -out_release: -#endif path_release(nd); /* Kernel assumes that ->follow_link() releases nameidata on error */ GOTO(out, rc); diff --git a/lustre/llite/vvp_dev.c b/lustre/llite/vvp_dev.c new file mode 100644 index 0000000..8347789 --- /dev/null +++ b/lustre/llite/vvp_dev.c @@ -0,0 +1,559 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_device and cl_device_type implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp device and device type functions. + * + */ + +/* + * vvp_ prefix stands for "Vfs Vm Posix". It corresponds to historical + * "llite_" (var. "ll_") prefix. + */ + +cfs_mem_cache_t *vvp_page_kmem; +cfs_mem_cache_t *vvp_thread_kmem; +static cfs_mem_cache_t *vvp_session_kmem; +static struct lu_kmem_descr vvp_caches[] = { + { + .ckd_cache = &vvp_page_kmem, + .ckd_name = "vvp_page_kmem", + .ckd_size = sizeof (struct ccc_page) + }, + { + .ckd_cache = &vvp_thread_kmem, + .ckd_name = "vvp_thread_kmem", + .ckd_size = sizeof (struct vvp_thread_info), + }, + { + .ckd_cache = &vvp_session_kmem, + .ckd_name = "vvp_session_kmem", + .ckd_size = sizeof (struct vvp_session) + }, + { + .ckd_cache = NULL + } +}; + +static void *vvp_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_thread_info *info; + + OBD_SLAB_ALLOC_PTR(info, vvp_thread_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void vvp_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, vvp_thread_kmem); +} + +static void *vvp_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct vvp_session *session; + + OBD_SLAB_ALLOC_PTR(session, vvp_session_kmem); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void vvp_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct vvp_session *session = data; + OBD_SLAB_FREE_PTR(session, vvp_session_kmem); +} + + +struct lu_context_key vvp_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = vvp_key_init, + .lct_fini = vvp_key_fini +}; + +struct lu_context_key vvp_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = vvp_session_key_init, + .lct_fini = vvp_session_key_fini +}; + +/* type constructor/destructor: vvp_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(vvp, &ccc_key, &ccc_session_key, &vvp_key, &vvp_session_key); + +static const struct lu_device_operations vvp_lu_ops = { + .ldo_object_alloc = vvp_object_alloc +}; + +static const struct cl_device_operations vvp_cl_ops = { + .cdo_req_init = ccc_req_init +}; + +static struct lu_device *vvp_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + return ccc_device_alloc(env, t, cfg, &vvp_lu_ops, &vvp_cl_ops); +} + +static const struct lu_device_type_operations vvp_device_type_ops = { + .ldto_init = vvp_type_init, + .ldto_fini = vvp_type_fini, + + .ldto_start = vvp_type_start, + .ldto_stop = vvp_type_stop, + + .ldto_device_alloc = vvp_device_alloc, + .ldto_device_free = ccc_device_free, + .ldto_device_init = ccc_device_init, + .ldto_device_fini = ccc_device_fini +}; + +struct lu_device_type vvp_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_VVP_NAME, + .ldt_ops = &vvp_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** + * A mutex serializing calls to vvp_inode_fini() under extreme memory + * pressure, when environments cannot be allocated. + */ +int vvp_global_init(void) +{ + int result; + + result = lu_kmem_init(vvp_caches); + if (result == 0) { + result = ccc_global_init(&vvp_device_type); + if (result != 0) + lu_kmem_fini(vvp_caches); + } + return result; +} + +void vvp_global_fini(void) +{ + ccc_global_fini(&vvp_device_type); + lu_kmem_fini(vvp_caches); +} + + +/***************************************************************************** + * + * mirror obd-devices into cl devices. + * + */ + +int cl_sb_init(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct cl_device *cl; + struct lu_env *env; + int rc = 0; + int refcheck; + + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cl = cl_type_setup(env, NULL, &vvp_device_type, + sbi->ll_dt_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + cl2ccc_dev(cl)->cdv_sb = sb; + sbi->ll_cl = cl; + sbi->ll_site = cl2lu_dev(cl)->ld_site; + } + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); + RETURN(rc); +} + +int cl_sb_fini(struct super_block *sb) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + struct cl_device *cld; + int refcheck; + int result; + + ENTRY; + sbi = ll_s2sbi(sb); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + cld = sbi->ll_cl; + + if (cld != NULL) { + cl_stack_fini(env, cld); + sbi->ll_cl = NULL; + sbi->ll_site = NULL; + } + cl_env_put(env, &refcheck); + result = 0; + } else { + CERROR("Cannot cleanup cl-stack due to memory shortage.\n"); + result = PTR_ERR(env); + } + /* + * If mount failed (sbi->ll_cl == NULL), and this there are no other + * mounts, stop device types manually (this usually happens + * automatically when last device is destroyed). + */ + lu_types_stop(); + RETURN(result); +} + +/**************************************************************************** + * + * /proc/fs/lustre/llite/$MNT/dump_page_cache + * + ****************************************************************************/ + +/* + * To represent contents of a page cache as a byte stream, following + * information if encoded in 64bit offset: + * + * - file hash bucket in lu_site::ls_hash[] 28bits + * + * - how far file is from bucket head 4bits + * + * - page index 32bits + * + * First two data identify a file in the cache uniquely. + */ + +#define PGC_OBJ_SHIFT (32 + 4) +#define PGC_DEPTH_SHIFT (32) + +struct vvp_pgcache_id { + unsigned vpi_bucket; + unsigned vpi_depth; + uint32_t vpi_index; +}; + +static void vvp_pgcache_id_unpack(loff_t pos, struct vvp_pgcache_id *id) +{ + CLASSERT(sizeof(pos) == sizeof(__u64)); + + id->vpi_index = pos & 0xffffffff; + id->vpi_depth = (pos >> PGC_DEPTH_SHIFT) & 0xf; + id->vpi_bucket = ((unsigned long long)pos >> PGC_OBJ_SHIFT); +} + +static loff_t vvp_pgcache_id_pack(struct vvp_pgcache_id *id) +{ + return + ((__u64)id->vpi_index) | + ((__u64)id->vpi_depth << PGC_DEPTH_SHIFT) | + ((__u64)id->vpi_bucket << PGC_OBJ_SHIFT); +} + +static struct cl_object *vvp_pgcache_obj(const struct lu_env *env, + struct lu_device *dev, + struct vvp_pgcache_id *id) +{ + struct hlist_head *bucket; + struct lu_object_header *hdr; + struct lu_site *site; + struct hlist_node *scan; + struct lu_object_header *found; + struct cl_object *clob; + unsigned depth; + + LASSERT(lu_device_is_cl(dev)); + + site = dev->ld_site; + bucket = site->ls_hash + (id->vpi_bucket & site->ls_hash_mask); + depth = id->vpi_depth & 0xf; + found = NULL; + clob = NULL; + + /* XXX copy of lu_object.c:htable_lookup() */ + read_lock(&site->ls_guard); + hlist_for_each_entry(hdr, scan, bucket, loh_hash) { + if (depth-- == 0) { + if (!lu_object_is_dying(hdr)) { + if (atomic_add_return(1, &hdr->loh_ref) == 1) + ++ site->ls_busy; + found = hdr; + } + break; + } + } + read_unlock(&site->ls_guard); + + if (found != NULL) { + struct lu_object *lu_obj; + + lu_obj = lu_object_locate(found, dev->ld_type); + if (lu_obj != NULL) { + lu_object_ref_add(lu_obj, "dump", cfs_current()); + clob = lu2cl(lu_obj); + } else + lu_object_put(env, lu_object_top(found)); + } else if (depth > 0) + id->vpi_depth = 0xf; + return clob; +} + +static loff_t vvp_pgcache_find(const struct lu_env *env, + struct lu_device *dev, loff_t pos) +{ + struct cl_object *clob; + struct lu_site *site; + struct vvp_pgcache_id id; + + site = dev->ld_site; + vvp_pgcache_id_unpack(pos, &id); + + while (1) { + if (id.vpi_bucket >= site->ls_hash_size) + return ~0ULL; + clob = vvp_pgcache_obj(env, dev, &id); + if (clob != NULL) { + struct cl_object_header *hdr; + int nr; + struct cl_page *pg; + + /* got an object. Find next page. */ + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + nr = radix_tree_gang_lookup(&hdr->coh_tree, + (void **)&pg, + id.vpi_index, 1); + if (nr > 0) { + id.vpi_index = pg->cp_index; + /* Cant support over 16T file */ + nr = !(pg->cp_index > 0xffffffff); + } + spin_unlock(&hdr->coh_page_guard); + + lu_object_ref_del(&clob->co_lu, "dump", cfs_current()); + cl_object_put(env, clob); + if (nr > 0) + return vvp_pgcache_id_pack(&id); + } + /* to the next object. */ + ++id.vpi_depth; + id.vpi_depth &= 0xf; + if (id.vpi_depth == 0 && ++id.vpi_bucket == 0) + return ~0ULL; + id.vpi_index = 0; + } +} + +#define seq_page_flag(seq, page, flag, has_flags) do { \ + if (test_bit(PG_##flag, &(page)->flags)) { \ + seq_printf(seq, "%s"#flag, has_flags ? "|" : ""); \ + has_flags = 1; \ + } \ +} while(0); + +static void vvp_pgcache_page_show(const struct lu_env *env, + struct seq_file *seq, struct cl_page *page) +{ + struct ccc_page *cpg; + cfs_page_t *vmpage; + int has_flags; + + cpg = cl2ccc_page(cl_page_at(page, &vvp_device_type)); + vmpage = cpg->cpg_page; + seq_printf(seq," %5i | %p %p %s %s %s %s | %p %lu/%u(%p) %lu %u [", + 0 /* gen */, + cpg, page, + "none", + cpg->cpg_write_queued ? "wq" : "- ", + cpg->cpg_defer_uptodate ? "du" : "- ", + PageWriteback(vmpage) ? "wb" : "-", + vmpage, vmpage->mapping->host->i_ino, + vmpage->mapping->host->i_generation, + vmpage->mapping->host, vmpage->index, + page_count(vmpage)); + has_flags = 0; + seq_page_flag(seq, vmpage, locked, has_flags); + seq_page_flag(seq, vmpage, error, has_flags); + seq_page_flag(seq, vmpage, referenced, has_flags); + seq_page_flag(seq, vmpage, uptodate, has_flags); + seq_page_flag(seq, vmpage, dirty, has_flags); +#if (LINUX_VERSION_CODE <= KERNEL_VERSION(2,6,12)) + seq_page_flag(seq, vmpage, highmem, has_flags); +#endif + seq_page_flag(seq, vmpage, writeback, has_flags); + seq_printf(seq, "%s]\n", has_flags ? "" : "-"); +} + +static int vvp_pgcache_show(struct seq_file *f, void *v) +{ + loff_t pos; + struct ll_sb_info *sbi; + struct cl_object *clob; + struct lu_env *env; + struct cl_page *page; + struct cl_object_header *hdr; + struct vvp_pgcache_id id; + int refcheck; + int result; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + pos = *(loff_t *) v; + vvp_pgcache_id_unpack(pos, &id); + sbi = f->private; + clob = vvp_pgcache_obj(env, &sbi->ll_cl->cd_lu_dev, &id); + if (clob != NULL) { + hdr = cl_object_header(clob); + + spin_lock(&hdr->coh_page_guard); + page = cl_page_lookup(hdr, id.vpi_index); + spin_unlock(&hdr->coh_page_guard); + + seq_printf(f, "%8x@"DFID": ", + id.vpi_index, PFID(&hdr->coh_lu.loh_fid)); + if (page != NULL) { + vvp_pgcache_page_show(env, f, page); + cl_page_put(env, page); + } else + seq_puts(f, "missing\n"); + lu_object_ref_del(&clob->co_lu, "dump", cfs_current()); + cl_object_put(env, clob); + } else + seq_printf(f, "%llx missing\n", pos); + cl_env_put(env, &refcheck); + result = 0; + } else + result = PTR_ERR(env); + return result; +} + +static void *vvp_pgcache_start(struct seq_file *f, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + sbi = f->private; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + if (sbi->ll_site->ls_hash_bits > 64 - PGC_OBJ_SHIFT) + pos = ERR_PTR(-EFBIG); + else { + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, + *pos); + if (*pos == ~0ULL) + pos = NULL; + } + cl_env_put(env, &refcheck); + } + return pos; +} + +static void *vvp_pgcache_next(struct seq_file *f, void *v, loff_t *pos) +{ + struct ll_sb_info *sbi; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + sbi = f->private; + *pos = vvp_pgcache_find(env, &sbi->ll_cl->cd_lu_dev, *pos + 1); + if (*pos == ~0ULL) + pos = NULL; + cl_env_put(env, &refcheck); + } + return pos; +} + +static void vvp_pgcache_stop(struct seq_file *f, void *v) +{ + /* Nothing to do */ +} + +static struct seq_operations vvp_pgcache_ops = { + .start = vvp_pgcache_start, + .next = vvp_pgcache_next, + .stop = vvp_pgcache_stop, + .show = vvp_pgcache_show +}; + +static int vvp_dump_pgcache_seq_open(struct inode *inode, struct file *filp) +{ + struct proc_dir_entry *dp = PDE(inode); + struct ll_sb_info *sbi = dp->data; + struct seq_file *seq; + int result; + + result = seq_open(filp, &vvp_pgcache_ops); + if (result == 0) { + seq = filp->private_data; + seq->private = sbi; + } + return result; +} + +struct file_operations vvp_dump_pgcache_file_ops = { + .owner = THIS_MODULE, + .open = vvp_dump_pgcache_seq_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; diff --git a/lustre/llite/vvp_internal.h b/lustre/llite/vvp_internal.h new file mode 100644 index 0000000..42042a9 --- /dev/null +++ b/lustre/llite/vvp_internal.h @@ -0,0 +1,68 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal definitions for VVP layer. + * + * Author: Nikita Danilov + */ + +#ifndef VVP_INTERNAL_H +#define VVP_INTERNAL_H + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include +#include "llite_internal.h" + +int vvp_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int vvp_lock_init (const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +struct cl_page *vvp_page_init (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage); +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct ccc_object *cl_inode2ccc(struct inode *inode); + +extern cfs_mem_cache_t *vvp_page_kmem; +extern cfs_mem_cache_t *vvp_thread_kmem; + +#endif /* VVP_INTERNAL_H */ diff --git a/lustre/llite/vvp_io.c b/lustre/llite/vvp_io.c new file mode 100644 index 0000000..aa2dcf0 --- /dev/null +++ b/lustre/llite/vvp_io.c @@ -0,0 +1,996 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include +#include + +#include "vvp_internal.h" + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice); + +/***************************************************************************** + * + * io operations. + * + */ + +static int vvp_io_fault_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct inode *inode = ccc_object_inode(ios->cis_obj); + + LASSERT(inode == + cl2ccc_io(env, ios)->cui_fd->fd_file->f_dentry->d_inode); + vio->u.fault.ft_mtime = LTIME_S(inode->i_mtime); + return 0; +} + +static void vvp_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + if (io->ci_type == CIT_WRITE) + up(&ll_i2info(ccc_object_inode(obj))->lli_write_sem); + else { + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + + if (vio->cui_ra_window_set) + ll_ra_read_ex(cio->cui_fd->fd_file, &vio->cui_bead); + } + +} + +static void vvp_io_fault_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct cl_page *page = io->u.ci_fault.ft_page; + + CLOBINVRNT(env, io->ci_obj, ccc_object_invariant(io->ci_obj)); + + if (page != NULL) { + lu_ref_del(&page->cp_reference, "fault", io); + cl_page_put(env, page); + io->u.ci_fault.ft_page = NULL; + } + vvp_io_fini(env, ios); +} + +enum cl_lock_mode vvp_mode_from_vma(struct vm_area_struct *vma) +{ + /* + * we only want to hold PW locks if the mmap() can generate + * writes back to the file and that only happens in shared + * writable vmas + */ + if ((vma->vm_flags & VM_SHARED) && (vma->vm_flags & VM_WRITE)) + return CLM_WRITE; + return CLM_READ; +} + +static int vvp_mmap_locks(const struct lu_env *env, + struct ccc_io *vio, struct cl_io *io) +{ + struct ccc_thread_info *cti = ccc_env_info(env); + struct vm_area_struct *vma; + struct cl_lock_descr *descr = &cti->cti_descr; + ldlm_policy_data_t policy; + struct inode *inode; + unsigned long addr; + unsigned long seg; + ssize_t count; + int result; + ENTRY; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + if (cl_io_is_sendfile(io)) + RETURN(0); + + for (seg = 0; seg < vio->cui_nrsegs; seg++) { + const struct iovec *iv = &vio->cui_iov[seg]; + + addr = (unsigned long)iv->iov_base; + count = iv->iov_len; + if (count == 0) + continue; + + count += addr & (~CFS_PAGE_MASK); + addr &= CFS_PAGE_MASK; + while((vma = our_vma(addr, count)) != NULL) { + LASSERT(vma->vm_file); + + inode = vma->vm_file->f_dentry->d_inode; + /* + * XXX: Required lock mode can be weakened: CIT_WRITE + * io only ever reads user level buffer, and CIT_READ + * only writes on it. + */ + policy_from_vma(&policy, vma, addr, count); + descr->cld_mode = vvp_mode_from_vma(vma); + descr->cld_obj = ll_i2info(inode)->lli_clob; + descr->cld_start = cl_index(descr->cld_obj, + policy.l_extent.start); + descr->cld_end = cl_index(descr->cld_obj, + policy.l_extent.end); + result = cl_io_lock_alloc_add(env, io, descr); + if (result < 0) + RETURN(result); + if (vma->vm_end - addr >= count) + break; + count -= vma->vm_end - addr; + addr = vma->vm_end; + } + } + RETURN(0); +} + +static void vvp_io_update_iov(const struct lu_env *env, + struct ccc_io *vio, struct cl_io *io) +{ + int i; + size_t size = io->u.ci_rw.crw_count; + + vio->cui_iov_olen = 0; + if (cl_io_is_sendfile(io) || size == vio->cui_tot_count) + return; + + if (vio->cui_tot_nrsegs == 0) + vio->cui_tot_nrsegs = vio->cui_nrsegs; + + for (i = 0; i < vio->cui_tot_nrsegs; i++) { + struct iovec *iv = &vio->cui_iov[i]; + + if (iv->iov_len < size) + size -= iv->iov_len; + else { + if (iv->iov_len > size) { + vio->cui_iov_olen = iv->iov_len; + iv->iov_len = size; + } + break; + } + } + + vio->cui_nrsegs = i + 1; +} + +static int vvp_io_rw_lock(const struct lu_env *env, struct cl_io *io, + enum cl_lock_mode mode, loff_t start, loff_t end) +{ + struct ccc_io *cio = ccc_env_io(env); + int result; + int ast_flags = 0; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + LASSERT(vvp_env_io(env)->cui_oneshot == 0); + ENTRY; + + vvp_io_update_iov(env, cio, io); + + if (io->u.ci_rw.crw_nonblock) + ast_flags |= CEF_NONBLOCK; + result = vvp_mmap_locks(env, cio, io); + if (result == 0) + result = ccc_io_one_lock(env, io, ast_flags, mode, start, end); + RETURN(result); +} + +static int vvp_io_read_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct ll_inode_info *lli = ll_i2info(ccc_object_inode(io->ci_obj)); + int result; + + ENTRY; + /* XXX: Layer violation, we shouldn't see lsm at llite level. */ + if (lli->lli_smd != NULL) /* lsm-less file, don't need to lock */ + result = vvp_io_rw_lock(env, io, CLM_READ, + io->u.ci_rd.rd.crw_pos, + io->u.ci_rd.rd.crw_pos + + io->u.ci_rd.rd.crw_count - 1); + else + result = 0; + RETURN(result); +} + +static int vvp_io_fault_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + struct vvp_io *vio = cl2vvp_io(env, ios); + /* + * XXX LDLM_FL_CBPENDING + */ + return ccc_io_one_lock_index + (env, io, 0, vvp_mode_from_vma(vio->u.fault.ft_vma), + io->u.ci_fault.ft_index, io->u.ci_fault.ft_index); +} + +static int vvp_io_write_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io = ios->cis_io; + loff_t start; + loff_t end; + int result; + + if (cl2vvp_io(env, ios)->cui_oneshot == 0) { + if (io->u.ci_wr.wr_append) { + start = 0; + end = OBD_OBJECT_EOF; + } else { + start = io->u.ci_wr.wr.crw_pos; + end = start + io->u.ci_wr.wr.crw_count - 1; + } + result = vvp_io_rw_lock(env, io, CLM_WRITE, start, end); + } else + result = 0; + return result; +} + +static int vvp_io_trunc_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *vio = cl2ccc_io(env, ios); + struct inode *inode = ccc_object_inode(ios->cis_obj); + + /* + * We really need to get our PW lock before we change inode->i_size. + * If we don't we can race with other i_size updaters on our node, + * like ll_file_read. We can also race with i_size propogation to + * other nodes through dirtying and writeback of final cached pages. + * This last one is especially bad for racing o_append users on other + * nodes. + */ + + UNLOCK_INODE_MUTEX(inode); + UP_WRITE_I_ALLOC_SEM(inode); + vio->u.trunc.cui_locks_released = 1; + return 0; +} + +/** + * Implementation of cl_io_operations::vio_lock() method for CIT_TRUNC io. + * + * Handles "lockless io" mode when extent locking is done by server. + */ +static int vvp_io_trunc_lock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *vio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + size_t new_size = io->u.ci_truncate.tr_size; + __u32 enqflags = new_size == 0 ? CEF_DISCARD_DATA : 0; + int result; + + vio->u.trunc.cui_local_lock = TRUNC_EXTENT; + result = ccc_io_one_lock(env, io, enqflags, CLM_WRITE, + new_size, OBD_OBJECT_EOF); + return result; +} + +static int vvp_io_trunc_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct inode *inode = ccc_object_inode(io->ci_obj); + struct cl_object *obj = ios->cis_obj; + size_t size = io->u.ci_truncate.tr_size; + pgoff_t start = cl_index(obj, size); + int result; + + LASSERT(cio->u.trunc.cui_locks_released); + LASSERT(vio->cui_oneshot == 0); + + LOCK_INODE_MUTEX(inode); + DOWN_WRITE_I_ALLOC_SEM(inode); + cio->u.trunc.cui_locks_released = 0; + + /* + * Only ll_inode_size_lock is taken at this level. lov_stripe_lock() + * is grabbed by ll_truncate() only over call to obd_adjust_kms(). If + * vmtruncate returns 0, then ll_truncate dropped ll_inode_size_lock() + */ + ll_inode_size_lock(inode, 0); + result = vmtruncate(inode, size); + if (result != 0) + ll_inode_size_unlock(inode, 0); + /* + * If a page is partially truncated, keep it owned across truncate to + * prevent... races. + * + * XXX this properly belongs to osc, because races in question are OST + * specific. + */ + if (cl_offset(obj, start) != size) { + struct cl_object_header *hdr; + + hdr = cl_object_header(obj); + spin_lock(&hdr->coh_page_guard); + vio->cui_partpage = cl_page_lookup(hdr, start); + spin_unlock(&hdr->coh_page_guard); + + if (vio->cui_partpage != NULL) + /* + * Wait for the transfer completion for a partially + * truncated page to avoid dead-locking an OST with + * the concurrent page-wise overlapping WRITE and + * PUNCH requests. + * + * Partial page is disowned in vvp_io_trunc_end(). + */ + cl_page_own(env, io, vio->cui_partpage); + } else + vio->cui_partpage = NULL; + return result; +} + +static void vvp_io_trunc_end(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + + if (vio->cui_partpage != NULL) { + cl_page_disown(env, ios->cis_io, vio->cui_partpage); + cl_page_put(env, vio->cui_partpage); + vio->cui_partpage = NULL; + } +} + +static void vvp_io_trunc_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = ccc_env_io(env); + struct inode *inode = ccc_object_inode(ios->cis_io->ci_obj); + + if (cio->u.trunc.cui_locks_released) { + LOCK_INODE_MUTEX(inode); + DOWN_WRITE_I_ALLOC_SEM(inode); + cio->u.trunc.cui_locks_released = 0; + } + vvp_io_fini(env, ios); +} + +#ifdef HAVE_FILE_READV +static ssize_t lustre_generic_file_read(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_readv(file, vio->cui_iov, vio->cui_nrsegs, ppos); +} + +static ssize_t lustre_generic_file_write(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_writev(file, vio->cui_iov, vio->cui_nrsegs, ppos); +} +#else +static ssize_t lustre_generic_file_read(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_aio_read(vio->cui_iocb, vio->cui_iov, + vio->cui_nrsegs, *ppos); +} + +static ssize_t lustre_generic_file_write(struct file *file, + struct ccc_io *vio, loff_t *ppos) +{ + return generic_file_aio_write(vio->cui_iocb, vio->cui_iov, + vio->cui_nrsegs, *ppos); +} +#endif + +static int vvp_io_read_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct ll_ra_read *bead = &vio->cui_bead; + struct file *file = cio->cui_fd->fd_file; + + int result; + loff_t pos = io->u.ci_rd.rd.crw_pos; + size_t cnt = io->u.ci_rd.rd.crw_count; + size_t tot = cio->cui_tot_count; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + LASSERT(vio->cui_oneshot == 0); + + CDEBUG(D_VFSTRACE, "read: -> [%lli, %lli)\n", pos, pos + cnt); + + result = ccc_prep_size(env, obj, io, pos + tot - 1, 1); + if (result != 0) + return result; + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, + "Read ino %lu, "LPSZ" bytes, offset %lld, size %llu\n", + inode->i_ino, cnt, pos, i_size_read(inode)); + + /* turn off the kernel's read-ahead */ + cio->cui_fd->fd_file->f_ra.ra_pages = 0; + + /* initialize read-ahead window once per syscall */ + if (!vio->cui_ra_window_set) { + vio->cui_ra_window_set = 1; + bead->lrr_start = cl_index(obj, pos); + /* + * XXX: explicit CFS_PAGE_SIZE + */ + bead->lrr_count = cl_index(obj, tot + CFS_PAGE_SIZE - 1); + ll_ra_read_in(file, bead); + } + + /* BUG: 5972 */ + file_accessed(file); + if (cl_io_is_sendfile(io)) { + result = generic_file_sendfile(file, &pos, cnt, + vio->u.read.cui_actor, vio->u.read.cui_target); + } else { + result = lustre_generic_file_read(file, cio, &pos); + } + + if (result >= 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, 0); + result = 0; + } + return result; +} + +static int vvp_io_write_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct ccc_io *cio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct file *file = cio->cui_fd->fd_file; + ssize_t result = 0; + loff_t pos = io->u.ci_wr.wr.crw_pos; + size_t cnt = io->u.ci_wr.wr.crw_count; + + ENTRY; + + if (cl_io_is_append(io)) + /* + * PARALLEL IO This has to be changed for parallel IO doing + * out-of-order writes. + */ + pos = io->u.ci_wr.wr.crw_pos = i_size_read(inode); + + CDEBUG(D_VFSTRACE, "write: [%lli, %lli)\n", pos, pos + cnt); + + if (cl2vvp_io(env, ios)->cui_oneshot > 0) + result = 0; + else + result = lustre_generic_file_write(file, cio, &pos); + + if (result > 0) { + if (result < cnt) + io->ci_continue = 0; + io->ci_nob += result; + ll_rw_stats_tally(ll_i2sbi(inode), current->pid, + cio->cui_fd, pos, result, 0); + result = 0; + } + RETURN(result); +} + +static int vvp_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct vvp_io *vio = cl2vvp_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = io->ci_obj; + struct inode *inode = ccc_object_inode(obj); + struct cl_fault_io *fio = &io->u.ci_fault; + struct vvp_fault_io *cfio = &vio->u.fault; + cfs_page_t *vmpage; + loff_t offset; + int result = 0; + + LASSERT(vio->cui_oneshot == 0); + + if (fio->ft_executable && + LTIME_S(inode->i_mtime) != vio->u.fault.ft_mtime) + CWARN("binary "DFID + " changed while waiting for the page fault lock\n", + PFID(lu_object_fid(&obj->co_lu))); + + /* offset of the last byte on the page */ + offset = cl_offset(obj, fio->ft_index + 1) - 1; + LASSERT(cl_index(obj, offset) == fio->ft_index); + result = ccc_prep_size(env, obj, io, offset, 0); + if (result != 0) + return result; + + vmpage = filemap_nopage(cfio->ft_vma, cfio->ft_address, cfio->ft_type); + if (vmpage != NOPAGE_SIGBUS && vmpage != NOPAGE_OOM) + LL_CDEBUG_PAGE(D_PAGE, vmpage, + "got addr %lu type %lx\n", + cfio->ft_address, (long)cfio->ft_type); + else + CDEBUG(D_PAGE, "got addr %lu type %lx - SIGBUS\n", + cfio->ft_address, (long)cfio->ft_type); + + if (vmpage == NOPAGE_SIGBUS) + result = -EFAULT; + else if (vmpage == NOPAGE_OOM) + result = -ENOMEM; + else { + struct cl_page *page; + loff_t size; + pgoff_t last; /* last page in a file data region */ + + /* Temporarily lock vmpage to keep cl_page_find() happy. */ + lock_page(vmpage); + page = cl_page_find(env, obj, fio->ft_index, vmpage, + CPT_CACHEABLE); + unlock_page(vmpage); + if (!IS_ERR(page)) { + size = i_size_read(inode); + last = cl_index(obj, size - 1); + if (fio->ft_index == last) + /* + * Last page is mapped partially. + */ + fio->ft_nob = size - cl_offset(obj, + fio->ft_index); + else + fio->ft_nob = cl_page_size(obj); + lu_ref_add(&page->cp_reference, "fault", io); + fio->ft_page = page; + /* + * Certain 2.6 kernels return not-NULL from + * filemap_nopage() when page is beyond the file size, + * on the grounds that "An external ptracer can access + * pages that normally aren't accessible.." Don't + * propagate such page fault to the lower layers to + * avoid side-effects like KMS updates. + */ + if (fio->ft_index > last) + result = +1; + } else + result = PTR_ERR(page); + } + return result; +} + +static void vvp_io_advance(const struct lu_env *env, + const struct cl_io_slice *ios, size_t nob) +{ + struct ccc_io *vio = cl2ccc_io(env, ios); + struct cl_io *io = ios->cis_io; + struct cl_object *obj = ios->cis_io->ci_obj; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + if (!cl_io_is_sendfile(io) && io->ci_continue) { + /* update the iov */ + LASSERT(vio->cui_tot_nrsegs >= vio->cui_nrsegs); + LASSERT(vio->cui_tot_count >= nob); + + vio->cui_iov += vio->cui_nrsegs; + vio->cui_tot_nrsegs -= vio->cui_nrsegs; + vio->cui_tot_count -= nob; + + if (vio->cui_iov_olen) { + struct iovec *iv; + + vio->cui_iov--; + vio->cui_tot_nrsegs++; + iv = &vio->cui_iov[0]; + iv->iov_base += iv->iov_len; + LASSERT(vio->cui_iov_olen > iv->iov_len); + iv->iov_len = vio->cui_iov_olen - iv->iov_len; + } + } +} + +static int vvp_io_read_page(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice) +{ + struct cl_io *io = ios->cis_io; + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *page = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + struct ll_file_data *fd = cl2ccc_io(env, ios)->cui_fd; + struct ll_readahead_state *ras = &fd->fd_ras; + cfs_page_t *vmpage = cp->cpg_page; + struct cl_2queue *queue = &io->ci_queue; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + LASSERT(cl2vvp_io(env, ios)->cui_oneshot == 0); + LASSERT(slice->cpl_obj == obj); + + ENTRY; + + if (sbi->ll_ra_info.ra_max_pages) + ras_update(sbi, inode, ras, page->cp_index, + cp->cpg_defer_uptodate); + + /* Sanity check whether the page is protected by a lock. */ + if (likely(!(fd->fd_flags & LL_FILE_IGNORE_LOCK))) { + int rc; + + rc = cl_page_is_under_lock(env, io, page); + if (rc != -EBUSY) { + CL_PAGE_HEADER(D_WARNING, env, page, "%s: %i\n", + rc == -ENODATA ? "without a lock" : + "match failed", rc); + if (rc != -ENODATA) + RETURN(rc); + } + } + + if (cp->cpg_defer_uptodate) { + cp->cpg_ra_used = 1; + cl_page_export(env, page); + } + /* + * Add page into the queue even when it is marked uptodate above. + * this will unlock it automatically as part of cl_page_list_disown(). + */ + cl_2queue_add(queue, page); + if (sbi->ll_ra_info.ra_max_pages) + ll_readahead(env, io, ras, + vmpage->mapping, &queue->c2_qin, fd->fd_flags); + + RETURN(0); +} + +static int vvp_page_sync_io(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, struct ccc_page *cp, + int to, enum cl_req_type crt) +{ + struct cl_2queue *queue; + struct ccc_object *cobo = cl2ccc(page->cp_obj); + struct cl_sync_io *anchor = &ccc_env_info(env)->cti_sync_io; + + int writing = io->ci_type == CIT_WRITE; + int result; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + + queue = &io->ci_queue; + + cl_2queue_init_page(queue, page); + + if (writing) + /* Do not pass llap here as it is sync write. */ + vvp_write_pending(cobo, cp); + + cl_sync_io_init(anchor, 1); + cp->cpg_sync_io = anchor; + cl_page_clip(env, page, 0, to); + result = cl_io_submit_rw(env, io, crt, queue); + if (result == 0) + result = cl_sync_io_wait(env, io, &queue->c2_qout, anchor); + else + cp->cpg_sync_io = NULL; + LASSERT(cl_page_is_owned(page, io)); + cl_page_clip(env, page, 0, CFS_PAGE_SIZE); + + if (crt == CRT_READ) + /* + * in CRT_WRITE case page is left locked even in case of + * error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + + return result; +} + +/** + * Prepare partially written-to page for a write. + */ +static int vvp_io_prepare_partial(const struct lu_env *env, struct cl_io *io, + struct cl_object *obj, struct cl_page *pg, + struct ccc_page *cp, + unsigned from, unsigned to) +{ + struct cl_attr *attr = &ccc_env_info(env)->cti_attr; + loff_t offset = cl_offset(obj, pg->cp_index); + int result; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result == 0) { + /* + * If are writing to a new page, no need to read old data. + * The extent locking will have updated the KMS, and for our + * purposes here we can treat it like i_size. + */ + if (attr->cat_kms <= offset) { + char *kaddr = kmap_atomic(cp->cpg_page, KM_USER0); + + memset(kaddr, 0, cl_page_size(obj)); + kunmap_atomic(kaddr, KM_USER0); + } else if (cp->cpg_defer_uptodate) + cp->cpg_ra_used = 1; + else + result = vvp_page_sync_io(env, io, pg, cp, + CFS_PAGE_SIZE, CRT_READ); + /* + * In older implementations, obdo_refresh_inode is called here + * to update the inode because the write might modify the + * object info at OST. However, this has been proven useless, + * since LVB functions will be called when user space program + * tries to retrieve inode attribute. Also, see bug 15909 for + * details. -jay + */ + if (result == 0) + cl_page_export(env, pg); + } + return result; +} + +static int vvp_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + cfs_page_t *vmpage = cp->cpg_page; + + int result; + + ENTRY; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == ccc_object_inode(obj)); + + result = 0; + + CL_PAGE_HEADER(D_PAGE, env, pg, "preparing: [%d, %d]\n", from, to); + if (!PageUptodate(vmpage)) { + /* + * We're completely overwriting an existing page, so _don't_ + * set it up to date until commit_write + */ + if (from == 0 && to == CFS_PAGE_SIZE) { + CL_PAGE_HEADER(D_PAGE, env, pg, "full page write\n"); + POISON_PAGE(page, 0x11); + } else + result = vvp_io_prepare_partial(env, ios->cis_io, obj, + pg, cp, from, to); + } else + CL_PAGE_HEADER(D_PAGE, env, pg, "uptodate\n"); + RETURN(result); +} + +static int vvp_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct cl_object *obj = slice->cpl_obj; + struct cl_io *io = ios->cis_io; + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + cfs_page_t *vmpage = cp->cpg_page; + + int result; + int tallyop; + loff_t size; + + ENTRY; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(vmpage->mapping->host == inode); + + LU_OBJECT_HEADER(D_INODE, env, &obj->co_lu, "commiting page write\n"); + CL_PAGE_HEADER(D_PAGE, env, pg, "committing: [%d, %d]\n", from, to); + + /* + * queue a write for some time in the future the first time we + * dirty the page. + * + * This is different from what other file systems do: they usually + * just mark page (and some of its buffers) dirty and rely on + * balance_dirty_pages() to start a write-back. Lustre wants write-back + * to be started earlier for the following reasons: + * + * (1) with a large number of clients we need to limit the amount + * of cached data on the clients a lot; + * + * (2) large compute jobs generally want compute-only then io-only + * and the IO should complete as quickly as possible; + * + * (3) IO is batched up to the RPC size and is async until the + * client max cache is hit + * (/proc/fs/lustre/osc/OSC.../max_dirty_mb) + * + */ + if (!PageDirty(vmpage)) { + tallyop = LPROC_LL_DIRTY_MISSES; + vvp_write_pending(cl2ccc(obj), cp); + set_page_dirty(vmpage); + result = cl_page_cache_add(env, io, pg, CRT_WRITE); + if (result == -EDQUOT) + /* + * Client ran out of disk space grant. Possible + * strategies are: + * + * (a) do a sync write, renewing grant; + * + * (b) stop writing on this stripe, switch to the + * next one. + * + * (b) is a part of "parallel io" design that is the + * ultimate goal. (a) is what "old" client did, and + * what the new code continues to do for the time + * being. + */ + result = vvp_page_sync_io(env, io, pg, cp, + to, CRT_WRITE); + } else { + tallyop = LPROC_LL_DIRTY_HITS; + result = 0; + } + ll_stats_ops_tally(sbi, tallyop, 1); + + size = cl_offset(obj, pg->cp_index) + to; + + if (result == 0) { + if (size > i_size_read(inode)) + i_size_write(inode, size); + cl_page_export(env, pg); + } else if (size > i_size_read(inode)) + cl_page_discard(env, io, pg); + RETURN(result); +} + +static const struct cl_io_operations vvp_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_read_lock, + .cio_start = vvp_io_read_start, + .cio_advance = vvp_io_advance + }, + [CIT_WRITE] = { + .cio_fini = vvp_io_fini, + .cio_lock = vvp_io_write_lock, + .cio_start = vvp_io_write_start, + .cio_advance = vvp_io_advance + }, + [CIT_TRUNC] = { + .cio_fini = vvp_io_trunc_fini, + .cio_iter_init = vvp_io_trunc_iter_init, + .cio_lock = vvp_io_trunc_lock, + .cio_start = vvp_io_trunc_start, + .cio_end = vvp_io_trunc_end + }, + [CIT_FAULT] = { + .cio_fini = vvp_io_fault_fini, + .cio_iter_init = vvp_io_fault_iter_init, + .cio_lock = vvp_io_fault_lock, + .cio_start = vvp_io_fault_start, + .cio_end = ccc_io_end + }, + [CIT_MISC] = { + .cio_fini = vvp_io_fini + } + }, + .cio_read_page = vvp_io_read_page, + .cio_prepare_write = vvp_io_prepare_write, + .cio_commit_write = vvp_io_commit_write +}; + +int vvp_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct vvp_io *vio = vvp_env_io(env); + struct ccc_io *cio = ccc_env_io(env); + struct inode *inode = ccc_object_inode(obj); + struct ll_sb_info *sbi = ll_i2sbi(inode); + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + ENTRY; + + CL_IO_SLICE_CLEAN(cio, cui_cl); + cl_io_slice_add(io, &cio->cui_cl, obj, &vvp_io_ops); + vio->cui_oneshot = 0; + vio->cui_ra_window_set = 0; + result = 0; + if (io->ci_type == CIT_READ || io->ci_type == CIT_WRITE) { + int op; + size_t count; + + count = io->u.ci_rw.crw_count; + op = io->ci_type == CIT_READ ? + LPROC_LL_READ_BYTES : LPROC_LL_WRITE_BYTES; + if (io->ci_type == CIT_WRITE) + down(&ll_i2info(inode)->lli_write_sem); + /* "If nbyte is 0, read() will return 0 and have no other + * results." -- Single Unix Spec */ + if (count == 0) + result = 1; + else { + cio->cui_tot_count = count; + cio->cui_tot_nrsegs = 0; + ll_stats_ops_tally(sbi, op, count); + } + } else if (io->ci_type == CIT_TRUNC) { + /* lockless truncate? */ + ll_stats_ops_tally(sbi, LPROC_LL_TRUNC, 1); + } + RETURN(result); +} + +static struct vvp_io *cl2vvp_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + /* Caling just for assertion */ + cl2ccc_io(env, slice); + return vvp_env_io(env); +} + diff --git a/lustre/llite/vvp_lock.c b/lustre/llite/vvp_lock.c new file mode 100644 index 0000000..f0c487d --- /dev/null +++ b/lustre/llite/vvp_lock.c @@ -0,0 +1,89 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Vvp lock functions. + * + */ + +/** + * Estimates lock value for the purpose of managing the lock cache during + * memory shortages. + * + * Locks for memory mapped files are almost infinitely precious, others are + * junk. "Mapped locks" are heavy, but not infinitely heavy, so that they are + * ordered within themselves by weights assigned from other layers. + */ +static unsigned long vvp_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct ccc_object *cob = cl2ccc(slice->cls_obj); + + ENTRY; + RETURN(atomic_read(&cob->cob_mmap_cnt) > 0 ? ~0UL >> 2 : 0); +} + +static const struct cl_lock_operations vvp_lock_ops = { + .clo_fini = ccc_lock_fini, + .clo_enqueue = ccc_lock_enqueue, + .clo_wait = ccc_lock_wait, + .clo_unuse = ccc_lock_unuse, + .clo_fits_into = ccc_lock_fits_into, + .clo_state = ccc_lock_state, + .clo_weigh = vvp_lock_weigh +}; + +int vvp_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + return ccc_lock_init(env, obj, lock, io, &vvp_lock_ops); +} diff --git a/lustre/llite/vvp_object.c b/lustre/llite/vvp_object.c new file mode 100644 index 0000000..412a877 --- /dev/null +++ b/lustre/llite/vvp_object.c @@ -0,0 +1,153 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * cl_object implementation for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Object operations. + * + */ + +static int vvp_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct ccc_object *obj = lu2ccc(o); + struct inode *inode = obj->cob_inode; + struct ll_inode_info *lli; + + (*p)(env, cookie, "(%s %i %i) inode: %p ", + list_empty(&obj->cob_pending_list) ? "-" : "+", + obj->cob_transient_pages, atomic_read(&obj->cob_mmap_cnt), inode); + if (inode) { + lli = ll_i2info(inode); + (*p)(env, cookie, "%lu/%u %o %u %i %p "DFID, + inode->i_ino, inode->i_generation, inode->i_mode, + inode->i_nlink, atomic_read(&inode->i_count), + lli->lli_clob, PFID(&lli->lli_fid)); + } + return 0; +} + +static int vvp_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct inode *inode = ccc_object_inode(obj); + + /* + * lov overwrites most of these fields in + * lov_attr_get()->...lov_merge_lvb_kms(), except when inode + * attributes are newer. + */ + + attr->cat_size = i_size_read(inode); + attr->cat_mtime = LTIME_S(inode->i_mtime); + attr->cat_atime = LTIME_S(inode->i_atime); + attr->cat_ctime = LTIME_S(inode->i_ctime); + attr->cat_blocks = inode->i_blocks; + attr->cat_uid = inode->i_uid; + attr->cat_gid = inode->i_gid; + /* KMS is not known by this layer */ + return 0; /* layers below have to fill in the rest */ +} + +static int vvp_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct inode *inode = ccc_object_inode(obj); + + if (valid & CAT_UID) + inode->i_uid = attr->cat_uid; + if (valid & CAT_GID) + inode->i_gid = attr->cat_gid; + if (0 && valid & CAT_SIZE) + i_size_write(inode, attr->cat_size); + /* not currently necessary */ + if (0 && valid & (CAT_UID|CAT_GID|CAT_SIZE)) + mark_inode_dirty(inode); + return 0; +} + +static const struct cl_object_operations vvp_ops = { + .coo_page_init = vvp_page_init, + .coo_lock_init = vvp_lock_init, + .coo_io_init = vvp_io_init, + .coo_attr_get = vvp_attr_get, + .coo_attr_set = vvp_attr_set, + .coo_conf_set = ccc_conf_set, + .coo_glimpse = ccc_object_glimpse +}; + +static const struct lu_object_operations vvp_lu_obj_ops = { + .loo_object_init = ccc_object_init, + .loo_object_free = ccc_object_free, + .loo_object_print = vvp_object_print +}; + +struct ccc_object *cl_inode2ccc(struct inode *inode) +{ + struct cl_inode_info *lli = cl_i2info(inode); + struct cl_object *obj = lli->lli_clob; + struct lu_object *lu; + + LASSERT(obj != NULL); + lu = lu_object_locate(obj->co_lu.lo_header, &vvp_device_type); + LASSERT(lu != NULL); + return lu2ccc(lu); +} + +struct lu_object *vvp_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + return ccc_object_alloc(env, hdr, dev, &vvp_ops, &vvp_lu_obj_ops); +} + diff --git a/lustre/llite/vvp_page.c b/lustre/llite/vvp_page.c new file mode 100644 index 0000000..3d9ed9c --- /dev/null +++ b/lustre/llite/vvp_page.c @@ -0,0 +1,556 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for VVP layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LLITE + +#ifndef __KERNEL__ +# error This file is kernel only. +#endif + +#include +#include + +#include "vvp_internal.h" + +/***************************************************************************** + * + * Page operations. + * + */ + +static void vvp_page_fini_common(struct ccc_page *cp) +{ + cfs_page_t *vmpage = cp->cpg_page; + + LASSERT(vmpage != NULL); + page_cache_release(vmpage); + OBD_SLAB_FREE_PTR(cp, vvp_page_kmem); +} + +static void vvp_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + cfs_page_t *vmpage = cp->cpg_page; + + /* + * vmpage->private was already cleared when page was moved into + * VPG_FREEING state. + */ + LASSERT((struct cl_page *)vmpage->private != slice->cpl_page); + vvp_page_fini_common(cp); +} + +static void vvp_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *_) +{ + struct ccc_page *vpg = cl2ccc_page(slice); + cfs_page_t *vmpage = vpg->cpg_page; + + LASSERT(vmpage != NULL); + lock_page(vmpage); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *_) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + wait_on_page_writeback(vmpage); +} + +static void vvp_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); +} + +static void vvp_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + unlock_page(cl2vm_page(slice)); +} + +static void vvp_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *_) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + struct address_space *mapping = vmpage->mapping; + struct ccc_page *cpg = cl2ccc_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + + if (cpg->cpg_defer_uptodate && !cpg->cpg_ra_used) + ll_ra_stats_inc(mapping, RA_STAT_DISCARDED); + + /* + * truncate_complete_page() calls + * a_ops->invalidatepage()->cl_page_delete()->vvp_page_delete(). + */ + truncate_complete_page(mapping, vmpage); +} + +static int vvp_page_unmap(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *_) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + __u64 offset = vmpage->index << CFS_PAGE_SHIFT; + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + /* + * XXX is it safe to call this with the page lock held? + */ + ll_teardown_mmaps(vmpage->mapping, offset, offset + CFS_PAGE_SIZE); + return 0; +} + +static void vvp_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + struct inode *inode = vmpage->mapping->host; + struct cl_object *obj = slice->cpl_obj; + + LASSERT(PageLocked(vmpage)); + LASSERT((struct cl_page *)vmpage->private == slice->cpl_page); + LASSERT(inode == ccc_object_inode(obj)); + + vvp_write_complete(cl2ccc(obj), cl2ccc_page(slice)); + ClearPagePrivate(vmpage); + vmpage->private = 0; + /* + * Reference from vmpage to cl_page is removed, but the reference back + * is still here. It is removed later in vvp_page_fini(). + */ +} + +static void vvp_page_export(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + + LASSERT(vmpage != NULL); + LASSERT(PageLocked(vmpage)); + SetPageUptodate(vmpage); +} + +static int vvp_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return PageLocked(cl2vm_page(slice)) ? -EBUSY : -ENODATA; +} + +static int vvp_page_prep_read(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + ENTRY; + /* Skip the page already marked as PG_uptodate. */ + RETURN(PageUptodate(cl2vm_page(slice)) ? -EALREADY : 0); +} + +static int vvp_page_prep_write(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + int result; + + if (clear_page_dirty_for_io(vmpage)) { + set_page_writeback(vmpage); + result = 0; + } else + result = -EALREADY; + return result; +} + +/** + * Handles page transfer errors at VM level. + * + * This takes inode as a separate argument, because inode on which error is to + * be set can be different from \a vmpage inode in case of direct-io. + */ +static void vvp_vmpage_error(struct inode *inode, cfs_page_t *vmpage, int ioret) +{ + if (ioret == 0) + ClearPageError(vmpage); + else if (ioret != -EINTR) { + SetPageError(vmpage); + if (ioret == -ENOSPC) + set_bit(AS_ENOSPC, &inode->i_mapping->flags); + else + set_bit(AS_EIO, &inode->i_mapping->flags); + } +} + +static void vvp_page_completion_common(const struct lu_env *env, + struct ccc_page *cp, int ioret) +{ + struct cl_page *clp = cp->cpg_cl.cpl_page; + cfs_page_t *vmpage = cp->cpg_page; + struct inode *inode = ccc_object_inode(clp->cp_obj); + struct cl_sync_io *anchor = cp->cpg_sync_io; + + LINVRNT(cl_page_is_vmlocked(env, clp)); + KLASSERT(!PageWriteback(vmpage)); + + vvp_vmpage_error(inode, vmpage, ioret); + + if (anchor != NULL) { + cp->cpg_sync_io = NULL; + cl_sync_io_note(anchor, ioret); + } else if (clp->cp_type == CPT_CACHEABLE) + unlock_page(vmpage); +} + +static void vvp_page_completion_read(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *page = cl_page_top(slice->cpl_page); + struct inode *inode = ccc_object_inode(page->cp_obj); + ENTRY; + + CL_PAGE_HEADER(D_PAGE, env, page, "completing READ with %d\n", ioret); + + if (cp->cpg_defer_uptodate) + ll_ra_count_put(ll_i2sbi(inode), 1); + + if (ioret == 0) { + /* XXX: do we need this for transient pages? */ + if (!cp->cpg_defer_uptodate) + cl_page_export(env, page); + } else + cp->cpg_defer_uptodate = 0; + vvp_page_completion_common(env, cp, ioret); + + EXIT; +} + +static void vvp_page_completion_write_common(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + + if (ioret == 0) { + cp->cpg_write_queued = 0; + /* + * Only ioret == 0, write succeed, then this page could be + * deleted from the pending_writing count. + */ + vvp_write_complete(cl2ccc(slice->cpl_obj), cp); + } + vvp_page_completion_common(env, cp, ioret); +} + +static void vvp_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *pg = slice->cpl_page; + cfs_page_t *vmpage = cp->cpg_page; + + ENTRY; + + LINVRNT(cl_page_is_vmlocked(env, pg)); + LASSERT(PageWriteback(vmpage)); + + CL_PAGE_HEADER(D_PAGE, env, pg, "completing WRITE with %d\n", ioret); + + end_page_writeback(vmpage); + LASSERT(!PageWriteback(vmpage)); + + vvp_page_completion_write_common(env, slice, ioret); + EXIT; +} + +/** + * Implements cl_page_operations::cpo_make_ready() method. + * + * This is called to yank page from the transfer page and to send it out as a + * part of transfer. This function try-locks the page. If try-lock failed, + * page is owned by some concurrent IO, and should be skipped (this is bad, + * but hopefully rare situation, as it usually results in transfer being + * shorter than possible). + * + * \retval 0 success, page can be placed into transfer + * + * \retval -EAGAIN page is either used by concurrent IO has been + * truncated. Skip it. + */ +static int vvp_page_make_ready(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + cfs_page_t *vmpage = cl2vm_page(slice); + struct cl_page *pg = slice->cpl_page; + int result; + + result = -EAGAIN; + /* we're trying to write, but the page is locked.. come back later */ + if (!TestSetPageLocked(vmpage)) { + if (pg->cp_state == CPS_CACHED) { + /* + * We can cancel IO if page wasn't dirty after all. + */ + clear_page_dirty_for_io(vmpage); + /* + * This actually clears the dirty bit in the radix + * tree. + */ + set_page_writeback(vmpage); + + CL_PAGE_HEADER(D_PAGE, env, pg, "readied\n"); + result = 0; + } else + /* + * Page was concurrently truncated. + */ + LASSERT(pg->cp_state == CPS_FREEING); + } + RETURN(result); +} + +static int vvp_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct ccc_page *vp = cl2ccc_page(slice); + cfs_page_t *vmpage = vp->cpg_page; + + (*printer)(env, cookie, LUSTRE_VVP_NAME"-page@%p(%d:%d:%d) " + "vm@%p ", + vp, vp->cpg_defer_uptodate, vp->cpg_ra_used, + vp->cpg_write_queued, vmpage); + if (vmpage != NULL) { + (*printer)(env, cookie, "%lx %d:%d %lx %lu %slru", + (long)vmpage->flags, page_count(vmpage), + page_mapcount(vmpage), vmpage->private, + page_index(vmpage), + list_empty(&vmpage->lru) ? "not-" : ""); + } + (*printer)(env, cookie, "\n"); + return 0; +} + +static const struct cl_page_operations vvp_page_ops = { + .cpo_own = vvp_page_own, + .cpo_assume = vvp_page_assume, + .cpo_unassume = vvp_page_unassume, + .cpo_disown = vvp_page_disown, + .cpo_vmpage = ccc_page_vmpage, + .cpo_discard = vvp_page_discard, + .cpo_delete = vvp_page_delete, + .cpo_unmap = vvp_page_unmap, + .cpo_export = vvp_page_export, + .cpo_is_vmlocked = vvp_page_is_vmlocked, + .cpo_fini = vvp_page_fini, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = vvp_page_prep_read, + .cpo_completion = vvp_page_completion_read, + .cpo_make_ready = ccc_fail, + }, + [CRT_WRITE] = { + .cpo_prep = vvp_page_prep_write, + .cpo_completion = vvp_page_completion_write, + .cpo_make_ready = vvp_page_make_ready, + } + } +}; + +static void vvp_transient_page_verify(const struct cl_page *page) +{ + struct inode *inode = ccc_object_inode(page->cp_obj); + + LASSERT(!TRYLOCK_INODE_MUTEX(inode)); + /* LASSERT_SEM_LOCKED(&inode->i_alloc_sem); */ +} + +static void vvp_transient_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_unassume(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_disown(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + vvp_transient_page_verify(slice->cpl_page); +} + +static void vvp_transient_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + struct cl_page *page = slice->cpl_page; + + vvp_transient_page_verify(slice->cpl_page); + + /* + * For transient pages, remove it from the radix tree. + */ + cl_page_delete(env, page); +} + +static int vvp_transient_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct inode *inode = ccc_object_inode(slice->cpl_obj); + int locked; + + locked = !TRYLOCK_INODE_MUTEX(inode); + if (!locked) + UNLOCK_INODE_MUTEX(inode); + return locked ? -EBUSY : -ENODATA; +} + +static void +vvp_transient_page_completion_write(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + vvp_transient_page_verify(slice->cpl_page); + vvp_page_completion_write_common(env, slice, ioret); +} + + +static void vvp_transient_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct ccc_page *cp = cl2ccc_page(slice); + struct cl_page *clp = slice->cpl_page; + struct ccc_object *clobj = cl2ccc(clp->cp_obj); + + vvp_page_fini_common(cp); + LASSERT(!TRYLOCK_INODE_MUTEX(clobj->cob_inode)); + clobj->cob_transient_pages--; +} + +static const struct cl_page_operations vvp_transient_page_ops = { + .cpo_own = vvp_transient_page_own, + .cpo_assume = vvp_transient_page_assume, + .cpo_unassume = vvp_transient_page_unassume, + .cpo_disown = vvp_transient_page_disown, + .cpo_discard = vvp_transient_page_discard, + .cpo_vmpage = ccc_page_vmpage, + .cpo_fini = vvp_transient_page_fini, + .cpo_is_vmlocked = vvp_transient_page_is_vmlocked, + .cpo_print = vvp_page_print, + .cpo_is_under_lock = ccc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_page_completion_read, + }, + [CRT_WRITE] = { + .cpo_prep = ccc_transient_page_prep, + .cpo_completion = vvp_transient_page_completion_write, + } + } +}; + +struct cl_page *vvp_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage) +{ + struct ccc_page *cpg; + int result; + + CLOBINVRNT(env, obj, ccc_object_invariant(obj)); + + OBD_SLAB_ALLOC_PTR(cpg, vvp_page_kmem); + if (cpg != NULL) { + cpg->cpg_page = vmpage; + page_cache_get(vmpage); + + CFS_INIT_LIST_HEAD(&cpg->cpg_pending_linkage); + if (page->cp_type == CPT_CACHEABLE) { + SetPagePrivate(vmpage); + vmpage->private = (unsigned long)page; + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_page_ops); + } else { + struct ccc_object *clobj = cl2ccc(obj); + + LASSERT(!TRYLOCK_INODE_MUTEX(clobj->cob_inode)); + cl_page_slice_add(page, &cpg->cpg_cl, obj, + &vvp_transient_page_ops); + clobj->cob_transient_pages++; + } + result = 0; + } else + result = -ENOMEM; + return ERR_PTR(result); +} + diff --git a/lustre/llite/xattr.c b/lustre/llite/xattr.c index e2835d9..d66a025 100644 --- a/lustre/llite/xattr.c +++ b/lustre/llite/xattr.c @@ -238,7 +238,9 @@ int ll_setxattr(struct dentry *dentry, const char *name, } return rc; - } + + } else if (strcmp(name, "trusted.lma") == 0) /* b17288: ignore common_ea */ + return 0; return ll_setxattr_common(inode, name, value, size, flags, OBD_MD_FLXATTR); diff --git a/lustre/lmv/lmv_fld.c b/lustre/lmv/lmv_fld.c index c2b9757..8f4f94c 100644 --- a/lustre/lmv/lmv_fld.c +++ b/lustre/lmv/lmv_fld.c @@ -75,11 +75,11 @@ int lmv_fld_lookup(struct lmv_obd *lmv, RETURN(rc); } - CDEBUG(D_INODE, "FLD lookup got mds #"LPU64" for fid="DFID"\n", + CDEBUG(D_INODE, "FLD lookup got mds #%x for fid="DFID"\n", *mds, PFID(fid)); if (*mds >= lmv->desc.ld_tgt_count) { - CERROR("FLD lookup got invalid mds #"LPU64" (max: %d) " + CERROR("FLD lookup got invalid mds #%x (max: %x) " "for fid="DFID"\n", *mds, lmv->desc.ld_tgt_count, PFID(fid)); rc = -EINVAL; diff --git a/lustre/lmv/lmv_intent.c b/lustre/lmv/lmv_intent.c index 7e35465..cb1273b 100644 --- a/lustre/lmv/lmv_intent.c +++ b/lustre/lmv/lmv_intent.c @@ -768,7 +768,7 @@ int lmv_allocate_slaves(struct obd_device *obd, struct lu_fid *pid, } CDEBUG(D_INODE, "Allocate new fid "DFID" for slave " - "obj -> mds #"LPU64"\n", PFID(fid), mds); + "obj -> mds #%x\n", PFID(fid), mds); RETURN(rc); } diff --git a/lustre/lmv/lmv_internal.h b/lustre/lmv/lmv_internal.h index 818d53d..2311f4e 100644 --- a/lustre/lmv/lmv_internal.h +++ b/lustre/lmv/lmv_internal.h @@ -49,15 +49,15 @@ ((it) ? ldlm_it2str((it)->it_op) : "0") struct lmv_stripe { - /** - * Dir stripe fid. + /** + * Dir stripe fid. */ struct lu_fid ls_fid; - /** - * Cached home mds number for @li_fid. + /** + * Cached home mds number for @li_fid. */ mdsno_t ls_mds; - /** + /** * Stripe object size. */ unsigned long ls_size; @@ -78,15 +78,15 @@ struct lmv_object { * Sema for protecting fields. */ struct semaphore lo_guard; - /** + /** * Object state like O_FREEING. */ int lo_state; - /** - * Object ref counter. + /** + * Object ref counter. */ atomic_t lo_count; - /** + /** * Object master fid. */ struct lu_fid lo_fid; @@ -94,15 +94,15 @@ struct lmv_object { * Object hash type to find stripe by name. */ __u32 lo_hashtype; - /** - * Number of stripes. + /** + * Number of stripes. */ int lo_objcount; - /** - * Array of sub-objs. + /** + * Array of sub-objs. */ struct lmv_stripe *lo_stripes; - /** + /** * Pointer to LMV obd. */ struct obd_device *lo_obd; @@ -233,7 +233,7 @@ lmv_find_target(struct lmv_obd *lmv, const struct lu_fid *fid) { mdsno_t mds; int rc; - + rc = lmv_fld_lookup(lmv, fid, &mds); if (rc) return ERR_PTR(rc); diff --git a/lustre/lmv/lmv_obd.c b/lustre/lmv/lmv_obd.c index 1c8f940..7543a8c 100644 --- a/lustre/lmv/lmv_obd.c +++ b/lustre/lmv/lmv_obd.c @@ -127,7 +127,7 @@ static int lmv_set_mdc_active(struct lmv_obd *lmv, struct obd_uuid *uuid, GOTO(out_lmv_lock, rc); } - CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, + CDEBUG(D_INFO, "Marking OBD %p %sactive\n", obd, activate ? "" : "in"); lmv_activate_target(lmv, tgt, activate); EXIT; @@ -199,7 +199,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched, } else if (ev == OBD_NOTIFY_OCD) { conn_data = &watched->u.cli.cl_import->imp_connect_data; - /* + /* * Set connect data to desired target, update exp_connect_flags. */ rc = lmv_set_mdc_data(lmv, uuid, conn_data); @@ -219,14 +219,14 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched, } #if 0 else if (ev == OBD_NOTIFY_DISCON) { - /* - * For disconnect event, flush fld cache for failout MDS case. + /* + * For disconnect event, flush fld cache for failout MDS case. */ fld_client_flush(&lmv->lmv_fld); } #endif - /* - * Pass the notification up the chain. + /* + * Pass the notification up the chain. */ if (obd->obd_observer) rc = obd_notify(obd->obd_observer, watched, ev, data); @@ -236,7 +236,7 @@ static int lmv_notify(struct obd_device *obd, struct obd_device *watched, /** * This is fake connect function. Its purpose is to initialize lmv and say - * caller that everything is okay. Real connection will be performed later. + * caller that everything is okay. Real connection will be performed later. */ static int lmv_connect(const struct lu_env *env, struct lustre_handle *conn, struct obd_device *obd, @@ -259,9 +259,9 @@ static int lmv_connect(const struct lu_env *env, exp = class_conn2export(conn); - /* + /* * We don't want to actually do the underlying connections more than - * once, so keep track. + * once, so keep track. */ lmv->refcount++; if (lmv->refcount > 1) { @@ -286,11 +286,11 @@ static int lmv_connect(const struct lu_env *env, } #endif - /* + /* * All real clients should perform actual connection right away, because * it is possible, that LMV will not have opportunity to connect targets * and MDC stuff will be called directly, for instance while reading - * ../mdc/../kbytesfree procfs file, etc. + * ../mdc/../kbytesfree procfs file, etc. */ if (data->ocd_connect_flags & OBD_CONNECT_REAL) rc = lmv_check_connect(obd); @@ -416,7 +416,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) mdc_exp = class_conn2export(&conn); - /* + /* * Init fid sequence client for this mdc and add new fld target. */ rc = obd_fid_init(mdc_exp); @@ -440,7 +440,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) } if (obd->obd_observer) { - /* + /* * Tell the observer about the new target. */ rc = obd_notify(obd->obd_observer, mdc_exp->exp_obd, @@ -455,7 +455,7 @@ int lmv_connect_mdc(struct obd_device *obd, struct lmv_tgt_desc *tgt) tgt->ltd_exp = mdc_exp; lmv->desc.ld_active_tgt_count++; - /* + /* * Copy connect data, it may be used later. */ lmv->datas[tgt->ltd_idx] = *mdc_data; @@ -684,8 +684,8 @@ static int lmv_disconnect(struct obd_export *exp) if (!lmv->tgts) goto out_local; - /* - * Only disconnect the underlying layers on the final disconnect. + /* + * Only disconnect the underlying layers on the final disconnect. */ lmv->refcount--; if (lmv->refcount != 0) @@ -725,12 +725,13 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, { struct obd_device *obddev = class_exp2obd(exp); struct lmv_obd *lmv = &obddev->u.lmv; - int i; + int i = 0; int rc = 0; int set = 0; + int count = lmv->desc.ld_tgt_count; ENTRY; - if (lmv->desc.ld_tgt_count == 0) + if (count == 0) RETURN(-ENOTTY); switch (cmd) { @@ -743,7 +744,7 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, memcpy(&index, data->ioc_inlbuf2, sizeof(__u32)); LASSERT(data->ioc_plen1 == sizeof(struct obd_statfs)); - if ((index >= lmv->desc.ld_tgt_count)) + if ((index >= count)) RETURN(-ENODEV); if (!lmv->tgts[index].ltd_active) @@ -764,8 +765,54 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, RETURN(-EFAULT); break; } + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lmv_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_MDTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = &lmv->tgts[qctl->qc_idx]; + if (!tgt->ltd_exp) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = &lmv->tgts[i]; + if (!obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_MDTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } default : { - for (i = 0; i < lmv->desc.ld_tgt_count; i++) { + for (i = 0; i < count; i++) { int err; if (lmv->tgts[i].ltd_exp == NULL) @@ -773,7 +820,9 @@ static int lmv_iocontrol(unsigned int cmd, struct obd_export *exp, err = obd_iocontrol(cmd, lmv->tgts[i].ltd_exp, len, karg, uarg); - if (err) { + if (err == -ENODATA && cmd == OBD_IOC_POLL_QUOTACHECK) { + RETURN(err); + } else if (err) { if (lmv->tgts[i].ltd_active) { CERROR("error: iocontrol MDC %s on MDT" "idx %d cmd %x: err = %d\n", @@ -807,7 +856,7 @@ static int lmv_nid_policy(struct lmv_obd *lmv) { struct obd_import *imp; __u32 id; - + /* * XXX: To get nid we assume that underlying obd device is mdc. */ @@ -836,7 +885,7 @@ static int lmv_choose_mds(struct lmv_obd *lmv, struct md_op_data *op_data, } /** - * This is _inode_ placement policy function (not name). + * This is _inode_ placement policy function (not name). */ static int lmv_placement_policy(struct obd_device *obd, struct md_op_data *op_data, @@ -913,8 +962,8 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, ENTRY; tgt = lmv_get_target(lmv, mds); - - /* + + /* * New seq alloc and FLD setup should be atomic. Otherwise we may find * on server that seq in new allocated fid is not yet known. */ @@ -923,26 +972,13 @@ int __lmv_fid_alloc(struct lmv_obd *lmv, struct lu_fid *fid, if (!tgt->ltd_active) GOTO(out, rc = -ENODEV); - /* - * Asking underlaying tgt layer to allocate new fid. + /* + * Asking underlaying tgt layer to allocate new fid. */ rc = obd_fid_alloc(tgt->ltd_exp, fid, NULL); if (rc > 0) { LASSERT(fid_is_sane(fid)); - - /* - * Client switches to new sequence, setup FLD. - */ - rc = fld_client_create(&lmv->lmv_fld, fid_seq(fid), - mds, NULL); - if (rc) { - /* - * Delete just allocated fid sequence in case - * of fail back. - */ - CERROR("Can't create fld entry, rc %d\n", rc); - obd_fid_delete(tgt->ltd_exp, NULL); - } + rc = 0; } EXIT; @@ -1218,7 +1254,7 @@ static int lmv_setxattr(struct obd_export *exp, const struct lu_fid *fid, if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); - rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input, + rc = md_setxattr(tgt->ltd_exp, fid, oc, valid, name, input, input_size, output_size, flags, suppgid, request); @@ -1280,7 +1316,7 @@ static int lmv_getattr(struct obd_export *exp, const struct lu_fid *fid, continue; } - /* + /* * Skip master object. */ if (lu_fid_eq(&obj->lo_fid, &obj->lo_stripes[i].ls_fid)) @@ -1369,8 +1405,8 @@ int lmv_handle_split(struct obd_export *exp, const struct lu_fid *fid) if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); - /* - * Time to update mea of parent fid. + /* + * Time to update mea of parent fid. */ rc = md_getattr(tgt->ltd_exp, fid, NULL, valid, mealen, &req); if (rc) { @@ -1449,7 +1485,7 @@ repeat: else if (rc) RETURN(rc); - CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #"LPU64"\n", + CDEBUG(D_INODE, "CREATE '%*s' on "DFID" -> mds #%x\n", op_data->op_namelen, op_data->op_name, PFID(&op_data->op_fid1), op_data->op_mds); @@ -1560,8 +1596,8 @@ cleanup: OBD_FREE_PTR(op_data2); if (rc != 0) { - /* - * Drop all taken locks. + /* + * Drop all taken locks. */ while (--i >= 0) { if (lockh[i].cookie) @@ -1599,8 +1635,8 @@ lmv_enqueue_remote(struct obd_export *exp, struct ldlm_enqueue_info *einfo, CDEBUG(D_INODE, "REMOTE_ENQUEUE '%s' on "DFID" -> "DFID"\n", LL_IT2STR(it), PFID(&op_data->op_fid1), PFID(&body->fid1)); - /* - * We got LOOKUP lock, but we really need attrs. + /* + * We got LOOKUP lock, but we really need attrs. */ pmode = it->d.lustre.it_lock_mode; LASSERT(pmode != 0); @@ -1678,7 +1714,7 @@ lmv_enqueue(struct obd_export *exp, struct ldlm_enqueue_info *einfo, CDEBUG(D_INODE, "ENQUEUE '%s' on "DFID" -> mds #%d\n", LL_IT2STR(it), PFID(&op_data->op_fid1), tgt->ltd_idx); - + rc = md_enqueue(tgt->ltd_exp, einfo, it, op_data, lockh, lmm, lmmsize, req, extra_lock_flags); @@ -1803,7 +1839,7 @@ static int lmv_early_cancel_slaves(struct obd_export *exp, obj = lmv_object_find(obd, op_fid); if (obj == NULL) RETURN(-EALREADY); - + policy.l_inodebits.bits = bits; for (i = 0; i < obj->lo_objcount; i++) { tgt = lmv_get_target(lmv, obj->lo_stripes[i].ls_mds); @@ -1811,12 +1847,12 @@ static int lmv_early_cancel_slaves(struct obd_export *exp, if (op_tgt != tgt->ltd_idx) { CDEBUG(D_INODE, "EARLY_CANCEL slave "DFID" -> mds #%d\n", PFID(st_fid), tgt->ltd_idx); - rc = md_cancel_unused(tgt->ltd_exp, st_fid, &policy, + rc = md_cancel_unused(tgt->ltd_exp, st_fid, &policy, mode, LDLM_FL_ASYNC, NULL); if (rc) GOTO(out_put_obj, rc); } else { - CDEBUG(D_INODE, + CDEBUG(D_INODE, "EARLY_CANCEL skip operation target %d on "DFID"\n", op_tgt, PFID(st_fid)); /* @@ -1865,7 +1901,7 @@ static int lmv_early_cancel(struct obd_export *exp, struct md_op_data *op_data, rc = md_cancel_unused(tgt->ltd_exp, fid, &policy, mode, LDLM_FL_ASYNC, NULL); } else { - CDEBUG(D_INODE, + CDEBUG(D_INODE, "EARLY_CANCEL skip operation target %d on "DFID"\n", op_tgt, PFID(fid)); op_data->op_flags |= flag; @@ -1921,7 +1957,7 @@ repeat: RETURN(rc); } - CDEBUG(D_INODE, "Forward to mds #"LPU64" ("DFID")\n", + CDEBUG(D_INODE, "Forward to mds #%x ("DFID")\n", mds, PFID(&op_data->op_fid1)); op_data->op_fsuid = current->fsuid; @@ -1929,8 +1965,8 @@ repeat: op_data->op_cap = cfs_curproc_cap_pack(); tgt = lmv_get_target(lmv, mds); - /* - * Cancel UPDATE lock on child (fid1). + /* + * Cancel UPDATE lock on child (fid1). */ op_data->op_flags |= MF_MDC_CANCEL_FID2; rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, @@ -2025,34 +2061,34 @@ repeat: src_tgt = lmv_get_target(lmv, mds1); tgt_tgt = lmv_get_target(lmv, mds2); - /* + /* * LOOKUP lock on src child (fid3) should also be cancelled for - * src_tgt in mdc_rename. + * src_tgt in mdc_rename. */ op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; - /* + /* * Cancel UPDATE locks on tgt parent (fid2), tgt_tgt is its - * own target. + * own target. */ - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, - LCK_EX, MDS_INODELOCK_UPDATE, + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + LCK_EX, MDS_INODELOCK_UPDATE, MF_MDC_CANCEL_FID2); - /* + /* * Cancel LOOKUP locks on tgt child (fid4) for parent tgt_tgt. */ if (rc == 0) { - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, LCK_EX, MDS_INODELOCK_LOOKUP, MF_MDC_CANCEL_FID4); } - /* - * Cancel all the locks on tgt child (fid4). + /* + * Cancel all the locks on tgt child (fid4). */ if (rc == 0) - rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, + rc = lmv_early_cancel(exp, op_data, src_tgt->ltd_idx, LCK_EX, MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID4); @@ -2062,7 +2098,7 @@ repeat: if (rc == -ERESTART) { LASSERT(*request != NULL); - DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, + DEBUG_REQ(D_WARNING|D_RPCTRACE, *request, "Got -ERESTART during rename!\n"); ptlrpc_req_finished(*request); *request = NULL; @@ -2164,7 +2200,7 @@ static int lmv_sync(struct obd_export *exp, const struct lu_fid *fid, /** * Main purpose of LMV blocking ast is to remove split directory LMV - * presentation object (struct lmv_object) attached to the lock being revoked. + * presentation object (struct lmv_object) attached to the lock being revoked. */ int lmv_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, void *data, int flag) @@ -2184,7 +2220,7 @@ int lmv_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, } break; case LDLM_CB_CANCELING: - /* + /* * Time to drop cached attrs for split directory object */ obj = lock->l_ast_data; @@ -2312,7 +2348,7 @@ static int lmv_readpage(struct obd_export *exp, const struct lu_fid *fid, hash_adj += rank * seg_size; CDEBUG(D_INODE, "Readpage hash adjustment: %x "LPX64" " - LPX64"/%x -> "LPX64"/%x\n", rank, hash_adj, + LPX64"/%x -> "LPX64"/%x\n", rank, hash_adj, offset, tgt0_idx, offset + hash_adj, tgt_idx); offset = (offset + hash_adj) & MAX_HASH_SIZE; @@ -2411,17 +2447,17 @@ repeat: op_data->op_fsgid = current->fsgid; op_data->op_cap = cfs_curproc_cap_pack(); - /* + /* * If child's fid is given, cancel unused locks for it if it is from * another export than parent. * - * LOOKUP lock for child (fid3) should also be cancelled on parent - * tgt_tgt in mdc_unlink(). + * LOOKUP lock for child (fid3) should also be cancelled on parent + * tgt_tgt in mdc_unlink(). */ op_data->op_flags |= MF_MDC_CANCEL_FID1 | MF_MDC_CANCEL_FID3; - /* - * Cancel FULL locks on child (fid3). + /* + * Cancel FULL locks on child (fid3). */ rc = lmv_early_cancel(exp, op_data, tgt->ltd_idx, LCK_EX, MDS_INODELOCK_FULL, MF_MDC_CANCEL_FID3); @@ -2468,7 +2504,7 @@ static int lmv_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) } static int lmv_get_info(struct obd_export *exp, __u32 keylen, - void *key, __u32 *vallen, void *val, + void *key, __u32 *vallen, void *val, struct lov_stripe_md *lsm) { struct obd_device *obd; @@ -2496,8 +2532,8 @@ static int lmv_get_info(struct obd_export *exp, __u32 keylen, for (i = 0, tgts = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgts++) { - /* - * All tgts should be connected when this gets called. + /* + * All tgts should be connected when this gets called. */ if (!tgts || !tgts->ltd_exp) { CERROR("target not setup?\n"); @@ -2514,9 +2550,9 @@ static int lmv_get_info(struct obd_export *exp, __u32 keylen, if (rc) RETURN(rc); - /* + /* * Forwarding this request to first MDS, it should know LOV - * desc. + * desc. */ rc = obd_get_info(lmv->tgts[0].ltd_exp, keylen, key, vallen, val, NULL); @@ -2657,8 +2693,8 @@ int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, { magic = le32_to_cpu(mea->mea_magic); } else { - /* - * Old mea is not handled here. + /* + * Old mea is not handled here. */ CERROR("Old not supportable EA is found\n"); LBUG(); @@ -2676,7 +2712,7 @@ int lmv_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, } static int lmv_cancel_unused(struct obd_export *exp, const struct lu_fid *fid, - ldlm_policy_data_t *policy, ldlm_mode_t mode, + ldlm_policy_data_t *policy, ldlm_mode_t mode, int flags, void *opaque) { struct obd_device *obd = exp->exp_obd; @@ -2723,11 +2759,11 @@ ldlm_mode_t lmv_lock_match(struct obd_export *exp, int flags, CDEBUG(D_INODE, "Lock match for "DFID"\n", PFID(fid)); - /* + /* * With CMD every object can have two locks in different namespaces: * lookup lock in space of mds storing direntry and update/open lock in * space of mds storing inode. Thus we check all targets, not only that - * one fid was created in. + * one fid was created in. */ for (i = 0; i < lmv->desc.ld_tgt_count; i++) { rc = md_lock_match(lmv->tgts[i].ltd_exp, flags, fid, @@ -2837,6 +2873,18 @@ static int lmv_renew_capa(struct obd_export *exp, struct obd_capa *oc, RETURN(rc); } +int lmv_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa **oc) +{ + struct obd_device *obd = exp->exp_obd; + struct lmv_obd *lmv = &obd->u.lmv; + int rc; + + ENTRY; + rc = md_unpack_capa(lmv->tgts[0].ltd_exp, req, field, oc); + RETURN(rc); +} + int lmv_intent_getattr_async(struct obd_export *exp, struct md_enqueue_info *minfo, struct ldlm_enqueue_info *einfo) @@ -2862,10 +2910,10 @@ int lmv_intent_getattr_async(struct obd_export *exp, (char *)op_data->op_name, op_data->op_namelen); op_data->op_fid1 = obj->lo_stripes[sidx].ls_fid; - tgt = lmv_get_target(lmv, + tgt = lmv_get_target(lmv, obj->lo_stripes[sidx].ls_mds); CDEBUG(D_INODE, - "Choose slave dir ("DFID") -> mds #%d\n", + "Choose slave dir ("DFID") -> mds #%d\n", PFID(&op_data->op_fid1), tgt->ltd_idx); } else { tgt = lmv_find_target(lmv, &op_data->op_fid1); @@ -2883,7 +2931,7 @@ int lmv_intent_getattr_async(struct obd_export *exp, if (minfo->mi_it.it_op & IT_LOOKUP) minfo->mi_it.it_op = IT_GETATTR; } - + if (IS_ERR(tgt)) RETURN(PTR_ERR(tgt)); @@ -2960,11 +3008,15 @@ struct md_ops lmv_md_ops = { .m_set_open_replay_data = lmv_set_open_replay_data, .m_clear_open_replay_data = lmv_clear_open_replay_data, .m_renew_capa = lmv_renew_capa, + .m_unpack_capa = lmv_unpack_capa, .m_get_remote_perm = lmv_get_remote_perm, .m_intent_getattr_async = lmv_intent_getattr_async, .m_revalidate_lock = lmv_revalidate_lock }; +static quota_interface_t *quota_interface; +extern quota_interface_t lmv_quota_interface; + int __init lmv_init(void) { struct lprocfs_static_vars lvars; @@ -2979,10 +3031,18 @@ int __init lmv_init(void) } lprocfs_lmv_init_vars(&lvars); + + request_module("lquota"); + quota_interface = PORTAL_SYMBOL_GET(lmv_quota_interface); + init_obd_quota_ops(quota_interface, &lmv_obd_ops); + rc = class_register_type(&lmv_obd_ops, &lmv_md_ops, lvars.module_vars, LUSTRE_LMV_NAME, NULL); - if (rc) + if (rc) { + if (quota_interface) + PORTAL_SYMBOL_PUT(lmv_quota_interface); cfs_mem_cache_destroy(lmv_object_cache); + } return rc; } @@ -2990,6 +3050,9 @@ int __init lmv_init(void) #ifdef __KERNEL__ static void lmv_exit(void) { + if (quota_interface) + PORTAL_SYMBOL_PUT(lmv_quota_interface); + class_unregister_type(LUSTRE_LMV_NAME); LASSERTF(atomic_read(&lmv_object_count) == 0, diff --git a/lustre/lmv/lproc_lmv.c b/lustre/lmv/lproc_lmv.c index e880d23..555f4dc 100644 --- a/lustre/lmv/lproc_lmv.c +++ b/lustre/lmv/lproc_lmv.c @@ -90,7 +90,7 @@ static int lmv_rd_placement(char *page, char **start, off_t off, int count, LASSERT(dev != NULL); lmv = &dev->u.lmv; *eof = 1; - return snprintf(page, count, "%s\n", + return snprintf(page, count, "%s\n", placement_policy2name(lmv->lmv_placement)); } @@ -182,7 +182,7 @@ static int lmv_tgt_seq_show(struct seq_file *p, void *v) struct obd_device *dev = p->private; struct lmv_obd *lmv = &dev->u.lmv; int idx = tgt - &(lmv->tgts[0]); - + return seq_printf(p, "%d: %s %sACTIVE\n", idx, tgt->ltd_uuid.uuid, tgt->ltd_active ? "" : "IN"); } @@ -199,7 +199,7 @@ static int lmv_target_seq_open(struct inode *inode, struct file *file) struct proc_dir_entry *dp = PDE(inode); struct seq_file *seq; int rc; - + rc = seq_open(file, &lmv_tgt_sops); if (rc) return rc; diff --git a/lustre/lov/Makefile.in b/lustre/lov/Makefile.in index 0f223f8..5a2aad7 100644 --- a/lustre/lov/Makefile.in +++ b/lustre/lov/Makefile.in @@ -1,4 +1,4 @@ MODULES := lov -lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_pool.o +lov-objs := lov_log.o lov_obd.o lov_pack.o lproc_lov.o lov_offset.o lov_merge.o lov_request.o lov_qos.o lov_ea.o lov_dev.o lov_object.o lov_page.o lov_lock.o lov_io.o lovsub_dev.o lovsub_object.o lovsub_page.o lovsub_lock.o lovsub_io.o lov_pool.o @INCLUDE_RULES@ diff --git a/lustre/lov/autoMakefile.am b/lustre/lov/autoMakefile.am index c65e095..e18070c 100644 --- a/lustre/lov/autoMakefile.am +++ b/lustre/lov/autoMakefile.am @@ -36,7 +36,7 @@ if LIBLUSTRE noinst_LIBRARIES = liblov.a -liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h +liblov_a_SOURCES = lov_log.c lov_pool.c lov_obd.c lov_pack.c lov_request.c lov_offset.c lov_qos.c lov_merge.c lov_ea.c lov_internal.h lov_cl_internal.h lov_dev.c lov_object.c lov_page.c lov_lock.c lov_io.c lovsub_dev.c lovsub_object.c lovsub_page.c lovsub_lock.c lovsub_io.c liblov_a_CPPFLAGS = $(LLCPPFLAGS) liblov_a_CFLAGS = $(LLCFLAGS) endif @@ -51,12 +51,22 @@ macos_PROGRAMS := lov lov_SOURCES := \ lov_log.c \ - lov_pool.c \ + lov_pool.c \ lov_obd.c \ lov_pack.c \ lov_request.c \ lov_merge.c \ lov_qos.c \ + lov_dev.c \ + lov_object.c \ + lov_page.c \ + lov_lock.c \ + lov_io.c \ + lovsub_dev.c \ + lovsub_object.c \ + lovsub_page.c \ + lovsub_lock.c \ + lovsub_io.c \ lov_offset.c \ lov_internal.h @@ -74,5 +84,5 @@ endif # MODULES install-data-hook: $(install_data_hook) -DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h +DIST_SOURCES = $(lov-objs:.o=.c) lov_internal.h lov_cl_internal.h MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ diff --git a/lustre/lov/lov_cl_internal.h b/lustre/lov/lov_cl_internal.h new file mode 100644 index 0000000..90f53fb --- /dev/null +++ b/lustre/lov/lov_cl_internal.h @@ -0,0 +1,798 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of LOV layer. + * + * Author: Nikita Danilov + */ + +#ifndef LOV_CL_INTERNAL_H +#define LOV_CL_INTERNAL_H + +#ifdef __KERNEL__ +# include +#else +# include +#endif + +#include +#include +#include "lov_internal.h" + +/** \addtogroup lov lov @{ */ + +/** \defgroup lov lov + * Logical object volume layer. This layer implements data striping (raid0). + * + * At the lov layer top-entity (object, page, lock, io) is connected to one or + * more sub-entities: top-object, representing a file is connected to a set of + * sub-objects, each representing a stripe, file-level top-lock is connected + * to a set of per-stripe sub-locks, top-page is connected to a (single) + * sub-page, and a top-level IO is connected to a set of (potentially + * concurrent) sub-IO's. + * + * Sub-object, sub-page, and sub-io have well-defined top-object and top-page + * respectively, while a single sub-lock can be part of multiple top-locks. + * + * Reference counting models are different for different types of entities: + * + * - top-object keeps a reference to its sub-objects, and destroys them + * when it is destroyed. + * + * - top-page keeps a reference to its sub-page, and destroys it when it + * is destroyed. + * + * - sub-lock keep a reference to its top-locks. Top-lock keeps a + * reference (and a hold, see cl_lock_hold()) on its sub-locks when it + * actively using them (that is, in cl_lock_state::CLS_QUEUING, + * cl_lock_state::CLS_ENQUEUED, cl_lock_state::CLS_HELD states). When + * moving into cl_lock_state::CLS_CACHED state, top-lock releases a + * hold. From this moment top-lock has only a 'weak' reference to its + * sub-locks. This reference is protected by top-lock + * cl_lock::cll_guard, and will be automatically cleared by the sub-lock + * when the latter is destroyed. When a sub-lock is canceled, a + * reference to it is removed from the top-lock array, and top-lock is + * moved into CLS_NEW state. It is guaranteed that all sub-locks exits + * while their top-lock is in CLS_HELD or CLS_CACHED states. + * + * - IO's are not reference counted. + * + * To implement a connection between top and sub entities, lov layer is split + * into two pieces: lov ("upper half"), and lovsub ("bottom half"), both + * implementing full set of cl-interfaces. For example, top-object has clu and + * lov layers, and it's sub-object has lovsub and osc layers. lovsub layer is + * used to track child-parent relationship. + * + * @{ + */ + +struct lovsub_device; +struct lovsub_object; +struct lovsub_lock; + +enum lov_device_flags { + LOV_DEV_INITIALIZED = 1 << 0 +}; + +/* + * Upper half. + */ + +/** + * Resources that are used in memory-cleaning path, and whose allocation + * cannot fail even when memory is tight. They are preallocated in sufficient + * quantities in lov_device::ld_emerg[], and access to them is serialized + * lov_device::ld_mutex. + */ +struct lov_device_emerg { + /** + * Page list used to submit IO when memory is in pressure. + */ + struct cl_page_list emrg_page_list; + /** + * sub-io's shared by all threads accessing this device when memory is + * too low to allocate sub-io's dynamically. + */ + struct cl_io emrg_subio; + /** + * Environments used by sub-io's in + * lov_device_emerg::emrg_subio. + */ + struct lu_env *emrg_env; + /** + * Refchecks for lov_device_emerg::emrg_env. + * + * \see cl_env_get() + */ + int emrg_refcheck; +}; + +struct lov_device { + /* + * XXX Locking of lov-private data is missing. + */ + struct cl_device ld_cl; + struct lov_obd *ld_lov; + /** size of lov_device::ld_target[] array */ + __u32 ld_target_nr; + struct lovsub_device **ld_target; + __u32 ld_flags; + + /** Emergency resources used in memory-cleansing paths. */ + struct lov_device_emerg **ld_emrg; + /** + * Serializes access to lov_device::ld_emrg in low-memory + * conditions. + */ + struct mutex ld_mutex; +}; + +/** + * Layout type. + */ +enum lov_layout_type { + /** empty file without body */ + LLT_EMPTY, + /** striped file */ + LLT_RAID0, + /** join file */ + LLT_JOIN, + LLT_NR +}; + +/** + * lov-specific file state. + * + * lov object has particular layout type, determining how top-object is built + * on top of sub-objects. Layout type can change dynamically. When this + * happens, lov_object::lo_type_guard semaphore is taken in exclusive mode, + * all state pertaining to the old layout type is destroyed, and new state is + * constructed. All object methods take said semaphore in the shared mode, + * providing serialization against transition between layout types. + * + * To avoid multiple `if' or `switch' statements, selecting behavior for the + * current layout type, object methods perform double-dispatch, invoking + * function corresponding to the current layout type. + */ +struct lov_object { + struct cl_object lo_cl; + /** + * Serializes object operations with transitions between layout types. + * + * This semaphore is taken in shared mode by all object methods, and + * is taken in exclusive mode when object type is changed. + * + * \see lov_object::lo_type + */ + struct rw_semaphore lo_type_guard; + /** + * Type of an object. Protected by lov_object::lo_type_guard. + */ + enum lov_layout_type lo_type; + + union lov_layout_state { + struct lov_layout_raid0 { + unsigned lo_nr; + struct lov_stripe_md *lo_lsm; + /** + * Array of sub-objects. Allocated when top-object is + * created (lov_init_raid0()). + * + * Top-object is a strict master of its sub-objects: + * it is created before them, and outlives its + * children (this later is necessary so that basic + * functions like cl_object_top() always + * work). Top-object keeps a reference on every + * sub-object. + * + * When top-object is destroyed (lov_delete_raid0()) + * it releases its reference to a sub-object and waits + * until the latter is finally destroyed. + */ + struct lovsub_object **lo_sub; + /** + * When this is true, lov_object::lo_attr contains + * valid up to date attributes for a top-level + * object. This field is reset to 0 when attributes of + * any sub-object change. + */ + int lo_attr_valid; + /** + * Cached object attribute, built from sub-object + * attributes. + */ + struct cl_attr lo_attr; + } raid0; + struct lov_layout_state_empty { + } empty; + struct lov_layout_state_join { + } join; + } u; + /** + * Thread that acquired lov_object::lo_type_guard in an exclusive + * mode. + */ + cfs_task_t *lo_owner; +}; + +/** + * Flags that top-lock can set on each of its sub-locks. + */ +enum lov_sub_flags { + /** Top-lock acquired a hold (cl_lock_hold()) on a sub-lock. */ + LSF_HELD = 1 << 0 +}; + +/** + * State lov_lock keeps for each sub-lock. + */ +struct lov_lock_sub { + /** sub-lock itself */ + struct lovsub_lock *sub_lock; + /** An array of per-sub-lock flags, taken from enum lov_sub_flags */ + unsigned sub_flags; + int sub_stripe; + struct cl_lock_descr sub_descr; + struct cl_lock_descr sub_got; +}; + +/** + * lov-specific lock state. + */ +struct lov_lock { + struct cl_lock_slice lls_cl; + /** Number of sub-locks in this lock */ + int lls_nr; + /** + * Number of existing sub-locks. + */ + unsigned lls_nr_filled; + /** + * Set when sub-lock was canceled, while top-lock was being + * unlocked. + */ + int lls_unuse_race; + /** + * An array of sub-locks + * + * There are two issues with managing sub-locks: + * + * - sub-locks are concurrently canceled, and + * + * - sub-locks are shared with other top-locks. + * + * To manage cancellation, top-lock acquires a hold on a sublock + * (lov_sublock_adopt()) when the latter is inserted into + * lov_lock::lls_sub[]. This hold is released (lov_sublock_release()) + * when top-lock is going into CLS_CACHED state or destroyed. Hold + * prevents sub-lock from cancellation. + * + * Sub-lock sharing means, among other things, that top-lock that is + * in the process of creation (i.e., not yet inserted into lock list) + * is already accessible to other threads once at least one of its + * sub-locks is created, see lov_lock_sub_init(). + * + * Sub-lock can be in one of the following states: + * + * - doesn't exist, lov_lock::lls_sub[]::sub_lock == NULL. Such + * sub-lock was either never created (top-lock is in CLS_NEW + * state), or it was created, then canceled, then destroyed + * (lov_lock_unlink() cleared sub-lock pointer in the top-lock). + * + * - sub-lock exists and is on + * hold. (lov_lock::lls_sub[]::sub_flags & LSF_HELD). This is a + * normal state of a sub-lock in CLS_HELD and CLS_CACHED states + * of a top-lock. + * + * - sub-lock exists, but is not held by the top-lock. This + * happens after top-lock released a hold on sub-locks before + * going into cache (lov_lock_unuse()). + * + * \todo To support wide-striping, array has to be replaced with a set + * of queues to avoid scanning. + */ + struct lov_lock_sub *lls_sub; + /** + * Original description with which lock was enqueued. + */ + struct cl_lock_descr lls_orig; +}; + +struct lov_page { + struct cl_page_slice lps_cl; + int lps_invalid; +}; + +/* + * Bottom half. + */ + +struct lovsub_device { + struct cl_device acid_cl; + struct lov_device *acid_super; + int acid_idx; + struct cl_device *acid_next; +}; + +struct lovsub_object { + struct cl_object_header lso_header; + struct cl_object lso_cl; + struct lov_object *lso_super; + int lso_index; +}; + +/** + * A link between a top-lock and a sub-lock. Separate data-structure is + * necessary, because top-locks and sub-locks are in M:N relationship. + * + * \todo This can be optimized for a (by far) most frequent case of a single + * top-lock per sub-lock. + */ +struct lov_lock_link { + struct lov_lock *lll_super; + /** An index within parent lock. */ + int lll_idx; + /** + * A linkage into per sub-lock list of all corresponding top-locks, + * hanging off lovsub_lock::lss_parents. + */ + struct list_head lll_list; +}; + +/** + * Lock state at lovsub layer. + */ +struct lovsub_lock { + struct cl_lock_slice lss_cl; + /** + * List of top-locks that have given sub-lock as their part. Protected + * by cl_lock::cll_guard mutex. + */ + struct list_head lss_parents; + /** + * Top-lock that initiated current operation on this sub-lock. This is + * only set during top-to-bottom lock operations like enqueue, and is + * used to optimize state change notification. Protected by + * cl_lock::cll_guard mutex. + * + * \see lovsub_lock_state_one(). + */ + struct cl_lock *lss_active; +}; + +struct lovsub_page { + struct cl_page_slice lsb_cl; +}; + + +struct lov_thread_info { + struct cl_object_conf lti_stripe_conf; + struct lu_fid lti_fid; + struct cl_lock_descr lti_ldescr; + struct ost_lvb lti_lvb; + struct cl_2queue lti_cl2q; + union lov_layout_state lti_state; + struct cl_lock_closure lti_closure; + cfs_waitlink_t lti_waiter; +}; + +/** + * State that lov_io maintains for every sub-io. + */ +struct lov_io_sub { + int sub_stripe; + /** + * sub-io for a stripe. Ideally sub-io's can be stopped and resumed + * independently, with lov acting as a scheduler to maximize overall + * throughput. + */ + struct cl_io *sub_io; + /** + * Linkage into a list (hanging off lov_io::lis_active) of all + * sub-io's active for the current IO iteration. + */ + struct list_head sub_linkage; + /** + * true, iff cl_io_init() was successfully executed against + * lov_io_sub::sub_io. + */ + int sub_io_initialized; + /** + * True, iff lov_io_sub::sub_io and lov_io_sub::sub_env weren't + * allocated, but borrowed from a per-device emergency pool. + */ + int sub_borrowed; + /** + * environment, in which sub-io executes. + */ + struct lu_env *sub_env; + /** + * environment's refcheck. + * + * \see cl_env_get() + */ + int sub_refcheck; + int sub_refcheck2; + int sub_reenter; + void *sub_cookie; +}; + +/** + * IO state private for LOV. + */ +struct lov_io { + /** super-class */ + struct cl_io_slice lis_cl; + /** + * Pointer to the object slice. This is a duplicate of + * lov_io::lis_cl::cis_object. + */ + struct lov_object *lis_object; + /** + * Original end-of-io position for this IO, set by the upper layer as + * cl_io::u::ci_rw::pos + cl_io::u::ci_rw::count. lov remembers this, + * changes pos and count to fit IO into a single stripe and uses saved + * value to determine when IO iterations have to stop. + * + * This is used only for CIT_READ and CIT_WRITE io's. + */ + loff_t lis_io_endpos; + + /** + * starting position within a file, for the current io loop iteration + * (stripe), used by ci_io_loop(). + */ + obd_off lis_pos; + /** + * end position with in a file, for the current stripe io. This is + * exclusive (i.e., next offset after last byte affected by io). + */ + obd_off lis_endpos; + + int lis_mem_frozen; + int lis_stripe_count; + int lis_active_subios; + + /** + * the index of ls_single_subio in ls_subios array + */ + int lis_single_subio_index; + struct cl_io lis_single_subio; + + /** + * size of ls_subios array, actually the highest stripe # + */ + int lis_nr_subios; + struct lov_io_sub *lis_subs; + /** + * List of active sub-io's. + */ + struct list_head lis_active; +}; + +struct lov_session { + struct lov_io ls_io; +}; + +/** + * State of transfer for lov. + */ +struct lov_req { + struct cl_req_slice lr_cl; +}; + +/** + * State of transfer for lovsub. + */ +struct lovsub_req { + struct cl_req_slice lsrq_cl; +}; + +extern struct lu_device_type lov_device_type; +extern struct lu_device_type lovsub_device_type; + +extern struct lu_context_key lov_key; +extern struct lu_context_key lov_session_key; + +extern cfs_mem_cache_t *lov_page_kmem; +extern cfs_mem_cache_t *lov_lock_kmem; +extern cfs_mem_cache_t *lov_object_kmem; +extern cfs_mem_cache_t *lov_thread_kmem; +extern cfs_mem_cache_t *lov_session_kmem; +extern cfs_mem_cache_t *lov_req_kmem; + +extern cfs_mem_cache_t *lovsub_page_kmem; +extern cfs_mem_cache_t *lovsub_lock_kmem; +extern cfs_mem_cache_t *lovsub_object_kmem; +extern cfs_mem_cache_t *lovsub_req_kmem; + +extern cfs_mem_cache_t *lov_lock_link_kmem; + +int lov_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lovsub_object_init (const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf); +int lov_lock_init (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lovsub_lock_init (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); + +int lov_lock_init_raid0 (const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io); +int lov_io_init_raid0 (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +int lov_io_init_empty (const struct lu_env *env, struct cl_object *obj, + struct cl_io *io); +void lov_lock_unlink (const struct lu_env *env, struct lov_lock_link *link, + struct lovsub_lock *sub); + +void lov_sub_put (struct lov_io_sub *sub); +int lov_sublock_modify (const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx); + + +struct cl_page *lov_page_init (const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, cfs_page_t *vmpage); +struct cl_page *lovsub_page_init(const struct lu_env *env, struct cl_object *ob, + struct cl_page *page, cfs_page_t *vmpage); + +struct cl_page *lov_page_init_empty(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage); +struct cl_page *lov_page_init_raid0(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage); +struct lu_object *lov_object_alloc (const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub); +struct lov_io_sub *lov_page_subio (const struct lu_env *env, + struct lov_io *lio, + const struct cl_page_slice *slice); + + +#define lov_foreach_target(lov, var) \ + for (var = 0; var < lov_targets_nr(lov); ++var) + +/***************************************************************************** + * + * Type conversions. + * + * Accessors. + * + */ + +static inline struct lov_session *lov_env_session(const struct lu_env *env) +{ + struct lov_session *ses; + + ses = lu_context_key_get(env->le_ses, &lov_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct lov_io *lov_env_io(const struct lu_env *env) +{ + return &lov_env_session(env)->ls_io; +} + +static inline int lov_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lov_device_type; +} + +static inline int lovsub_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &lovsub_device_type; +} + +static inline struct lu_device *lov2lu_dev(struct lov_device *lov) +{ + return &lov->ld_cl.cd_lu_dev; +} + +static inline struct lov_device *lu2lov_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lov_device_type); + return container_of0(d, struct lov_device, ld_cl.cd_lu_dev); +} + +static inline struct cl_device *lovsub2cl_dev(struct lovsub_device *lovsub) +{ + return &lovsub->acid_cl; +} + +static inline struct lu_device *lovsub2lu_dev(struct lovsub_device *lovsub) +{ + return &lovsub2cl_dev(lovsub)->cd_lu_dev; +} + +static inline struct lovsub_device *lu2lovsub_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl.cd_lu_dev); +} + +static inline struct lovsub_device *cl2lovsub_dev(const struct cl_device *d) +{ + LINVRNT(d->cd_lu_dev.ld_type == &lovsub_device_type); + return container_of0(d, struct lovsub_device, acid_cl); +} + +static inline struct lu_object *lov2lu(struct lov_object *lov) +{ + return &lov->lo_cl.co_lu; +} + +static inline struct cl_object *lov2cl(struct lov_object *lov) +{ + return &lov->lo_cl; +} + +static inline struct lov_object *lu2lov(const struct lu_object *obj) +{ + LINVRNT(lov_is_object(obj)); + return container_of0(obj, struct lov_object, lo_cl.co_lu); +} + +static inline struct lov_object *cl2lov(const struct cl_object *obj) +{ + LINVRNT(lov_is_object(&obj->co_lu)); + return container_of0(obj, struct lov_object, lo_cl); +} + +static inline struct lu_object *lovsub2lu(struct lovsub_object *los) +{ + return &los->lso_cl.co_lu; +} + +static inline struct cl_object *lovsub2cl(struct lovsub_object *los) +{ + return &los->lso_cl; +} + +static inline struct lovsub_object *cl2lovsub(const struct cl_object *obj) +{ + LINVRNT(lovsub_is_object(&obj->co_lu)); + return container_of0(obj, struct lovsub_object, lso_cl); +} + +static inline struct lovsub_object *lu2lovsub(const struct lu_object *obj) +{ + LINVRNT(lovsub_is_object(obj)); + return container_of0(obj, struct lovsub_object, lso_cl.co_lu); +} + +static inline struct lovsub_lock * +cl2lovsub_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lovsub_lock, lss_cl); +} + +static inline struct lovsub_lock *cl2sub_lock(const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + + slice = cl_lock_at(lock, &lovsub_device_type); + LASSERT(slice != NULL); + return cl2lovsub_lock(slice); +} + +static inline struct lov_lock *cl2lov_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cls_obj->co_lu)); + return container_of(slice, struct lov_lock, lls_cl); +} + +static inline struct lov_page *cl2lov_page(const struct cl_page_slice *slice) +{ + LINVRNT(lov_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lov_page, lps_cl); +} + +static inline struct lov_req *cl2lov_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lov_req, lr_cl); +} + +static inline struct lovsub_page * +cl2lovsub_page(const struct cl_page_slice *slice) +{ + LINVRNT(lovsub_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct lovsub_page, lsb_cl); +} + +static inline struct lovsub_req *cl2lovsub_req(const struct cl_req_slice *slice) +{ + return container_of0(slice, struct lovsub_req, lsrq_cl); +} + +static inline struct cl_page *lov_sub_page(const struct cl_page_slice *slice) +{ + return slice->cpl_page->cp_child; +} + +static inline struct lov_io *cl2lov_io(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio; + + lio = container_of(ios, struct lov_io, lis_cl); + LASSERT(lio == lov_env_io(env)); + return lio; +} + +static inline int lov_targets_nr(const struct lov_device *lov) +{ + return lov->ld_lov->desc.ld_tgt_count; +} + +static inline struct lov_thread_info *lov_env_info(const struct lu_env *env) +{ + struct lov_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &lov_key); + LASSERT(info != NULL); + return info; +} + +static inline struct lov_layout_raid0 *lov_r0(struct lov_object *lov) +{ + struct lov_layout_raid0 *raid0; + + LASSERT(lov->lo_type == LLT_RAID0); + raid0 = &lov->u.raid0; + LASSERT(raid0->lo_lsm->lsm_wire.lw_magic == LOV_MAGIC); + return raid0; +} + +/** @} lov */ + +#endif + diff --git a/lustre/lov/lov_dev.c b/lustre/lov/lov_dev.c new file mode 100644 index 0000000..32dfe3a --- /dev/null +++ b/lustre/lov/lov_dev.c @@ -0,0 +1,540 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +/* class_name2obd() */ +#include + +#include "lov_cl_internal.h" + +cfs_mem_cache_t *lov_page_kmem; +cfs_mem_cache_t *lov_lock_kmem; +cfs_mem_cache_t *lov_object_kmem; +cfs_mem_cache_t *lov_thread_kmem; +cfs_mem_cache_t *lov_session_kmem; +cfs_mem_cache_t *lov_req_kmem; + +cfs_mem_cache_t *lovsub_page_kmem; +cfs_mem_cache_t *lovsub_lock_kmem; +cfs_mem_cache_t *lovsub_object_kmem; +cfs_mem_cache_t *lovsub_req_kmem; + +cfs_mem_cache_t *lov_lock_link_kmem; + +/** Lock class of lov_device::ld_mutex. */ +struct lock_class_key cl_lov_device_mutex_class; + +struct lu_kmem_descr lov_caches[] = { + { + .ckd_cache = &lov_page_kmem, + .ckd_name = "lov_page_kmem", + .ckd_size = sizeof (struct lov_page) + }, + { + .ckd_cache = &lov_lock_kmem, + .ckd_name = "lov_lock_kmem", + .ckd_size = sizeof (struct lov_lock) + }, + { + .ckd_cache = &lov_object_kmem, + .ckd_name = "lov_object_kmem", + .ckd_size = sizeof (struct lov_object) + }, + { + .ckd_cache = &lov_thread_kmem, + .ckd_name = "lov_thread_kmem", + .ckd_size = sizeof (struct lov_thread_info) + }, + { + .ckd_cache = &lov_session_kmem, + .ckd_name = "lov_session_kmem", + .ckd_size = sizeof (struct lov_session) + }, + { + .ckd_cache = &lov_req_kmem, + .ckd_name = "lov_req_kmem", + .ckd_size = sizeof (struct lov_req) + }, + { + .ckd_cache = &lovsub_page_kmem, + .ckd_name = "lovsub_page_kmem", + .ckd_size = sizeof (struct lovsub_page) + }, + { + .ckd_cache = &lovsub_lock_kmem, + .ckd_name = "lovsub_lock_kmem", + .ckd_size = sizeof (struct lovsub_lock) + }, + { + .ckd_cache = &lovsub_object_kmem, + .ckd_name = "lovsub_object_kmem", + .ckd_size = sizeof (struct lovsub_object) + }, + { + .ckd_cache = &lovsub_req_kmem, + .ckd_name = "lovsub_req_kmem", + .ckd_size = sizeof (struct lovsub_req) + }, + { + .ckd_cache = &lov_lock_link_kmem, + .ckd_name = "lov_lock_link_kmem", + .ckd_size = sizeof (struct lov_lock_link) + }, + { + .ckd_cache = NULL + } +}; + +/***************************************************************************** + * + * Lov transfer operations. + * + */ + +static void lov_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lov_req *lr; + + ENTRY; + lr = cl2lov_req(slice); + OBD_SLAB_FREE_PTR(lr, lov_req_kmem); + EXIT; +} + +static const struct cl_req_operations lov_req_ops = { + .cro_completion = lov_req_completion +}; + +/***************************************************************************** + * + * Lov device and device type functions. + * + */ + +static void *lov_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_thread_info *info; + + OBD_SLAB_ALLOC_PTR(info, lov_thread_kmem); + if (info != NULL) + CFS_INIT_LIST_HEAD(&info->lti_closure.clc_list); + else + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_thread_info *info = data; + LINVRNT(list_empty(&info->lti_closure.clc_list)); + OBD_SLAB_FREE_PTR(info, lov_thread_kmem); +} + +struct lu_context_key lov_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = lov_key_init, + .lct_fini = lov_key_fini +}; + +static void *lov_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct lov_session *info; + + OBD_SLAB_ALLOC_PTR(info, lov_session_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void lov_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct lov_session *info = data; + OBD_SLAB_FREE_PTR(info, lov_session_kmem); +} + +struct lu_context_key lov_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = lov_session_key_init, + .lct_fini = lov_session_key_fini +}; + +/* type constructor/destructor: lov_type_{init,fini,start,stop}() */ +LU_TYPE_INIT_FINI(lov, &lov_key, &lov_session_key); + +static struct lu_device *lov_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + int i; + struct lov_device *ld = lu2lov_dev(d); + + LASSERT(ld->ld_lov != NULL); + if (ld->ld_target == NULL) + RETURN(NULL); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + + lsd = ld->ld_target[i]; + if (lsd != NULL) { + cl_stack_fini(env, lovsub2cl_dev(lsd)); + ld->ld_target[i] = NULL; + } + } + RETURN(NULL); +} + +static int lov_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lov_device *ld = lu2lov_dev(d); + int i; + int rc = 0; + + LASSERT(d->ld_site != NULL); + if (ld->ld_target == NULL) + RETURN(rc); + + lov_foreach_target(ld, i) { + struct lovsub_device *lsd; + struct cl_device *cl; + struct lov_tgt_desc *desc; + + desc = ld->ld_lov->lov_tgts[i]; + if (desc->ltd_active) { + cl = cl_type_setup(env, d->ld_site, &lovsub_device_type, + desc->ltd_exp->exp_obd->obd_lu_dev); + if (IS_ERR(cl)) { + rc = PTR_ERR(cl); + break; + } + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = i; + lsd->acid_super = ld; + ld->ld_target[i] = lsd; + } + } + + if (rc) + lov_device_fini(env, d); + else + ld->ld_flags |= LOV_DEV_INITIALIZED; + + RETURN(rc); +} + +static int lov_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lov_req *lr; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lr, lov_req_kmem); + if (lr != NULL) { + cl_req_slice_add(req, &lr->lr_cl, dev, &lov_req_ops); + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +static const struct cl_device_operations lov_cl_ops = { + .cdo_req_init = lov_req_init +}; + +static void lov_emerg_free(struct lov_device_emerg **emrg, int nr) +{ + int i; + + for (i = 0; i < nr; ++i) { + struct lov_device_emerg *em; + + em = emrg[i]; + if (em != NULL) { + LASSERT(em->emrg_page_list.pl_nr == 0); + if (em->emrg_env != NULL) + cl_env_put(em->emrg_env, &em->emrg_refcheck); + OBD_FREE_PTR(em); + } + } + OBD_FREE(emrg, nr * sizeof emrg[0]); +} + +static struct lu_device *lov_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lov_device *ld = lu2lov_dev(d); + const int nr = ld->ld_target_nr; + + cl_device_fini(lu2cl_dev(d)); + if (ld->ld_target != NULL) + OBD_FREE(ld->ld_target, nr * sizeof ld->ld_target[0]); + if (ld->ld_emrg != NULL) + lov_emerg_free(ld->ld_emrg, nr); + OBD_FREE_PTR(ld); + return NULL; +} + +static void lov_cl_del_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct lov_device *ld = lu2lov_dev(dev); + ENTRY; + + if (ld->ld_target[index] != NULL) { + cl_stack_fini(env, lovsub2cl_dev(ld->ld_target[index])); + ld->ld_target[index] = NULL; + } + EXIT; +} + +static struct lov_device_emerg **lov_emerg_alloc(int nr) +{ + struct lov_device_emerg **emerg; + int i; + int result; + + OBD_ALLOC(emerg, nr * sizeof emerg[0]); + if (emerg == NULL) + return ERR_PTR(-ENOMEM); + for (result = i = 0; i < nr && result == 0; i++) { + struct lov_device_emerg *em; + void *cookie; + + OBD_ALLOC_PTR(em); + if (em != NULL) { + emerg[i] = em; + cl_page_list_init(&em->emrg_page_list); + cookie = cl_env_reenter(); + em->emrg_env = cl_env_alloc(&em->emrg_refcheck, + LCT_REMEMBER|LCT_NOREF); + cl_env_reexit(cookie); + if (!IS_ERR(em->emrg_env)) + em->emrg_env->le_ctx.lc_cookie = 0x2; + else { + result = PTR_ERR(em->emrg_env); + em->emrg_env = NULL; + } + } else + result = -ENOMEM; + } + if (result != 0) { + lov_emerg_free(emerg, nr); + emerg = ERR_PTR(result); + } + return emerg; +} + +static int lov_expand_targets(const struct lu_env *env, struct lov_device *dev) +{ + int result; + __u32 tgt_size; + __u32 sub_size; + + ENTRY; + result = 0; + tgt_size = dev->ld_lov->lov_tgt_size; + sub_size = dev->ld_target_nr; + if (sub_size < tgt_size) { + struct lovsub_device **newd; + struct lov_device_emerg **emerg; + const size_t sz = sizeof newd[0]; + + emerg = lov_emerg_alloc(tgt_size); + if (IS_ERR(emerg)) + RETURN(PTR_ERR(emerg)); + + OBD_ALLOC(newd, tgt_size * sz); + if (newd != NULL) { + mutex_lock(&dev->ld_mutex); + if (sub_size > 0) { + memcpy(newd, dev->ld_target, sub_size * sz); + OBD_FREE(dev->ld_target, sub_size * sz); + } + dev->ld_target = newd; + dev->ld_target_nr = tgt_size; + + if (dev->ld_emrg != NULL) + lov_emerg_free(dev->ld_emrg, sub_size); + dev->ld_emrg = emerg; + mutex_unlock(&dev->ld_mutex); + } else { + lov_emerg_free(emerg, tgt_size); + result = -ENOMEM; + } + } + RETURN(result); +} + +static int lov_cl_add_target(const struct lu_env *env, struct lu_device *dev, + __u32 index) +{ + struct obd_device *obd = dev->ld_obd; + struct lov_device *ld = lu2lov_dev(dev); + struct lov_tgt_desc *tgt; + struct lovsub_device *lsd; + struct cl_device *cl; + int rc; + ENTRY; + + lov_getref(obd); + + tgt = obd->u.lov.lov_tgts[index]; + LASSERT(tgt != NULL); + + rc = lov_expand_targets(env, ld); + if (rc == 0 && ld->ld_flags & LOV_DEV_INITIALIZED) { + LASSERT(dev->ld_site != NULL); + cl = cl_type_setup(env, dev->ld_site, &lovsub_device_type, + tgt->ltd_exp->exp_obd->obd_lu_dev); + if (!IS_ERR(cl)) { + lsd = cl2lovsub_dev(cl); + lsd->acid_idx = index; + lsd->acid_super = ld; + ld->ld_target[index] = lsd; + } else { + CERROR("add failed (%d), deleting %s\n", rc, + obd_uuid2str(&tgt->ltd_uuid)); + lov_cl_del_target(env, dev, index); + rc = PTR_ERR(cl); + } + } + lov_putref(obd); + RETURN(rc); +} + +static int lov_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + struct obd_device *obd = d->ld_obd; + int cmd; + int rc; + int gen; + __u32 index; + + lov_getref(obd); + + cmd = cfg->lcfg_command; + rc = lov_process_config_base(d->ld_obd, cfg, &index, &gen); + if (rc == 0) { + switch(cmd) { + case LCFG_LOV_ADD_OBD: + case LCFG_LOV_ADD_INA: + rc = lov_cl_add_target(env, d, index); + if (rc != 0) + lov_del_target(d->ld_obd, index, 0, 0); + break; + case LCFG_LOV_DEL_OBD: + lov_cl_del_target(env, d, index); + break; + } + } + lov_putref(obd); + RETURN(rc); +} + +static const struct lu_device_operations lov_lu_ops = { + .ldo_object_alloc = lov_object_alloc, + .ldo_process_config = lov_process_config, +}; + +static struct lu_device *lov_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lov_device *ld; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(ld); + if (ld == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&ld->ld_cl, t); + d = lov2lu_dev(ld); + d->ld_ops = &lov_lu_ops; + ld->ld_cl.cd_ops = &lov_cl_ops; + + mutex_init(&ld->ld_mutex); + lockdep_set_class(&ld->ld_mutex, &cl_lov_device_mutex_class); + + /* setup the LOV OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = lov_setup(obd, cfg); + if (rc) { + lov_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + + ld->ld_lov = &obd->u.lov; + RETURN(d); +} + +static const struct lu_device_type_operations lov_device_type_ops = { + .ldto_init = lov_type_init, + .ldto_fini = lov_type_fini, + + .ldto_start = lov_type_start, + .ldto_stop = lov_type_stop, + + .ldto_device_alloc = lov_device_alloc, + .ldto_device_free = lov_device_free, + + .ldto_device_init = lov_device_init, + .ldto_device_fini = lov_device_fini +}; + +struct lu_device_type lov_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOV_NAME, + .ldt_ops = &lov_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; +EXPORT_SYMBOL(lov_device_type); + +/** @} lov */ diff --git a/lustre/lov/lov_ea.c b/lustre/lov/lov_ea.c index 457d9dc..564246a 100755 --- a/lustre/lov/lov_ea.c +++ b/lustre/lov/lov_ea.c @@ -500,7 +500,7 @@ static int lsm_revalidate_join(struct lov_stripe_md *lsm, OBD_ALLOC(lsm->lsm_array->lai_ext_array,lsm->lsm_array->lai_ext_count * sizeof (struct lov_extent)); if (!lsm->lsm_array->lai_ext_array) - GOTO(release_ctxt, rc = -ENOMEM); + GOTO(release_ctxt, rc = -ENOMEM); CDEBUG(D_INFO, "get lsm logid: "LPU64":"LPU64"\n", lsm->lsm_array->lai_array_id.lgl_oid, @@ -526,7 +526,7 @@ release_ctxt: RETURN(rc); } -int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa, +int lsm_destroy_join(struct lov_stripe_md *lsm, struct obdo *oa, struct obd_export *md_exp) { struct llog_ctxt *ctxt; diff --git a/lustre/lov/lov_internal.h b/lustre/lov/lov_internal.h index c9468ae..b7e49c9 100644 --- a/lustre/lov/lov_internal.h +++ b/lustre/lov/lov_internal.h @@ -82,26 +82,6 @@ struct lov_request_set { struct list_head set_list; }; -#define LOV_AP_MAGIC 8200 - -struct lov_async_page { - int lap_magic; - int lap_stripe; - obd_off lap_sub_offset; - obd_id lap_loi_id; - obd_gr lap_loi_gr; - void *lap_sub_cookie; - struct obd_async_page_ops *lap_caller_ops; - void *lap_caller_data; -}; - -static inline struct lov_async_page *lap_from_cookie(void *ptr) -{ - struct lov_async_page *ap = ptr; - LASSERT(ap->lap_magic == LOV_AP_MAGIC); - return ap; -} - extern cfs_mem_cache_t *lov_oinfo_slab; static inline void lov_llh_addref(void *llhp) @@ -142,7 +122,7 @@ static inline void lov_llh_put(struct lov_lock_handles *llh) atomic_read(&llh->llh_refcount) < 0x5a5a); if (atomic_dec_and_test(&llh->llh_refcount)) { class_handle_unhash(&llh->llh_handle); - /* The structure may be held by other threads because RCU. + /* The structure may be held by other threads because RCU. * -jxiong */ if (atomic_read(&llh->llh_refcount)) return; @@ -163,6 +143,8 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only); int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, obd_off size, int shrink); +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place); /* lov_offset.c */ obd_size lov_stripe_size(struct lov_stripe_md *lsm, obd_size ost_size, @@ -264,10 +246,16 @@ void lov_fix_desc_qos_maxage(__u32 *val); int lov_get_stripecnt(struct lov_obd *lov, __u32 stripe_count); void lov_getref(struct obd_device *obd); void lov_putref(struct obd_device *obd); - +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data); +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg); +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp); +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen); /* lov_log.c */ int lov_llog_init(struct obd_device *obd, struct obd_llog_group *olg, - struct obd_device *tgt, int count, struct llog_catid *logid, + struct obd_device *tgt, int count, struct llog_catid *logid, struct obd_uuid *uuid); int lov_llog_finish(struct obd_device *obd, int count); @@ -312,6 +300,9 @@ static inline void lprocfs_lov_init_vars(struct lprocfs_static_vars *lvars) } #endif +/* lov_cl.c */ +extern struct lu_device_type lov_device_type; + /* pools */ extern lustre_hash_ops_t pool_hash_operations; /* ost_pool methods */ @@ -330,5 +321,4 @@ void lov_dump_pool(int level, struct pool_desc *pool); struct pool_desc *lov_find_pool(struct lov_obd *lov, char *poolname); int lov_check_index_in_pool(__u32 idx, struct pool_desc *pool); - #endif diff --git a/lustre/lov/lov_io.c b/lustre/lov/lov_io.c new file mode 100644 index 0000000..346a992 --- /dev/null +++ b/lustre/lov/lov_io.c @@ -0,0 +1,894 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +static void lov_sub_enter(struct lov_io_sub *sub) +{ + ENTRY; + if (sub->sub_reenter++ == 0) { + sub->sub_cookie = cl_env_reenter(); + cl_env_implant(sub->sub_env, &sub->sub_refcheck2); + } + EXIT; +} + +static void lov_sub_exit(struct lov_io_sub *sub) +{ + ENTRY; + if (--sub->sub_reenter == 0) { + cl_env_unplant(sub->sub_env, &sub->sub_refcheck2); + cl_env_reexit(sub->sub_cookie); + } + EXIT; +} + +static void lov_io_sub_fini(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + ENTRY; + if (sub->sub_io != NULL) { + if (sub->sub_io_initialized) { + lov_sub_enter(sub); + cl_io_fini(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + sub->sub_io_initialized = 0; + lio->lis_active_subios--; + } + if (sub->sub_stripe == lio->lis_single_subio_index) + lio->lis_single_subio_index = -1; + else if (!sub->sub_borrowed) + OBD_FREE_PTR(sub->sub_io); + sub->sub_io = NULL; + } + if (sub->sub_env != NULL && !IS_ERR(sub->sub_env)) { + if (!sub->sub_borrowed) + cl_env_put(sub->sub_env, &sub->sub_refcheck); + sub->sub_env = NULL; + } + EXIT; +} + +static void lov_io_sub_inherit(struct cl_io *io, struct lov_io *lio, + int stripe, loff_t start, loff_t end) +{ + struct lov_stripe_md *lsm = lov_r0(lio->lis_object)->lo_lsm; + struct cl_io *parent = lio->lis_cl.cis_io; + + switch(io->ci_type) { + case CIT_TRUNC: { + size_t new_size = parent->u.ci_truncate.tr_size; + + new_size = lov_size_to_stripe(lsm, new_size, stripe); + io->u.ci_truncate.tr_capa = parent->u.ci_truncate.tr_capa; + io->u.ci_truncate.tr_size = new_size; + break; + } + case CIT_FAULT: { + struct cl_object *obj = parent->ci_obj; + loff_t off = cl_offset(obj, parent->u.ci_fault.ft_index); + + io->u.ci_fault = parent->u.ci_fault; + off = lov_size_to_stripe(lsm, off, stripe); + io->u.ci_fault.ft_index = cl_index(obj, off); + break; + } + case CIT_READ: + case CIT_WRITE: { + io->u.ci_rw.crw_pos = start; + io->u.ci_rw.crw_count = end - start; + break; + } + default: + break; + } +} + +static int lov_io_sub_init(const struct lu_env *env, struct lov_io *lio, + struct lov_io_sub *sub) +{ + struct lov_object *lov = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(lov)->co_lu.lo_dev); + struct cl_io *sub_io; + struct cl_object *sub_obj; + struct cl_io *io = lio->lis_cl.cis_io; + + int stripe = sub->sub_stripe; + int result; + + LASSERT(sub->sub_io == NULL); + LASSERT(sub->sub_env == NULL); + LASSERT(sub->sub_stripe < lio->lis_stripe_count); + ENTRY; + + result = 0; + sub->sub_io_initialized = 0; + sub->sub_borrowed = 0; + + /* + * First sub-io. Use ->lis_single_subio and current environment, to + * avoid dynamic allocation. + */ + if (lio->lis_active_subios == 0) { + sub->sub_io = &lio->lis_single_subio; + lio->lis_single_subio_index = stripe; + sub->sub_env = cl_env_get(&sub->sub_refcheck); + LASSERT(sub->sub_env == env); + } else if (lio->lis_mem_frozen) { + LASSERT(mutex_is_locked(&ld->ld_mutex)); + sub->sub_io = &ld->ld_emrg[stripe]->emrg_subio; + sub->sub_env = ld->ld_emrg[stripe]->emrg_env; + sub->sub_borrowed = 1; + } else { + void *cookie; + + /* obtain new environment */ + cookie = cl_env_reenter(); + sub->sub_env = cl_env_get(&sub->sub_refcheck); + cl_env_reexit(cookie); + + OBD_ALLOC_PTR(sub->sub_io); + if (IS_ERR(sub->sub_env)) + result = PTR_ERR(sub->sub_env); + else if (sub->sub_io == NULL) + result = -ENOMEM; + } + + if (result == 0) { + sub_obj = lovsub2cl(lov_r0(lov)->lo_sub[stripe]); + sub_io = sub->sub_io; + + sub_io->ci_obj = sub_obj; + sub_io->ci_result = 0; + + sub_io->ci_parent = io; + sub_io->ci_lockreq = io->ci_lockreq; + sub_io->ci_type = io->ci_type; + + lov_sub_enter(sub); + result = cl_io_sub_init(sub->sub_env, sub_io, + io->ci_type, sub_obj); + lov_sub_exit(sub); + if (result >= 0) { + lio->lis_active_subios++; + sub->sub_io_initialized = 1; + result = 0; + } + } + if (result != 0) + lov_io_sub_fini(env, lio, sub); + RETURN(result); +} + +static struct lov_io_sub *lov_sub_get(const struct lu_env *env, + struct lov_io *lio, int stripe) +{ + int rc; + struct lov_io_sub *sub = &lio->lis_subs[stripe]; + + LASSERT(stripe < lio->lis_stripe_count); + ENTRY; + + if (!sub->sub_io_initialized) { + sub->sub_stripe = stripe; + rc = lov_io_sub_init(env, lio, sub); + } else + rc = 0; + if (rc == 0) + lov_sub_enter(sub); + else + sub = ERR_PTR(rc); + RETURN(sub); +} + +void lov_sub_put(struct lov_io_sub *sub) +{ + lov_sub_exit(sub); +} + +/***************************************************************************** + * + * Lov io operations. + * + */ + +static int lov_page_stripe(const struct cl_page *page) +{ + struct lovsub_object *subobj; + + ENTRY; + subobj = lu2lovsub( + lu_object_locate(page->cp_child->cp_obj->co_lu.lo_header, + &lovsub_device_type)); + LASSERT(subobj != NULL); + RETURN(subobj->lso_index); +} + +struct lov_io_sub *lov_page_subio(const struct lu_env *env, struct lov_io *lio, + const struct cl_page_slice *slice) +{ + struct lov_stripe_md *lsm = lov_r0(lio->lis_object)->lo_lsm; + struct cl_page *page = slice->cpl_page; + int stripe; + + LASSERT(lio->lis_cl.cis_io != NULL); + LASSERT(cl2lov(slice->cpl_obj) == lio->lis_object); + LASSERT(lsm != NULL); + LASSERT(lio->lis_nr_subios > 0); + ENTRY; + + stripe = lov_page_stripe(page); + RETURN(lov_sub_get(env, lio, stripe)); +} + + +static int lov_io_subio_init(const struct lu_env *env, struct lov_io *lio, + struct cl_io *io) +{ + struct lov_object *lov = lio->lis_object; + struct lov_stripe_md *lsm = lov_r0(lov)->lo_lsm; + int result; + + LASSERT(lio->lis_object != NULL); + ENTRY; + + /* + * Need to be optimized, we can't afford to allocate a piece of memory + * when writing a page. -jay + */ + OBD_ALLOC(lio->lis_subs, + lsm->lsm_stripe_count * sizeof lio->lis_subs[0]); + if (lio->lis_subs != NULL) { + lio->lis_nr_subios = lio->lis_stripe_count; + lio->lis_single_subio_index = -1; + lio->lis_active_subios = 0; + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +static void lov_io_slice_init(struct lov_io *lio, + struct lov_object *obj, struct cl_io *io) +{ + struct lov_stripe_md *lsm = lov_r0(obj)->lo_lsm; + + LASSERT(lsm != NULL); + ENTRY; + + io->ci_result = 0; + lio->lis_object = obj; + lio->lis_stripe_count = lsm->lsm_stripe_count; + + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + lio->lis_io_endpos = lio->lis_endpos; + if (cl_io_is_append(io)) { + LASSERT(io->ci_type == CIT_WRITE); + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + } + break; + + case CIT_TRUNC: + lio->lis_pos = io->u.ci_truncate.tr_size; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + case CIT_FAULT: { + pgoff_t index = io->u.ci_fault.ft_index; + lio->lis_pos = cl_offset(io->ci_obj, index); + lio->lis_endpos = cl_offset(io->ci_obj, index + 1); + break; + } + + case CIT_MISC: + lio->lis_pos = 0; + lio->lis_endpos = OBD_OBJECT_EOF; + break; + + default: + LBUG(); + } + + EXIT; +} + +static void lov_io_fini(const struct lu_env *env, const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int i; + + ENTRY; + if (lio->lis_subs != NULL) { + for (i = 0; i < lio->lis_nr_subios; i++) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + OBD_FREE(lio->lis_subs, + lio->lis_nr_subios * sizeof lio->lis_subs[0]); + lio->lis_nr_subios = 0; + } + EXIT; +} + +static obd_off lov_offset_mod(obd_off val, int delta) +{ + if (val != OBD_OBJECT_EOF) + val += delta; + return val; +} + +static int lov_io_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_stripe_md *lsm = lov_r0(lio->lis_object)->lo_lsm; + struct lov_io_sub *sub; + obd_off endpos; + obd_off start; + obd_off end; + int stripe; + int rc = 0; + + ENTRY; + endpos = lov_offset_mod(lio->lis_endpos, -1); + for (stripe = 0; stripe < lio->lis_stripe_count; stripe++) { + if (!lov_stripe_intersects(lsm, stripe, lio->lis_pos, + endpos, &start, &end)) + continue; + + end = lov_offset_mod(end, +1); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + lov_io_sub_inherit(sub->sub_io, lio, stripe, + start, end); + rc = cl_io_iter_init(sub->sub_env, sub->sub_io); + lov_sub_put(sub); + CDEBUG(D_VFSTRACE, "shrink: %i [%llu, %llu)\n", + stripe, start, end); + } else + rc = PTR_ERR(sub); + if (!rc) + list_add_tail(&sub->sub_linkage, &lio->lis_active); + else + break; + } + RETURN(rc); +} + +static int lov_io_rw_iter_init(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_io *io = ios->cis_io; + struct lov_stripe_md *lsm = lov_r0(cl2lov(ios->cis_obj))->lo_lsm; + loff_t start = io->u.ci_rw.crw_pos; + loff_t next; + int ssize = lsm->lsm_stripe_size; + + LASSERT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE); + ENTRY; + + /* fast path for common case. */ + if (lio->lis_nr_subios != 1 && !cl_io_is_append(io)) { + + do_div(start, ssize); + next = (start + 1) * ssize; + if (next <= start * ssize) + next = ~0ull; + + io->ci_continue = next < lio->lis_io_endpos; + io->u.ci_rw.crw_count = min_t(loff_t, lio->lis_io_endpos, + next) - io->u.ci_rw.crw_pos; + lio->lis_pos = io->u.ci_rw.crw_pos; + lio->lis_endpos = io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + CDEBUG(D_VFSTRACE, "stripe: %llu chunk: [%llu, %llu) %llu\n", + (__u64)start, lio->lis_pos, lio->lis_endpos, + (__u64)lio->lis_io_endpos); + } + /* + * XXX The following call should be optimized: we know, that + * [lio->lis_pos, lio->lis_endpos) intersects with exactly one stripe. + */ + RETURN(lov_io_iter_init(env, ios)); +} + +static int lov_io_call(const struct lu_env *env, struct lov_io *lio, + int (*iofunc)(const struct lu_env *, struct cl_io *)) +{ + struct lov_io_sub *sub; + int rc = 0; + + ENTRY; + list_for_each_entry(sub, &lio->lis_active, sub_linkage) { + lov_sub_enter(sub); + rc = iofunc(sub->sub_env, sub->sub_io); + lov_sub_exit(sub); + if (rc) + break; + } + RETURN(rc); +} + +static int lov_io_lock(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_lock)); +} + +static int lov_io_start(const struct lu_env *env, const struct cl_io_slice *ios) +{ + ENTRY; + RETURN(lov_io_call(env, cl2lov_io(env, ios), cl_io_start)); +} + +static int lov_io_end_wrapper(const struct lu_env *env, struct cl_io *io) +{ + ENTRY; + /* + * It's possible that lov_io_start() wasn't called against this + * sub-io, either because previous sub-io failed, or upper layer + * completed IO. + */ + if (io->ci_state == CIS_IO_GOING) + cl_io_end(env, io); + else + io->ci_state = CIS_IO_FINISHED; + RETURN(0); +} + +static int lov_io_iter_fini_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_iter_fini(env, io); + RETURN(0); +} + +static int lov_io_unlock_wrapper(const struct lu_env *env, struct cl_io *io) +{ + cl_io_unlock(env, io); + RETURN(0); +} + +static void lov_io_end(const struct lu_env *env, const struct cl_io_slice *ios) +{ + int rc; + + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_end_wrapper); + LASSERT(rc == 0); +} + +static void lov_io_iter_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct lov_io *lio = cl2lov_io(env, ios); + int rc; + + ENTRY; + rc = lov_io_call(env, lio, lov_io_iter_fini_wrapper); + LASSERT(rc == 0); + while (!list_empty(&lio->lis_active)) + list_del_init(lio->lis_active.next); + EXIT; +} + +static void lov_io_unlock(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + int rc; + + ENTRY; + rc = lov_io_call(env, cl2lov_io(env, ios), lov_io_unlock_wrapper); + LASSERT(rc == 0); + EXIT; +} + + +static struct cl_page_list *lov_io_submit_qin(struct lov_device *ld, + struct cl_page_list *qin, + int idx, int alloc) +{ + return alloc ? &qin[idx] : &ld->ld_emrg[idx]->emrg_page_list; +} + +/** + * lov implementation of cl_operations::cio_submit() method. It takes a list + * of pages in \a queue, splits it into per-stripe sub-lists, invokes + * cl_io_submit() on underlying devices to submit sub-lists, and then splices + * everything back. + * + * Major complication of this function is a need to handle memory cleansing: + * cl_io_submit() is called to write out pages as a part of VM memory + * reclamation, and hence it may not fail due to memory shortages (system + * dead-locks otherwise). To deal with this, some resources (sub-lists, + * sub-environment, etc.) are allocated per-device on "startup" (i.e., in a + * not-memory cleansing context), and in case of memory shortage, these + * pre-allocated resources are used by lov_io_submit() under + * lov_device::ld_mutex mutex. + */ +static int lov_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct lov_object *obj = lio->lis_object; + struct lov_device *ld = lu2lov_dev(lov2cl(obj)->co_lu.lo_dev); + struct cl_page_list *qin = &queue->c2_qin; + struct cl_2queue *cl2q = &lov_env_info(env)->lti_cl2q; + struct cl_page_list *stripes_qin = NULL; + struct cl_page *page; + struct cl_page *tmp; + int stripe; + +#define QIN(stripe) lov_io_submit_qin(ld, stripes_qin, stripe, alloc) + + int rc = 0; + int alloc = +#if defined(__KERNEL__) && defined(__linux__) + !(current->flags & PF_MEMALLOC); +#else + 1; +#endif + ENTRY; + if (lio->lis_active_subios == 1) { + int idx = lio->lis_single_subio_index; + struct lov_io_sub *sub; + + LASSERT(idx < lio->lis_nr_subios); + sub = lov_sub_get(env, lio, idx); + LASSERT(!IS_ERR(sub)); + LASSERT(sub->sub_io == &lio->lis_single_subio); + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, crt, queue); + lov_sub_put(sub); + RETURN(rc); + } + + LASSERT(lio->lis_subs != NULL); + if (alloc) { + OBD_ALLOC(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + if (stripes_qin == NULL) + RETURN(-ENOMEM); + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) + cl_page_list_init(&stripes_qin[stripe]); + } else { + /* + * If we get here, it means pageout & swap doesn't help. + * In order to not make things worse, even don't try to + * allocate the memory with __GFP_NOWARN. -jay + */ + mutex_lock(&ld->ld_mutex); + lio->lis_mem_frozen = 1; + } + + cl_2queue_init(cl2q); + cl_page_list_for_each_safe(page, tmp, qin) { + stripe = lov_page_stripe(page); + cl_page_list_move(QIN(stripe), qin, page); + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct lov_io_sub *sub; + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, &cl2q->c2_qin); + sub = lov_sub_get(env, lio, stripe); + if (!IS_ERR(sub)) { + rc = cl_io_submit_rw(sub->sub_env, sub->sub_io, + crt, cl2q); + lov_sub_put(sub); + } else + rc = PTR_ERR(sub); + cl_page_list_splice(&cl2q->c2_qin, &queue->c2_qin); + cl_page_list_splice(&cl2q->c2_qout, &queue->c2_qout); + if (rc != 0) + break; + } + + for (stripe = 0; stripe < lio->lis_nr_subios; stripe++) { + struct cl_page_list *sub_qin = QIN(stripe); + + if (list_empty(&sub_qin->pl_pages)) + continue; + + cl_page_list_splice(sub_qin, qin); + } + + if (alloc) { + OBD_FREE(stripes_qin, + sizeof(*stripes_qin) * lio->lis_nr_subios); + } else { + int i; + + for (i = 0; i < lio->lis_nr_subios; i++) { + struct cl_io *cio = lio->lis_subs[i].sub_io; + + if (cio && cio == &ld->ld_emrg[i]->emrg_subio) + lov_io_sub_fini(env, lio, &lio->lis_subs[i]); + } + lio->lis_mem_frozen = 0; + mutex_unlock(&ld->ld_mutex); + } + + RETURN(rc); +#undef QIN +} + +static int lov_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + ENTRY; + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_prepare_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + RETURN(result); +} + +static int lov_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct lov_io *lio = cl2lov_io(env, ios); + struct cl_page *sub_page = lov_sub_page(slice); + struct lov_io_sub *sub; + int result; + + ENTRY; + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + result = cl_io_commit_write(sub->sub_env, sub->sub_io, + sub_page, from, to); + lov_sub_put(sub); + } else + result = PTR_ERR(sub); + RETURN(result); +} + +static int lov_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_fault_io *fio; + struct lov_io *lio; + struct lov_io_sub *sub; + + ENTRY; + fio = &ios->cis_io->u.ci_fault; + lio = cl2lov_io(env, ios); + sub = lov_sub_get(env, lio, lov_page_stripe(fio->ft_page)); + sub->sub_io->u.ci_fault.ft_nob = fio->ft_nob; + lov_sub_put(sub); + RETURN(lov_io_start(env, ios)); +} + +static const struct cl_io_operations lov_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_WRITE] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_rw_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_TRUNC] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_start, + .cio_end = lov_io_end + }, + [CIT_FAULT] = { + .cio_fini = lov_io_fini, + .cio_iter_init = lov_io_iter_init, + .cio_iter_fini = lov_io_iter_fini, + .cio_lock = lov_io_lock, + .cio_unlock = lov_io_unlock, + .cio_start = lov_io_fault_start, + .cio_end = lov_io_end + }, + [CIT_MISC] = { + .cio_fini = lov_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = lov_io_submit + }, + [CRT_WRITE] = { + .cio_submit = lov_io_submit + } + }, + .cio_prepare_write = lov_io_prepare_write, + .cio_commit_write = lov_io_commit_write +}; + +/***************************************************************************** + * + * Empty lov io operations. + * + */ + +static void lov_empty_io_fini(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + ENTRY; + EXIT; +} + +static void lov_empty_impossible(const struct lu_env *env, + struct cl_io_slice *ios) +{ + LBUG(); +} + +#define LOV_EMPTY_IMPOSSIBLE ((void *)lov_empty_impossible) + +/** + * An io operation vector for files without stripes. + */ +static const struct cl_io_operations lov_empty_io_ops = { + .op = { + [CIT_READ] = { +#if 0 + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE +#endif + }, + [CIT_WRITE] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_TRUNC] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_FAULT] = { + .cio_fini = lov_empty_io_fini, + .cio_iter_init = LOV_EMPTY_IMPOSSIBLE, + .cio_lock = LOV_EMPTY_IMPOSSIBLE, + .cio_start = LOV_EMPTY_IMPOSSIBLE, + .cio_end = LOV_EMPTY_IMPOSSIBLE + }, + [CIT_MISC] = { + .cio_fini = lov_empty_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + }, + [CRT_WRITE] = { + .cio_submit = LOV_EMPTY_IMPOSSIBLE + } + }, + .cio_commit_write = LOV_EMPTY_IMPOSSIBLE +}; + +int lov_io_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_object *lov = cl2lov(obj); + + ENTRY; + CFS_INIT_LIST_HEAD(&lio->lis_active); + lov_io_slice_init(lio, lov, io); + if (io->ci_result == 0) { + LASSERT(lov_r0(lov)->lo_lsm != NULL); + io->ci_result = lov_io_subio_init(env, lio, io); + if (io->ci_result == 0) + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_io_ops); + } + RETURN(io->ci_result); +} + +int lov_io_init_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + int result; + + ENTRY; + switch (io->ci_type) { + default: + LBUG(); + case CIT_MISC: + case CIT_READ: + result = 0; + break; + case CIT_WRITE: + case CIT_TRUNC: + result = -EBADF; + break; + case CIT_FAULT: + result = -EFAULT; + CERROR("Page fault on a file without stripes: "DFID"\n", + PFID(lu_object_fid(&obj->co_lu))); + break; + } + if (result == 0) + cl_io_slice_add(io, &lio->lis_cl, obj, &lov_empty_io_ops); + io->ci_result = result; + RETURN(result != 0); +} + +/** @} lov */ diff --git a/lustre/lov/lov_lock.c b/lustre/lov/lov_lock.c new file mode 100644 index 0000000..14ecd68 --- /dev/null +++ b/lustre/lov/lov_lock.c @@ -0,0 +1,935 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent); + +/***************************************************************************** + * + * Lov lock operations. + * + */ + +static void lov_sublock_adopt(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, int idx, + struct lov_lock_link *link) +{ + struct lovsub_lock *lsl; + struct cl_lock *parent = lck->lls_cl.cls_lock; + int rc; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sublock)); + ENTRY; + + lsl = cl2sub_lock(sublock); + /* + * check that sub-lock doesn't have lock link to this top-lock. + */ + LASSERT(lov_lock_link_find(env, lck, lsl) == NULL); + LASSERT(idx < lck->lls_nr); + + lck->lls_sub[idx].sub_lock = lsl; + lck->lls_nr_filled++; + LASSERT(lck->lls_nr_filled <= lck->lls_nr); + list_add_tail(&link->lll_list, &lsl->lss_parents); + link->lll_idx = idx; + link->lll_super = lck; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lov-child", sublock); + lck->lls_sub[idx].sub_flags |= LSF_HELD; + cl_lock_user_add(env, sublock); + + rc = lov_sublock_modify(env, lck, lsl, &sublock->cll_descr, idx); + LASSERT(rc == 0); /* there is no way this can fail, currently */ + EXIT; +} + +static struct cl_lock *lov_sublock_alloc(const struct lu_env *env, + const struct cl_io *io, + struct lov_lock *lck, + int idx, struct lov_lock_link **out) +{ + struct cl_lock *sublock; + struct cl_lock *parent; + struct lov_lock_link *link; + + LASSERT(idx < lck->lls_nr); + ENTRY; + + OBD_SLAB_ALLOC_PTR(link, lov_lock_link_kmem); + if (link != NULL) { + struct lov_lock_sub *sub; + struct cl_lock_descr *descr; + + parent = lck->lls_cl.cls_lock; + sub = &lck->lls_sub[idx]; + descr = &sub->sub_descr; + + /* XXX maybe sub-io? */ + sublock = cl_lock_hold(env, io, descr, "lov-parent", parent); + if (!IS_ERR(sublock)) + *out = link; + else + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + } else + sublock = ERR_PTR(-ENOMEM); + RETURN(sublock); +} + +static void lov_sublock_unlock(const struct lu_env *env, + struct lovsub_lock *lsl, + struct cl_lock_closure *closure) +{ + ENTRY; + lsl->lss_active = NULL; + cl_lock_disclosure(env, closure); + EXIT; +} + +static int lov_sublock_lock(const struct lu_env *env, struct lovsub_lock *lsl, + struct cl_lock_closure *closure) +{ + struct cl_lock *child; + int result; + + LASSERT(list_empty(&closure->clc_list)); + + ENTRY; + child = lsl->lss_cl.cls_lock; + result = cl_lock_closure_build(env, child, closure); + if (result == 0) { + LASSERT(cl_lock_is_mutexed(child)); + lsl->lss_active = closure->clc_origin; + } + RETURN(result); +} + +/** + * Updates the result of a top-lock operation from a result of sub-lock + * sub-operations. Top-operations like lov_lock_{enqueue,use,unuse}() iterate + * over sub-locks and lov_subresult() is used to calculate return value of a + * top-operation. To this end, possible return values of sub-operations are + * ordered as + * + * - 0 success + * - CLO_WAIT wait for event + * - CLO_REPEAT repeat top-operation + * - -ne fundamental error + * + * Top-level return code can only go down through this list. CLO_REPEAT + * overwrites CLO_WAIT, because lock mutex was released and sleeping condition + * has to be rechecked by the upper layer. + */ +static int lov_subresult(int result, int rc) +{ + int result_rank; + int rc_rank; + + LASSERT(result <= 0 || result == CLO_REPEAT || result == CLO_WAIT); + LASSERT(rc <= 0 || rc == CLO_REPEAT || rc == CLO_WAIT); + CLASSERT(CLO_WAIT < CLO_REPEAT); + + ENTRY; + + /* calculate ranks in the ordering above */ + result_rank = result < 0 ? 1 + CLO_REPEAT : result; + rc_rank = rc < 0 ? 1 + CLO_REPEAT : rc; + + if (result_rank < rc_rank) + result = rc; + RETURN(result); +} + +/** + * Creates sub-locks for a given lov_lock for the first time. + * + * Goes through all sub-objects of top-object, and creates sub-locks on every + * sub-object intersecting with top-lock extent. This is complicated by the + * fact that top-lock (that is being created) can be accessed concurrently + * through already created sub-locks (possibly shared with other top-locks). + */ +static int lov_lock_sub_init(const struct lu_env *env, + struct lov_lock *lck, const struct cl_io *io) +{ + int result = 0; + int i; + int j; + int nr; + int stripe; + int start_stripe; + obd_off start; + obd_off end; + obd_off file_start; + obd_off file_end; + + struct lov_object *loo = cl2lov(lck->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct cl_lock *parent = lck->lls_cl.cls_lock; + + ENTRY; + + lck->lls_orig = parent->cll_descr; + file_start = cl_offset(lov2cl(loo), parent->cll_descr.cld_start); + file_end = cl_offset(lov2cl(loo), parent->cll_descr.cld_end + 1) - 1; + + start_stripe = lov_stripe_number(r0->lo_lsm, file_start); + for (i = 0, nr = 0; i < r0->lo_nr; i++) { + /* + * XXX for wide striping smarter algorithm is desirable, + * breaking out of the loop, early. + */ + stripe = (start_stripe + i) % r0->lo_nr; + if (lov_stripe_intersects(r0->lo_lsm, stripe, + file_start, file_end, &start, &end)) + nr++; + } + LASSERT(nr > 0); + OBD_ALLOC(lck->lls_sub, nr * sizeof lck->lls_sub[0]); + if (lck->lls_sub == NULL) + RETURN(-ENOMEM); + + lck->lls_nr = nr; + /* + * First, fill in sub-lock descriptions in + * lck->lls_sub[].sub_descr. They are used by lov_sublock_alloc() + * (called below in this function, and by lov_lock_enqueue()) to + * create sub-locks. At this moment, no other thread can access + * top-lock. + */ + for (j = 0, nr = 0; j < i; ++j) { + stripe = (start_stripe + j) % r0->lo_nr; + if (lov_stripe_intersects(r0->lo_lsm, stripe, + file_start, file_end, &start, &end)) { + struct cl_lock_descr *descr; + + descr = &lck->lls_sub[nr].sub_descr; + + LASSERT(descr->cld_obj == NULL); + descr->cld_obj = lovsub2cl(r0->lo_sub[stripe]); + descr->cld_start = cl_index(descr->cld_obj, start); + descr->cld_end = cl_index(descr->cld_obj, end); + descr->cld_mode = parent->cll_descr.cld_mode; + lck->lls_sub[nr].sub_got = *descr; + lck->lls_sub[nr].sub_stripe = stripe; + nr++; + } + } + LASSERT(nr == lck->lls_nr); + /* + * Then, create sub-locks. Once at least one sub-lock was created, + * top-lock can be reached by other threads. + */ + for (i = 0; i < lck->lls_nr; ++i) { + struct cl_lock *sublock; + struct lov_lock_link *link; + + if (lck->lls_sub[i].sub_lock == NULL) { + sublock = lov_sublock_alloc(env, io, lck, i, &link); + if (IS_ERR(sublock)) { + result = PTR_ERR(sublock); + break; + } + cl_lock_mutex_get(env, sublock); + cl_lock_mutex_get(env, parent); + /* + * recheck under mutex that sub-lock wasn't created + * concurrently, and that top-lock is still alive. + */ + if (lck->lls_sub[i].sub_lock == NULL && + parent->cll_state < CLS_FREEING) { + lov_sublock_adopt(env, lck, sublock, i, link); + cl_lock_mutex_put(env, parent); + } else { + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, + "lov-parent", parent); + } + cl_lock_mutex_put(env, sublock); + } + } + /* + * Some sub-locks can be missing at this point. This is not a problem, + * because enqueue will create them anyway. Main duty of this function + * is to fill in sub-lock descriptions in a race free manner. + */ + RETURN(result); +} + +static int lov_sublock_release(const struct lu_env *env, struct lov_lock *lck, + int i, int deluser, int rc) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + ENTRY; + + if (lck->lls_sub[i].sub_flags & LSF_HELD) { + struct cl_lock *sublock; + int dying; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + + lck->lls_sub[i].sub_flags &= ~LSF_HELD; + if (deluser) + cl_lock_user_del(env, sublock); + /* + * If the last hold is released, and cancellation is pending + * for a sub-lock, release parent mutex, to avoid keeping it + * while sub-lock is being paged out. + */ + dying = (sublock->cll_descr.cld_mode == CLM_PHANTOM || + (sublock->cll_flags & (CLF_CANCELPEND|CLF_DOOMED))) && + sublock->cll_holds == 1; + if (dying) + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + if (dying) { + cl_lock_mutex_get(env, parent); + rc = lov_subresult(rc, CLO_REPEAT); + } + /* + * From now on lck->lls_sub[i].sub_lock is a "weak" pointer, + * not backed by a reference on a + * sub-lock. lovsub_lock_delete() will clear + * lck->lls_sub[i].sub_lock under semaphores, just before + * sub-lock is destroyed. + */ + } + RETURN(rc); +} + +static void lov_sublock_hold(const struct lu_env *env, struct lov_lock *lck, + int i) +{ + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + ENTRY; + + if (!(lck->lls_sub[i].sub_flags & LSF_HELD)) { + struct cl_lock *sublock; + + LASSERT(lck->lls_sub[i].sub_lock != NULL); + sublock = lck->lls_sub[i].sub_lock->lss_cl.cls_lock; + LASSERT(cl_lock_is_mutexed(sublock)); + LASSERT(sublock->cll_state != CLS_FREEING); + + lck->lls_sub[i].sub_flags |= LSF_HELD; + + cl_lock_get_trust(sublock); + cl_lock_hold_add(env, sublock, "lov-parent", parent); + cl_lock_user_add(env, sublock); + cl_lock_put(env, sublock); + } + EXIT; +} + +static void lov_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lov_lock *lck; + int i; + + ENTRY; + lck = cl2lov_lock(slice); + LASSERT(lck->lls_nr_filled == 0); + if (lck->lls_sub != NULL) { + for (i = 0; i < lck->lls_nr; ++i) + /* + * No sub-locks exists at this point, as sub-lock has + * a reference on its parent. + */ + LASSERT(lck->lls_sub[i].sub_lock == NULL); + OBD_FREE(lck->lls_sub, lck->lls_nr * sizeof lck->lls_sub[0]); + } + OBD_SLAB_FREE_PTR(lck, lov_lock_kmem); + EXIT; +} + +/** + * Tries to advance a state machine of a given sub-lock toward enqueuing of + * the top-lock. + * + * \retval 0 if state-transition can proceed + * \retval -ve otherwise. + */ +static int lov_lock_enqueue_one(const struct lu_env *env, struct lov_lock *lck, + struct cl_lock *sublock, + struct cl_io *io, __u32 enqflags, int last) +{ + int result; + + ENTRY; + /* first, try to enqueue a sub-lock ... */ + result = cl_enqueue_try(env, sublock, io, enqflags); + if (sublock->cll_state == CLS_ENQUEUED) + /* if it is enqueued, try to `wait' on it---maybe it's already + * granted */ + result = cl_wait_try(env, sublock); + /* + * If CEF_ASYNC flag is set, then all sub-locks can be enqueued in + * parallel, otherwise---enqueue has to wait until sub-lock is granted + * before proceeding to the next one. + */ + if (result == CLO_WAIT && sublock->cll_state <= CLS_HELD && + enqflags & CEF_ASYNC && !last) + result = 0; + RETURN(result); +} + +/** + * Helper function for lov_lock_enqueue() that creates missing sub-lock. + */ +static int lov_sublock_fill(const struct lu_env *env, struct cl_lock *parent, + struct cl_io *io, struct lov_lock *lck, int idx) +{ + struct lov_lock_link *link; + struct cl_lock *sublock; + int result; + + LASSERT(parent->cll_depth == 1); + cl_lock_mutex_put(env, parent); + sublock = lov_sublock_alloc(env, io, lck, idx, &link); + if (!IS_ERR(sublock)) + cl_lock_mutex_get(env, sublock); + cl_lock_mutex_get(env, parent); + + if (!IS_ERR(sublock)) { + if (parent->cll_state == CLS_QUEUING && + lck->lls_sub[idx].sub_lock == NULL) + lov_sublock_adopt(env, lck, sublock, idx, link); + else { + /* other thread allocated sub-lock, or enqueue is no + * longer going on */ + cl_lock_mutex_put(env, parent); + cl_lock_unhold(env, sublock, "lov-parent", parent); + cl_lock_mutex_get(env, parent); + } + cl_lock_mutex_put(env, sublock); + result = CLO_REPEAT; + } else + result = PTR_ERR(sublock); + return result; +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() for lov layer. This + * function is rather subtle, as it enqueues top-lock (i.e., advances top-lock + * state machine from CLS_QUEUING to CLS_ENQUEUED states) by juggling sub-lock + * state machines in the face of sub-locks sharing (by multiple top-locks), + * and concurrent sub-lock cancellations. + */ +static int lov_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *io, __u32 enqflags) +{ + struct cl_lock *lock = slice->cls_lock; + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, lock); + int i; + int result; + enum cl_lock_state minstate; + + ENTRY; + + for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + + if (lock->cll_state != CLS_QUEUING) { + /* + * Lock might have left QUEUING state if previous + * iteration released its mutex. Stop enqueing in this + * case and let the upper layer to decide what to do. + */ + LASSERT(i > 0 && result != 0); + break; + } + + sub = lck->lls_sub[i].sub_lock; + /* + * Sub-lock might have been canceled, while top-lock was + * cached. + */ + if (sub == NULL) { + result = lov_sublock_fill(env, lock, io, lck, i); + /* lov_sublock_fill() released @lock mutex, + * restart. */ + break; + } + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, sub, closure); + if (rc == 0) { + lov_sublock_hold(env, lck, i); + rc = lov_lock_enqueue_one(env, lck, sublock, io, + enqflags, + i == lck->lls_nr - 1); + minstate = min(minstate, sublock->cll_state); + /* + * Don't hold a sub-lock in CLS_CACHED state, see + * description for lov_lock::lls_sub. + */ + if (sublock->cll_state > CLS_HELD) + rc = lov_sublock_release(env, lck, i, 1, rc); + lov_sublock_unlock(env, sub, closure); + } + result = lov_subresult(result, rc); + if (result < 0) + break; + } + cl_lock_closure_fini(closure); + RETURN(result ?: minstate >= CLS_ENQUEUED ? 0 : CLO_WAIT); +} + +static int lov_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + int result; + + ENTRY; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + + /* top-lock state cannot change concurrently, because single + * thread (one that released the last hold) carries unlocking + * to the completion. */ + LASSERT(slice->cls_lock->cll_state == CLS_UNLOCKING); + sub = lck->lls_sub[i].sub_lock; + if (sub == NULL) + continue; + + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, sub, closure); + if (rc == 0) { + if (lck->lls_sub[i].sub_flags & LSF_HELD) { + LASSERT(sublock->cll_state == CLS_HELD); + rc = cl_unuse_try(env, sublock); + if (rc != CLO_WAIT) + rc = lov_sublock_release(env, lck, + i, 0, rc); + } + lov_sublock_unlock(env, sub, closure); + } + result = lov_subresult(result, rc); + if (result < 0) + break; + } + if (result == 0 && lck->lls_unuse_race) { + lck->lls_unuse_race = 0; + result = -ESTALE; + } + cl_lock_closure_fini(closure); + RETURN(result); +} + +static int lov_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + enum cl_lock_state minstate; + int result; + int i; + + ENTRY; + + for (result = 0, minstate = CLS_FREEING, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + + sub = lck->lls_sub[i].sub_lock; + LASSERT(sub != NULL); + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, sub, closure); + if (rc == 0) { + LASSERT(sublock->cll_state >= CLS_ENQUEUED); + if (sublock->cll_state < CLS_HELD) + rc = cl_wait_try(env, sublock); + minstate = min(minstate, sublock->cll_state); + lov_sublock_unlock(env, sub, closure); + } + result = lov_subresult(result, rc); + if (result < 0) + break; + } + cl_lock_closure_fini(closure); + RETURN(result ?: minstate >= CLS_HELD ? 0 : CLO_WAIT); +} + +static int lov_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int result; + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_CACHED); + ENTRY; + + for (result = 0, i = 0; i < lck->lls_nr; ++i) { + int rc; + struct lovsub_lock *sub; + struct cl_lock *sublock; + + if (slice->cls_lock->cll_state != CLS_CACHED) { + /* see comment in lov_lock_enqueue(). */ + LASSERT(i > 0 && result != 0); + break; + } + /* + * if a sub-lock was destroyed while top-lock was in + * CLS_CACHED state, top-lock would have been moved into + * CLS_NEW state, so all sub-locks have to be in place. + */ + sub = lck->lls_sub[i].sub_lock; + LASSERT(sub != NULL); + sublock = sub->lss_cl.cls_lock; + rc = lov_sublock_lock(env, sub, closure); + if (rc == 0) { + LASSERT(sublock->cll_state != CLS_FREEING); + lov_sublock_hold(env, lck, i); + if (sublock->cll_state == CLS_CACHED) { + rc = cl_use_try(env, sublock); + if (rc != 0) + rc = lov_sublock_release(env, lck, + i, 1, rc); + } else + rc = 0; + lov_sublock_unlock(env, sub, closure); + } + result = lov_subresult(result, rc); + if (result < 0) + break; + } + cl_lock_closure_fini(closure); + RETURN(result); +} + +#if 0 +static int lock_lock_multi_match() +{ + struct cl_lock *lock = slice->cls_lock; + struct cl_lock_descr *subneed = &lov_env_info(env)->lti_ldescr; + struct lov_object *loo = cl2lov(lov->lls_cl.cls_obj); + struct lov_layout_raid0 *r0 = lov_r0(loo); + struct lov_lock_sub *sub; + struct cl_object *subobj; + obd_off fstart; + obd_off fend; + obd_off start; + obd_off end; + int i; + + fstart = cl_offset(need->cld_obj, need->cld_start); + fend = cl_offset(need->cld_obj, need->cld_end + 1) - 1; + subneed->cld_mode = need->cld_mode; + cl_lock_mutex_get(env, lock); + for (i = 0; i < lov->lls_nr; ++i) { + sub = &lov->lls_sub[i]; + if (sub->sub_lock == NULL) + continue; + subobj = sub->sub_descr.cld_obj; + if (!lov_stripe_intersects(r0->lo_lsm, sub->sub_stripe, + fstart, fend, &start, &end)) + continue; + subneed->cld_start = cl_index(subobj, start); + subneed->cld_end = cl_index(subobj, end); + subneed->cld_obj = subobj; + if (!cl_lock_ext_match(&sub->sub_got, subneed)) { + result = 0; + break; + } + } + cl_lock_mutex_put(env, lock); +} +#endif + +static int lov_is_same_stripe(struct lov_object *lov, int stripe, + const struct cl_lock_descr *descr) +{ + struct lov_stripe_md *lsm = lov_r0(lov)->lo_lsm; + obd_off start; + obd_off end; + + start = cl_offset(&lov->lo_cl, descr->cld_start); + end = cl_offset(&lov->lo_cl, descr->cld_end + 1) - 1; + return + end - start <= lsm->lsm_stripe_size && + stripe == lov_stripe_number(lsm, start) && + stripe == lov_stripe_number(lsm, end); +} + +/** + * An implementation of cl_lock_operations::clo_fits_into() method. + * + * Checks whether a lock (given by \a slice) is suitable for \a + * io. Multi-stripe locks can be used only for "quick" io, like truncate, or + * O_APPEND write. + * + * \see ccc_lock_fits_into(). + */ +static int lov_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + struct lov_lock *lov = cl2lov_lock(slice); + struct lov_object *obj = cl2lov(slice->cls_obj); + int result; + + LASSERT(cl_object_same(need->cld_obj, slice->cls_obj)); + LASSERT(lov->lls_nr > 0); + + ENTRY; + + if (lov->lls_nr == 1) { + /* + * If a lock is on a single stripe, it's enough to check that + * @need lock matches actually granted stripe lock, and... + */ + result = cl_lock_ext_match(&lov->lls_sub[0].sub_got, need); + if (result && lov_r0(obj)->lo_nr > 1) + /* + * ... @need is on the same stripe, if multiple + * stripes are possible at all for this object. + */ + result = lov_is_same_stripe(cl2lov(slice->cls_obj), + lov->lls_sub[0].sub_stripe, + need); + } else if (io->ci_type != CIT_TRUNC && io->ci_type != CIT_MISC && + !cl_io_is_append(io) && need->cld_mode != CLM_PHANTOM) + /* + * Multi-stripe locks are only suitable for `quick' IO and for + * glimpse. + */ + result = 0; + else + /* + * Most general case: multi-stripe existing lock, and + * (potentially) multi-stripe @need lock. Check that @need is + * covered by @lov's sub-locks. + * + * For now, ignore lock expansions made by the server, and + * match against original lock extent. + */ + result = cl_lock_ext_match(&lov->lls_orig, need); + CDEBUG(D_DLMTRACE, DDESCR"/"DDESCR" %i %i/%i: %i\n", + PDESCR(&lov->lls_orig), PDESCR(&lov->lls_sub[0].sub_got), + lov->lls_sub[0].sub_stripe, lov->lls_nr, lov_r0(obj)->lo_nr, + result); + RETURN(result); +} + +void lov_lock_unlink(const struct lu_env *env, + struct lov_lock_link *link, struct lovsub_lock *sub) +{ + struct lov_lock *lck = link->lll_super; + struct cl_lock *parent = lck->lls_cl.cls_lock; + + LASSERT(cl_lock_is_mutexed(parent)); + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + ENTRY; + + list_del_init(&link->lll_list); + LASSERT(lck->lls_sub[link->lll_idx].sub_lock == sub); + /* yank this sub-lock from parent's array */ + lck->lls_sub[link->lll_idx].sub_lock = NULL; + LASSERT(lck->lls_nr_filled > 0); + lck->lls_nr_filled--; + lu_ref_del(&parent->cll_reference, "lov-child", sub->lss_cl.cls_lock); + cl_lock_put(env, parent); + OBD_SLAB_FREE_PTR(link, lov_lock_link_kmem); + EXIT; +} + +struct lov_lock_link *lov_lock_link_find(const struct lu_env *env, + struct lov_lock *lck, + struct lovsub_lock *sub) +{ + struct lov_lock_link *scan; + + LASSERT(cl_lock_is_mutexed(sub->lss_cl.cls_lock)); + ENTRY; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + if (scan->lll_super == lck) + RETURN(scan); + } + RETURN(NULL); +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked for "top-to-bottom" delete, when lock destruction starts from the + * top-lock, e.g., as a result of inode destruction. + * + * Unlinks top-lock from all its sub-locks. Sub-locks are not deleted there: + * this is done separately elsewhere: + * + * - for inode destruction, lov_object_delete() calls cl_object_kill() for + * each sub-object, purging its locks; + * + * - in other cases (e.g., a fatal error with a top-lock) sub-locks are + * left in the cache. + */ +static void lov_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + struct cl_lock_closure *closure = lov_closure_get(env, slice->cls_lock); + int i; + + LASSERT(slice->cls_lock->cll_state == CLS_FREEING); + ENTRY; + + for (i = 0; i < lck->lls_nr; ++i) { + struct lovsub_lock *lsl; + struct cl_lock *sublock; + int rc; + + lsl = lck->lls_sub[i].sub_lock; + if (lsl == NULL) + continue; + + sublock = lsl->lss_cl.cls_lock; + rc = lov_sublock_lock(env, lsl, closure); + if (rc == 0) { + if (lck->lls_sub[i].sub_flags & LSF_HELD) + lov_sublock_release(env, lck, i, 1, 0); + if (sublock->cll_state < CLS_FREEING) { + struct lov_lock_link *link; + + link = lov_lock_link_find(env, lck, lsl); + LASSERT(link != NULL); + lov_lock_unlink(env, link, lsl); + LASSERT(lck->lls_sub[i].sub_lock == NULL); + } + lov_sublock_unlock(env, lsl, closure); + } else if (rc == CLO_REPEAT) { + --i; /* repeat with this lock */ + } else { + CL_LOCK_DEBUG(D_ERROR, env, sublock, + "Cannot get sub-lock for delete: %i\n", + rc); + } + } + cl_lock_closure_fini(closure); + EXIT; +} + +static int lov_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lov_lock *lck = cl2lov_lock(slice); + int i; + + (*p)(env, cookie, "%d\n", lck->lls_nr); + for (i = 0; i < lck->lls_nr; ++i) { + struct lov_lock_sub *sub; + + sub = &lck->lls_sub[i]; + (*p)(env, cookie, " %d %x: ", i, sub->sub_flags); + if (sub->sub_lock != NULL) + cl_lock_print(env, cookie, p, + sub->sub_lock->lss_cl.cls_lock); + else + (*p)(env, cookie, "---\n"); + } + return 0; +} + +static const struct cl_lock_operations lov_lock_ops = { + .clo_fini = lov_lock_fini, + .clo_enqueue = lov_lock_enqueue, + .clo_wait = lov_lock_wait, + .clo_use = lov_lock_use, + .clo_unuse = lov_lock_unuse, + .clo_fits_into = lov_lock_fits_into, + .clo_delete = lov_lock_delete, + .clo_print = lov_lock_print +}; + +int lov_lock_init_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lov_lock *lck; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lck, lov_lock_kmem); + if (lck != NULL) { + cl_lock_slice_add(lock, &lck->lls_cl, obj, &lov_lock_ops); + result = lov_lock_sub_init(env, lck, io); + } else + result = -ENOMEM; + RETURN(result); +} + +static struct cl_lock_closure *lov_closure_get(const struct lu_env *env, + struct cl_lock *parent) +{ + struct cl_lock_closure *closure; + + closure = &lov_env_info(env)->lti_closure; + LINVRNT(list_empty(&closure->clc_list)); + cl_lock_closure_init(env, closure, parent, 1); + return closure; +} + + +/** @} lov */ diff --git a/lustre/lov/lov_log.c b/lustre/lov/lov_log.c index ea45e84..223234b 100644 --- a/lustre/lov/lov_log.c +++ b/lustre/lov/lov_log.c @@ -101,6 +101,12 @@ static int lov_llog_origin_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec, lsr->lsr_ogen = loi->loi_gr; break; } + case MDS_SETATTR64_REC: { + struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec; + lsr->lsr_oid = loi->loi_id; + lsr->lsr_ogen = loi->loi_gr; + break; + } default: break; } diff --git a/lustre/lov/lov_merge.c b/lustre/lov/lov_merge.c index 47e87e0..20abe46 100644 --- a/lustre/lov/lov_merge.c +++ b/lustre/lov/lov_merge.c @@ -50,19 +50,16 @@ #include "lov_internal.h" -/* Merge the lock value block(&lvb) attributes from each of the stripes in a - * file into a single lvb. It is expected that the caller initializes the - * current atime, mtime, ctime to avoid regressing a more uptodate time on - * the local client. - * - * If @kms_only is set then we do not consider the recently seen size (rss) - * when updating the known minimum size (kms). Even when merging RSS, we will - * take the KMS value if it's larger. This prevents getattr from stomping on - * dirty cached pages which extend the file size. */ -int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, - struct ost_lvb *lvb, int kms_only) +/** Merge the lock value block(&lvb) attributes and KMS from each of the + * stripes in a file into a single lvb. It is expected that the caller + * initializes the current atime, mtime, ctime to avoid regressing a more + * uptodate time on the local client. + */ +int lov_merge_lvb_kms(struct lov_stripe_md *lsm, + struct ost_lvb *lvb, __u64 *kms_place) { __u64 size = 0; + __u64 kms = 0; __u64 blocks = 0; __u64 current_mtime = lvb->lvb_mtime; __u64 current_atime = lvb->lvb_atime; @@ -85,7 +82,11 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, } tmpsize = loi->loi_kms; - if (kms_only == 0 && loi->loi_lvb.lvb_size > tmpsize) + lov_size = lov_stripe_size(lsm, tmpsize, i); + if (lov_size > kms) + kms = lov_size; + + if (loi->loi_lvb.lvb_size > tmpsize) tmpsize = loi->loi_lvb.lvb_size; lov_size = lov_stripe_size(lsm, tmpsize, i); @@ -98,7 +99,7 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, /* mtime is always updated with ctime, but can be set in past. As write and utime(2) may happen within 1 second, and utime's - mtime has a priority over write's one, leave mtime from mds + mtime has a priority over write's one, leave mtime from mds for the same ctimes. */ if (loi->loi_lvb.lvb_ctime > current_ctime) { current_ctime = loi->loi_lvb.lvb_ctime; @@ -106,6 +107,7 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, } } + *kms_place = kms; lvb->lvb_size = size; lvb->lvb_blocks = blocks; lvb->lvb_mtime = current_mtime; @@ -114,6 +116,31 @@ int lov_merge_lvb(struct obd_export *exp, struct lov_stripe_md *lsm, RETURN(rc); } +/** Merge the lock value block(&lvb) attributes from each of the stripes in a + * file into a single lvb. It is expected that the caller initializes the + * current atime, mtime, ctime to avoid regressing a more uptodate time on + * the local client. + * + * If @kms_only is set then we do not consider the recently seen size (rss) + * when updating the known minimum size (kms). Even when merging RSS, we will + * take the KMS value if it's larger. This prevents getattr from stomping on + * dirty cached pages which extend the file size. */ +int lov_merge_lvb(struct obd_export *exp, + struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only) +{ + int rc; + __u64 kms; + + ENTRY; + rc = lov_merge_lvb_kms(lsm, lvb, &kms); + if (kms_only) + lvb->lvb_size = kms; + CDEBUG(D_INODE, "merged: %llu %llu %llu %llu %llu\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(rc); +} + /* Must be called under the lov_stripe_lock() */ int lov_adjust_kms(struct obd_export *exp, struct lov_stripe_md *lsm, obd_off size, int shrink) diff --git a/lustre/lov/lov_obd.c b/lustre/lov/lov_obd.c index 84835e5..eb32c62 100644 --- a/lustre/lov/lov_obd.c +++ b/lustre/lov/lov_obd.c @@ -63,7 +63,7 @@ #include #include #include -#include +#include #include #include "lov_internal.h" @@ -104,97 +104,15 @@ void lov_putref(struct obd_device *obd) mutex_up(&lov->lov_lock); } -static int lov_register_page_removal_cb(struct obd_export *exp, - obd_page_removal_cb_t func, - obd_pin_extent_cb pin_cb) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - int i, rc = 0; - - if (lov->lov_page_removal_cb && lov->lov_page_removal_cb != func) - return -EBUSY; - - if (lov->lov_page_pin_cb && lov->lov_page_pin_cb != pin_cb) - return -EBUSY; - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) - continue; - rc |= obd_register_page_removal_cb(lov->lov_tgts[i]->ltd_exp, - func, pin_cb); - } - - lov->lov_page_removal_cb = func; - lov->lov_page_pin_cb = pin_cb; - - return rc; -} - -static int lov_unregister_page_removal_cb(struct obd_export *exp, - obd_page_removal_cb_t func) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - int i, rc = 0; - - if (lov->lov_page_removal_cb && lov->lov_page_removal_cb != func) - return -EINVAL; - - lov->lov_page_removal_cb = NULL; - lov->lov_page_pin_cb = NULL; - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) - continue; - rc |= obd_unregister_page_removal_cb(lov->lov_tgts[i]->ltd_exp, - func); - } - - return rc; -} - -static int lov_register_lock_cancel_cb(struct obd_export *exp, - obd_lock_cancel_cb func) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - int i, rc = 0; - - if (lov->lov_lock_cancel_cb && lov->lov_lock_cancel_cb != func) - return -EBUSY; - - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) - continue; - rc |= obd_register_lock_cancel_cb(lov->lov_tgts[i]->ltd_exp, - func); - } - - lov->lov_lock_cancel_cb = func; - - return rc; -} - -static int lov_unregister_lock_cancel_cb(struct obd_export *exp, - obd_lock_cancel_cb func) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - int i, rc = 0; - - if (lov->lov_lock_cancel_cb && lov->lov_lock_cancel_cb != func) - return -EINVAL; +static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, + int activate); +static int lov_notify(struct obd_device *obd, struct obd_device *watched, + enum obd_notify_event ev, void *data); - for (i = 0; i < lov->desc.ld_tgt_count; i++) { - if (!lov->lov_tgts[i] || !lov->lov_tgts[i]->ltd_exp) - continue; - rc |= obd_unregister_lock_cancel_cb(lov->lov_tgts[i]->ltd_exp, - func); - } - lov->lov_lock_cancel_cb = NULL; - return rc; -} #define MAX_STRING_SIZE 128 -static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, - struct obd_connect_data *data) +int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, + struct obd_connect_data *data) { struct lov_obd *lov = &obd->u.lov; struct obd_uuid tgt_uuid; @@ -226,6 +144,9 @@ static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, RETURN(-EINVAL); } + /* override the sp_me from lov */ + tgt_obd->u.cli.cl_sp_me = lov->lov_sp_me; + if (data && (data->ocd_connect_flags & OBD_CONNECT_INDEX)) data->ocd_index = index; @@ -236,7 +157,7 @@ static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, if (activate) { tgt_obd->obd_no_recov = 0; - /* FIXME this is probably supposed to be + /* FIXME this is probably supposed to be ptlrpc_set_import_active. Horrible naming. */ ptlrpc_activate_import(imp); } @@ -265,33 +186,10 @@ static int lov_connect_obd(struct obd_device *obd, __u32 index, int activate, RETURN(-ENODEV); } - rc = obd_register_page_removal_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_page_removal_cb, - lov->lov_page_pin_cb); - if (rc) { - obd_disconnect(lov->lov_tgts[index]->ltd_exp); - lov->lov_tgts[index]->ltd_exp = NULL; - RETURN(rc); - } - - rc = obd_register_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_lock_cancel_cb); - if (rc) { - obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_page_removal_cb); - obd_disconnect(lov->lov_tgts[index]->ltd_exp); - lov->lov_tgts[index]->ltd_exp = NULL; - RETURN(rc); - } - rc = obd_register_observer(tgt_obd, obd); if (rc) { CERROR("Target %s register_observer error %d\n", obd_uuid2str(&tgt_uuid), rc); - obd_unregister_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_lock_cancel_cb); - obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_page_removal_cb); obd_disconnect(lov->lov_tgts[index]->ltd_exp); lov->lov_tgts[index]->ltd_exp = NULL; RETURN(rc); @@ -374,6 +272,16 @@ static int lov_connect(const struct lu_env *env, obd->obd_name, i, rc); continue; } + /* connect to administrative disabled ost */ + if (!lov->lov_tgts[i]->ltd_exp) + continue; + + rc = lov_notify(obd, lov->lov_tgts[i]->ltd_exp->exp_obd, + OBD_NOTIFY_ACTIVE, (void *)&i); + if (rc) { + CERROR("%s error sending notify %d\n", + obd->obd_name, rc); + } } lov_putref(obd); @@ -396,11 +304,6 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index) CDEBUG(D_CONFIG, "%s: disconnecting target %s\n", obd->obd_name, osc_obd->obd_name); - obd_unregister_lock_cancel_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_lock_cancel_cb); - obd_unregister_page_removal_cb(lov->lov_tgts[index]->ltd_exp, - lov->lov_page_removal_cb); - if (lov->lov_tgts[index]->ltd_active) { lov->lov_tgts[index]->ltd_active = 0; lov->desc.ld_active_tgt_count--; @@ -446,9 +349,6 @@ static int lov_disconnect_obd(struct obd_device *obd, __u32 index) RETURN(0); } -static int lov_del_target(struct obd_device *obd, __u32 index, - struct obd_uuid *uuidp, int gen); - static int lov_disconnect(struct obd_export *exp) { struct obd_device *obd = class_exp2obd(exp); @@ -487,7 +387,9 @@ out: /* Error codes: * * -EINVAL : UUID can't be found in the LOV's target list - * - any other is lov index + * -ENOTCONN: The UUID is found, but the target connection is bad (!) + * -EBADF : The UUID is found, but the OBD is the wrong type (!) + * any >= 0 : is log target index */ static int lov_set_osc_active(struct obd_device *obd, struct obd_uuid *uuid, int activate) @@ -547,7 +449,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, { int rc = 0; ENTRY; - + if (ev == OBD_NOTIFY_ACTIVE || ev == OBD_NOTIFY_INACTIVE) { struct obd_uuid *uuid; @@ -571,6 +473,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, obd_uuid2str(uuid), rc); RETURN(rc); } + /* active event should be pass lov target index as data */ data = &rc; } @@ -593,6 +496,7 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, data = &i; tgt_obd = class_exp2obd(lov->lov_tgts[i]->ltd_exp); + rc = obd_notify_observer(obd, tgt_obd, ev, data); if (rc) { CERROR("%s: notify %s of %s failed %d\n", @@ -608,8 +512,8 @@ static int lov_notify(struct obd_device *obd, struct obd_device *watched, RETURN(rc); } -static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, - __u32 index, int gen, int active) +int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, + __u32 index, int gen, int active) { struct lov_obd *lov = &obd->u.lov; struct lov_tgt_desc *tgt; @@ -706,6 +610,10 @@ static int lov_add_target(struct obd_device *obd, struct obd_uuid *uuidp, if (rc) GOTO(out, rc); + /* connect to administrative disabled ost */ + if (!tgt->ltd_exp) + GOTO(out, rc = 0); + rc = lov_notify(obd, tgt->ltd_exp->exp_obd, active ? OBD_NOTIFY_ACTIVE : OBD_NOTIFY_INACTIVE, (void *)&index); @@ -721,8 +629,8 @@ out: } /* Schedule a target for deletion */ -static int lov_del_target(struct obd_device *obd, __u32 index, - struct obd_uuid *uuidp, int gen) +int lov_del_target(struct obd_device *obd, __u32 index, + struct obd_uuid *uuidp, int gen) { struct lov_obd *lov = &obd->u.lov; int count = lov->desc.ld_tgt_count; @@ -841,7 +749,7 @@ void lov_fix_desc(struct lov_desc *desc) lov_fix_desc_qos_maxage(&desc->ld_qos_maxage); } -static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) +int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) { struct lprocfs_static_vars lvars = { 0 }; struct lov_desc *desc; @@ -896,13 +804,14 @@ static int lov_setup(struct obd_device *obd, struct lustre_cfg *lcfg) atomic_set(&lov->lov_refcount, 0); CFS_INIT_LIST_HEAD(&lov->lov_qos.lq_oss_list); init_rwsem(&lov->lov_qos.lq_rw_sem); + lov->lov_sp_me = LUSTRE_SP_CLI; lov->lov_qos.lq_dirty = 1; lov->lov_qos.lq_rr.lqr_dirty = 1; lov->lov_qos.lq_reset = 1; /* Default priority is toward free space balance */ lov->lov_qos.lq_prio_free = 232; - lov->lov_pools_hash_body = lustre_hash_init("POOLS", 128, 128, + lov->lov_pools_hash_body = lustre_hash_init("POOLS", 7, 7, &pool_hash_operations, 0); CFS_INIT_LIST_HEAD(&lov->lov_pool_list); lov->lov_pool_count = 0; @@ -1005,9 +914,9 @@ static int lov_cleanup(struct obd_device *obd) RETURN(0); } -static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) +int lov_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg, + __u32 *indexp, int *genp) { - struct lustre_cfg *lcfg = buf; struct obd_uuid obd_uuid; int cmd; int rc = 0; @@ -1025,10 +934,12 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) obd_str2uuid(&obd_uuid, lustre_cfg_buf(lcfg, 1)); - if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", &index) != 1) + if (sscanf(lustre_cfg_buf(lcfg, 2), "%d", indexp) != 1) GOTO(out, rc = -EINVAL); - if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", &gen) != 1) + if (sscanf(lustre_cfg_buf(lcfg, 3), "%d", genp) != 1) GOTO(out, rc = -EINVAL); + index = *indexp; + gen = *genp; if (cmd == LCFG_LOV_ADD_OBD) rc = lov_add_target(obd, &obd_uuid, index, gen, 1); else if (cmd == LCFG_LOV_ADD_INA) @@ -1048,6 +959,8 @@ static int lov_process_config(struct obd_device *obd, obd_count len, void *buf) rc = class_process_proc_param(PARAM_LOV, lvars.obd_vars, lcfg, obd); + if (rc > 0) + rc = 0; GOTO(out, rc); } case LCFG_POOL_NEW: @@ -1247,7 +1160,7 @@ do { static int lov_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *lsm, struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obd_export *md_exp, void *capa) { struct lov_request_set *set; struct obd_info oinfo; @@ -1280,7 +1193,7 @@ static int lov_destroy(struct obd_export *exp, struct obdo *oa, oti->oti_logcookies = set->set_cookies + req->rq_stripe; err = obd_destroy(lov->lov_tgts[req->rq_idx]->ltd_exp, - req->rq_oi.oi_oa, NULL, oti, NULL); + req->rq_oi.oi_oa, NULL, oti, NULL, capa); err = lov_update_common_set(set, req, err); if (err) { CERROR("error: destroying objid "LPX64" subobj " @@ -1678,7 +1591,7 @@ static int lov_brw_check(struct lov_obd *lov, struct obd_info *lov_oinfo, obd_off start, end; if (!lov_stripe_intersects(lov_oinfo->oi_md, i, pga[i].off, - pga[i].off + pga[i].count, + pga[i].off + pga[i].count - 1, &start, &end)) continue; @@ -1737,330 +1650,6 @@ static int lov_brw(int cmd, struct obd_export *exp, struct obd_info *oinfo, RETURN(rc); } -static int lov_brw_interpret(struct ptlrpc_request_set *reqset, void *data, - int rc) -{ - struct lov_request_set *lovset = (struct lov_request_set *)data; - ENTRY; - - if (rc) { - lovset->set_completes = 0; - lov_fini_brw_set(lovset); - } else { - rc = lov_fini_brw_set(lovset); - } - - RETURN(rc); -} - -static int lov_brw_async(int cmd, struct obd_export *exp, - struct obd_info *oinfo, obd_count oa_bufs, - struct brw_page *pga, struct obd_trans_info *oti, - struct ptlrpc_request_set *set) -{ - struct lov_request_set *lovset; - struct lov_request *req; - struct list_head *pos; - struct lov_obd *lov = &exp->exp_obd->u.lov; - int rc = 0; - ENTRY; - - LASSERT(oinfo); - ASSERT_LSM_MAGIC(oinfo->oi_md); - - if (cmd == OBD_BRW_CHECK) { - rc = lov_brw_check(lov, oinfo, oa_bufs, pga); - RETURN(rc); - } - - rc = lov_prep_brw_set(exp, oinfo, oa_bufs, pga, oti, &lovset); - if (rc) - RETURN(rc); - - list_for_each (pos, &lovset->set_list) { - struct obd_export *sub_exp; - struct brw_page *sub_pga; - req = list_entry(pos, struct lov_request, rq_link); - - sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp; - sub_pga = lovset->set_pga + req->rq_pgaidx; - rc = obd_brw_async(cmd, sub_exp, &req->rq_oi, req->rq_oabufs, - sub_pga, oti, set); - if (rc) - GOTO(out, rc); - lov_update_common_set(lovset, req, rc); - } - LASSERT(rc == 0); - LASSERT(set->set_interpret == NULL); - LASSERT(set->set_arg == NULL); - rc = ptlrpc_set_add_cb(set, lov_brw_interpret, lovset); - if (rc) - GOTO(out, rc); - - RETURN(rc); -out: - lov_fini_brw_set(lovset); - RETURN(rc); -} - -static int lov_ap_make_ready(void *data, int cmd) -{ - struct lov_async_page *lap = lap_from_cookie(data); - - return lap->lap_caller_ops->ap_make_ready(lap->lap_caller_data, cmd); -} - -static int lov_ap_refresh_count(void *data, int cmd) -{ - struct lov_async_page *lap = lap_from_cookie(data); - - return lap->lap_caller_ops->ap_refresh_count(lap->lap_caller_data, - cmd); -} - -static void lov_ap_fill_obdo(void *data, int cmd, struct obdo *oa) -{ - struct lov_async_page *lap = lap_from_cookie(data); - - lap->lap_caller_ops->ap_fill_obdo(lap->lap_caller_data, cmd, oa); - /* XXX woah, shouldn't we be altering more here? size? */ - oa->o_id = lap->lap_loi_id; - oa->o_gr = lap->lap_loi_gr; - oa->o_valid |= OBD_MD_FLGROUP; - oa->o_stripe_idx = lap->lap_stripe; -} - -static void lov_ap_update_obdo(void *data, int cmd, struct obdo *oa, - obd_valid valid) -{ - struct lov_async_page *lap = lap_from_cookie(data); - - lap->lap_caller_ops->ap_update_obdo(lap->lap_caller_data, cmd,oa,valid); -} - -static int lov_ap_completion(void *data, int cmd, struct obdo *oa, int rc) -{ - struct lov_async_page *lap = lap_from_cookie(data); - - /* in a raid1 regime this would down a count of many ios - * in flight, onl calling the caller_ops completion when all - * the raid1 ios are complete */ - rc = lap->lap_caller_ops->ap_completion(lap->lap_caller_data,cmd,oa,rc); - return rc; -} - -static struct obd_capa *lov_ap_lookup_capa(void *data, int cmd) -{ - struct lov_async_page *lap = lap_from_cookie(data); - return lap->lap_caller_ops->ap_lookup_capa(lap->lap_caller_data, cmd); -} - -static struct obd_async_page_ops lov_async_page_ops = { - .ap_make_ready = lov_ap_make_ready, - .ap_refresh_count = lov_ap_refresh_count, - .ap_fill_obdo = lov_ap_fill_obdo, - .ap_update_obdo = lov_ap_update_obdo, - .ap_completion = lov_ap_completion, - .ap_lookup_capa = lov_ap_lookup_capa, -}; - -int lov_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, cfs_page_t *page, - obd_off offset, struct obd_async_page_ops *ops, - void *data, void **res, int nocache, - struct lustre_handle *lockh) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lov_async_page *lap; - struct lov_lock_handles *lov_lockh = NULL; - int rc = 0; - ENTRY; - - if (!page) { - int i = 0; - /* Find an existing osc so we can get it's stupid sizeof(*oap). - Only because of this layering limitation will a client - mount with no osts fail */ - while (!lov->lov_tgts || !lov->lov_tgts[i] || - !lov->lov_tgts[i]->ltd_exp) { - i++; - if (i >= lov->desc.ld_tgt_count) - RETURN(-ENOMEDIUM); - } - rc = size_round(sizeof(*lap)) + - obd_prep_async_page(lov->lov_tgts[i]->ltd_exp, NULL, - NULL, NULL, 0, NULL, NULL, NULL, 0, - NULL); - RETURN(rc); - } - ASSERT_LSM_MAGIC(lsm); - LASSERT(loi == NULL); - - lap = *res; - lap->lap_magic = LOV_AP_MAGIC; - lap->lap_caller_ops = ops; - lap->lap_caller_data = data; - - /* for now only raid 0 which passes through */ - lap->lap_stripe = lov_stripe_number(lsm, offset); - lov_stripe_offset(lsm, offset, lap->lap_stripe, &lap->lap_sub_offset); - loi = lsm->lsm_oinfo[lap->lap_stripe]; - - /* so the callback doesn't need the lsm */ - lap->lap_loi_id = loi->loi_id; - lap->lap_loi_gr = lsm->lsm_object_gr; - LASSERT(lsm->lsm_object_gr > 0); - - lap->lap_sub_cookie = (void *)lap + size_round(sizeof(*lap)); - - if (lockh) { - lov_lockh = lov_handle2llh(lockh); - if (lov_lockh) { - lockh = lov_lockh->llh_handles + lap->lap_stripe; - } - } - - rc = obd_prep_async_page(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, - lsm, loi, page, lap->lap_sub_offset, - &lov_async_page_ops, lap, - &lap->lap_sub_cookie, nocache, lockh); - if (lov_lockh) - lov_llh_put(lov_lockh); - if (rc) - RETURN(rc); - CDEBUG(D_CACHE, "lap %p page %p cookie %p off "LPU64"\n", lap, page, - lap->lap_sub_cookie, offset); - RETURN(0); -} - -static int lov_queue_async_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lov_async_page *lap; - int rc; - - LASSERT(loi == NULL); - - ASSERT_LSM_MAGIC(lsm); - - lap = lap_from_cookie(cookie); - - loi = lsm->lsm_oinfo[lap->lap_stripe]; - - rc = obd_queue_async_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, lsm, - loi, lap->lap_sub_cookie, cmd, off, count, - brw_flags, async_flags); - RETURN(rc); -} - -static int lov_set_async_flags(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flag async_flags) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lov_async_page *lap; - int rc; - - LASSERT(loi == NULL); - - ASSERT_LSM_MAGIC(lsm); - - lap = lap_from_cookie(cookie); - - loi = lsm->lsm_oinfo[lap->lap_stripe]; - - rc = obd_set_async_flags(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, - lsm, loi, lap->lap_sub_cookie, async_flags); - RETURN(rc); -} - -static int lov_queue_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lov_async_page *lap; - int rc; - - LASSERT(loi == NULL); - - ASSERT_LSM_MAGIC(lsm); - - lap = lap_from_cookie(cookie); - - loi = lsm->lsm_oinfo[lap->lap_stripe]; - - rc = obd_queue_group_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, lsm, - loi, oig, lap->lap_sub_cookie, cmd, off, count, - brw_flags, async_flags); - RETURN(rc); -} - -/* this isn't exactly optimal. we may have queued sync io in oscs on - * all stripes, but we don't record that fact at queue time. so we - * trigger sync io on all stripes. */ -static int lov_trigger_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - int rc = 0, i, err; - - LASSERT(loi == NULL); - - ASSERT_LSM_MAGIC(lsm); - - for (i = 0; i < lsm->lsm_stripe_count; i++) { - loi = lsm->lsm_oinfo[i]; - if (!lov->lov_tgts[loi->loi_ost_idx] || - !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) { - CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); - continue; - } - - err = obd_trigger_group_io(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, - lsm, loi, oig); - if (rc == 0 && err != 0) - rc = err; - }; - RETURN(rc); -} - -static int lov_teardown_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie) -{ - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lov_async_page *lap; - int rc; - - LASSERT(loi == NULL); - - ASSERT_LSM_MAGIC(lsm); - - lap = lap_from_cookie(cookie); - - loi = lsm->lsm_oinfo[lap->lap_stripe]; - - rc = obd_teardown_async_page(lov->lov_tgts[loi->loi_ost_idx]->ltd_exp, - lsm, loi, lap->lap_sub_cookie); - if (rc) { - CERROR("unable to teardown sub cookie %p: %d\n", - lap->lap_sub_cookie, rc); - RETURN(rc); - } - RETURN(rc); -} - static int lov_enqueue_interpret(struct ptlrpc_request_set *rqset, void *data, int rc) { @@ -2118,50 +1707,6 @@ out: RETURN(rc); } -static int lov_match(struct obd_export *exp, struct lov_stripe_md *lsm, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh) -{ - struct lov_request_set *set; - struct obd_info oinfo; - struct lov_request *req; - struct list_head *pos; - struct lov_obd *lov = &exp->exp_obd->u.lov; - struct lustre_handle *lov_lockhp; - int lov_flags, rc = 0; - ENTRY; - - ASSERT_LSM_MAGIC(lsm); - LASSERT((*flags & LDLM_FL_TEST_LOCK) || mode == (mode & -mode)); - - if (!exp || !exp->exp_obd) - RETURN(-ENODEV); - - lov = &exp->exp_obd->u.lov; - rc = lov_prep_match_set(exp, &oinfo, lsm, policy, mode, lockh, &set); - if (rc) - RETURN(rc); - - list_for_each (pos, &set->set_list) { - ldlm_policy_data_t sub_policy; - req = list_entry(pos, struct lov_request, rq_link); - lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; - LASSERT(lov_lockhp); - - lov_flags = *flags; - sub_policy.l_extent = req->rq_oi.oi_policy.l_extent; - - rc = obd_match(lov->lov_tgts[req->rq_idx]->ltd_exp, - req->rq_oi.oi_md, type, &sub_policy, - mode, &lov_flags, data, lov_lockhp); - rc = lov_update_match_set(set, req, rc); - if (rc <= 0) - break; - } - lov_fini_match_set(set, mode, *flags); - RETURN(rc); -} - static int lov_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, ldlm_iterator_t it, void *data) @@ -2175,7 +1720,7 @@ static int lov_change_cbdata(struct obd_export *exp, if (!exp || !exp->exp_obd) RETURN(-ENODEV); - LASSERT(lsm->lsm_object_gr > 0); + LASSERT_MDS_GROUP(lsm->lsm_object_gr); lov = &exp->exp_obd->u.lov; for (i = 0; i < lsm->lsm_stripe_count; i++) { @@ -2186,7 +1731,7 @@ static int lov_change_cbdata(struct obd_export *exp, CDEBUG(D_HA, "lov idx %d NULL \n", loi->loi_ost_idx); continue; } - + submd.lsm_object_id = loi->loi_id; submd.lsm_object_gr = lsm->lsm_object_gr; submd.lsm_stripe_count = 0; @@ -2213,7 +1758,7 @@ static int lov_cancel(struct obd_export *exp, struct lov_stripe_md *lsm, if (!exp || !exp->exp_obd) RETURN(-ENODEV); - LASSERT(lsm->lsm_object_gr > 0); + LASSERT_MDS_GROUP(lsm->lsm_object_gr); LASSERT(lockh); lov = &exp->exp_obd->u.lov; rc = lov_prep_cancel_set(exp, &oinfo, lsm, mode, lockh, &set); @@ -2269,7 +1814,7 @@ static int lov_cancel_unused(struct obd_export *exp, ASSERT_LSM_MAGIC(lsm); - LASSERT(lsm->lsm_object_gr > 0); + LASSERT_MDS_GROUP(lsm->lsm_object_gr); for (i = 0; i < lsm->lsm_stripe_count; i++) { struct lov_stripe_md submd; struct lov_oinfo *loi = lsm->lsm_oinfo[i]; @@ -2386,7 +1931,7 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, { struct obd_device *obddev = class_exp2obd(exp); struct lov_obd *lov = &obddev->u.lov; - int i, rc = 0, count = lov->desc.ld_tgt_count; + int i = 0, rc = 0, count = lov->desc.ld_tgt_count; struct obd_uuid *uuidp; ENTRY; @@ -2480,6 +2025,53 @@ static int lov_iocontrol(unsigned int cmd, struct obd_export *exp, int len, case LL_IOC_LOV_SETEA: rc = lov_setea(exp, karg, uarg); break; + case OBD_IOC_QUOTACTL: { + struct if_quotactl *qctl = karg; + struct lov_tgt_desc *tgt = NULL; + struct obd_quotactl *oqctl; + + if (qctl->qc_valid == QC_OSTIDX) { + if (qctl->qc_idx < 0 || count <= qctl->qc_idx) + RETURN(-EINVAL); + + tgt = lov->lov_tgts[qctl->qc_idx]; + if (!tgt || !tgt->ltd_exp) + RETURN(-EINVAL); + } else if (qctl->qc_valid == QC_UUID) { + for (i = 0; i < count; i++) { + tgt = lov->lov_tgts[i]; + if (!tgt || + !obd_uuid_equals(&tgt->ltd_uuid, + &qctl->obd_uuid)) + continue; + + if (tgt->ltd_exp == NULL) + RETURN(-EINVAL); + + break; + } + } else { + RETURN(-EINVAL); + } + + if (i >= count) + RETURN(-EAGAIN); + + LASSERT(tgt && tgt->ltd_exp); + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + QCTL_COPY(oqctl, qctl); + rc = obd_quotactl(tgt->ltd_exp, oqctl); + if (rc == 0) { + QCTL_COPY(qctl, oqctl); + qctl->qc_valid = QC_OSTIDX; + qctl->obd_uuid = tgt->ltd_uuid; + } + OBD_FREE_PTR(oqctl); + break; + } default: { int set = 0; @@ -3149,7 +2741,7 @@ int lov_complete_many(struct obd_export *exp, struct lov_stripe_md *lsm, struct ldlm_lock *lock; struct obd_device *obd; - lock = ldlm_handle2lock(lov_lockhp); + lock = ldlm_handle2lock_long(lov_lockhp, 0); if (lock == NULL) { CDEBUG(D_HA, "lov idx %d subobj "LPX64" no lock?\n", loi->loi_ost_idx, loi->loi_id); @@ -3207,93 +2799,13 @@ void lov_stripe_unlock(struct lov_stripe_md *md) } EXPORT_SYMBOL(lov_stripe_unlock); -/** - * Checks if requested extent lock is compatible with a lock under the page. - * - * Checks if the lock under \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param exp lov export - * \param lsm striping information for the file - * \param res lov_async_page placeholder - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param start start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * - * \post result == 1, *cookie == context, appropriate lock is referenced or - * \post result == 0 - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * - * \see lov_release_short_lock - */ -static int lov_reget_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, - void **res, int rw, - obd_off start, obd_off end, - void **cookie) -{ - struct lov_async_page *l = *res; - obd_off stripe_start, stripe_end = start; - - ENTRY; - - /* ensure we don't cross stripe boundaries */ - lov_extent_calc(exp, lsm, OBD_CALC_STRIPE_END, &stripe_end); - if (stripe_end <= end) - RETURN(0); - - /* map the region limits to the object limits */ - lov_stripe_offset(lsm, start, l->lap_stripe, &stripe_start); - lov_stripe_offset(lsm, end, l->lap_stripe, &stripe_end); - - RETURN(obd_reget_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm-> - lsm_oinfo[l->lap_stripe]->loi_ost_idx]-> - ltd_exp, NULL, &l->lap_sub_cookie, - rw, stripe_start, stripe_end, cookie)); -} - -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or a write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param exp lov export - * \param lsm striping information for the file - * \param end end of the locked extent - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see lov_reget_short_lock - */ -static int lov_release_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, obd_off end, - void *cookie, int rw) -{ - int stripe; - - ENTRY; - - stripe = lov_stripe_number(lsm, end); - - RETURN(obd_release_short_lock(exp->exp_obd->u.lov.lov_tgts[lsm-> - lsm_oinfo[stripe]->loi_ost_idx]-> - ltd_exp, NULL, end, cookie, rw)); -} struct obd_ops lov_obd_ops = { .o_owner = THIS_MODULE, .o_setup = lov_setup, .o_precleanup = lov_precleanup, .o_cleanup = lov_cleanup, - .o_process_config = lov_process_config, + //.o_process_config = lov_process_config, .o_connect = lov_connect, .o_disconnect = lov_disconnect, .o_statfs = lov_statfs, @@ -3308,21 +2820,11 @@ struct obd_ops lov_obd_ops = { .o_setattr = lov_setattr, .o_setattr_async = lov_setattr_async, .o_brw = lov_brw, - .o_brw_async = lov_brw_async, - .o_prep_async_page = lov_prep_async_page, - .o_reget_short_lock = lov_reget_short_lock, - .o_release_short_lock = lov_release_short_lock, - .o_queue_async_io = lov_queue_async_io, - .o_set_async_flags = lov_set_async_flags, - .o_queue_group_io = lov_queue_group_io, - .o_trigger_group_io = lov_trigger_group_io, - .o_teardown_async_page = lov_teardown_async_page, .o_merge_lvb = lov_merge_lvb, .o_adjust_kms = lov_adjust_kms, .o_punch = lov_punch, .o_sync = lov_sync, .o_enqueue = lov_enqueue, - .o_match = lov_match, .o_change_cbdata = lov_change_cbdata, .o_cancel = lov_cancel, .o_cancel_unused = lov_cancel_unused, @@ -3333,10 +2835,6 @@ struct obd_ops lov_obd_ops = { .o_llog_init = lov_llog_init, .o_llog_finish = lov_llog_finish, .o_notify = lov_notify, - .o_register_page_removal_cb = lov_register_page_removal_cb, - .o_unregister_page_removal_cb = lov_unregister_page_removal_cb, - .o_register_lock_cancel_cb = lov_register_lock_cancel_cb, - .o_unregister_lock_cancel_cb = lov_unregister_lock_cancel_cb, .o_pool_new = lov_pool_new, .o_pool_rem = lov_pool_remove, .o_pool_add = lov_pool_add, @@ -3348,17 +2846,30 @@ extern quota_interface_t lov_quota_interface; cfs_mem_cache_t *lov_oinfo_slab; +extern struct lu_kmem_descr lov_caches[]; + int __init lov_init(void) { struct lprocfs_static_vars lvars = { 0 }; int rc, rc2; ENTRY; + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_CONSOLE, "Lustre LOV module (%p).\n", &lov_caches); + + rc = lu_kmem_init(lov_caches); + if (rc) + return rc; + lov_oinfo_slab = cfs_mem_cache_create("lov_oinfo", - sizeof(struct lov_oinfo), + sizeof(struct lov_oinfo), 0, SLAB_HWCACHE_ALIGN); - if (lov_oinfo_slab == NULL) + if (lov_oinfo_slab == NULL) { + lu_kmem_fini(lov_caches); return -ENOMEM; + } lprocfs_lov_init_vars(&lvars); request_module("lquota"); @@ -3366,12 +2877,14 @@ int __init lov_init(void) init_obd_quota_ops(quota_interface, &lov_obd_ops); rc = class_register_type(&lov_obd_ops, NULL, lvars.module_vars, - LUSTRE_LOV_NAME, NULL); + LUSTRE_LOV_NAME, &lov_device_type); + if (rc) { if (quota_interface) PORTAL_SYMBOL_PUT(lov_quota_interface); rc2 = cfs_mem_cache_destroy(lov_oinfo_slab); LASSERT(rc2 == 0); + lu_kmem_fini(lov_caches); } RETURN(rc); @@ -3381,7 +2894,10 @@ int __init lov_init(void) static void /*__exit*/ lov_exit(void) { int rc; - + + lu_device_type_fini(&lov_device_type); + lu_kmem_fini(lov_caches); + if (quota_interface) PORTAL_SYMBOL_PUT(lov_quota_interface); diff --git a/lustre/lov/lov_object.c b/lustre/lov/lov_object.c new file mode 100644 index 0000000..d5781b4 --- /dev/null +++ b/lustre/lov/lov_object.c @@ -0,0 +1,700 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOV layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup lov lov @{ */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/***************************************************************************** + * + * Layout operations. + * + */ + +struct lov_layout_operations { + int (*llo_init)(const struct lu_env *env, struct lov_device *dev, + struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state); + void (*llo_delete)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_fini)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + void (*llo_install)(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state); + int (*llo_print)(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o); + struct cl_page *(*llo_page_init)(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, + cfs_page_t *vmpage); + int (*llo_lock_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); + int (*llo_io_init)(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); + int (*llo_getattr)(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr); +}; + +/***************************************************************************** + * + * Lov object layout operations. + * + */ + +static void lov_install_empty(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + /* + * File without objects. + */ +} + +static int lov_init_empty(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + return 0; +} + +static void lov_install_raid0(const struct lu_env *env, + struct lov_object *lov, + union lov_layout_state *state) +{ + lov->u = *state; +} + +static void oinfo_get_fid(const struct lov_oinfo *oinfo, struct lu_fid *fid) +{ + __u64 idx = oinfo->loi_id; + + /* See idif definition in wiki:CMD3_interoperability_architecture */ + + LASSERT(oinfo->loi_gr < 1ULL << 16); + LASSERT(oinfo->loi_id < 1ULL << 49); + ENTRY; + + /* + * Now that the fid of stripe is not unique now, ost_idx have to + * be used to make it unique. This is ok because the stripe fids + * are just used in client side(to locate the objects). -jay + */ + fid->f_seq = ((__u64)oinfo->loi_ost_idx) << 32 | + oinfo->loi_gr << 16 | idx >> 32; + fid->f_oid = idx; /* truncated to 32 bits by assignment */ + fid->f_ver = 0; + EXIT; +} + +static struct cl_object *lov_sub_find(const struct lu_env *env, + struct cl_device *dev, + const struct lu_fid *fid, + const struct cl_object_conf *conf) +{ + struct lu_object *o; + + ENTRY; + o = lu_object_find_at(env, cl2lu_dev(dev), fid, &conf->coc_lu); + LASSERT(ergo(!IS_ERR(o), o->lo_dev->ld_type == &lovsub_device_type)); + RETURN(lu2cl(o)); +} + +static int lov_init_sub(const struct lu_env *env, struct lov_object *lov, + struct cl_object *stripe, + struct lov_layout_raid0 *r0, int idx) +{ + struct cl_object_header *hdr; + struct cl_object_header *subhdr; + struct cl_object_header *parent; + struct lov_oinfo *oinfo; + int result; + + hdr = cl_object_header(lov2cl(lov)); + subhdr = cl_object_header(stripe); + parent = subhdr->coh_parent; + + oinfo = r0->lo_lsm->lsm_oinfo[idx]; + CDEBUG(D_INODE, DFID"@%p[%d] -> "DFID"@%p: id: "LPU64" gr: "LPU64 + " idx: %d gen: %d\n", + PFID(&subhdr->coh_lu.loh_fid), subhdr, idx, + PFID(&hdr->coh_lu.loh_fid), hdr, + oinfo->loi_id, oinfo->loi_gr, + oinfo->loi_ost_idx, oinfo->loi_ost_gen); + + if (parent == NULL) { + subhdr->coh_parent = hdr; + subhdr->coh_nesting = hdr->coh_nesting + 1; + lu_object_ref_add(&stripe->co_lu, "lov-parent", lov); + r0->lo_sub[idx] = cl2lovsub(stripe); + r0->lo_sub[idx]->lso_super = lov; + r0->lo_sub[idx]->lso_index = idx; + result = 0; + } else { + CERROR("Stripe is already owned by other file (%i).\n", idx); + LU_OBJECT_DEBUG(D_ERROR, env, &stripe->co_lu, "\n"); + LU_OBJECT_DEBUG(D_ERROR, env, lu_object_top(&parent->coh_lu), + "old\n"); + LU_OBJECT_HEADER(D_ERROR, env, lov2lu(lov), "new\n"); + cl_object_put(env, stripe); + result = -EIO; + } + return result; +} + +static int lov_init_raid0(const struct lu_env *env, + struct lov_device *dev, struct lov_object *lov, + const struct cl_object_conf *conf, + union lov_layout_state *state) +{ + int result; + int i; + + struct cl_object *stripe; + struct lov_thread_info *lti = lov_env_info(env); + struct cl_object_conf *subconf = <i->lti_stripe_conf; + struct lov_stripe_md *lsm = conf->u.coc_md->lsm; + struct lu_fid *ofid = <i->lti_fid; + struct lov_layout_raid0 *r0 = &state->raid0; + + ENTRY; + r0->lo_nr = conf->u.coc_md->lsm->lsm_stripe_count; + r0->lo_lsm = conf->u.coc_md->lsm; + LASSERT(r0->lo_nr <= lov_targets_nr(dev)); + + OBD_ALLOC(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]); + if (r0->lo_sub != NULL) { + result = 0; + subconf->coc_inode = conf->coc_inode; + /* + * Create stripe cl_objects. + */ + for (i = 0; i < r0->lo_nr && result == 0; ++i) { + struct cl_device *subdev; + struct lov_oinfo *oinfo = lsm->lsm_oinfo[i]; + int ost_idx = oinfo->loi_ost_idx; + + oinfo_get_fid(oinfo, ofid); + subdev = lovsub2cl_dev(dev->ld_target[ost_idx]); + subconf->u.coc_oinfo = oinfo; + stripe = lov_sub_find(env, subdev, ofid, subconf); + if (!IS_ERR(stripe)) + result = lov_init_sub(env, lov, stripe, r0, i); + else + result = PTR_ERR(stripe); + } + } else + result = -ENOMEM; + RETURN(result); +} + +static void lov_delete_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY); +} + +static void lov_subobject_kill(const struct lu_env *env, struct lov_object *lov, + struct lovsub_object *los, int idx) +{ + struct cl_object *sub; + struct lov_layout_raid0 *r0; + struct lu_site *site; + cfs_waitlink_t *waiter; + + r0 = &lov->u.raid0; + sub = lovsub2cl(los); + LASSERT(r0->lo_sub[idx] == los); + + cl_object_kill(env, sub); + /* release a reference to the sub-object and ... */ + lu_object_ref_del(&sub->co_lu, "lov-parent", lov); + cl_object_put(env, sub); + + /* ... wait until it is actually destroyed---sub-object clears its + * ->lo_sub[] slot in lovsub_object_fini() */ + if (r0->lo_sub[idx] == los) { + waiter = &lov_env_info(env)->lti_waiter; + site = sub->co_lu.lo_dev->ld_site; + cfs_waitlink_init(waiter); + cfs_waitq_add(&site->ls_marche_funebre, waiter); + set_current_state(CFS_TASK_UNINT); + + while (r0->lo_sub[idx] == los) + /* this wait-queue is signaled at the end of + * lu_object_free(). */ + cfs_waitq_wait(waiter, CFS_TASK_UNINT); + cfs_waitq_del(&site->ls_marche_funebre, waiter); + } + LASSERT(r0->lo_sub[idx] == NULL); +} + +static void lov_delete_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + int i; + + ENTRY; + if (r0->lo_sub != NULL) { + for (i = 0; i < r0->lo_nr; ++i) { + struct lovsub_object *los = r0->lo_sub[i]; + + if (los != NULL) + /* + * If top-level object is to be evicted from + * the cache, so are its sub-objects. + */ + lov_subobject_kill(env, lov, los, i); + } + } + EXIT; +} + +static void lov_fini_empty(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + LASSERT(lov->lo_type == LLT_EMPTY); +} + +static void lov_fini_raid0(const struct lu_env *env, struct lov_object *lov, + union lov_layout_state *state) +{ + struct lov_layout_raid0 *r0 = &state->raid0; + + ENTRY; + if (r0->lo_sub != NULL) { + OBD_FREE(r0->lo_sub, r0->lo_nr * sizeof r0->lo_sub[0]); + r0->lo_sub = NULL; + } + EXIT; +} + +static int lov_print_empty(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + (*p)(env, cookie, "empty\n"); + return 0; +} + +static int lov_print_raid0(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct lov_object *lov = lu2lov(o); + struct lov_layout_raid0 *r0 = lov_r0(lov); + int i; + + (*p)(env, cookie, "stripes: %d:\n", r0->lo_nr); + for (i = 0; i < r0->lo_nr; ++i) { + struct lu_object *sub; + + if (r0->lo_sub[i] != NULL) { + sub = lovsub2lu(r0->lo_sub[i]); + lu_object_print(env, cookie, p, sub); + } else + (*p)(env, cookie, "sub %d absent\n", i); + } + return 0; +} + +/** + * Implements cl_object_operations::coo_attr_get() method for an object + * without stripes (LLT_EMPTY layout type). + * + * The only attributes this layer is authoritative in this case is + * cl_attr::cat_blocks---it's 0. + */ +static int lov_attr_get_empty(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + attr->cat_blocks = 0; + return 0; +} + +static int lov_attr_get_raid0(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_object *lov = cl2lov(obj); + struct lov_layout_raid0 *r0 = lov_r0(lov); + struct lov_stripe_md *lsm = lov->u.raid0.lo_lsm; + struct ost_lvb *lvb = &lov_env_info(env)->lti_lvb; + __u64 kms; + int result = 0; + + ENTRY; + if (!r0->lo_attr_valid) { + /* + * Fill LVB with attributes already initialized by the upper + * layer. + */ + cl_attr2lvb(lvb, attr); + kms = attr->cat_kms; + + /* + * XXX that should be replaced with a loop over sub-objects, + * doing cl_object_attr_get() on them. But for now, let's + * reuse old lov code. + */ + + /* + * XXX take lsm spin-lock to keep lov_merge_lvb_kms() + * happy. It's not needed, because new code uses + * ->coh_attr_guard spin-lock to protect consistency of + * sub-object attributes. + */ + lov_stripe_lock(lsm); + result = lov_merge_lvb_kms(lsm, lvb, &kms); + lov_stripe_unlock(lsm); + if (result == 0) { + cl_lvb2attr(attr, lvb); + attr->cat_kms = kms; + r0->lo_attr_valid = 1; + r0->lo_attr = *attr; + } + } else + *attr = r0->lo_attr; + RETURN(result); +} + +const static struct lov_layout_operations lov_dispatch[] = { + [LLT_EMPTY] = { + .llo_init = lov_init_empty, + .llo_delete = lov_delete_empty, + .llo_fini = lov_fini_empty, + .llo_install = lov_install_empty, + .llo_print = lov_print_empty, + .llo_page_init = lov_page_init_empty, + .llo_lock_init = NULL, + .llo_io_init = lov_io_init_empty, + .llo_getattr = lov_attr_get_empty + }, + [LLT_RAID0] = { + .llo_init = lov_init_raid0, + .llo_delete = lov_delete_raid0, + .llo_fini = lov_fini_raid0, + .llo_install = lov_install_raid0, + .llo_print = lov_print_raid0, + .llo_page_init = lov_page_init_raid0, + .llo_lock_init = lov_lock_init_raid0, + .llo_io_init = lov_io_init_raid0, + .llo_getattr = lov_attr_get_raid0 + } +}; + + +/** + * Performs a double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH_NOLOCK(obj, op, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ +}) + +#define LOV_2DISPATCH_MAYLOCK(obj, op, lock, ...) \ +({ \ + struct lov_object *__obj = (obj); \ + int __lock = !!(lock); \ + typeof(lov_dispatch[0].op(__VA_ARGS__)) __result; \ + \ + __lock &= __obj->lo_owner != cfs_current(); \ + if (__lock) \ + down_read(&__obj->lo_type_guard); \ + __result = LOV_2DISPATCH_NOLOCK(obj, op, __VA_ARGS__); \ + if (__lock) \ + up_read(&__obj->lo_type_guard); \ + __result; \ +}) + +/** + * Performs a locked double-dispatch based on the layout type of an object. + */ +#define LOV_2DISPATCH(obj, op, ...) \ + LOV_2DISPATCH_MAYLOCK(obj, op, 1, __VA_ARGS__) + +#define LOV_2DISPATCH_VOID(obj, op, ...) \ +do { \ + struct lov_object *__obj = (obj); \ + enum lov_layout_type __llt; \ + \ + if (__obj->lo_owner != cfs_current()) \ + down_read(&__obj->lo_type_guard); \ + __llt = __obj->lo_type; \ + LASSERT(0 <= __llt && __llt < ARRAY_SIZE(lov_dispatch)); \ + lov_dispatch[__llt].op(__VA_ARGS__); \ + if (__obj->lo_owner != cfs_current()) \ + up_read(&__obj->lo_type_guard); \ +} while (0) + +static int lov_layout_change(const struct lu_env *env, + struct lov_object *obj, enum lov_layout_type llt, + const struct cl_object_conf *conf) +{ + int result; + union lov_layout_state *state = &lov_env_info(env)->lti_state; + const struct lov_layout_operations *old_ops; + const struct lov_layout_operations *new_ops; + + LASSERT(0 <= obj->lo_type && obj->lo_type < ARRAY_SIZE(lov_dispatch)); + LASSERT(0 <= llt && llt < ARRAY_SIZE(lov_dispatch)); + ENTRY; + + old_ops = &lov_dispatch[obj->lo_type]; + new_ops = &lov_dispatch[llt]; + + result = new_ops->llo_init(env, lu2lov_dev(obj->lo_cl.co_lu.lo_dev), + obj, conf, state); + if (result == 0) { + struct cl_object_header *hdr = cl_object_header(&obj->lo_cl); + void *cookie; + struct lu_env *nested; + int refcheck; + + cookie = cl_env_reenter(); + nested = cl_env_get(&refcheck); + if (!IS_ERR(nested)) + cl_object_prune(nested, &obj->lo_cl); + else + result = PTR_ERR(nested); + cl_env_put(nested, &refcheck); + cl_env_reexit(cookie); + + old_ops->llo_fini(env, obj, &obj->u); + LASSERT(list_empty(&hdr->coh_locks)); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + new_ops->llo_install(env, obj, state); + obj->lo_type = llt; + } else + new_ops->llo_fini(env, obj, state); + RETURN(result); +} + +/***************************************************************************** + * + * Lov object operations. + * + */ + +int lov_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lov_device *dev = lu2lov_dev(obj->lo_dev); + struct lov_object *lov = lu2lov(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + union lov_layout_state *set = &lov_env_info(env)->lti_state; + const struct lov_layout_operations *ops; + int result; + + ENTRY; + init_rwsem(&lov->lo_type_guard); + + /* no locking is necessary, as object is being created */ + lov->lo_type = cconf->u.coc_md->lsm != NULL ? LLT_RAID0 : LLT_EMPTY; + ops = &lov_dispatch[lov->lo_type]; + result = ops->llo_init(env, dev, lov, cconf, set); + if (result == 0) + ops->llo_install(env, lov, set); + else + ops->llo_fini(env, lov, set); + RETURN(result); +} + +static int lov_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lov_object *lov = cl2lov(obj); + int result; + + ENTRY; + /* + * Currently only LLT_EMPTY -> LLT_RAID0 transition is supported. + */ + LASSERT(lov->lo_owner != cfs_current()); + down_write(&lov->lo_type_guard); + LASSERT(lov->lo_owner == NULL); + lov->lo_owner = cfs_current(); + if (lov->lo_type == LLT_EMPTY && conf->u.coc_md->lsm != NULL) + result = lov_layout_change(env, lov, LLT_RAID0, conf); + else + result = -EOPNOTSUPP; + lov->lo_owner = NULL; + up_write(&lov->lo_type_guard); + RETURN(result); +} + +static void lov_object_delete(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_delete, env, lov, &lov->u); + EXIT; +} + +static void lov_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lov_object *lov = lu2lov(obj); + + ENTRY; + LOV_2DISPATCH_VOID(lov, llo_fini, env, lov, &lov->u); + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(lov, lov_object_kmem); + EXIT; +} + +static int lov_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + return LOV_2DISPATCH(lu2lov(o), llo_print, env, cookie, p, o); +} + +struct cl_page *lov_page_init(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage) +{ + return LOV_2DISPATCH(cl2lov(obj), + llo_page_init, env, obj, page, vmpage); +} + +/** + * Implements cl_object_operations::clo_io_init() method for lov + * layer. Dispatches to the appropriate layout io initialization method. + */ +int lov_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + CL_IO_SLICE_CLEAN(lov_env_io(env), lis_cl); + /* + * Do not take lock in case of CIT_MISC io, because + * + * - if this is an io for a glimpse, then we don't care; + * + * - if this not a glimpse (writepage or lock cancellation), then + * layout change cannot happen because a page or a lock + * already exist; and + * + * - lock ordering (lock mutex nests within layout rw-semaphore) + * is obeyed in case of lock cancellation. + */ + return LOV_2DISPATCH_MAYLOCK(cl2lov(obj), llo_io_init, + io->ci_type != CIT_MISC, env, obj, io); +} + +/** + * An implementation of cl_object_operations::clo_attr_get() method for lov + * layer. For raid0 layout this collects and merges attributes of all + * sub-objects. + */ +static int lov_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + /* do not take lock, as this function is called under a + * spin-lock. Layout is protected from changing by ongoing IO. */ + return LOV_2DISPATCH_NOLOCK(cl2lov(obj), llo_getattr, env, obj, attr); +} + +static int lov_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + /* + * No dispatch is required here, as no layout implements this. + */ + return 0; +} + +int lov_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + return LOV_2DISPATCH(cl2lov(obj), llo_lock_init, env, obj, lock, io); +} + +static const struct cl_object_operations lov_ops = { + .coo_page_init = lov_page_init, + .coo_lock_init = lov_lock_init, + .coo_io_init = lov_io_init, + .coo_attr_get = lov_attr_get, + .coo_attr_set = lov_attr_set, + .coo_conf_set = lov_conf_set +}; + +static const struct lu_object_operations lov_lu_obj_ops = { + .loo_object_init = lov_object_init, + .loo_object_delete = lov_object_delete, + .loo_object_release = NULL, + .loo_object_free = lov_object_free, + .loo_object_print = lov_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lov_object_alloc(const struct lu_env *env, + const struct lu_object_header *_, + struct lu_device *dev) +{ + struct lov_object *lov; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lov, lov_object_kmem); + if (lov != NULL) { + obj = lov2lu(lov); + lu_object_init(obj, NULL, dev); + lov->lo_cl.co_ops = &lov_ops; + lov->lo_type = -1; /* invalid, to catch uninitialized type */ + /* + * object io operation vector (cl_object::co_iop) is installed + * later in lov_object_init(), as different vectors are used + * for object with different layouts. + */ + obj->lo_ops = &lov_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +/** @} lov */ diff --git a/lustre/lov/lov_page.c b/lustre/lov/lov_page.c new file mode 100644 index 0000000..3efbc41 --- /dev/null +++ b/lustre/lov/lov_page.c @@ -0,0 +1,227 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOV layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +/***************************************************************************** + * + * Lov page operations. + * + */ + +static int lov_page_invariant(const struct cl_page_slice *slice) +{ + const struct cl_page *page = slice->cpl_page; + const struct cl_page *sub = lov_sub_page(slice); + + return ergo(sub != NULL, + page->cp_child == sub && + sub->cp_parent == page && + page->cp_state == sub->cp_state); +} + +static void lov_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct lov_page *lp = cl2lov_page(slice); + struct cl_page *sub = lov_sub_page(slice); + + LINVRNT(lov_page_invariant(slice)); + ENTRY; + + if (sub != NULL) { + LASSERT(sub->cp_state == CPS_FREEING); + lu_ref_del(&sub->cp_reference, "lov", sub->cp_parent); + sub->cp_parent = NULL; + slice->cpl_page->cp_child = NULL; + cl_page_put(env, sub); + } + OBD_SLAB_FREE_PTR(lp, lov_page_kmem); + EXIT; +} + +static void lov_page_own(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + struct lov_io *lio = lov_env_io(env); + struct lov_io_sub *sub; + + LINVRNT(lov_page_invariant(slice)); + LINVRNT(!cl2lov_page(slice)->lps_invalid); + ENTRY; + + sub = lov_page_subio(env, lio, slice); + if (!IS_ERR(sub)) { + lov_sub_page(slice)->cp_owner = sub->sub_io; + lov_sub_put(sub); + } else + LBUG(); /* Arrgh */ + EXIT; +} + +static void lov_page_assume(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *io) +{ + return lov_page_own(env, slice, io); +} + +static int lov_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct lov_page *lp = cl2lov_page(slice); + + return (*printer)(env, cookie, LUSTRE_LOV_NAME"-page@%p\n", lp); +} + +static const struct cl_page_operations lov_page_ops = { + .cpo_fini = lov_page_fini, + .cpo_own = lov_page_own, + .cpo_assume = lov_page_assume, + .cpo_print = lov_page_print +}; + +static void lov_empty_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct lov_page *lp = cl2lov_page(slice); + + LASSERT(slice->cpl_page->cp_child == NULL); + ENTRY; + OBD_SLAB_FREE_PTR(lp, lov_page_kmem); + EXIT; +} + +struct cl_page *lov_page_init_raid0(const struct lu_env *env, + struct cl_object *obj, struct cl_page *page, + cfs_page_t *vmpage) +{ + struct lov_page *lpg; + struct lov_object *loo = cl2lov(obj); + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lpg, lov_page_kmem); + if (lpg != NULL) { + loff_t offset; + int stripe; + obd_off suboff; + struct cl_page *subpage; + struct cl_object *subobj; + struct lov_layout_raid0 *r0 = lov_r0(loo); + + offset = cl_offset(obj, page->cp_index); + stripe = lov_stripe_number(r0->lo_lsm, offset); + result = lov_stripe_offset(r0->lo_lsm, offset, stripe, + &suboff); + LASSERT(stripe < r0->lo_nr); + LASSERT(result == 0); + + subobj = lovsub2cl(r0->lo_sub[stripe]); + subpage = cl_page_find(env, subobj, + cl_index(subobj, suboff), vmpage, + page->cp_type); + if (!IS_ERR(subpage)) { + if (subpage->cp_parent != NULL) { + /* + * This is only possible when TRANSIENT page + * is being created, and CACHEABLE sub-page + * (attached to already existing top-page) has + * been found. Tell cl_page_find() to use + * existing page. + */ + LASSERT(subpage->cp_type == CPT_CACHEABLE); + LASSERT(page->cp_type == CPT_TRANSIENT); + lpg->lps_invalid = 1; + cl_page_put(env, subpage); + /* + * XXX This assumes that lov is in the topmost + * cl_page. + */ + result = PTR_ERR(cl_page_top(subpage)); + } else { + lu_ref_add(&subpage->cp_reference, "lov", page); + subpage->cp_parent = page; + page->cp_child = subpage; + } + cl_page_slice_add(page, &lpg->lps_cl, + obj, &lov_page_ops); + } else + result = PTR_ERR(subpage); + } else + result = -ENOMEM; + RETURN(ERR_PTR(result)); +} + + +static const struct cl_page_operations lov_empty_page_ops = { + .cpo_fini = lov_empty_page_fini, + .cpo_print = lov_page_print +}; + +struct cl_page *lov_page_init_empty(const struct lu_env *env, + struct cl_object *obj, struct cl_page *page, + cfs_page_t *vmpage) +{ + struct lov_page *lpg; + int result = -ENOMEM; + ENTRY; + + OBD_SLAB_ALLOC_PTR(lpg, lov_page_kmem); + if (lpg != NULL) { + void *addr; + cl_page_slice_add(page, &lpg->lps_cl, + obj, &lov_empty_page_ops); + addr = cfs_kmap(vmpage); + memset(addr, 0, cl_page_size(obj)); + cfs_kunmap(vmpage); + cl_page_export(env, page); + result = 0; + } + RETURN(ERR_PTR(result)); +} + + +/** @} lov */ + diff --git a/lustre/lov/lov_pool.c b/lustre/lov/lov_pool.c index 12115eb..f63fac7 100644 --- a/lustre/lov/lov_pool.c +++ b/lustre/lov/lov_pool.c @@ -400,18 +400,16 @@ int lov_pool_new(struct obd_device *obd, char *poolname) GOTO(out_err, rc); } - spin_lock(&obd->obd_dev_lock); - /* check if pool already exists */ - if (lustre_hash_lookup(lov->lov_pools_hash_body, poolname) != NULL) { - spin_unlock(&obd->obd_dev_lock); + INIT_HLIST_NODE(&new_pool->pool_hash); + rc = lustre_hash_add_unique(lov->lov_pools_hash_body, poolname, + &new_pool->pool_hash); + if (rc) { lov_ost_pool_free(&new_pool->pool_rr.lqr_pool); lov_ost_pool_free(&new_pool->pool_obds); GOTO(out_err, rc = -EEXIST); } - INIT_HLIST_NODE(&new_pool->pool_hash); - lustre_hash_add_unique(lov->lov_pools_hash_body, poolname, - &new_pool->pool_hash); + spin_lock(&obd->obd_dev_lock); list_add_tail(&new_pool->pool_list, &lov->lov_pool_list); lov->lov_pool_count++; spin_unlock(&obd->obd_dev_lock); diff --git a/lustre/lov/lov_qos.c b/lustre/lov/lov_qos.c index b292fc3..5ad5848 100644 --- a/lustre/lov/lov_qos.c +++ b/lustre/lov/lov_qos.c @@ -161,6 +161,7 @@ static int qos_calc_ppo(struct obd_device *obd) __u64 ba_max, ba_min, temp; __u32 num_active; int rc, i, prio_wide; + time_t now, age; ENTRY; if (!lov->lov_qos.lq_dirty) @@ -183,6 +184,7 @@ static int qos_calc_ppo(struct obd_device *obd) ba_min = (__u64)(-1); ba_max = 0; + now = cfs_time_current_sec(); /* Calculate OST penalty per object */ /* (lov ref taken in alloc_qos) */ for (i = 0; i < lov->desc.ld_tgt_count; i++) { @@ -205,8 +207,17 @@ static int qos_calc_ppo(struct obd_device *obd) lov->lov_tgts[i]->ltd_qos.ltq_penalty_per_obj = (temp * prio_wide) >> 8; - if (lov->lov_qos.lq_reset == 0) + age = (now - lov->lov_tgts[i]->ltd_qos.ltq_used) >> 3; + if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage) lov->lov_tgts[i]->ltd_qos.ltq_penalty = 0; + else if (age > lov->desc.ld_qos_maxage) + /* Decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives + * lots of time for the statfs information to be + * updated (which the penalty is only a proxy for), + * and avoids penalizing OSS/OSTs under light load. */ + lov->lov_tgts[i]->ltd_qos.ltq_penalty >>= + (age / lov->desc.ld_qos_maxage); } num_active = lov->lov_qos.lq_active_oss_count - 1; @@ -226,8 +237,17 @@ static int qos_calc_ppo(struct obd_device *obd) temp = oss->lqo_bavail >> 1; do_div(temp, oss->lqo_ost_count * num_active); oss->lqo_penalty_per_obj = (temp * prio_wide) >> 8; - if (lov->lov_qos.lq_reset == 0) + + age = (now - oss->lqo_used) >> 3; + if (lov->lov_qos.lq_reset || age > 32 * lov->desc.ld_qos_maxage) oss->lqo_penalty = 0; + else if (age > lov->desc.ld_qos_maxage) + /* Decay the penalty by half for every 8x the update + * interval that the device has been idle. That gives + * lots of time for the statfs information to be + * updated (which the penalty is only a proxy for), + * and avoids penalizing OSS/OSTs under light load. */ + oss->lqo_penalty >>= (age / lov->desc.ld_qos_maxage); } lov->lov_qos.lq_dirty = 0; @@ -242,7 +262,7 @@ static int qos_calc_ppo(struct obd_device *obd) /* Difference is less than 20% */ lov->lov_qos.lq_same_space = 1; /* Reset weights for the next time we enter qos mode */ - lov->lov_qos.lq_reset = 0; + lov->lov_qos.lq_reset = 1; } rc = 0; @@ -285,6 +305,10 @@ static int qos_used(struct lov_obd *lov, struct ost_pool *osts, lov->lov_tgts[index]->ltd_qos.ltq_penalty >>= 1; oss->lqo_penalty >>= 1; + /* mark the OSS and OST as recently used */ + lov->lov_tgts[index]->ltd_qos.ltq_used = + oss->lqo_used = cfs_time_current_sec(); + /* Set max penalties for this OST and OSS */ lov->lov_tgts[index]->ltd_qos.ltq_penalty += lov->lov_tgts[index]->ltd_qos.ltq_penalty_per_obj * @@ -669,9 +693,13 @@ repeat_find: if (OBD_FAIL_CHECK(OBD_FAIL_MDS_OSC_PRECREATE) && ost_idx == 0) continue; - /* Drop slow OSCs if we can, but not for requested start idx */ + /* Drop slow OSCs if we can, but not for requested start idx. + * + * This means "if OSC is slow and it is not the requested + * start OST, then it can be skipped, otherwise skip it only + * if it is inactive/recovering/out-of-space." */ if ((obd_precreate(lov->lov_tgts[ost_idx]->ltd_exp) > speed) && - (i != 0 || speed < 2)) + (i != 0 || speed >= 2)) continue; *idx_pos = ost_idx; diff --git a/lustre/lov/lov_request.c b/lustre/lov/lov_request.c index 00805bb..2c8c0ad 100644 --- a/lustre/lov/lov_request.c +++ b/lustre/lov/lov_request.c @@ -113,7 +113,7 @@ int lov_update_common_set(struct lov_request_set *set, lov_update_set(set, req, rc); /* grace error on inactive ost */ - if (rc && !(lov->lov_tgts[req->rq_idx] && + if (rc && !(lov->lov_tgts[req->rq_idx] && lov->lov_tgts[req->rq_idx]->ltd_active)) rc = 0; @@ -127,18 +127,44 @@ void lov_set_add_req(struct lov_request *req, struct lov_request_set *set) set->set_count++; } +extern void osc_update_enqueue(struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, + struct ost_lvb *lvb, __u32 mode, int rc); + +static int lov_update_enqueue_lov(struct obd_export *exp, + struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, int idx, + __u64 oid, int rc) +{ + struct lov_obd *lov = &exp->exp_obd->u.lov; + + if (rc != ELDLM_OK && + !(rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT))) { + memset(lov_lockhp, 0, sizeof(*lov_lockhp)); + if (lov->lov_tgts[idx] && lov->lov_tgts[idx]->ltd_active) { + /* -EUSERS used by OST to report file contention */ + if (rc != -EINTR && rc != -EUSERS) + CERROR("enqueue objid "LPX64" subobj " + LPX64" on OST idx %d: rc %d\n", + oid, loi->loi_id, loi->loi_ost_idx, rc); + } else + rc = ELDLM_OK; + } + return rc; +} + int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc) { struct lov_request_set *set = req->rq_rqset; struct lustre_handle *lov_lockhp; + struct obd_info *oi = set->set_oi; struct lov_oinfo *loi; ENTRY; - LASSERT(set != NULL); - LASSERT(set->set_oi != NULL); + LASSERT(oi != NULL); lov_lockhp = set->set_lockh->llh_handles + req->rq_stripe; - loi = set->set_oi->oi_md->lsm_oinfo[req->rq_stripe]; + loi = oi->oi_md->lsm_oinfo[req->rq_stripe]; /* XXX LOV STACKING: OSC gets a copy, created in lov_prep_enqueue_set * and that copy can be arbitrarily out of date. @@ -146,65 +172,22 @@ int lov_update_enqueue_set(struct lov_request *req, __u32 mode, int rc) * The LOV API is due for a serious rewriting anyways, and this * can be addressed then. */ - if (rc == ELDLM_OK) { - struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); - __u64 tmp; - - LASSERT(lock != NULL); - lov_stripe_lock(set->set_oi->oi_md); - loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb; - tmp = loi->loi_lvb.lvb_size; - /* Extend KMS up to the end of this lock and no further - * A lock on [x,y] means a KMS of up to y + 1 bytes! */ - if (tmp > lock->l_policy_data.l_extent.end) - tmp = lock->l_policy_data.l_extent.end + 1; - if (tmp >= loi->loi_kms) { - LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64 - ", kms="LPU64, loi->loi_lvb.lvb_size, tmp); - loi->loi_kms = tmp; - loi->loi_kms_valid = 1; - } else { - LDLM_DEBUG(lock, "lock acquired, setting rss=" - LPU64"; leaving kms="LPU64", end="LPU64, - loi->loi_lvb.lvb_size, loi->loi_kms, - lock->l_policy_data.l_extent.end); - } - lov_stripe_unlock(set->set_oi->oi_md); - ldlm_lock_allow_match(lock); - LDLM_LOCK_PUT(lock); - } else if ((rc == ELDLM_LOCK_ABORTED) && - (set->set_oi->oi_flags & LDLM_FL_HAS_INTENT)) { - memset(lov_lockhp, 0, sizeof(*lov_lockhp)); - lov_stripe_lock(set->set_oi->oi_md); - loi->loi_lvb = req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb; - lov_stripe_unlock(set->set_oi->oi_md); - CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" - " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms); - rc = ELDLM_OK; - } else { - struct obd_export *exp = set->set_exp; - struct lov_obd *lov = &exp->exp_obd->u.lov; - - memset(lov_lockhp, 0, sizeof(*lov_lockhp)); - if (lov->lov_tgts[req->rq_idx] && - lov->lov_tgts[req->rq_idx]->ltd_active) { - /* -EUSERS used by OST to report file contention */ - if (rc != -EINTR && rc != -EUSERS) - CERROR("enqueue objid "LPX64" subobj " - LPX64" on OST idx %d: rc %d\n", - set->set_oi->oi_md->lsm_object_id, - loi->loi_id, loi->loi_ost_idx, rc); - } else { - rc = ELDLM_OK; - } - } + lov_stripe_lock(oi->oi_md); + osc_update_enqueue(lov_lockhp, loi, oi->oi_flags, + &req->rq_oi.oi_md->lsm_oinfo[0]->loi_lvb, mode, rc); + if (rc == ELDLM_LOCK_ABORTED && (oi->oi_flags & LDLM_FL_HAS_INTENT)) + memset(lov_lockhp, 0, sizeof *lov_lockhp); + rc = lov_update_enqueue_lov(set->set_exp, lov_lockhp, loi, oi->oi_flags, + req->rq_idx, oi->oi_md->lsm_object_id, rc); + lov_stripe_unlock(oi->oi_md); lov_update_set(set, req, rc); RETURN(rc); } /* The callback for osc_enqueue that updates lov info for every OSC request. */ -static int cb_update_enqueue(struct obd_info *oinfo, int rc) +static int cb_update_enqueue(void *cookie, int rc) { + struct obd_info *oinfo = cookie; struct ldlm_enqueue_info *einfo; struct lov_request *lovreq; @@ -637,7 +620,8 @@ cleanup: continue; sub_exp = lov->lov_tgts[req->rq_idx]->ltd_exp; - err = obd_destroy(sub_exp, req->rq_oi.oi_oa, NULL, oti, NULL); + err = obd_destroy(sub_exp, req->rq_oi.oi_oa, NULL, oti, NULL, + NULL); if (err) CERROR("Failed to uncreate objid "LPX64" subobj " LPX64" on OST idx %d: rc = %d\n", @@ -877,9 +861,9 @@ int lov_prep_brw_set(struct obd_export *exp, struct obd_info *oinfo, if (info[i].count == 0) continue; - + loi = oinfo->oi_md->lsm_oinfo[i]; - if (!lov->lov_tgts[loi->loi_ost_idx] || + if (!lov->lov_tgts[loi->loi_ost_idx] || !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); GOTO(out, rc = -EIO); @@ -972,8 +956,9 @@ int lov_fini_getattr_set(struct lov_request_set *set) /* The callback for osc_getattr_async that finilizes a request info when a * response is recieved. */ -static int cb_getattr_update(struct obd_info *oinfo, int rc) +static int cb_getattr_update(void *cookie, int rc) { + struct obd_info *oinfo = cookie; struct lov_request *lovreq; lovreq = container_of(oinfo, struct lov_request, rq_oi); return lov_update_common_set(lovreq->rq_rqset, lovreq, rc); @@ -1081,7 +1066,7 @@ int lov_prep_destroy_set(struct obd_export *exp, struct obd_info *oinfo, struct lov_request *req; loi = lsm->lsm_oinfo[i]; - if (!lov->lov_tgts[loi->loi_ost_idx] || + if (!lov->lov_tgts[loi->loi_ost_idx] || !lov->lov_tgts[loi->loi_ost_idx]->ltd_active) { CDEBUG(D_HA, "lov idx %d inactive\n", loi->loi_ost_idx); continue; @@ -1140,7 +1125,7 @@ int lov_update_setattr_set(struct lov_request_set *set, lov_update_set(set, req, rc); /* grace error on inactive ost */ - if (rc && !(lov->lov_tgts[req->rq_idx] && + if (rc && !(lov->lov_tgts[req->rq_idx] && lov->lov_tgts[req->rq_idx]->ltd_active)) rc = 0; @@ -1161,8 +1146,9 @@ int lov_update_setattr_set(struct lov_request_set *set, /* The callback for osc_setattr_async that finilizes a request info when a * response is recieved. */ -static int cb_setattr_update(struct obd_info *oinfo, int rc) +static int cb_setattr_update(void *cookie, int rc) { + struct obd_info *oinfo = cookie; struct lov_request *lovreq; lovreq = container_of(oinfo, struct lov_request, rq_oi); return lov_update_setattr_set(lovreq->rq_rqset, lovreq, rc); @@ -1212,8 +1198,11 @@ int lov_prep_setattr_set(struct obd_export *exp, struct obd_info *oinfo, memcpy(req->rq_oi.oi_oa, oinfo->oi_oa, sizeof(*req->rq_oi.oi_oa)); req->rq_oi.oi_oa->o_id = loi->loi_id; - LASSERT(!(req->rq_oi.oi_oa->o_valid & OBD_MD_FLGROUP) - || req->rq_oi.oi_oa->o_gr>0); + LASSERTF(!(req->rq_oi.oi_oa->o_valid & OBD_MD_FLGROUP) || + CHECK_MDS_GROUP(req->rq_oi.oi_oa->o_gr), + "req->rq_oi.oi_oa->o_valid="LPX64" " + "req->rq_oi.oi_oa->o_gr="LPU64"\n", + req->rq_oi.oi_oa->o_valid, req->rq_oi.oi_oa->o_gr); req->rq_oi.oi_oa->o_stripe_idx = i; req->rq_oi.oi_cb_up = cb_setattr_update; req->rq_oi.oi_capa = oinfo->oi_capa; @@ -1293,8 +1282,9 @@ int lov_update_punch_set(struct lov_request_set *set, /* The callback for osc_punch that finilizes a request info when a response * is recieved. */ -static int cb_update_punch(struct obd_info *oinfo, int rc) +static int cb_update_punch(void *cookie, int rc) { + struct obd_info *oinfo = cookie; struct lov_request *lovreq; lovreq = container_of(oinfo, struct lov_request, rq_oi); return lov_update_punch_set(lovreq->rq_rqset, lovreq, rc); @@ -1576,8 +1566,9 @@ void lov_update_statfs(struct obd_statfs *osfs, struct obd_statfs *lov_sfs, /* The callback for osc_statfs_async that finilizes a request info when a * response is recieved. */ -static int cb_statfs_update(struct obd_info *oinfo, int rc) +static int cb_statfs_update(void *cookie, int rc) { + struct obd_info *oinfo = cookie; struct lov_request *lovreq; struct obd_statfs *osfs, *lov_sfs; struct obd_device *obd; diff --git a/lustre/lov/lovsub_dev.c b/lustre/lov/lovsub_dev.c new file mode 100644 index 0000000..359def4 --- /dev/null +++ b/lustre/lov/lovsub_dev.c @@ -0,0 +1,212 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device and cl_device_type for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +/***************************************************************************** + * + * Lovsub transfer operations. + * + */ + +static void lovsub_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct lovsub_req *lsr; + + ENTRY; + lsr = cl2lovsub_req(slice); + OBD_SLAB_FREE_PTR(lsr, lovsub_req_kmem); + EXIT; +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for lovsub + * layer. Lov and lovsub are responsible only for struct obdo::o_stripe_idx + * field, which is filled there. + */ +static void lovsub_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct lovsub_object *subobj; + + ENTRY; + subobj = cl2lovsub(obj); + /* + * There is no OBD_MD_* flag for obdo::o_stripe_idx, so set it + * unconditionally. It never changes anyway. + */ + attr->cra_oa->o_stripe_idx = subobj->lso_index; + EXIT; +} + +static const struct cl_req_operations lovsub_req_ops = { + .cro_attr_set = lovsub_req_attr_set, + .cro_completion = lovsub_req_completion +}; + +/***************************************************************************** + * + * Lov-sub device and device type functions. + * + */ + +static int lovsub_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device_type *ldt; + int rc; + + ENTRY; + next->ld_site = d->ld_site; + ldt = next->ld_type; + LASSERT(ldt != NULL); + rc = ldt->ldt_ops->ldto_device_init(env, next, ldt->ldt_name, NULL); + if (rc) { + next->ld_site = NULL; + RETURN(rc); + } + + lu_device_get(next); + lu_ref_add(&next->ld_reference, "lu-stack", &lu_site_init); + lsd->acid_next = lu2cl_dev(next); + RETURN(rc); +} + +static struct lu_device *lovsub_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct lu_device *next; + struct lovsub_device *lsd; + + ENTRY; + lsd = lu2lovsub_dev(d); + next = cl2lu_dev(lsd->acid_next); + lsd->acid_super = NULL; + lsd->acid_next = NULL; + RETURN(next); +} + +static struct lu_device *lovsub_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct lovsub_device *lsd = lu2lovsub_dev(d); + struct lu_device *next = cl2lu_dev(lsd->acid_next); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(lsd); + return next; +} + +static int lovsub_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct lovsub_req *lsr; + int result; + + OBD_SLAB_ALLOC_PTR(lsr, lovsub_req_kmem); + if (lsr != NULL) { + cl_req_slice_add(req, &lsr->lsrq_cl, dev, &lovsub_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +static const struct lu_device_operations lovsub_lu_ops = { + .ldo_object_alloc = lovsub_object_alloc, + .ldo_process_config = NULL, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations lovsub_cl_ops = { + .cdo_req_init = lovsub_req_init +}; + +static struct lu_device *lovsub_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct lovsub_device *lsd; + + OBD_ALLOC_PTR(lsd); + if (lsd != NULL) { + int result; + + result = cl_device_init(&lsd->acid_cl, t); + if (result == 0) { + d = lovsub2lu_dev(lsd); + d->ld_ops = &lovsub_lu_ops; + lsd->acid_cl.cd_ops = &lovsub_cl_ops; + } else + d = ERR_PTR(result); + } else + d = ERR_PTR(-ENOMEM); + return d; +} + +static const struct lu_device_type_operations lovsub_device_type_ops = { + .ldto_device_alloc = lovsub_device_alloc, + .ldto_device_free = lovsub_device_free, + + .ldto_device_init = lovsub_device_init, + .ldto_device_fini = lovsub_device_fini +}; + +#define LUSTRE_LOVSUB_NAME "lovsub" + +struct lu_device_type lovsub_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_LOVSUB_NAME, + .ldt_ops = &lovsub_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + + +/** @} lov */ + diff --git a/lustre/include/obd_echo.h b/lustre/lov/lovsub_io.c similarity index 58% rename from lustre/include/obd_echo.h rename to lustre/lov/lovsub_io.c index 7465b68..d8cfe0e 100644 --- a/lustre/include/obd_echo.h +++ b/lustre/lov/lovsub_io.c @@ -26,40 +26,30 @@ * GPL HEADER END */ /* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. * Use is subject to license terms. */ /* * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for LOVSUB layer. + * + * Author: Nikita Danilov */ -#ifndef _OBD_ECHO_H -#define _OBD_ECHO_H +#define DEBUG_SUBSYSTEM S_LOV -/* The persistent object (i.e. actually stores stuff!) */ -#define ECHO_PERSISTENT_OBJID 1ULL -#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) +#include "lov_cl_internal.h" -/* block size to use for data verification */ -#define OBD_ECHO_BLOCK_SIZE (4<<10) +/** \addtogroup lov lov @{ */ -struct ec_object { - struct list_head eco_obj_chain; - struct obd_device *eco_device; - int eco_refcount; - int eco_deleted; - obd_id eco_id; - struct lov_stripe_md *eco_lsm; -}; +/***************************************************************************** + * + * Lovsub io operations. + * + */ -struct ec_lock { - struct list_head ecl_exp_chain; - struct ec_object *ecl_object; - __u64 ecl_cookie; - struct lustre_handle ecl_lock_handle; - ldlm_policy_data_t ecl_policy; - __u32 ecl_mode; -}; +/* All trivial */ -#endif +/** @} lov */ diff --git a/lustre/lov/lovsub_lock.c b/lustre/lov/lovsub_lock.c new file mode 100644 index 0000000..f02a2ce --- /dev/null +++ b/lustre/lov/lovsub_lock.c @@ -0,0 +1,430 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +/***************************************************************************** + * + * Lovsub lock operations. + * + */ + +static void lovsub_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct lovsub_lock *lsl; + + ENTRY; + lsl = cl2lovsub_lock(slice); + LASSERT(list_empty(&lsl->lss_parents)); + OBD_SLAB_FREE_PTR(lsl, lovsub_lock_kmem); + EXIT; +} + +static void lovsub_parent_lock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + ENTRY; + parent = lov->lls_cl.cls_lock; + cl_lock_get(parent); + lu_ref_add(&parent->cll_reference, "lovsub-parent", cfs_current()); + cl_lock_mutex_get(env, parent); + EXIT; +} + +static void lovsub_parent_unlock(const struct lu_env *env, struct lov_lock *lov) +{ + struct cl_lock *parent; + + ENTRY; + parent = lov->lls_cl.cls_lock; + cl_lock_mutex_put(env, lov->lls_cl.cls_lock); + lu_ref_del(&parent->cll_reference, "lovsub-parent", cfs_current()); + cl_lock_put(env, parent); + EXIT; +} + +static void lovsub_lock_state_one(const struct lu_env *env, + const struct lovsub_lock *lovsub, + struct lov_lock *lov) +{ + struct cl_lock *parent; + const struct cl_lock *child; + + ENTRY; + parent = lov->lls_cl.cls_lock; + child = lovsub->lss_cl.cls_lock; + + if (lovsub->lss_active != parent) { + lovsub_parent_lock(env, lov); + if (child->cll_error != 0) + cl_lock_error(env, parent, child->cll_error); + else + cl_lock_signal(env, parent); + lovsub_parent_unlock(env, lov); + } + EXIT; +} + +/** + * Implements cl_lock_operations::clo_state() method for lovsub layer, which + * method is called whenever sub-lock state changes. Propagates state change + * to the top-locks. + */ +static void lovsub_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock_link *scan; + struct lov_lock_link *temp; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + ENTRY; + + /* + * Use _safe() version, because + * + * lovsub_lock_state_one() + * ->cl_lock_error() + * ->cl_lock_delete() + * ->lov_lock_delete() + * + * can unlink parent from the parent list. + */ + list_for_each_entry_safe(scan, temp, &sub->lss_parents, lll_list) + lovsub_lock_state_one(env, sub, scan->lll_super); + EXIT; +} + +/** + * Implementation of cl_lock_operation::clo_weigh() estimating lock weight by + * asking parent lock. + */ +static unsigned long lovsub_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lovsub_lock *lock = cl2lovsub_lock(slice); + struct lov_lock *lov; + unsigned long dumbbell; + + ENTRY; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + + if (!list_empty(&lock->lss_parents)) { + /* + * It is not clear whether all parents have to be asked and + * their estimations summed, or it is enough to ask one. For + * the current usages, one is always enough. + */ + lov = container_of(lock->lss_parents.next, + struct lov_lock_link, lll_list)->lll_super; + + lovsub_parent_lock(env, lov); + dumbbell = cl_lock_weigh(env, lov->lls_cl.cls_lock); + lovsub_parent_unlock(env, lov); + } else + dumbbell = 0; + + RETURN(dumbbell); +} + +/** + * Maps start/end offsets within a stripe, to offsets within a file. + */ +static void lovsub_lock_descr_map(const struct cl_lock_descr *in, + struct lov_object *obj, + int stripe, struct cl_lock_descr *out) +{ + struct lov_stripe_md *lsm = lov_r0(obj)->lo_lsm; + pgoff_t size; /* stripe size in pages */ + pgoff_t skip; /* how many pages in every stripe are occupied by + * "other" stripes */ + pgoff_t start; + pgoff_t end; + + ENTRY; + start = in->cld_start; + end = in->cld_end; + + /* + * XXX join file support. + */ + if (lsm->lsm_stripe_count > 1) { + size = cl_index(lov2cl(obj), lsm->lsm_stripe_size); + skip = (lsm->lsm_stripe_count - 1) * size; + + /* XXX overflow check here? */ + start += start/size * skip + stripe * size; + + if (end != CL_PAGE_EOF) { + end += end/size * skip + stripe * size; + /* + * And check for overflow... + */ + if (end < in->cld_end) + end = CL_PAGE_EOF; + } + } + out->cld_start = start; + out->cld_end = end; + EXIT; +} + +/** + * Adjusts parent lock extent when a sub-lock is attached to a parent. This is + * called in two ways: + * + * - as part of receive call-back, when server returns granted extent to + * the client, and + * + * - when top-lock finds existing sub-lock in the cache. + * + * Note, that lock mode is not propagated to the parent: i.e., if CLM_READ + * top-lock matches CLM_WRITE sub-lock, top-lock is still CLM_READ. + */ +int lov_sublock_modify(const struct lu_env *env, struct lov_lock *lov, + struct lovsub_lock *sublock, + const struct cl_lock_descr *d, int idx) +{ + struct cl_lock *parent; + struct cl_lock *child; + struct lovsub_object *subobj; + struct cl_lock_descr *pd; + struct cl_lock_descr *parent_descr; + int result; + + parent = lov->lls_cl.cls_lock; + parent_descr = &parent->cll_descr; + LASSERT(cl_lock_mode_match(d->cld_mode, parent_descr->cld_mode)); + + child = sublock->lss_cl.cls_lock; + subobj = cl2lovsub(sublock->lss_cl.cls_obj); + pd = &lov_env_info(env)->lti_ldescr; + + pd->cld_obj = parent_descr->cld_obj; + pd->cld_mode = parent_descr->cld_mode; + lovsub_lock_descr_map(d, subobj->lso_super, subobj->lso_index, pd); + lov->lls_sub[idx].sub_got = *d; + /* + * Notify top-lock about modification, if lock description changes + * materially. + */ + if (!cl_lock_ext_match(parent_descr, pd)) + result = cl_lock_modify(env, parent, pd); + else + result = 0; + return result; +} + +static int lovsub_lock_modify(const struct lu_env *env, + const struct cl_lock_slice *s, + const struct cl_lock_descr *d) +{ + struct lovsub_lock *lock = cl2lovsub_lock(s); + struct lov_lock_link *scan; + struct lov_lock *lov; + int result = 0; + + ENTRY; + + LASSERT(cl_lock_mode_match(d->cld_mode, + s->cls_lock->cll_descr.cld_mode)); + list_for_each_entry(scan, &lock->lss_parents, lll_list) { + int rc; + + lov = scan->lll_super; + lovsub_parent_lock(env, lov); + rc = lov_sublock_modify(env, lov, lock, d, scan->lll_idx); + lovsub_parent_unlock(env, lov); + result = result ?: rc; + } + RETURN(result); +} + +static int lovsub_lock_closure(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_lock_closure *closure) +{ + struct lovsub_lock *sub; + struct cl_lock *parent; + struct lov_lock_link *scan; + int result; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + ENTRY; + + sub = cl2lovsub_lock(slice); + result = 0; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + parent = scan->lll_super->lls_cl.cls_lock; + result = cl_lock_closure_build(env, parent, closure); + if (result != 0) + break; + } + RETURN(result); +} + +/** + * An implementation of cl_lock_operations::clo_delete() method. This is + * invoked in "bottom-to-top" delete, when lock destruction starts from the + * sub-lock (e.g, as a result of ldlm lock LRU policy). + */ +static void lovsub_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock *lov; + struct cl_lock *parent; + struct lov_lock_link *scan; + struct lov_lock_link *temp; + struct lov_lock_sub *subdata; + + LASSERT(cl_lock_is_mutexed(slice->cls_lock)); + ENTRY; + + list_for_each_entry_safe(scan, temp, &sub->lss_parents, lll_list) { + lov = scan->lll_super; + subdata = &lov->lls_sub[scan->lll_idx]; + parent = lov->lls_cl.cls_lock; + lovsub_parent_lock(env, lov); + subdata->sub_got = subdata->sub_descr; + lov_lock_unlink(env, scan, sub); + CDEBUG(D_DLMTRACE, "%p %p %i %i\n", parent, sub, + lov->lls_nr_filled, parent->cll_state); + switch (parent->cll_state) { + case CLS_NEW: + case CLS_QUEUING: + case CLS_ENQUEUED: + case CLS_FREEING: + cl_lock_signal(env, parent); + break; + case CLS_UNLOCKING: + /* + * Here lies a problem: a sub-lock is canceled while + * top-lock is being unlocked. Top-lock cannot be + * moved into CLS_NEW state, because unlocking has to + * succeed eventually by placing lock into CLS_CACHED + * (or failing it), see cl_unuse_try(). Nor can + * top-lock be left in CLS_CACHED state, because lov + * maintains an invariant that all sub-locks exist in + * CLS_CACHED (this allows cached top-lock to be + * reused immediately). Nor can we wait for top-lock + * state to change, because this can be synchronous to + * the current thread. + * + * We know for sure that lov_lock_unuse() will be + * called at least one more time to finish un-using, + * so leave a mark on the top-lock, that will be seen + * by the next call to lov_lock_unuse(). + */ + lov->lls_unuse_race = 1; + break; + case CLS_CACHED: + cl_lock_state_set(env, parent, CLS_NEW); + if (lov->lls_nr_filled == 0) { + cl_lock_cancel(env, parent); + cl_lock_delete(env, parent); + cl_lock_signal(env, parent); + } + break; + case CLS_HELD: + default: + CERROR("Impossible state: %i\n", parent->cll_state); + LBUG(); + } + lovsub_parent_unlock(env, lov); + } + EXIT; +} + +static int lovsub_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct lovsub_lock *sub = cl2lovsub_lock(slice); + struct lov_lock *lov; + struct lov_lock_link *scan; + + list_for_each_entry(scan, &sub->lss_parents, lll_list) { + lov = scan->lll_super; + (*p)(env, cookie, "[%d %p ", scan->lll_idx, lov); + if (lov != NULL) + cl_lock_descr_print(env, cookie, p, + &lov->lls_cl.cls_lock->cll_descr); + (*p)(env, cookie, "] "); + } + return 0; +} + +static const struct cl_lock_operations lovsub_lock_ops = { + .clo_fini = lovsub_lock_fini, + .clo_state = lovsub_lock_state, + .clo_delete = lovsub_lock_delete, + .clo_modify = lovsub_lock_modify, + .clo_closure = lovsub_lock_closure, + .clo_weigh = lovsub_lock_weigh, + .clo_print = lovsub_lock_print +}; + +int lovsub_lock_init(const struct lu_env *env, struct cl_object *obj, + struct cl_lock *lock, const struct cl_io *io) +{ + struct lovsub_lock *lsk; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lsk, lovsub_lock_kmem); + if (lsk != NULL) { + CFS_INIT_LIST_HEAD(&lsk->lss_parents); + cl_lock_slice_add(lock, &lsk->lss_cl, obj, &lovsub_lock_ops); + result = 0; + } else + result = -ENOMEM; + RETURN(result); +} + +/** @} lov */ diff --git a/lustre/lov/lovsub_object.c b/lustre/lov/lovsub_object.c new file mode 100644 index 0000000..a2c4f061 --- /dev/null +++ b/lustre/lov/lovsub_object.c @@ -0,0 +1,160 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +/***************************************************************************** + * + * Lovsub object operations. + * + */ + +int lovsub_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct lovsub_device *dev = lu2lovsub_dev(obj->lo_dev); + struct lu_object *below; + struct lu_device *under; + + int result; + + ENTRY; + under = &dev->acid_next->cd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, under); + if (below != NULL) { + lu_object_add(obj, below); + result = 0; + } else + result = -ENOMEM; + RETURN(result); + +} + +static void lovsub_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + struct lov_object *lov = los->lso_super; + + LASSERT(lov->lo_type == LLT_RAID0); + LASSERT(lov->u.raid0.lo_sub[los->lso_index] == los); + + ENTRY; + lov->u.raid0.lo_sub[los->lso_index] = NULL; + lu_object_fini(obj); + lu_object_header_fini(&los->lso_header.coh_lu); + OBD_SLAB_FREE_PTR(los, lovsub_object_kmem); + EXIT; +} + +static int lovsub_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct lovsub_object *los = lu2lovsub(obj); + + return (*p)(env, cookie, "[%i]", los->lso_index); +} + +static int lovsub_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_object *lov = cl2lovsub(obj)->lso_super; + + ENTRY; + lov_r0(lov)->lo_attr_valid = 0; + RETURN(0); +} + +static int lovsub_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lovsub_object *los = cl2lovsub(obj); + + ENTRY; + RETURN(cl_object_glimpse(env, &los->lso_super->lo_cl, lvb)); +} + + + +static const struct cl_object_operations lovsub_ops = { + .coo_page_init = lovsub_page_init, + .coo_lock_init = lovsub_lock_init, + .coo_attr_set = lovsub_attr_set, + .coo_glimpse = lovsub_object_glimpse +}; + +static const struct lu_object_operations lovsub_lu_obj_ops = { + .loo_object_init = lovsub_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = lovsub_object_free, + .loo_object_print = lovsub_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *lovsub_object_alloc(const struct lu_env *env, + const struct lu_object_header *_, + struct lu_device *dev) +{ + struct lovsub_object *los; + struct lu_object *obj; + + ENTRY; + OBD_SLAB_ALLOC_PTR(los, lovsub_object_kmem); + if (los != NULL) { + struct cl_object_header *hdr; + + obj = lovsub2lu(los); + hdr = &los->lso_header; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + los->lso_cl.co_ops = &lovsub_ops; + obj->lo_ops = &lovsub_lu_obj_ops; + } else + obj = NULL; + RETURN(obj); +} + +/** @} lov */ diff --git a/lustre/lov/lovsub_page.c b/lustre/lov/lovsub_page.c new file mode 100644 index 0000000..70e1f56 --- /dev/null +++ b/lustre/lov/lovsub_page.c @@ -0,0 +1,83 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for LOVSUB layer. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_LOV + +#include "lov_cl_internal.h" + +/** \addtogroup lov lov @{ */ + +/***************************************************************************** + * + * Lovsub page operations. + * + */ + +static void lovsub_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct lovsub_page *lsb = cl2lovsub_page(slice); + ENTRY; + OBD_SLAB_FREE_PTR(lsb, lovsub_page_kmem); + EXIT; +} + +static const struct cl_page_operations lovsub_page_ops = { + .cpo_fini = lovsub_page_fini +}; + +struct cl_page *lovsub_page_init(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *_) +{ + struct lovsub_page *lsb; + int result; + + ENTRY; + OBD_SLAB_ALLOC_PTR(lsb, lovsub_page_kmem); + if (lsb != NULL) { + cl_page_slice_add(page, &lsb->lsb_cl, obj, &lovsub_page_ops); + result = 0; + } else + result = -ENOMEM; + RETURN(ERR_PTR(result)); +} + +/** @} lov */ diff --git a/lustre/lvfs/autoMakefile.am b/lustre/lvfs/autoMakefile.am index 1b5311b..b80a28d 100644 --- a/lustre/lvfs/autoMakefile.am +++ b/lustre/lvfs/autoMakefile.am @@ -60,7 +60,7 @@ sources: fsfilt_$(BACKINGFS).c else #SERVER sources: -endif +endif #SERVER ldiskfs_sed_flags = \ -e "s/dx_hash_info/ext3_dx_hash_info/g" \ @@ -104,8 +104,7 @@ install-data-hook: $(install_data_hook) DIST_SOURCES = fsfilt.c fsfilt_ext3.c fsfilt_reiserfs.c lvfs_common.c \ lvfs_internal.h lvfs_linux.c lvfs_userfs.c \ upcall_cache.c prng.c lvfs_lib.c \ - lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c \ - # quotacheck_test.c quotactl_test.c fsfilt_ext3_quota.h + lustre_quota_fmt.c lustre_quota_fmt.h quotafmt_test.c MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ CLEANFILES = fsfilt-*.c fsfilt_ldiskfs*.c fsfilt_extN.c sources diff --git a/lustre/lvfs/fsfilt_ext3.c b/lustre/lvfs/fsfilt_ext3.c index 1577be7..26ed65b 100644 --- a/lustre/lvfs/fsfilt_ext3.c +++ b/lustre/lvfs/fsfilt_ext3.c @@ -67,6 +67,8 @@ #include #endif +#include "lustre_quota_fmt.h" + #if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) #define FSFILT_DATA_TRANS_BLOCKS(sb) EXT3_DATA_TRANS_BLOCKS #define FSFILT_DELETE_TRANS_BLOCKS(sb) EXT3_DELETE_TRANS_BLOCKS @@ -723,9 +725,7 @@ static int fsfilt_ext3_statfs(struct super_block *sb, struct obd_statfs *osfs) int rc; memset(&sfs, 0, sizeof(sfs)); - rc = ll_do_statfs(sb, &sfs); - if (!rc && sfs.f_bfree < sfs.f_ffree) { sfs.f_files = (sfs.f_files - sfs.f_ffree) + sfs.f_bfree; sfs.f_ffree = sfs.f_bfree; @@ -883,7 +883,6 @@ static unsigned long new_blocks(handle_t *handle, struct ext3_ext_base *base, pblock = ext3_mb_new_blocks(handle, &ar, err); *count = ar.len; return pblock; - } #endif @@ -1315,19 +1314,37 @@ static int fsfilt_ext3_write_record(struct file *file, void *buf, int bufsize, static int fsfilt_ext3_setup(struct super_block *sb) { +#if ((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && \ + defined(HAVE_QUOTA_SUPPORT)) || defined(S_PDIROPS) + struct ext3_sb_info *sbi = EXT3_SB(sb); #if 0 - EXT3_SB(sb)->dx_lock = fsfilt_ext3_dx_lock; - EXT3_SB(sb)->dx_unlock = fsfilt_ext3_dx_unlock; + sbi->dx_lock = fsfilt_ext3_dx_lock; + sbi->dx_unlock = fsfilt_ext3_dx_unlock; +#endif #endif #ifdef S_PDIROPS CWARN("Enabling PDIROPS\n"); - set_opt(EXT3_SB(sb)->s_mount_opt, PDIROPS); + set_opt(sbi->s_mount_opt, PDIROPS); sb->s_flags |= S_PDIROPS; #endif if (!EXT3_HAS_COMPAT_FEATURE(sb, EXT3_FEATURE_COMPAT_DIR_INDEX)) CWARN("filesystem doesn't have dir_index feature enabled\n"); -#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)) && defined(HAVE_QUOTA_SUPPORT) - set_opt(EXT3_SB(sb)->s_mount_opt, QUOTA); +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,6)) && defined(HAVE_QUOTA_SUPPORT) + /* enable journaled quota support */ + /* kfreed in ext3_put_super() */ + sbi->s_qf_names[USRQUOTA] = kstrdup("lquota.user.reserved", GFP_KERNEL); + if (!sbi->s_qf_names[USRQUOTA]) + return -ENOMEM; + sbi->s_qf_names[GRPQUOTA] = kstrdup("lquota.group.reserved", GFP_KERNEL); + if (!sbi->s_qf_names[GRPQUOTA]) { + kfree(sbi->s_qf_names[USRQUOTA]); + sbi->s_qf_names[USRQUOTA] = NULL; + return -ENOMEM; + } + sbi->s_jquota_fmt = QFMT_VFS_V0; +#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,13)) + set_opt(sbi->s_mount_opt, QUOTA); +#endif #endif return 0; } @@ -1363,8 +1380,7 @@ static int fsfilt_ext3_get_op_len(int op, struct fsfilt_objinfo *fso, int logs) return 0; } -static const char *op_quotafile[] = { "lquota.user", "lquota.group" }; - +#ifdef HAVE_QUOTA_SUPPORT #define DQINFO_COPY(out, in) \ do { \ Q_COPY(out, in, dqi_bgrace); \ @@ -1386,8 +1402,6 @@ do { \ Q_COPY(out, in, dqb_valid); \ } while (0) - - static int fsfilt_ext3_quotactl(struct super_block *sb, struct obd_quotactl *oqc) { @@ -1419,10 +1433,15 @@ static int fsfilt_ext3_quotactl(struct super_block *sb, continue; if (oqc->qc_cmd == Q_QUOTAON) { + char *name[MAXQUOTAS] = LUSTRE_OPQFILES_NAMES_V2; + + LASSERT(oqc->qc_id == LUSTRE_QUOTA_V2); + if (!qcop->quota_on) GOTO(out, rc = -ENOSYS); - rc = qcop->quota_on(sb, i, oqc->qc_id, - (char *)op_quotafile[i]); + + rc = qcop->quota_on(sb, i, QFMT_VFS_V0, + name[i]); } else if (oqc->qc_cmd == Q_QUOTAOFF) { if (!qcop->quota_off) GOTO(out, rc = -ENOSYS); @@ -1455,14 +1474,38 @@ static int fsfilt_ext3_quotactl(struct super_block *sb, if (!qcop->get_dqblk) GOTO(out, rc = -ENOSYS); rc = qcop->get_dqblk(sb, oqc->qc_type, oqc->qc_id, dqblk); + if (!rc) + dqblk->dqb_valid = QIF_LIMITS | QIF_USAGE; break; case Q_SYNC: if (!sb->s_qcop->quota_sync) GOTO(out, rc = -ENOSYS); qcop->quota_sync(sb, oqc->qc_type); break; + case Q_FINVALIDATE: + CDEBUG(D_WARNING, "invalidating operational quota files\n"); + for (i = 0; i < MAXQUOTAS; i++) { + struct file *fp; + char *name[MAXQUOTAS] = LUSTRE_OPQFILES_NAMES_V2; + + LASSERT(oqc->qc_id == LUSTRE_QUOTA_V2); + + if (!Q_TYPESET(oqc, i)) + continue; + + fp = filp_open(name[i], O_CREAT | O_TRUNC | O_RDWR, 0644); + if (IS_ERR(fp)) { + rc = PTR_ERR(fp); + CERROR("error invalidating operational quota file" + " %s (rc:%d)\n", name[i], rc); + } else { + filp_close(fp, 0); + } + + } + break; default: - CERROR("unsupported quotactl command: %d", oqc->qc_cmd); + CERROR("unsupported quotactl command: %d\n", oqc->qc_cmd); LBUG(); } out: @@ -1473,26 +1516,26 @@ out: OBD_FREE_PTR(dqblk); if (rc) - CDEBUG(D_QUOTA, "quotactl command %#x, id %u, type %d " + CDEBUG(D_QUOTA, "quotactl command %#x, id %u, type %u " "failed: %d\n", oqc->qc_cmd, oqc->qc_id, oqc->qc_type, rc); RETURN(rc); } struct chk_dqblk{ - struct hlist_node dqb_hash; /* quotacheck hash */ - struct list_head dqb_list; /* in list also */ - qid_t dqb_id; /* uid/gid */ - short dqb_type; /* USRQUOTA/GRPQUOTA */ - __u32 dqb_bhardlimit; /* block hard limit */ - __u32 dqb_bsoftlimit; /* block soft limit */ - qsize_t dqb_curspace; /* current space */ - __u32 dqb_ihardlimit; /* inode hard limit */ - __u32 dqb_isoftlimit; /* inode soft limit */ - __u32 dqb_curinodes; /* current inodes */ - __u64 dqb_btime; /* block grace time */ - __u64 dqb_itime; /* inode grace time */ - __u32 dqb_valid; /* flag for above fields */ + struct hlist_node dqb_hash; /** quotacheck hash */ + struct list_head dqb_list; /** in list also */ + qid_t dqb_id; /** uid/gid */ + short dqb_type; /** USRQUOTA/GRPQUOTA */ + qsize_t dqb_bhardlimit; /** block hard limit */ + qsize_t dqb_bsoftlimit; /** block soft limit */ + qsize_t dqb_curspace; /** current space */ + qsize_t dqb_ihardlimit; /** inode hard limit */ + qsize_t dqb_isoftlimit; /** inode soft limit */ + qsize_t dqb_curinodes; /** current inodes */ + __u64 dqb_btime; /** block grace time */ + __u64 dqb_itime; /** inode grace time */ + __u32 dqb_valid; /** flag for above fields */ }; static inline unsigned int chkquot_hash(qid_t id, int type) @@ -1568,7 +1611,7 @@ cqget(struct super_block *sb, struct hlist_head *hash, struct list_head *list, return cdqb; } -static inline int quota_onoff(struct super_block *sb, int cmd, int type) +static inline int quota_onoff(struct super_block *sb, int cmd, int type, int qfmt) { struct obd_quotactl *oqctl; int rc; @@ -1578,7 +1621,7 @@ static inline int quota_onoff(struct super_block *sb, int cmd, int type) RETURN(-ENOMEM); oqctl->qc_cmd = cmd; - oqctl->qc_id = QFMT_LDISKFS; + oqctl->qc_id = qfmt; oqctl->qc_type = type; rc = fsfilt_ext3_quotactl(sb, oqctl); @@ -1700,24 +1743,8 @@ static int add_inode_quota(struct inode *inode, struct qchk_ctxt *qctxt, return rc; } -static int v2_write_dqheader(struct file *f, int type) -{ - static const __u32 quota_magics[] = V2_INITQMAGICS; - static const __u32 quota_versions[] = V2_INITQVERSIONS; - struct v2_disk_dqheader dqhead; - loff_t offset = 0; - - CLASSERT(ARRAY_SIZE(quota_magics) == ARRAY_SIZE(quota_versions)); - LASSERT(0 <= type && type < ARRAY_SIZE(quota_magics)); - - dqhead.dqh_magic = cpu_to_le32(quota_magics[type]); - dqhead.dqh_version = cpu_to_le32(quota_versions[type]); - - return cfs_user_write(f, (char *)&dqhead, sizeof(dqhead), &offset); -} - /* write dqinfo struct in a new quota file */ -static int v2_write_dqinfo(struct file *f, int type, struct if_dqinfo *info) +static int v3_write_dqinfo(struct file *f, int type, struct if_dqinfo *info) { struct v2_disk_dqinfo dqinfo; __u32 blocks = V2_DQTREEOFF + 1; @@ -1741,6 +1768,22 @@ static int v2_write_dqinfo(struct file *f, int type, struct if_dqinfo *info) return cfs_user_write(f, (char *)&dqinfo, sizeof(dqinfo), &offset); } +static int v3_write_dqheader(struct file *f, int type) +{ + static const __u32 quota_magics[] = V2_INITQMAGICS; + static const __u32 quota_versions[] = V2_INITQVERSIONS_R1; + struct v2_disk_dqheader dqhead; + loff_t offset = 0; + + CLASSERT(ARRAY_SIZE(quota_magics) == ARRAY_SIZE(quota_versions)); + LASSERT(0 <= type && type < ARRAY_SIZE(quota_magics)); + + dqhead.dqh_magic = cpu_to_le32(quota_magics[type]); + dqhead.dqh_version = cpu_to_le32(quota_versions[type]); + + return cfs_user_write(f, (char *)&dqhead, sizeof(dqhead), &offset); +} + static int create_new_quota_files(struct qchk_ctxt *qctxt, struct obd_quotactl *oqc) { @@ -1751,32 +1794,36 @@ static int create_new_quota_files(struct qchk_ctxt *qctxt, struct if_dqinfo *info = qctxt->qckt_first_check[i]? NULL : &qctxt->qckt_dqinfo[i]; struct file *file; + const char *name[MAXQUOTAS] = LUSTRE_OPQFILES_NAMES_V2; if (!Q_TYPESET(oqc, i)) continue; - file = filp_open(op_quotafile[i], O_RDWR | O_CREAT | O_TRUNC, - 0644); + LASSERT(oqc->qc_id == LUSTRE_QUOTA_V2); + + file = filp_open(name[i], O_RDWR | O_CREAT | O_TRUNC, 0644); if (IS_ERR(file)) { rc = PTR_ERR(file); CERROR("can't create %s file: rc = %d\n", - op_quotafile[i], rc); + name[i], rc); GOTO(out, rc); } if (!S_ISREG(file->f_dentry->d_inode->i_mode)) { - CERROR("file %s is not regular", op_quotafile[i]); + CERROR("file %s is not regular", name[i]); filp_close(file, 0); GOTO(out, rc = -EINVAL); } - rc = v2_write_dqheader(file, i); + DQUOT_DROP(file->f_dentry->d_inode); + + rc = v3_write_dqheader(file, i); if (rc) { filp_close(file, 0); GOTO(out, rc); } - rc = v2_write_dqinfo(file, i, info); + rc = v3_write_dqinfo(file, i, info); filp_close(file, 0); if (rc) GOTO(out, rc); @@ -1804,13 +1851,13 @@ static int commit_chkquot(struct super_block *sb, struct qchk_ctxt *qctxt, if (cdqb->dqb_bsoftlimit && toqb(cdqb->dqb_curspace) >= cdqb->dqb_bsoftlimit && !cdqb->dqb_btime) - cdqb->dqb_btime = + cdqb->dqb_btime = now + qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_bgrace; if (cdqb->dqb_isoftlimit && cdqb->dqb_curinodes >= cdqb->dqb_isoftlimit && !cdqb->dqb_itime) - cdqb->dqb_itime = + cdqb->dqb_itime = now + qctxt->qckt_dqinfo[cdqb->dqb_type].dqi_igrace; cdqb->dqb_valid = QIF_ALL; @@ -1872,12 +1919,12 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, if (!Q_TYPESET(oqc, i)) continue; - rc = quota_onoff(sb, Q_QUOTAON, i); + rc = quota_onoff(sb, Q_QUOTAON, i, oqc->qc_id); if (!rc || rc == -EBUSY) { rc = read_old_dqinfo(sb, i, qctxt->qckt_dqinfo); if (rc) GOTO(out, rc); - } else if (rc == -ENOENT) { + } else if (rc == -ENOENT || rc == -EINVAL || rc == -EEXIST) { qctxt->qckt_first_check[i] = 1; } else if (rc) { GOTO(out, rc); @@ -1925,7 +1972,7 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, LASSERT(sb_dqopt(sb)->files[i] != NULL); INIT_LIST_HEAD(&id_list); -#ifndef KERNEL_SUPPORTS_QUOTA_READ +#ifndef KERNEL_SUPPORTS_QUOTA_READ rc = lustre_get_qids(sb_dqopt(sb)->files[i], NULL, i, &id_list); #else rc = lustre_get_qids(NULL, sb_dqopt(sb)->files[i], i, &id_list); @@ -1945,14 +1992,14 @@ static int fsfilt_ext3_quotacheck(struct super_block *sb, } #endif /* turn off quota cause we are to dump chk_dqblk to files */ - quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type); + quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type, oqc->qc_id); rc = create_new_quota_files(qctxt, oqc); if (rc) GOTO(out, rc); /* we use vfs functions to set dqblk, so turn quota on */ - rc = quota_onoff(sb, Q_QUOTAON, oqc->qc_type); + rc = quota_onoff(sb, Q_QUOTAON, oqc->qc_type, oqc->qc_id); out: /* dump and free chk_dqblk */ rc = prune_chkquots(sb, qctxt, rc); @@ -1960,7 +2007,7 @@ out: /* turn off quota, `lfs quotacheck` will turn on when all * nodes quotacheck finish. */ - quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type); + quota_onoff(sb, Q_QUOTAOFF, oqc->qc_type, oqc->qc_id); oqc->qc_stat = rc; if (rc) @@ -1969,7 +2016,6 @@ out: RETURN(rc); } -#ifdef HAVE_QUOTA_SUPPORT static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, int cmd) { @@ -1994,9 +2040,15 @@ static int fsfilt_ext3_quotainfo(struct lustre_quota_info *lqi, int type, case QFILE_INIT_INFO: rc = lustre_init_quota_info(lqi, type); break; + case QFILE_CONVERT: + rc = -ENOTSUPP; + CERROR("quota CONVERT command is not supported\n"); + break; default: - CERROR("Unsupported admin quota file cmd %d\n", cmd); - LBUG(); + rc = -ENOTSUPP; + CERROR("Unsupported admin quota file cmd %d\n" + "Are lquota.ko and fsfilt_ldiskfs.ko modules in sync?\n", + cmd); break; } RETURN(rc); @@ -2076,13 +2128,13 @@ static struct fsfilt_operations fsfilt_ext3_ops = { .fs_setup = fsfilt_ext3_setup, .fs_send_bio = fsfilt_ext3_send_bio, .fs_get_op_len = fsfilt_ext3_get_op_len, - .fs_quotactl = fsfilt_ext3_quotactl, - .fs_quotacheck = fsfilt_ext3_quotacheck, #ifdef HAVE_DISK_INODE_VERSION .fs_get_version = fsfilt_ext3_get_version, .fs_set_version = fsfilt_ext3_set_version, #endif #ifdef HAVE_QUOTA_SUPPORT + .fs_quotactl = fsfilt_ext3_quotactl, + .fs_quotacheck = fsfilt_ext3_quotacheck, .fs_quotainfo = fsfilt_ext3_quotainfo, .fs_qids = fsfilt_ext3_qids, .fs_dquot = fsfilt_ext3_dquot, diff --git a/lustre/lvfs/fsfilt_reiserfs.c b/lustre/lvfs/fsfilt_reiserfs.c index 2f58e2a..83db369 100644 --- a/lustre/lvfs/fsfilt_reiserfs.c +++ b/lustre/lvfs/fsfilt_reiserfs.c @@ -184,9 +184,7 @@ static int fsfilt_reiserfs_statfs(struct super_block *sb, int rc; memset(&sfs, 0, sizeof(sfs)); - rc = ll_do_statfs(sb, &sfs); - statfs_pack(osfs, &sfs); return rc; } diff --git a/lustre/lvfs/lustre_quota_fmt.c b/lustre/lvfs/lustre_quota_fmt.c index b0ddb5c..ee713e4 100644 --- a/lustre/lvfs/lustre_quota_fmt.c +++ b/lustre/lvfs/lustre_quota_fmt.c @@ -39,7 +39,6 @@ * from linux/fs/quota_v2.c */ - #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif @@ -57,21 +56,32 @@ #include #include +#include #include "lustre_quota_fmt.h" -typedef char *dqbuf_t; +#ifdef HAVE_QUOTA_SUPPORT + +static const uint lustre_initqversions[][MAXQUOTAS] = { + [LUSTRE_QUOTA_V2] = LUSTRE_INITQVERSIONS_V2 +}; + +static const int lustre_dqstrinblk[] = { + [LUSTRE_QUOTA_V2] = LUSTRE_DQSTRINBLK_V2 +}; -#define GETIDINDEX(id, depth) (((id) >> ((LUSTRE_DQTREEDEPTH-(depth)-1)*8)) & 0xff) -#define GETENTRIES(buf) ((struct lustre_disk_dqblk *)(((char *)buf)+sizeof(struct lustre_disk_dqdbheader))) +static const int lustre_disk_dqblk_sz[] = { + [LUSTRE_QUOTA_V2] = sizeof(struct lustre_disk_dqblk_v2) +}; -static int check_quota_file(struct file *f, struct inode *inode, int type) +int check_quota_file(struct file *f, struct inode *inode, int type, + lustre_quota_version_t version) { struct lustre_disk_dqheader dqhead; mm_segment_t fs; ssize_t size; loff_t offset = 0; static const uint quota_magics[] = LUSTRE_INITQMAGICS; - static const uint quota_versions[] = LUSTRE_INITQVERSIONS; + const uint *quota_versions = lustre_initqversions[version]; if (f) { fs = get_fs(); @@ -90,27 +100,26 @@ static int check_quota_file(struct file *f, struct inode *inode, int type) #endif } if (size != sizeof(struct lustre_disk_dqheader)) - return 0; + return -EINVAL; if (le32_to_cpu(dqhead.dqh_magic) != quota_magics[type] || le32_to_cpu(dqhead.dqh_version) != quota_versions[type]) - return 0; - return 1; + return -EINVAL; + return 0; } -/* Check whether given file is really lustre admin quotafile */ +/** + * Check whether given file is really lustre admin quotafile + */ int lustre_check_quota_file(struct lustre_quota_info *lqi, int type) { struct file *f = lqi->qi_files[type]; - return check_quota_file(f, NULL, type); + return check_quota_file(f, NULL, type, lqi->qi_version); } -/* Read information header from quota file */ -int lustre_read_quota_info(struct lustre_quota_info *lqi, int type) +int lustre_read_quota_file_info(struct file* f, struct lustre_mem_dqinfo* info) { mm_segment_t fs; struct lustre_disk_dqinfo dinfo; - struct lustre_mem_dqinfo *info = &lqi->qi_info[type]; - struct file *f = lqi->qi_files[type]; ssize_t size; loff_t offset = LUSTRE_DQINFOOFF; @@ -120,9 +129,9 @@ int lustre_read_quota_info(struct lustre_quota_info *lqi, int type) sizeof(struct lustre_disk_dqinfo), &offset); set_fs(fs); if (size != sizeof(struct lustre_disk_dqinfo)) { - CDEBUG(D_WARNING, "Can't read info structure on device %s.\n", + CDEBUG(D_ERROR, "Can't read info structure on device %s.\n", f->f_vfsmnt->mnt_sb->s_id); - return -1; + return -EINVAL; } info->dqi_bgrace = le32_to_cpu(dinfo.dqi_bgrace); info->dqi_igrace = le32_to_cpu(dinfo.dqi_igrace); @@ -133,7 +142,17 @@ int lustre_read_quota_info(struct lustre_quota_info *lqi, int type) return 0; } -/* Write information header to quota file */ +/** + * Read information header from quota file + */ +int lustre_read_quota_info(struct lustre_quota_info *lqi, int type) +{ + return lustre_read_quota_file_info(lqi->qi_files[type], &lqi->qi_info[type]); +} + +/** + * Write information header to quota file + */ int lustre_write_quota_info(struct lustre_quota_info *lqi, int type) { mm_segment_t fs; @@ -164,33 +183,44 @@ int lustre_write_quota_info(struct lustre_quota_info *lqi, int type) return 0; } -static void disk2memdqb(struct mem_dqblk *m, struct lustre_disk_dqblk *d) +void disk2memdqb(struct lustre_mem_dqblk *m, void *d, + lustre_quota_version_t version) { - m->dqb_ihardlimit = le32_to_cpu(d->dqb_ihardlimit); - m->dqb_isoftlimit = le32_to_cpu(d->dqb_isoftlimit); - m->dqb_curinodes = le32_to_cpu(d->dqb_curinodes); - m->dqb_itime = le64_to_cpu(d->dqb_itime); - m->dqb_bhardlimit = le32_to_cpu(d->dqb_bhardlimit); - m->dqb_bsoftlimit = le32_to_cpu(d->dqb_bsoftlimit); - m->dqb_curspace = le64_to_cpu(d->dqb_curspace); - m->dqb_btime = le64_to_cpu(d->dqb_btime); + struct lustre_disk_dqblk_v2 *dqblk = (struct lustre_disk_dqblk_v2 *)d; + + LASSERT(version == LUSTRE_QUOTA_V2); + + m->dqb_ihardlimit = le64_to_cpu(dqblk->dqb_ihardlimit); + m->dqb_isoftlimit = le64_to_cpu(dqblk->dqb_isoftlimit); + m->dqb_curinodes = le64_to_cpu(dqblk->dqb_curinodes); + m->dqb_itime = le64_to_cpu(dqblk->dqb_itime); + m->dqb_bhardlimit = le64_to_cpu(dqblk->dqb_bhardlimit); + m->dqb_bsoftlimit = le64_to_cpu(dqblk->dqb_bsoftlimit); + m->dqb_curspace = le64_to_cpu(dqblk->dqb_curspace); + m->dqb_btime = le64_to_cpu(dqblk->dqb_btime); } -static void mem2diskdqb(struct lustre_disk_dqblk *d, struct mem_dqblk *m, - qid_t id) +static int mem2diskdqb(void *d, struct lustre_mem_dqblk *m, + qid_t id, lustre_quota_version_t version) { - d->dqb_ihardlimit = cpu_to_le32(m->dqb_ihardlimit); - d->dqb_isoftlimit = cpu_to_le32(m->dqb_isoftlimit); - d->dqb_curinodes = cpu_to_le32(m->dqb_curinodes); - d->dqb_itime = cpu_to_le64(m->dqb_itime); - d->dqb_bhardlimit = cpu_to_le32(m->dqb_bhardlimit); - d->dqb_bsoftlimit = cpu_to_le32(m->dqb_bsoftlimit); - d->dqb_curspace = cpu_to_le64(m->dqb_curspace); - d->dqb_btime = cpu_to_le64(m->dqb_btime); - d->dqb_id = cpu_to_le32(id); + struct lustre_disk_dqblk_v2 *dqblk = (struct lustre_disk_dqblk_v2 *)d; + + LASSERT(version == LUSTRE_QUOTA_V2); + + dqblk->dqb_ihardlimit = cpu_to_le64(m->dqb_ihardlimit); + dqblk->dqb_isoftlimit = cpu_to_le64(m->dqb_isoftlimit); + dqblk->dqb_curinodes = cpu_to_le64(m->dqb_curinodes); + dqblk->dqb_itime = cpu_to_le64(m->dqb_itime); + dqblk->dqb_bhardlimit = cpu_to_le64(m->dqb_bhardlimit); + dqblk->dqb_bsoftlimit = cpu_to_le64(m->dqb_bsoftlimit); + dqblk->dqb_curspace = cpu_to_le64(m->dqb_curspace); + dqblk->dqb_btime = cpu_to_le64(m->dqb_btime); + dqblk->dqb_id = cpu_to_le32(id); + + return 0; } -static dqbuf_t getdqbuf(void) +dqbuf_t getdqbuf(void) { dqbuf_t buf = kmalloc(LUSTRE_DQBLKSIZE, GFP_NOFS); if (!buf) @@ -199,12 +229,12 @@ static dqbuf_t getdqbuf(void) return buf; } -static inline void freedqbuf(dqbuf_t buf) +void freedqbuf(dqbuf_t buf) { kfree(buf); } -static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf) +ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf) { mm_segment_t fs; ssize_t ret; @@ -218,7 +248,7 @@ static ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf) return ret; } -static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf) +ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf) { mm_segment_t fs; ssize_t ret; @@ -229,18 +259,17 @@ static ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf) ret = filp->f_op->write(filp, (char *)buf, LUSTRE_DQBLKSIZE, &offset); set_fs(fs); return ret; - } -static void lustre_mark_info_dirty(struct lustre_mem_dqinfo *info) +void lustre_mark_info_dirty(struct lustre_mem_dqinfo *info) { set_bit(DQF_INFO_DIRTY_B, &info->dqi_flags); } -#define lustre_info_dirty(info) test_bit(DQF_INFO_DIRTY_B, &(info)->dqi_flags) - -/* Remove empty block from list and return it */ -static int get_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info) +/** + * Remove empty block from list and return it + */ +int get_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info) { dqbuf_t buf = getdqbuf(); struct lustre_disk_dqdbheader *dh = @@ -256,7 +285,8 @@ static int get_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info) info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free); } else { memset(buf, 0, LUSTRE_DQBLKSIZE); - if ((ret = write_blk(filp, info->dqi_blocks, buf)) < 0) /* Assure block allocation... */ + /* Assure block allocation... */ + if ((ret = write_blk(filp, info->dqi_blocks, buf)) < 0) goto out_buf; blk = info->dqi_blocks++; } @@ -267,9 +297,11 @@ out_buf: return ret; } -/* Insert empty block to the list */ -static int put_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info, - dqbuf_t buf, uint blk) +/** + * Insert empty block to the list + */ +int put_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info, + dqbuf_t buf, uint blk) { struct lustre_disk_dqdbheader *dh = (struct lustre_disk_dqdbheader *)buf; @@ -286,10 +318,12 @@ static int put_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info, return 0; } -/* Remove given block from the list of blocks with free entries */ -static int remove_free_dqentry(struct file *filp, - struct lustre_mem_dqinfo *info, dqbuf_t buf, - uint blk) +/** + * Remove given block from the list of blocks with free entries + */ +int remove_free_dqentry(struct file *filp, + struct lustre_mem_dqinfo *info, dqbuf_t buf, + uint blk) { dqbuf_t tmpbuf = getdqbuf(); struct lustre_disk_dqdbheader *dh = @@ -321,7 +355,8 @@ static int remove_free_dqentry(struct file *filp, } freedqbuf(tmpbuf); dh->dqdh_next_free = dh->dqdh_prev_free = cpu_to_le32(0); - if (write_blk(filp, blk, buf) < 0) /* No matter whether write succeeds block is out of list */ + if (write_blk(filp, blk, buf) < 0) + /* No matter whether write succeeds block is out of list */ CDEBUG(D_ERROR, "VFS: Can't write block (%u) with free entries.\n", blk); return 0; @@ -330,10 +365,12 @@ out_buf: return err; } -/* Insert given block to the beginning of list with free entries */ -static int insert_free_dqentry(struct file *filp, - struct lustre_mem_dqinfo *info, dqbuf_t buf, - uint blk) +/** + * Insert given block to the beginning of list with free entries + */ +int insert_free_dqentry(struct file *filp, + struct lustre_mem_dqinfo *info, dqbuf_t buf, + uint blk) { dqbuf_t tmpbuf = getdqbuf(); struct lustre_disk_dqdbheader *dh = @@ -363,16 +400,23 @@ out_buf: return err; } -/* Find space for dquot */ -static uint find_free_dqentry(struct lustre_dquot *dquot, int *err) + + +/** + * Find space for dquot + */ +static uint find_free_dqentry(struct lustre_dquot *dquot, int *err, + lustre_quota_version_t version) { struct lustre_quota_info *lqi = dquot->dq_info; struct file *filp = lqi->qi_files[dquot->dq_type]; struct lustre_mem_dqinfo *info = &lqi->qi_info[dquot->dq_type]; uint blk, i; struct lustre_disk_dqdbheader *dh; - struct lustre_disk_dqblk *ddquot; - struct lustre_disk_dqblk fakedquot; + void *ddquot; + int dqblk_sz = lustre_disk_dqblk_sz[version]; + int dqstrinblk = lustre_dqstrinblk[version]; + char fakedquot[dqblk_sz]; dqbuf_t buf; *err = 0; @@ -381,7 +425,7 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err) return 0; } dh = (struct lustre_disk_dqdbheader *)buf; - ddquot = GETENTRIES(buf); + ddquot = GETENTRIES(buf, version); if (info->dqi_free_entry) { blk = info->dqi_free_entry; if ((*err = read_blk(filp, blk, buf)) < 0) @@ -394,10 +438,14 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err) return 0; } memset(buf, 0, LUSTRE_DQBLKSIZE); - info->dqi_free_entry = blk; /* This is enough as block is already zeroed and entry list is empty... */ + info->dqi_free_entry = blk; /* This is enough as block is + already zeroed and entry list + is empty... */ lustre_mark_info_dirty(info); } - if (le16_to_cpu(dh->dqdh_entries) + 1 >= LUSTRE_DQSTRINBLK) /* Block will be full? */ + + /* Will block be full */ + if (le16_to_cpu(dh->dqdh_entries) + 1 >= dqstrinblk) if ((*err = remove_free_dqentry(filp, info, buf, blk)) < 0) { CDEBUG(D_ERROR, "VFS: find_free_dqentry(): Can't remove block (%u) from entry free list.\n", @@ -405,12 +453,13 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err) goto out_buf; } dh->dqdh_entries = cpu_to_le16(le16_to_cpu(dh->dqdh_entries) + 1); - memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk)); + memset(fakedquot, 0, dqblk_sz); /* Find free structure in block */ - for (i = 0; i < LUSTRE_DQSTRINBLK && - memcmp(&fakedquot, ddquot + i, sizeof(fakedquot)); i++) ; + for (i = 0; i < dqstrinblk && + memcmp(fakedquot, (char*)ddquot + i * dqblk_sz, + sizeof(fakedquot)); i++); - if (i == LUSTRE_DQSTRINBLK) { + if (i == dqstrinblk) { CDEBUG(D_ERROR, "VFS: find_free_dqentry(): Data block full but it shouldn't.\n"); *err = -EIO; @@ -426,7 +475,7 @@ static uint find_free_dqentry(struct lustre_dquot *dquot, int *err) dquot->dq_off = (blk << LUSTRE_DQBLKSIZE_BITS) + sizeof(struct lustre_disk_dqdbheader) + - i * sizeof(struct lustre_disk_dqblk); + i * dqblk_sz; freedqbuf(buf); return blk; out_buf: @@ -434,8 +483,11 @@ out_buf: return 0; } -/* Insert reference to structure into the trie */ -static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth) +/** + * Insert reference to structure into the trie + */ +static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth, + lustre_quota_version_t version) { struct lustre_quota_info *lqi = dquot->dq_info; struct file *filp = lqi->qi_files[dquot->dq_type]; @@ -476,9 +528,9 @@ static int do_insert_tree(struct lustre_dquot *dquot, uint * treeblk, int depth) goto out_buf; } - newblk = find_free_dqentry(dquot, &ret); + newblk = find_free_dqentry(dquot, &ret, version); } else - ret = do_insert_tree(dquot, &newblk, depth + 1); + ret = do_insert_tree(dquot, &newblk, depth + 1, version); if (newson && ret >= 0) { ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(newblk); ret = write_blk(filp, *treeblk, buf); @@ -489,27 +541,37 @@ out_buf: return ret; } -/* Wrapper for inserting quota structure into tree */ -static inline int dq_insert_tree(struct lustre_dquot *dquot) +/** + * Wrapper for inserting quota structure into tree + */ +static inline int dq_insert_tree(struct lustre_dquot *dquot, + lustre_quota_version_t version) { int tmp = LUSTRE_DQTREEOFF; - return do_insert_tree(dquot, &tmp, 0); + return do_insert_tree(dquot, &tmp, 0, version); } -/* - * We don't have to be afraid of deadlocks as we never have quotas on quota files... +/** + * We don't have to be afraid of deadlocks as we never have quotas on + * quota files... */ -static int lustre_write_dquot(struct lustre_dquot *dquot) +static int lustre_write_dquot(struct lustre_dquot *dquot, + lustre_quota_version_t version) { int type = dquot->dq_type; struct file *filp; mm_segment_t fs; loff_t offset; ssize_t ret; - struct lustre_disk_dqblk ddquot, empty; + int dqblk_sz = lustre_disk_dqblk_sz[version]; + char ddquot[dqblk_sz], empty[dqblk_sz]; + + ret = mem2diskdqb(ddquot, &dquot->dq_dqb, dquot->dq_id, version); + if (ret < 0) + return ret; if (!dquot->dq_off) - if ((ret = dq_insert_tree(dquot)) < 0) { + if ((ret = dq_insert_tree(dquot, version)) < 0) { CDEBUG(D_ERROR, "VFS: Error %Zd occurred while creating quota.\n", ret); @@ -517,19 +579,18 @@ static int lustre_write_dquot(struct lustre_dquot *dquot) } filp = dquot->dq_info->qi_files[type]; offset = dquot->dq_off; - mem2diskdqb(&ddquot, &dquot->dq_dqb, dquot->dq_id); /* Argh... We may need to write structure full of zeroes but that would be * treated as an empty place by the rest of the code. Format change would * be definitely cleaner but the problems probably are not worth it */ - memset(&empty, 0, sizeof(struct lustre_disk_dqblk)); - if (!memcmp(&empty, &ddquot, sizeof(struct lustre_disk_dqblk))) - ddquot.dqb_itime = cpu_to_le64(1); + memset(empty, 0, dqblk_sz); + if (!memcmp(empty, ddquot, dqblk_sz)) + ((struct lustre_disk_dqblk_v2 *)ddquot)->dqb_itime = cpu_to_le64(1); fs = get_fs(); set_fs(KERNEL_DS); - ret = filp->f_op->write(filp, (char *)&ddquot, - sizeof(struct lustre_disk_dqblk), &offset); + ret = filp->f_op->write(filp, ddquot, + dqblk_sz, &offset); set_fs(fs); - if (ret != sizeof(struct lustre_disk_dqblk)) { + if (ret != dqblk_sz) { CDEBUG(D_WARNING, "VFS: dquota write failed on dev %s\n", filp->f_dentry->d_sb->s_id); if (ret >= 0) @@ -540,14 +601,18 @@ static int lustre_write_dquot(struct lustre_dquot *dquot) return ret; } -/* Free dquot entry in data block */ -static int free_dqentry(struct lustre_dquot *dquot, uint blk) +/** + * Free dquot entry in data block + */ +static int free_dqentry(struct lustre_dquot *dquot, uint blk, + lustre_quota_version_t version) { struct file *filp = dquot->dq_info->qi_files[dquot->dq_type]; struct lustre_mem_dqinfo *info = &dquot->dq_info->qi_info[dquot->dq_type]; struct lustre_disk_dqdbheader *dh; dqbuf_t buf = getdqbuf(); + int dqstrinblk = lustre_dqstrinblk[version]; int ret = 0; if (!buf) @@ -573,10 +638,9 @@ static int free_dqentry(struct lustre_dquot *dquot, uint blk) goto out_buf; } } else { - memset(buf + - (dquot->dq_off & ((1 << LUSTRE_DQBLKSIZE_BITS) - 1)), 0, - sizeof(struct lustre_disk_dqblk)); - if (le16_to_cpu(dh->dqdh_entries) == LUSTRE_DQSTRINBLK - 1) { + memset(buf + (dquot->dq_off & ((1<dqdh_entries) == dqstrinblk - 1) { /* Insert will write block itself */ if ((ret = insert_free_dqentry(filp, info, buf, blk)) < 0) { @@ -597,8 +661,11 @@ out_buf: return ret; } -/* Remove reference to dquot from tree */ -static int remove_tree(struct lustre_dquot *dquot, uint * blk, int depth) +/** + * Remove reference to dquot from tree + */ +static int remove_tree(struct lustre_dquot *dquot, uint * blk, int depth, + lustre_quota_version_t version) { struct file *filp = dquot->dq_info->qi_files[dquot->dq_type]; struct lustre_mem_dqinfo *info = @@ -616,14 +683,15 @@ static int remove_tree(struct lustre_dquot *dquot, uint * blk, int depth) } newblk = le32_to_cpu(ref[GETIDINDEX(dquot->dq_id, depth)]); if (depth == LUSTRE_DQTREEDEPTH - 1) { - ret = free_dqentry(dquot, newblk); + ret = free_dqentry(dquot, newblk, version); newblk = 0; } else - ret = remove_tree(dquot, &newblk, depth + 1); + ret = remove_tree(dquot, &newblk, depth + 1, version); if (ret >= 0 && !newblk) { int i; ref[GETIDINDEX(dquot->dq_id, depth)] = cpu_to_le32(0); - for (i = 0; i < LUSTRE_DQBLKSIZE && !buf[i]; i++) ; /* Block got empty? */ + for (i = 0; i < LUSTRE_DQBLKSIZE && !buf[i]; i++) + /* Block got empty? */ ; /* don't put the root block into free blk list! */ if (i == LUSTRE_DQBLKSIZE && *blk != LUSTRE_DQTREEOFF) { put_free_dqblk(filp, info, buf, *blk); @@ -637,24 +705,34 @@ out_buf: return ret; } -/* Delete dquot from tree */ -static int lustre_delete_dquot(struct lustre_dquot *dquot) +/** + * Delete dquot from tree + */ +static int lustre_delete_dquot(struct lustre_dquot *dquot, + lustre_quota_version_t version) { uint tmp = LUSTRE_DQTREEOFF; if (!dquot->dq_off) /* Even not allocated? */ return 0; - return remove_tree(dquot, &tmp, 0); + return remove_tree(dquot, &tmp, 0, version); } -/* Find entry in block */ -static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk) +/** + * Find entry in block + */ +static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk, + lustre_quota_version_t version) { struct file *filp = dquot->dq_info->qi_files[dquot->dq_type]; dqbuf_t buf = getdqbuf(); loff_t ret = 0; int i; - struct lustre_disk_dqblk *ddquot = GETENTRIES(buf); + struct lustre_disk_dqblk_v2 *ddquot = (struct lustre_disk_dqblk_v2 *)GETENTRIES(buf, version); + int dqblk_sz = lustre_disk_dqblk_sz[version]; + int dqstrinblk = lustre_dqstrinblk[version]; + + LASSERT(version == LUSTRE_QUOTA_V2); if (!buf) return -ENOMEM; @@ -663,20 +741,20 @@ static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk) goto out_buf; } if (dquot->dq_id) - for (i = 0; - i < LUSTRE_DQSTRINBLK - && le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; i++) ; + for (i = 0; i < dqstrinblk && + le32_to_cpu(ddquot[i].dqb_id) != dquot->dq_id; + i++) ; else { /* ID 0 as a bit more complicated searching... */ - struct lustre_disk_dqblk fakedquot; + char fakedquot[dqblk_sz]; - memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk)); - for (i = 0; i < LUSTRE_DQSTRINBLK; i++) + memset(fakedquot, 0, sizeof(fakedquot)); + for (i = 0; i < dqstrinblk; i++) if (!le32_to_cpu(ddquot[i].dqb_id) - && memcmp(&fakedquot, ddquot + i, - sizeof(struct lustre_disk_dqblk))) + && memcmp(fakedquot, ddquot + i, + dqblk_sz)) break; } - if (i == LUSTRE_DQSTRINBLK) { + if (i == dqstrinblk) { CDEBUG(D_ERROR, "VFS: Quota for id %u referenced but not present.\n", dquot->dq_id); @@ -686,14 +764,17 @@ static loff_t find_block_dqentry(struct lustre_dquot *dquot, uint blk) ret = (blk << LUSTRE_DQBLKSIZE_BITS) + sizeof(struct lustre_disk_dqdbheader) + - i * sizeof(struct lustre_disk_dqblk); + i * dqblk_sz; out_buf: freedqbuf(buf); return ret; } -/* Find entry for given id in the tree */ -static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth) +/** + * Find entry for given id in the tree + */ +static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth, + lustre_quota_version_t version) { struct file *filp = dquot->dq_info->qi_files[dquot->dq_type]; dqbuf_t buf = getdqbuf(); @@ -711,18 +792,21 @@ static loff_t find_tree_dqentry(struct lustre_dquot *dquot, uint blk, int depth) if (!blk) /* No reference? */ goto out_buf; if (depth < LUSTRE_DQTREEDEPTH - 1) - ret = find_tree_dqentry(dquot, blk, depth + 1); + ret = find_tree_dqentry(dquot, blk, depth + 1, version); else - ret = find_block_dqentry(dquot, blk); + ret = find_block_dqentry(dquot, blk, version); out_buf: freedqbuf(buf); return ret; } -/* Find entry for given id in the tree - wrapper function */ -static inline loff_t find_dqentry(struct lustre_dquot *dquot) +/** + * Find entry for given id in the tree - wrapper function + */ +static inline loff_t find_dqentry(struct lustre_dquot *dquot, + lustre_quota_version_t version) { - return find_tree_dqentry(dquot, LUSTRE_DQTREEOFF, 0); + return find_tree_dqentry(dquot, LUSTRE_DQTREEOFF, 0, version); } int lustre_read_dquot(struct lustre_dquot *dquot) @@ -731,8 +815,8 @@ int lustre_read_dquot(struct lustre_dquot *dquot) struct file *filp; mm_segment_t fs; loff_t offset; - struct lustre_disk_dqblk ddquot, empty; - int ret = 0; + int ret = 0, dqblk_sz; + lustre_quota_version_t version; /* Invalidated quota? */ if (!dquot->dq_info || !(filp = dquot->dq_info->qi_files[type])) { @@ -740,7 +824,11 @@ int lustre_read_dquot(struct lustre_dquot *dquot) return -EIO; } - offset = find_dqentry(dquot); + version = dquot->dq_info->qi_version; + LASSERT(version == LUSTRE_QUOTA_V2); + dqblk_sz = lustre_disk_dqblk_sz[version]; + + offset = find_dqentry(dquot, version); if (offset <= 0) { /* Entry not present? */ if (offset < 0) CDEBUG(D_ERROR, @@ -748,42 +836,46 @@ int lustre_read_dquot(struct lustre_dquot *dquot) dquot->dq_id); dquot->dq_off = 0; set_bit(DQ_FAKE_B, &dquot->dq_flags); - memset(&dquot->dq_dqb, 0, sizeof(struct mem_dqblk)); + memset(&dquot->dq_dqb, 0, sizeof(struct lustre_mem_dqblk)); ret = offset; } else { + char ddquot[dqblk_sz], empty[dqblk_sz]; + dquot->dq_off = offset; fs = get_fs(); set_fs(KERNEL_DS); - if ((ret = filp->f_op->read(filp, (char *)&ddquot, - sizeof(struct lustre_disk_dqblk), - &offset)) != - sizeof(struct lustre_disk_dqblk)) { + if ((ret = filp->f_op->read(filp, ddquot, dqblk_sz, &offset)) != + dqblk_sz) { if (ret >= 0) ret = -EIO; CDEBUG(D_ERROR, "VFS: Error while reading quota structure for id %u.\n", dquot->dq_id); - memset(&ddquot, 0, sizeof(struct lustre_disk_dqblk)); + memset(ddquot, 0, dqblk_sz); } else { ret = 0; /* We need to escape back all-zero structure */ - memset(&empty, 0, sizeof(struct lustre_disk_dqblk)); - empty.dqb_itime = cpu_to_le64(1); - if (!memcmp(&empty, &ddquot, - sizeof(struct lustre_disk_dqblk))) - ddquot.dqb_itime = 0; + memset(empty, 0, dqblk_sz); + ((struct lustre_disk_dqblk_v2 *)empty)->dqb_itime = cpu_to_le64(1); + if (!memcmp(empty, ddquot, dqblk_sz)) + ((struct lustre_disk_dqblk_v2 *)empty)->dqb_itime = cpu_to_le64(0); } set_fs(fs); - disk2memdqb(&dquot->dq_dqb, &ddquot); + disk2memdqb(&dquot->dq_dqb, ddquot, version); } return ret; } -/* Commit changes of dquot to disk - it might also mean deleting it when quota became fake */ +/** + * Commit changes of dquot to disk - it might also mean deleting + * it when quota became fake. + */ int lustre_commit_dquot(struct lustre_dquot *dquot) { int rc = 0; + lustre_quota_version_t version = dquot->dq_info->qi_version; + /* always clear the flag so we don't loop on an IO error... */ clear_bit(DQ_MOD_B, &dquot->dq_flags); @@ -791,9 +883,9 @@ int lustre_commit_dquot(struct lustre_dquot *dquot) * over all cluster, so keep the fake dquot entry on disk is * meaningless, just remove it */ if (test_bit(DQ_FAKE_B, &dquot->dq_flags)) - rc = lustre_delete_dquot(dquot); + rc = lustre_delete_dquot(dquot, version); else - rc = lustre_write_dquot(dquot); + rc = lustre_write_dquot(dquot, version); if (rc < 0) return rc; @@ -804,21 +896,20 @@ int lustre_commit_dquot(struct lustre_dquot *dquot) return rc; } -/* We need to export this function to initialize quotafile, because we haven't - * user level check utility */ -int lustre_init_quota_info(struct lustre_quota_info *lqi, int type) +int lustre_init_quota_header(struct lustre_quota_info *lqi, int type, int fakemagics) { - struct lustre_mem_dqinfo *dqinfo = &lqi->qi_info[type]; + static const uint quota_magics[] = LUSTRE_INITQMAGICS; + static const uint fake_magics[] = LUSTRE_BADQMAGICS; + const uint* quota_versions = lustre_initqversions[lqi->qi_version]; struct lustre_disk_dqheader dqhead; - struct file *fp = lqi->qi_files[type]; ssize_t size; loff_t offset = 0; + struct file *fp = lqi->qi_files[type]; int rc = 0; - static const uint quota_magics[] = LUSTRE_INITQMAGICS; - static const uint quota_versions[] = LUSTRE_INITQVERSIONS; /* write quotafile header */ - dqhead.dqh_magic = cpu_to_le32(quota_magics[type]); + dqhead.dqh_magic = cpu_to_le32(fakemagics ? + fake_magics[type] : quota_magics[type]); dqhead.dqh_version = cpu_to_le32(quota_versions[type]); size = fp->f_op->write(fp, (char *)&dqhead, sizeof(struct lustre_disk_dqheader), &offset); @@ -827,6 +918,21 @@ int lustre_init_quota_info(struct lustre_quota_info *lqi, int type) CDEBUG(D_ERROR, "error writing quoafile header (rc:%d)\n", rc); rc = size; } + + return rc; +} + +/** + * We need to export this function to initialize quotafile, because we haven't + * user level check utility + */ +int lustre_init_quota_info_generic(struct lustre_quota_info *lqi, int type, + int fakemagics) +{ + struct lustre_mem_dqinfo *dqinfo = &lqi->qi_info[type]; + int rc; + + rc = lustre_init_quota_header(lqi, type, fakemagics); if (rc) return rc; @@ -839,13 +945,13 @@ int lustre_init_quota_info(struct lustre_quota_info *lqi, int type) return lustre_write_quota_info(lqi, type); } -struct dqblk { - struct list_head link; - uint blk; -}; +int lustre_init_quota_info(struct lustre_quota_info *lqi, int type) +{ + return lustre_init_quota_info_generic(lqi, type, 0); +} -static ssize_t quota_read(struct file *file, struct inode *inode, int type, - uint blk, dqbuf_t buf) +ssize_t quota_read(struct file *file, struct inode *inode, int type, + uint blk, dqbuf_t buf) { if (file) { return read_blk(file, blk, buf); @@ -913,8 +1019,8 @@ out_buf: return ret; } -static int walk_tree_dqentry(struct file *filp, struct inode *inode, int type, - uint blk, int depth, struct list_head *list) +int walk_tree_dqentry(struct file *filp, struct inode *inode, int type, + uint blk, int depth, struct list_head *list) { dqbuf_t buf = getdqbuf(); loff_t ret = 0; @@ -935,7 +1041,7 @@ static int walk_tree_dqentry(struct file *filp, struct inode *inode, int type, continue; if (depth < LUSTRE_DQTREEDEPTH - 1) - ret = walk_tree_dqentry(filp, inode, type, blk, + ret = walk_tree_dqentry(filp, inode, type, blk, depth + 1, list); else ret = walk_block_dqentry(filp, inode, type, blk, list); @@ -945,67 +1051,71 @@ out_buf: return ret; } -/* Walk through the quota file (v2 format) to get all ids with quota limit */ +/** + * Walk through the quota file (v2 format) to get all ids with quota limit + */ int lustre_get_qids(struct file *fp, struct inode *inode, int type, struct list_head *list) { struct list_head blk_list; struct dqblk *blk_item, *tmp; dqbuf_t buf = NULL; - struct lustre_disk_dqblk *ddquot; + struct lustre_disk_dqblk_v2 *ddquot; int rc; + lustre_quota_version_t version; + + ENTRY; - if (!check_quota_file(fp, inode, type)) { + if (check_quota_file(fp, inode, type, LUSTRE_QUOTA_V2) == 0) + version = LUSTRE_QUOTA_V2; + else { CDEBUG(D_ERROR, "unknown quota file format!\n"); - return -EINVAL; + RETURN(-EINVAL); } + if (!list_empty(list)) { CDEBUG(D_ERROR, "not empty list\n"); - return -EINVAL; + RETURN(-EINVAL); } INIT_LIST_HEAD(&blk_list); rc = walk_tree_dqentry(fp, inode, type, LUSTRE_DQTREEOFF, 0, &blk_list); if (rc) { CDEBUG(D_ERROR, "walk through quota file failed!(%d)\n", rc); - goto out_free; + GOTO(out_free, rc); } if (list_empty(&blk_list)) - return 0; + RETURN(0); buf = getdqbuf(); if (!buf) - return -ENOMEM; - ddquot = GETENTRIES(buf); + RETURN(-ENOMEM); + ddquot = (struct lustre_disk_dqblk_v2 *)GETENTRIES(buf, version); list_for_each_entry(blk_item, &blk_list, link) { loff_t ret = 0; - int i; - struct lustre_disk_dqblk fakedquot; + int i, dqblk_sz = lustre_disk_dqblk_sz[version]; + char fakedquot[dqblk_sz]; memset(buf, 0, LUSTRE_DQBLKSIZE); if ((ret = quota_read(fp, inode, type, blk_item->blk, buf))<0) { CDEBUG(D_ERROR, "VFS: Can't read quota tree block %u.\n", blk_item->blk); - rc = ret; - goto out_free; + GOTO(out_free, rc = ret); } - memset(&fakedquot, 0, sizeof(struct lustre_disk_dqblk)); - for (i = 0; i < LUSTRE_DQSTRINBLK; i++) { + memset(fakedquot, 0, dqblk_sz); + for (i = 0; i < lustre_dqstrinblk[version]; i++) { struct dquot_id *dqid; /* skip empty entry */ - if (!memcmp - (&fakedquot, ddquot + i, - sizeof(struct lustre_disk_dqblk))) + if (!memcmp(fakedquot, ddquot + i, dqblk_sz)) continue; dqid = kmalloc(sizeof(*dqid), GFP_NOFS); - if (!dqid) { - rc = -ENOMEM; - goto out_free; - } + if (!dqid) + GOTO(out_free, rc = -ENOMEM); + dqid->di_id = le32_to_cpu(ddquot[i].dqb_id); INIT_LIST_HEAD(&dqid->di_link); list_add(&dqid->di_link, list); @@ -1019,13 +1129,16 @@ out_free: } if (buf) freedqbuf(buf); - return rc; + + RETURN(rc); } -EXPORT_SYMBOL(lustre_check_quota_file); + EXPORT_SYMBOL(lustre_read_quota_info); EXPORT_SYMBOL(lustre_write_quota_info); +EXPORT_SYMBOL(lustre_check_quota_file); EXPORT_SYMBOL(lustre_read_dquot); EXPORT_SYMBOL(lustre_commit_dquot); EXPORT_SYMBOL(lustre_init_quota_info); EXPORT_SYMBOL(lustre_get_qids); +#endif diff --git a/lustre/lvfs/lustre_quota_fmt.h b/lustre/lvfs/lustre_quota_fmt.h index ffdac51..4072509 100644 --- a/lustre/lvfs/lustre_quota_fmt.h +++ b/lustre/lvfs/lustre_quota_fmt.h @@ -41,6 +41,8 @@ #ifndef _LUSTRE_QUOTA_FMT_H #define _LUSTRE_QUOTA_FMT_H +#ifdef HAVE_QUOTA_SUPPORT + #include #include @@ -49,32 +51,49 @@ * Same with quota v2's magic */ #define LUSTRE_INITQMAGICS {\ - 0xd9c01f11, /* USRQUOTA */\ - 0xd9c01927 /* GRPQUOTA */\ + 0xd9c01f11, /** USRQUOTA */\ + 0xd9c01927 /** GRPQUOTA */\ +} + +/* Invalid magics that mark quota file as inconsistent */ +#define LUSTRE_BADQMAGICS {\ + 0xbadbadba, /** USRQUOTA */\ + 0xbadbadba /** GRPQUOTA */\ } -#define LUSTRE_INITQVERSIONS {\ - 0, /* USRQUOTA */\ - 0 /* GRPQUOTA */\ +/* for the verson 2 of lustre_disk_dqblk*/ +#define LUSTRE_INITQVERSIONS_V2 {\ + 1, /* USRQUOTA */\ + 1 /* GRPQUOTA */\ } /* * The following structure defines the format of the disk quota file * (as it appears on disk) - the file is a radix tree whose leaves point - * to blocks of these structures. + * to blocks of these structures. for the version 2. */ -struct lustre_disk_dqblk { - __u32 dqb_id; /* id this quota applies to */ - __u32 dqb_ihardlimit; /* absolute limit on allocated inodes */ - __u32 dqb_isoftlimit; /* preferred inode limit */ - __u32 dqb_curinodes; /* current # allocated inodes */ - __u32 dqb_bhardlimit; /* absolute limit on disk space (in QUOTABLOCK_SIZE) */ - __u32 dqb_bsoftlimit; /* preferred limit on disk space (in QUOTABLOCK_SIZE) */ - __u64 dqb_curspace; /* current space occupied (in bytes) */ - __u64 dqb_btime; /* time limit for excessive disk use */ - __u64 dqb_itime; /* time limit for excessive inode use */ +struct lustre_disk_dqblk_v2 { + __u32 dqb_id; /**< id this quota applies to */ + __u32 padding; + __u64 dqb_ihardlimit; /**< absolute limit on allocated inodes */ + __u64 dqb_isoftlimit; /**< preferred inode limit */ + __u64 dqb_curinodes; /**< current # allocated inodes */ + __u64 dqb_bhardlimit; /**< absolute limit on disk space (in QUOTABLOCK_SIZE) */ + __u64 dqb_bsoftlimit; /**< preferred limit on disk space (in QUOTABLOCK_SIZE) */ + __u64 dqb_curspace; /**< current space occupied (in bytes) */ + __u64 dqb_btime; /**< time limit for excessive disk use */ + __u64 dqb_itime; /**< time limit for excessive inode use */ }; +/* Number of entries in one blocks(14 entries) */ +#define LUSTRE_DQSTRINBLK_V2 \ + ((LUSTRE_DQBLKSIZE - sizeof(struct lustre_disk_dqdbheader)) \ + / sizeof(struct lustre_disk_dqblk_v2)) +#define GETENTRIES_V2(buf) (((char *)buf)+sizeof(struct lustre_disk_dqdbheader)) + +#define GETENTRIES(buf,version) ((version == LUSTRE_QUOTA_V2) ? \ + GETENTRIES_V2(buf) : 0) + /* * Here are header structures as written on disk and their in-memory copies */ @@ -117,6 +136,62 @@ static void lprocfs_quotfmt_test_init_vars(struct lprocfs_static_vars *lvars) {} #define LUSTRE_DQBLKSIZE (1 << LUSTRE_DQBLKSIZE_BITS) /* Size of block with quota structures */ #define LUSTRE_DQTREEOFF 1 /* Offset of tree in file in blocks */ #define LUSTRE_DQTREEDEPTH 4 /* Depth of quota tree */ -#define LUSTRE_DQSTRINBLK ((LUSTRE_DQBLKSIZE - sizeof(struct lustre_disk_dqdbheader)) / sizeof(struct lustre_disk_dqblk)) /* Number of entries in one blocks */ +typedef char *dqbuf_t; + +#define GETIDINDEX(id, depth) (((id) >> ((LUSTRE_DQTREEDEPTH-(depth)-1)*8)) & 0xff) + +#define MAX_UL (0xffffffffUL) + +#define lustre_info_dirty(info) test_bit(DQF_INFO_DIRTY_B, &(info)->dqi_flags) + +struct dqblk { + struct list_head link; + uint blk; +}; + +/* come from lustre_fmt_common.c */ +dqbuf_t getdqbuf(void); +void freedqbuf(dqbuf_t buf); +void disk2memdqb(struct lustre_mem_dqblk *m, void *d, + enum lustre_quota_version version); +void lustre_mark_info_dirty(struct lustre_mem_dqinfo *info); +int lustre_init_quota_header(struct lustre_quota_info *lqi, int type, + int fakemagics); +int lustre_init_quota_info_generic(struct lustre_quota_info *lqi, int type, + int fakemagics); +int lustre_read_quota_info(struct lustre_quota_info *lqi, int type); +int lustre_read_quota_file_info(struct file* f, struct lustre_mem_dqinfo* info); +int lustre_write_quota_info(struct lustre_quota_info *lqi, int type); +ssize_t read_blk(struct file *filp, uint blk, dqbuf_t buf); +ssize_t write_blk(struct file *filp, uint blk, dqbuf_t buf); +int get_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info); +int put_free_dqblk(struct file *filp, struct lustre_mem_dqinfo *info, + dqbuf_t buf, uint blk); +int remove_free_dqentry(struct file *filp, + struct lustre_mem_dqinfo *info, dqbuf_t buf, + uint blk); +int insert_free_dqentry(struct file *filp, + struct lustre_mem_dqinfo *info, dqbuf_t buf, + uint blk); +ssize_t quota_read(struct file *file, struct inode *inode, int type, + uint blk, dqbuf_t buf); +int walk_tree_dqentry(struct file *filp, struct inode *inode, int type, + uint blk, int depth, struct list_head *list); +int check_quota_file(struct file *f, struct inode *inode, int type, + lustre_quota_version_t version); +int lustre_check_quota_file(struct lustre_quota_info *lqi, int type); +int lustre_read_dquot(struct lustre_dquot *dquot); +int lustre_commit_dquot(struct lustre_dquot *dquot); +int lustre_init_quota_info(struct lustre_quota_info *lqi, int type); +int lustre_get_qids(struct file *fp, struct inode *inode, int type, + struct list_head *list); + +#define LUSTRE_ADMIN_QUOTAFILES_V2 {\ + "admin_quotafile_v2.usr", /* user admin quotafile */\ + "admin_quotafile_v2.grp" /* group admin quotafile */\ +} + +#define LUSTRE_OPQFILES_NAMES_V2 { "lquota_v2.user", "lquota_v2.group" } #endif /* lustre_quota_fmt.h */ +#endif diff --git a/lustre/lvfs/lvfs_linux.c b/lustre/lvfs/lvfs_linux.c index 0520730..5d07875 100644 --- a/lustre/lvfs/lvfs_linux.c +++ b/lustre/lvfs/lvfs_linux.c @@ -231,7 +231,7 @@ EXPORT_SYMBOL(simple_mknod); /* utility to make a directory */ struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, - char *name, int mode, int fix) + const char *name, int mode, int fix) { struct dentry *dchild; int err = 0; @@ -254,7 +254,7 @@ struct dentry *simple_mkdir(struct dentry *dir, struct vfsmount *mnt, /* Fixup directory permissions if necessary */ if (fix && (old_mode & S_IALLUGO) != (mode & S_IALLUGO)) { - CDEBUG(D_CONFIG, + CDEBUG(D_CONFIG, "fixing permissions on %s from %o to %o\n", name, old_mode, mode); dchild->d_inode->i_mode = (mode & S_IALLUGO) | @@ -279,7 +279,7 @@ out_up: EXPORT_SYMBOL(simple_mkdir); /* utility to rename a file */ -int lustre_rename(struct dentry *dir, struct vfsmount *mnt, +int lustre_rename(struct dentry *dir, struct vfsmount *mnt, char *oldname, char *newname) { struct dentry *dchild_old, *dchild_new; @@ -287,21 +287,21 @@ int lustre_rename(struct dentry *dir, struct vfsmount *mnt, ENTRY; ASSERT_KERNEL_CTXT("kernel doing rename outside kernel context\n"); - CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", + CDEBUG(D_INODE, "renaming file %.*s to %.*s\n", (int)strlen(oldname), oldname, (int)strlen(newname), newname); dchild_old = ll_lookup_one_len(oldname, dir, strlen(oldname)); if (IS_ERR(dchild_old)) RETURN(PTR_ERR(dchild_old)); - if (!dchild_old->d_inode) + if (!dchild_old->d_inode) GOTO(put_old, err = -ENOENT); dchild_new = ll_lookup_one_len(newname, dir, strlen(newname)); if (IS_ERR(dchild_new)) GOTO(put_old, err = PTR_ERR(dchild_new)); - err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, + err = ll_vfs_rename(dir->d_inode, dchild_old, mnt, dir->d_inode, dchild_new, mnt); dput(dchild_new); @@ -430,7 +430,6 @@ int dev_check_rdonly(lvfs_sbdev_type dev); void __lvfs_set_rdonly(lvfs_sbdev_type dev, lvfs_sbdev_type jdev) { - lvfs_sbdev_sync(dev); if (jdev && (jdev != dev)) { CDEBUG(D_IOCTL | D_HA, "set journal dev %lx rdonly\n", (long)jdev); @@ -482,7 +481,7 @@ void obd_update_maxusage() if (max2 > obd_max_alloc) obd_max_alloc = max2; spin_unlock(&obd_updatemax_lock); - + } __u64 obd_memory_max(void) diff --git a/lustre/lvfs/quotafmt_test.c b/lustre/lvfs/quotafmt_test.c index 2b37387..b360570 100644 --- a/lustre/lvfs/quotafmt_test.c +++ b/lustre/lvfs/quotafmt_test.c @@ -57,6 +57,8 @@ #include "lustre_quota_fmt.h" +#ifdef HAVE_QUOTA_SUPPORT + char *test_quotafile[2] = { "usrquota_test", "grpquota_test" }; static int quotfmt_initialize(struct lustre_quota_info *lqi, @@ -65,7 +67,7 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi, { struct lustre_disk_dqheader dqhead; static const uint quota_magics[] = LUSTRE_INITQMAGICS; - static const uint quota_versions[] = LUSTRE_INITQVERSIONS; + static const uint quota_versions[] = LUSTRE_INITQVERSIONS_V2; struct file *fp; struct inode *parent_inode = tgt->obd_lvfs_ctxt.pwd->d_inode; size_t size; @@ -84,7 +86,7 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi, LOCK_INODE_MUTEX_PARENT(parent_inode); de = lookup_one_len(name, tgt->obd_lvfs_ctxt.pwd, namelen); if (!IS_ERR(de) && de->d_inode) - ll_vfs_unlink(parent_inode, de, + ll_vfs_unlink(parent_inode, de, tgt->obd_lvfs_ctxt.pwdmnt); if (!IS_ERR(de)) dput(de); @@ -107,7 +109,7 @@ static int quotfmt_initialize(struct lustre_quota_info *lqi, sizeof(struct lustre_disk_dqheader), &offset); if (size != sizeof(struct lustre_disk_dqheader)) { - CERROR("error writing quoafile header %s (rc = %d)\n", + CERROR("error writing quotafile header %s (rc = %d)\n", name, rc); rc = size; break; @@ -166,7 +168,7 @@ static int quotfmt_test_1(struct lustre_quota_info *lqi) ENTRY; for (i = 0; i < MAXQUOTAS; i++) { - if (!lustre_check_quota_file(lqi, i)) + if (lustre_check_quota_file(lqi, i)) RETURN(-EINVAL); } RETURN(0); @@ -256,7 +258,7 @@ static void put_rand_dquot(struct lustre_dquot *dquot) static int write_check_dquot(struct lustre_quota_info *lqi) { struct lustre_dquot *dquot; - struct mem_dqblk dqblk; + struct lustre_mem_dqblk dqblk; int rc = 0; ENTRY; @@ -380,7 +382,7 @@ static int quotfmt_test_4(struct lustre_quota_info *lqi) static int quotfmt_test_5(struct lustre_quota_info *lqi) { -#ifndef KERNEL_SUPPORTS_QUOTA_READ +#ifndef KERNEL_SUPPORTS_QUOTA_READ int i, rc = 0; for (i = USRQUOTA; i < MAXQUOTAS && !rc; i++) { @@ -541,3 +543,5 @@ MODULE_LICENSE("GPL"); module_init(quotfmt_test_init); module_exit(quotfmt_test_exit); + +#endif /* HAVE_QUOTA_SUPPORT */ diff --git a/lustre/mdc/mdc_internal.h b/lustre/mdc/mdc_internal.h index 62b85bf..c259f2b 100644 --- a/lustre/mdc/mdc_internal.h +++ b/lustre/mdc/mdc_internal.h @@ -78,25 +78,6 @@ void mdc_close_pack(struct ptlrpc_request *req, struct md_op_data *op_data); void mdc_enter_request(struct client_obd *cli); void mdc_exit_request(struct client_obd *cli); -static inline int client_is_remote(struct obd_export *exp) -{ - struct obd_import *imp = class_exp2cliimp(exp); - - if (imp->imp_connect_flags_orig & OBD_CONNECT_RMT_CLIENT) { - if (!(imp->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_RMT_CLIENT)) - return 0; - else - return 1; - } else { - if (!(imp->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_LCL_CLIENT)) - return 1; - else - return 0; - } -} - /* mdc/mdc_locks.c */ int mdc_set_lock_data(struct obd_export *exp, __u64 *lockh, void *data); diff --git a/lustre/mdc/mdc_locks.c b/lustre/mdc/mdc_locks.c index da91c7e..bf90727 100644 --- a/lustre/mdc/mdc_locks.c +++ b/lustre/mdc/mdc_locks.c @@ -155,7 +155,7 @@ ldlm_mode_t mdc_lock_match(struct obd_export *exp, int flags, fid_build_reg_res_name(fid, &res_id); rc = ldlm_lock_match(class_exp2obd(exp)->obd_namespace, flags, - &res_id, type, policy, mode, lockh); + &res_id, type, policy, mode, lockh, 0); RETURN(rc); } @@ -241,7 +241,7 @@ static struct ptlrpc_request *mdc_intent_open_pack(struct obd_export *exp, struct ptlrpc_request *req; struct obd_device *obddev = class_exp2obd(exp); struct ldlm_intent *lit; - int joinfile = !!((it->it_flags & O_JOIN_FILE) && + int joinfile = !!((it->it_flags & O_JOIN_FILE) && op_data->op_data); CFS_LIST_HEAD(cancels); int count = 0; @@ -812,7 +812,7 @@ static int mdc_finish_intent_lock(struct obd_export *exp, memcpy(&old_lock, lockh, sizeof(*lockh)); if (ldlm_lock_match(NULL, LDLM_FL_BLOCK_GRANTED, NULL, - LDLM_IBITS, &policy, LCK_NL, &old_lock)) { + LDLM_IBITS, &policy, LCK_NL, &old_lock, 0)) { ldlm_lock_decref_and_cancel(lockh, it->d.lustre.it_lock_mode); memcpy(lockh, &old_lock, sizeof(old_lock)); @@ -1024,7 +1024,7 @@ int mdc_intent_getattr_async(struct obd_export *exp, req->rq_async_args.pointer_arg[1] = minfo; req->rq_async_args.pointer_arg[2] = einfo; req->rq_interpret_reply = mdc_intent_getattr_async_interpret; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } @@ -1043,8 +1043,8 @@ int mdc_revalidate_lock(struct obd_export *exp, ENTRY; fid_build_reg_res_name(fid, &res_id); - /* As not all attributes are kept under update lock, e.g. - owner/group/acls are under lookup lock, we need both + /* As not all attributes are kept under update lock, e.g. + owner/group/acls are under lookup lock, we need both ibits for GETATTR. */ policy.l_inodebits.bits = (it->it_op == IT_GETATTR) ? MDS_INODELOCK_UPDATE | MDS_INODELOCK_LOOKUP : @@ -1052,7 +1052,7 @@ int mdc_revalidate_lock(struct obd_export *exp, mode = ldlm_lock_match(exp->exp_obd->obd_namespace, LDLM_FL_BLOCK_GRANTED, &res_id, LDLM_IBITS, - &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh); + &policy, LCK_CR|LCK_CW|LCK_PR|LCK_PW, &lockh, 0); if (mode) { it->d.lustre.it_lock_handle = lockh.cookie; it->d.lustre.it_lock_mode = mode; diff --git a/lustre/mdc/mdc_reint.c b/lustre/mdc/mdc_reint.c index 12465cd..881a223 100644 --- a/lustre/mdc/mdc_reint.c +++ b/lustre/mdc/mdc_reint.c @@ -131,7 +131,7 @@ int mdc_setattr(struct obd_export *exp, struct md_op_data *op_data, bits = MDS_INODELOCK_UPDATE; if (op_data->op_attr.ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) bits |= MDS_INODELOCK_LOOKUP; - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && (fid_is_sane(&op_data->op_fid1))) count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, LCK_EX, bits); @@ -228,7 +228,7 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data, } } - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && (fid_is_sane(&op_data->op_fid1))) count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, LCK_EX, @@ -264,7 +264,7 @@ int mdc_create(struct obd_export *exp, struct md_op_data *op_data, level = LUSTRE_IMP_FULL; resend: rc = mdc_reint(req, exp->exp_obd->u.cli.cl_rpc_lock, level); - + /* Resend if we were told to. */ if (rc == -ERESTARTSYS) { level = LUSTRE_IMP_RECOVER; @@ -298,12 +298,12 @@ int mdc_unlink(struct obd_export *exp, struct md_op_data *op_data, LASSERT(req == NULL); - if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && + if ((op_data->op_flags & MF_MDC_CANCEL_FID1) && (fid_is_sane(&op_data->op_fid1))) count = mdc_resource_get_unused(exp, &op_data->op_fid1, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && (fid_is_sane(&op_data->op_fid3))) count += mdc_resource_get_unused(exp, &op_data->op_fid3, &cancels, LCK_EX, @@ -407,7 +407,7 @@ int mdc_rename(struct obd_export *exp, struct md_op_data *op_data, count += mdc_resource_get_unused(exp, &op_data->op_fid2, &cancels, LCK_EX, MDS_INODELOCK_UPDATE); - if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && + if ((op_data->op_flags & MF_MDC_CANCEL_FID3) && (fid_is_sane(&op_data->op_fid3))) count += mdc_resource_get_unused(exp, &op_data->op_fid3, &cancels, LCK_EX, diff --git a/lustre/mdc/mdc_request.c b/lustre/mdc/mdc_request.c index e5b01f0..ff4cc21 100644 --- a/lustre/mdc/mdc_request.c +++ b/lustre/mdc/mdc_request.c @@ -57,8 +57,6 @@ #include #include "mdc_internal.h" -static quota_interface_t *quota_interface; - #define REQUEST_MINOR 244 static quota_interface_t *quota_interface; @@ -66,25 +64,27 @@ extern quota_interface_t mdc_quota_interface; static int mdc_cleanup(struct obd_device *obd); -static struct obd_capa *mdc_unpack_capa(struct ptlrpc_request *req, - const struct req_msg_field *field) +int mdc_unpack_capa(struct obd_export *exp, struct ptlrpc_request *req, + const struct req_msg_field *field, struct obd_capa **oc) { struct lustre_capa *capa; - struct obd_capa *oc; + struct obd_capa *c; + ENTRY; /* swabbed already in mdc_enqueue */ capa = req_capsule_server_get(&req->rq_pill, field); if (capa == NULL) - return ERR_PTR(-EPROTO); + RETURN(-EPROTO); - oc = alloc_capa(CAPA_SITE_CLIENT); - if (!oc) { + c = alloc_capa(CAPA_SITE_CLIENT); + if (IS_ERR(c)) { CDEBUG(D_INFO, "alloc capa failed!\n"); - return ERR_PTR(-ENOMEM); + RETURN(PTR_ERR(c)); + } else { + c->c_capa = *capa; + *oc = c; + RETURN(0); } - oc->c_capa = *capa; - - return oc; } /* Helper that implements most of mdc_getstatus and signal_completed_replay. */ @@ -117,12 +117,9 @@ static int send_getstatus(struct obd_import *imp, struct lu_fid *rootfid, GOTO(out, rc = -EPROTO); if (body->valid & OBD_MD_FLMDSCAPA) { - struct obd_capa *oc; - - oc = mdc_unpack_capa(req, &RMF_CAPA1); - if (IS_ERR(oc)) - GOTO(out, rc = PTR_ERR(oc)); - *pc = oc; + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, pc); + if (rc) + GOTO(out, rc); } *rootfid = body->fid1; @@ -150,7 +147,7 @@ int mdc_getstatus(struct obd_export *exp, struct lu_fid *rootfid, * from server. Even for cases when acl_size and md_size is zero, RPC header * will contain 4 fields and RPC itself will contain zero size fields. This is * because mdt_getattr*() _always_ returns 4 fields, but if acl is not needed - * and thus zero, it shirinks it, making zero size. The same story about + * and thus zero, it shrinks it, making zero size. The same story about * md_size. And this is course of problem when client waits for smaller number * of fields. This issue will be fixed later when client gets aware of RPC * layouts. --umka @@ -585,28 +582,34 @@ int mdc_get_lustre_md(struct obd_export *exp, struct ptlrpc_request *req, } } if (md->body->valid & OBD_MD_FLMDSCAPA) { - struct obd_capa *oc = mdc_unpack_capa(req, &RMF_CAPA1); + struct obd_capa *oc = NULL; - if (IS_ERR(oc)) - GOTO(out, rc = PTR_ERR(oc)); + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA1, &oc); + if (rc) + GOTO(out, rc); md->mds_capa = oc; } if (md->body->valid & OBD_MD_FLOSSCAPA) { - struct obd_capa *oc = mdc_unpack_capa(req, &RMF_CAPA2); + struct obd_capa *oc = NULL; - if (IS_ERR(oc)) - GOTO(out, rc = PTR_ERR(oc)); + rc = mdc_unpack_capa(NULL, req, &RMF_CAPA2, &oc); + if (rc) + GOTO(out, rc); md->oss_capa = oc; } EXIT; out: if (rc) { - if (md->oss_capa) - free_capa(md->oss_capa); - if (md->mds_capa) - free_capa(md->mds_capa); + if (md->oss_capa) { + capa_put(md->oss_capa); + md->oss_capa = NULL; + } + if (md->mds_capa) { + capa_put(md->mds_capa); + md->mds_capa = NULL; + } #ifdef CONFIG_FS_POSIX_ACL posix_acl_release(md->posix_acl); #endif @@ -1172,6 +1175,10 @@ int mdc_set_info_async(struct obd_export *exp, rc = do_set_info_async(exp, keylen, key, vallen, val, set); RETURN(rc); } + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(exp->exp_obd); + RETURN(0); + } if (KEY_IS(KEY_FLUSH_CTX)) { sptlrpc_import_flush_my_ctx(imp); RETURN(0); @@ -1610,11 +1617,12 @@ static int mdc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) client import will not have been cleaned. */ if (obd->u.cli.cl_import) { struct obd_import *imp; + down_write(&obd->u.cli.cl_sem); imp = obd->u.cli.cl_import; CERROR("client import never connected\n"); ptlrpc_invalidate_import(imp); - ptlrpc_free_rq_pool(imp->imp_rq_pool); class_destroy_import(imp); + up_write(&obd->u.cli.cl_sem); obd->u.cli.cl_import = NULL; } rc = obd_llog_finish(obd, 0); @@ -1682,14 +1690,12 @@ static int mdc_process_config(struct obd_device *obd, obd_count len, void *buf) int rc = 0; lprocfs_mdc_init_vars(&lvars); - switch (lcfg->lcfg_command) { - case LCFG_SPTLRPC_CONF: - rc = sptlrpc_cliobd_process_config(obd, lcfg); - break; default: rc = class_process_proc_param(PARAM_MDC, lvars.obd_vars, lcfg, obd); + if (rc > 0) + rc = 0; break; } return(rc); @@ -1784,7 +1790,7 @@ static int mdc_renew_capa(struct obd_export *exp, struct obd_capa *oc, req->rq_async_args.pointer_arg[0] = oc; req->rq_async_args.pointer_arg[1] = cb; req->rq_interpret_reply = mdc_interpret_renew_capa; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } @@ -1863,13 +1869,12 @@ struct md_ops mdc_md_ops = { .m_set_open_replay_data = mdc_set_open_replay_data, .m_clear_open_replay_data = mdc_clear_open_replay_data, .m_renew_capa = mdc_renew_capa, + .m_unpack_capa = mdc_unpack_capa, .m_get_remote_perm = mdc_get_remote_perm, .m_intent_getattr_async = mdc_intent_getattr_async, .m_revalidate_lock = mdc_revalidate_lock }; -extern quota_interface_t mdc_quota_interface; - int __init mdc_init(void) { int rc; diff --git a/lustre/mdd/Makefile.in b/lustre/mdd/Makefile.in index f1568ea..bfecc0c 100644 --- a/lustre/mdd/Makefile.in +++ b/lustre/mdd/Makefile.in @@ -1,6 +1,6 @@ MODULES := mdd mdd-objs := mdd_object.o mdd_lov.o mdd_orphans.o mdd_lproc.o mdd_dir.o -mdd-objs += mdd_device.o mdd_trans.o mdd_permission.o mdd_lock.o +mdd-objs += mdd_device.o mdd_trans.o mdd_permission.o mdd_lock.o mdd_quota.o EXTRA_PRE_CFLAGS := -I@LINUX@/fs -I@LDISKFS_DIR@ -I@LDISKFS_DIR@/ldiskfs diff --git a/lustre/mdd/mdd_device.c b/lustre/mdd/mdd_device.c index 92b1077..76ef1c6 100644 --- a/lustre/mdd/mdd_device.c +++ b/lustre/mdd/mdd_device.c @@ -53,6 +53,8 @@ #include #include +#include +#include #include #include #include @@ -62,7 +64,8 @@ const struct md_device_operations mdd_ops; -static const char *mdd_root_dir_name = "root"; +static const char mdd_root_dir_name[] = "ROOT"; + static int mdd_device_init(const struct lu_env *env, struct lu_device *d, const char *name, struct lu_device *next) { @@ -87,7 +90,7 @@ static int mdd_device_init(const struct lu_env *env, struct lu_device *d, static struct lu_device *mdd_device_fini(const struct lu_env *env, struct lu_device *d) { - struct mdd_device *mdd = lu2mdd_dev(d); + struct mdd_device *mdd = lu2mdd_dev(d); struct lu_device *next = &mdd->mdd_child->dd_lu_dev; int rc; @@ -99,25 +102,6 @@ static struct lu_device *mdd_device_fini(const struct lu_env *env, return next; } -static int mdd_mount(const struct lu_env *env, struct mdd_device *mdd) -{ - int rc; - struct dt_object *root; - ENTRY; - - dt_txn_callback_add(mdd->mdd_child, &mdd->mdd_txn_cb); - root = dt_store_open(env, mdd->mdd_child, mdd_root_dir_name, - &mdd->mdd_root_fid); - if (!IS_ERR(root)) { - LASSERT(root != NULL); - lu_object_put(env, &root->do_lu); - rc = orph_index_init(env, mdd); - } else - rc = PTR_ERR(root); - - RETURN(rc); -} - static void mdd_device_shutdown(const struct lu_env *env, struct mdd_device *m, struct lustre_cfg *cfg) { @@ -146,7 +130,7 @@ static int mdd_process_config(const struct lu_env *env, lprocfs_mdd_init_vars(&lvars); rc = class_process_proc_param(PARAM_MDD, lvars.obd_vars, cfg,m); - if (rc == -ENOSYS) + if (rc > 0 || rc == -ENOSYS) /* we don't understand; pass it on */ rc = next->ld_ops->ldo_process_config(env, next, cfg); break; @@ -162,9 +146,6 @@ static int mdd_process_config(const struct lu_env *env, CERROR("lov init error %d \n", rc); GOTO(out, rc); } - rc = mdd_mount(env, m); - if (rc) - GOTO(out, rc); rc = mdd_txn_init_credits(env, m); break; case LCFG_CLEANUP: @@ -243,10 +224,39 @@ static int mdd_recovery_complete(const struct lu_env *env, RETURN(rc); } +static int mdd_prepare(const struct lu_env *env, + struct lu_device *pdev, + struct lu_device *cdev) +{ + struct mdd_device *mdd = lu2mdd_dev(cdev); + struct lu_device *next = &mdd->mdd_child->dd_lu_dev; + struct dt_object *root; + int rc; + + ENTRY; + rc = next->ld_ops->ldo_prepare(env, cdev, next); + if (rc) + GOTO(out, rc); + + dt_txn_callback_add(mdd->mdd_child, &mdd->mdd_txn_cb); + root = dt_store_open(env, mdd->mdd_child, "", mdd_root_dir_name, + &mdd->mdd_root_fid); + if (!IS_ERR(root)) { + LASSERT(root != NULL); + lu_object_put(env, &root->do_lu); + rc = orph_index_init(env, mdd); + } else + rc = PTR_ERR(root); + +out: + RETURN(rc); +} + const struct lu_device_operations mdd_lu_ops = { - .ldo_object_alloc = mdd_object_alloc, + .ldo_object_alloc = mdd_object_alloc, .ldo_process_config = mdd_process_config, - .ldo_recovery_complete = mdd_recovery_complete + .ldo_recovery_complete = mdd_recovery_complete, + .ldo_prepare = mdd_prepare, }; /* @@ -268,7 +278,7 @@ static int mdd_root_get(const struct lu_env *env, static int mdd_statfs(const struct lu_env *env, struct md_device *m, struct kstatfs *sfs) { - struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); int rc; ENTRY; @@ -284,7 +294,7 @@ static int mdd_statfs(const struct lu_env *env, struct md_device *m, static int mdd_maxsize_get(const struct lu_env *env, struct md_device *m, int *md_size, int *cookie_size) { - struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); ENTRY; *md_size = mdd_lov_mdsize(env, mdd); @@ -297,7 +307,7 @@ static int mdd_init_capa_ctxt(const struct lu_env *env, struct md_device *m, int mode, unsigned long timeout, __u32 alg, struct lustre_capa_key *keys) { - struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); struct mds_obd *mds = &mdd2obd_dev(mdd)->u.mds; int rc; ENTRY; @@ -312,7 +322,7 @@ static int mdd_update_capa_key(const struct lu_env *env, struct md_device *m, struct lustre_capa_key *key) { - struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); struct obd_export *lov_exp = mdd2obd_dev(mdd)->u.mds.mds_osc_exp; int rc; ENTRY; @@ -406,6 +416,25 @@ const struct md_device_operations mdd_ops = { .mdo_maxsize_get = mdd_maxsize_get, .mdo_init_capa_ctxt = mdd_init_capa_ctxt, .mdo_update_capa_key= mdd_update_capa_key, +#ifdef HAVE_QUOTA_SUPPORT + .mdo_quota = { + .mqo_notify = mdd_quota_notify, + .mqo_setup = mdd_quota_setup, + .mqo_cleanup = mdd_quota_cleanup, + .mqo_recovery = mdd_quota_recovery, + .mqo_check = mdd_quota_check, + .mqo_on = mdd_quota_on, + .mqo_off = mdd_quota_off, + .mqo_setinfo = mdd_quota_setinfo, + .mqo_getinfo = mdd_quota_getinfo, + .mqo_setquota = mdd_quota_setquota, + .mqo_getquota = mdd_quota_getquota, + .mqo_getoinfo = mdd_quota_getoinfo, + .mqo_getoquota = mdd_quota_getoquota, + .mqo_invalidate = mdd_quota_invalidate, + .mqo_finvalidate = mdd_quota_finvalidate + } +#endif }; static struct lu_device_type_operations mdd_device_type_ops = { @@ -446,16 +475,45 @@ static void mdd_key_fini(const struct lu_context *ctx, /* context key: mdd_thread_key */ LU_CONTEXT_KEY_DEFINE(mdd, LCT_MD_THREAD); +static struct lu_local_obj_desc llod_capa_key = { + .llod_name = CAPA_KEYS, + .llod_oid = MDD_CAPA_KEYS_OID, + .llod_is_index = 0, +}; + +static struct lu_local_obj_desc llod_mdd_orphan = { + .llod_name = orph_index_name, + .llod_oid = MDD_ORPHAN_OID, + .llod_is_index = 1, + .llod_feat = &dt_directory_features, +}; + +static struct lu_local_obj_desc llod_mdd_root = { + .llod_name = mdd_root_dir_name, + .llod_oid = MDD_ROOT_INDEX_OID, + .llod_is_index = 1, + .llod_feat = &dt_directory_features, +}; + static int __init mdd_mod_init(void) { struct lprocfs_static_vars lvars; lprocfs_mdd_init_vars(&lvars); + + llo_local_obj_register(&llod_capa_key); + llo_local_obj_register(&llod_mdd_orphan); + llo_local_obj_register(&llod_mdd_root); + return class_register_type(&mdd_obd_device_ops, NULL, lvars.module_vars, LUSTRE_MDD_NAME, &mdd_device_type); } static void __exit mdd_mod_exit(void) { + llo_local_obj_unregister(&llod_capa_key); + llo_local_obj_unregister(&llod_mdd_orphan); + llo_local_obj_unregister(&llod_mdd_root); + class_unregister_type(LUSTRE_MDD_NAME); } diff --git a/lustre/mdd/mdd_dir.c b/lustre/mdd/mdd_dir.c index b67784a..fa56fcd 100644 --- a/lustre/mdd/mdd_dir.c +++ b/lustre/mdd/mdd_dir.c @@ -75,7 +75,7 @@ static int __mdd_lookup_locked(const struct lu_env *env, struct md_object *pobj, const struct lu_name *lname, struct lu_fid* fid, int mask) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct mdd_object *mdd_obj = md2mdd_obj(pobj); struct dynlock_handle *dlh; int rc; @@ -232,7 +232,7 @@ static int mdd_dir_is_empty(const struct lu_env *env, RETURN(-ENOTDIR); iops = &obj->do_index_ops->dio_it; - it = iops->init(env, obj, 0, BYPASS_CAPA); + it = iops->init(env, obj, BYPASS_CAPA); if (it != NULL) { result = iops->get(env, it, (const void *)""); if (result > 0) { @@ -269,7 +269,7 @@ static int __mdd_may_link(const struct lu_env *env, struct mdd_object *obj) /* * Subdir count limitation can be broken through. - */ + */ if (la->la_nlink >= m->mdd_dt_conf.ddp_max_nlink && !S_ISDIR(la->la_mode)) RETURN(-EMLINK); @@ -348,19 +348,19 @@ static inline int mdd_is_sticky(const struct lu_env *env, rc = mdd_la_get(env, pobj, tmp_la, BYPASS_CAPA); if (rc) return rc; - + if (!(tmp_la->la_mode & S_ISVTX) || (tmp_la->la_uid == uc->mu_fsuid)) return 0; } rc = mdd_la_get(env, cobj, tmp_la, BYPASS_CAPA); - if (rc) + if (rc) return rc; - + if (tmp_la->la_uid == uc->mu_fsuid) return 0; - + return !mdd_capable(uc, CFS_CAP_FOWNER); } @@ -458,15 +458,6 @@ int mdd_link_sanity_check(const struct lu_env *env, RETURN(rc); } -const struct dt_rec *__mdd_fid_rec(const struct lu_env *env, - const struct lu_fid *fid) -{ - struct lu_fid_pack *pack = &mdd_env_info(env)->mti_pack; - - fid_pack(pack, fid, &mdd_env_info(env)->mti_fid2); - return (const struct dt_rec *)pack; -} - /** * If subdir count is up to ddp_max_nlink, then enable MNLINK_OBJ flag and * assign i_nlink to 1 which means the i_nlink for subdir count is incredible @@ -514,10 +505,13 @@ static int __mdd_index_insert(const struct lu_env *env, struct mdd_object *pobj, ENTRY; if (dt_try_as_dir(env, next)) { + struct md_ucred *uc = md_ucred(env); + rc = next->do_index_ops->dio_insert(env, next, __mdd_fid_rec(env, lf), (const struct dt_key *)name, - handle, capa); + handle, capa, uc->mu_cap & + CFS_CAP_SYS_RESOURCE_MASK); } else { rc = -ENOTDIR; } @@ -570,10 +564,13 @@ __mdd_index_insert_only(const struct lu_env *env, struct mdd_object *pobj, ENTRY; if (dt_try_as_dir(env, next)) { + struct md_ucred *uc = md_ucred(env); + rc = next->do_index_ops->dio_insert(env, next, __mdd_fid_rec(env, lf), (const struct dt_key *)name, - handle, capa); + handle, capa, uc->mu_cap & + CFS_CAP_SYS_RESOURCE_MASK); } else { rc = -ENOTDIR; } @@ -584,20 +581,42 @@ static int mdd_link(const struct lu_env *env, struct md_object *tgt_obj, struct md_object *src_obj, const struct lu_name *lname, struct md_attr *ma) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_tobj = md2mdd_obj(tgt_obj); struct mdd_object *mdd_sobj = md2mdd_obj(src_obj); struct mdd_device *mdd = mdo2mdd(src_obj); struct dynlock_handle *dlh; struct thandle *handle; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, rec_pending = 0; +#endif int rc; ENTRY; +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_tobj, la_tmp, BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_LINK; + mdd_quota_wrapper(la_tmp, qids); + /* get block quota for parent */ + lquota_chkquota(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], 1, + &rec_pending, NULL, LQUOTA_FLAGS_BLK); + } + } +#endif + mdd_txn_param_build(env, mdd, MDD_TXN_LINK_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); dlh = mdd_pdo_write_lock(env, mdd_tobj, name, MOR_TGT_CHILD); if (dlh == NULL) @@ -632,6 +651,19 @@ out_unlock: mdd_pdo_write_unlock(env, mdd_tobj, dlh); out_trans: mdd_trans_stop(env, mdd, rc, handle); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) { + if (rec_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], + 1, 1); + /* Trigger dqacq for the parent owner. If failed, + * the next call for lquota_chkquota will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, 0, qids, rc, + quota_opc); + } +#endif return rc; } @@ -641,23 +673,28 @@ int mdd_finish_unlink(const struct lu_env *env, struct thandle *th) { int rc; + int reset = 1; ENTRY; rc = mdd_iattr_get(env, obj, ma); if (rc == 0 && ma->ma_attr.la_nlink == 0) { /* add new orphan and the object - * will be deleted during the object_put() */ - if (__mdd_orphan_add(env, obj, th) == 0) - obj->mod_flags |= ORPHAN_OBJ; + * will be deleted during mdd_close() */ + if (obj->mod_count) { + rc = __mdd_orphan_add(env, obj, th); + if (rc == 0) + obj->mod_flags |= ORPHAN_OBJ; + } obj->mod_flags |= DEAD_OBJ; - if (obj->mod_count == 0) + if (!(obj->mod_flags & ORPHAN_OBJ)) { rc = mdd_object_kill(env, obj, ma); - else - /* clear MA_LOV | MA_COOKIE, if we do not - * unlink it in case we get it somewhere */ - ma->ma_valid &= ~(MA_LOV | MA_COOKIE); - } else + if (rc == 0) + reset = 0; + } + + } + if (reset) ma->ma_valid &= ~(MA_LOV | MA_COOKIE); RETURN(rc); @@ -682,13 +719,20 @@ static int mdd_unlink(const struct lu_env *env, struct md_object *pobj, struct md_object *cobj, const struct lu_name *lname, struct md_attr *ma) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_pobj = md2mdd_obj(pobj); struct mdd_object *mdd_cobj = md2mdd_obj(cobj); struct mdd_device *mdd = mdo2mdd(pobj); struct dynlock_handle *dlh; struct thandle *handle; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qcids[MAXQUOTAS] = { 0, 0 }; + unsigned int qpids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0; +#endif int rc, is_dir; ENTRY; @@ -738,6 +782,23 @@ static int mdd_unlink(const struct lu_env *env, struct md_object *pobj, GOTO(cleanup, rc); rc = mdd_finish_unlink(env, mdd_cobj, ma, handle); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && ma->ma_valid & MA_INODE && + ma->ma_attr.la_nlink == 0) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_pobj, la_tmp, BYPASS_CAPA); + if (!rc) { + mdd_quota_wrapper(la_tmp, qpids); + if (mdd_cobj->mod_count == 0) { + quota_opc = FSFILT_OP_UNLINK; + mdd_quota_wrapper(&ma->ma_attr, qcids); + } else { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_PARENT; + } + } + } +#endif if (rc == 0) obd_set_info_async(mdd2obd_dev(mdd)->u.mds.mds_osc_exp, @@ -749,6 +810,13 @@ cleanup: mdd_pdo_write_unlock(env, mdd_pobj, dlh); out_trans: mdd_trans_stop(env, mdd, rc, handle); +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) + /* Trigger dqrel on the owner of child and parent. If failed, + * the next call for lquota_chkquota will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc, + quota_opc); +#endif return rc; } @@ -778,20 +846,48 @@ static int mdd_name_insert(const struct lu_env *env, const struct lu_fid *fid, const struct md_attr *ma) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_obj = md2mdd_obj(pobj); struct mdd_device *mdd = mdo2mdd(pobj); struct dynlock_handle *dlh; struct thandle *handle; int is_dir = S_ISDIR(ma->ma_attr.la_mode); +#ifdef HAVE_QUOTA_SUPPORT + struct md_ucred *uc = md_ucred(env); + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, rec_pending = 0; + cfs_cap_t save = uc->mu_cap; +#endif int rc; ENTRY; +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + if (!(ma->ma_attr_flags & MDS_QUOTA_IGNORE)) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_obj, la_tmp, BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_LINK; + mdd_quota_wrapper(la_tmp, qids); + /* get block quota for parent */ + lquota_chkquota(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], + 1, &rec_pending, NULL, + LQUOTA_FLAGS_BLK); + } + } else { + uc->mu_cap |= CFS_CAP_SYS_RESOURCE_MASK; + } + } +#endif mdd_txn_param_build(env, mdd, MDD_TXN_INDEX_INSERT_OP); handle = mdd_trans_start(env, mdo2mdd(pobj)); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); dlh = mdd_pdo_write_lock(env, mdd_obj, name, MOR_TGT_PARENT); if (dlh == NULL) @@ -823,6 +919,23 @@ out_unlock: mdd_pdo_write_unlock(env, mdd_obj, dlh); out_trans: mdd_trans_stop(env, mdo2mdd(pobj), rc, handle); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + if (quota_opc) { + if (rec_pending) + lquota_pending_commit(mds_quota_interface_ref, + obd, qids[USRQUOTA], + qids[GRPQUOTA], 1, 1); + /* Trigger dqacq for the parent owner. If failed, + * the next call for lquota_chkquota will process it*/ + lquota_adjust(mds_quota_interface_ref, obd, 0, qids, + rc, quota_opc); + } else { + uc->mu_cap = save; + } + } +#endif return rc; } @@ -851,20 +964,37 @@ static int mdd_name_remove(const struct lu_env *env, const struct lu_name *lname, const struct md_attr *ma) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_obj = md2mdd_obj(pobj); struct mdd_device *mdd = mdo2mdd(pobj); struct dynlock_handle *dlh; struct thandle *handle; int is_dir = S_ISDIR(ma->ma_attr.la_mode); +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0; +#endif int rc; ENTRY; +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_obj, la_tmp, BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_PARENT; + mdd_quota_wrapper(la_tmp, qids); + } + } +#endif mdd_txn_param_build(env, mdd, MDD_TXN_INDEX_DELETE_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); dlh = mdd_pdo_write_lock(env, mdd_obj, name, MOR_TGT_PARENT); if (dlh == NULL) @@ -896,6 +1026,14 @@ out_unlock: mdd_pdo_write_unlock(env, mdd_obj, dlh); out_trans: mdd_trans_stop(env, mdd, rc, handle); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + /* Trigger dqrel for the parent owner. + * If failed, the next call for lquota_chkquota will process it. */ + if (quota_opc) + lquota_adjust(mds_quota_interface_ref, obd, 0, qids, rc, + quota_opc); +#endif return rc; } @@ -932,20 +1070,42 @@ static int mdd_rename_tgt(const struct lu_env *env, const struct lu_fid *lf, const struct lu_name *lname, struct md_attr *ma) { - char *name = lname->ln_name; + const char *name = lname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_tpobj = md2mdd_obj(pobj); struct mdd_object *mdd_tobj = md2mdd_obj(tobj); struct mdd_device *mdd = mdo2mdd(pobj); struct dynlock_handle *dlh; struct thandle *handle; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qcids[MAXQUOTAS] = { 0, 0 }; + unsigned int qpids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, rec_pending = 0; +#endif int rc; ENTRY; +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && !tobj) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_tpobj, la_tmp, BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_LINK; + mdd_quota_wrapper(la_tmp, qpids); + /* get block quota for target parent */ + lquota_chkquota(mds_quota_interface_ref, obd, + qpids[USRQUOTA], qpids[GRPQUOTA], 1, + &rec_pending, NULL, LQUOTA_FLAGS_BLK); + } + } +#endif mdd_txn_param_build(env, mdd, MDD_TXN_RENAME_TGT_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); dlh = mdd_pdo_write_lock(env, mdd_tpobj, name, MOR_TGT_PARENT); if (dlh == NULL) @@ -978,7 +1138,7 @@ static int mdd_rename_tgt(const struct lu_env *env, if (rc) GOTO(cleanup, rc); - /* + /* * For tobj is remote case cmm layer has processed * and pass NULL tobj to here. So when tobj is NOT NULL, * it must be local one. @@ -998,6 +1158,14 @@ static int mdd_rename_tgt(const struct lu_env *env, rc = mdd_finish_unlink(env, mdd_tobj, ma, handle); if (rc) GOTO(cleanup, rc); + +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && ma->ma_valid & MA_INODE && + ma->ma_attr.la_nlink == 0 && mdd_tobj->mod_count == 0) { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD; + mdd_quota_wrapper(&ma->ma_attr, qcids); + } +#endif } EXIT; cleanup: @@ -1006,6 +1174,22 @@ cleanup: mdd_pdo_write_unlock(env, mdd_tpobj, dlh); out_trans: mdd_trans_stop(env, mdd, rc, handle); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + if (rec_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qpids[USRQUOTA], + qpids[GRPQUOTA], + 1, 1); + if (quota_opc) + /* Trigger dqrel/dqacq on the target owner of child and + * parent. If failed, the next call for lquota_chkquota + * will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qcids, + qpids, rc, quota_opc); + } +#endif return rc; } @@ -1063,7 +1247,7 @@ static int mdd_create_data(const struct lu_env *env, struct md_object *pobj, /* Replay creates has objects already */ #if 0 - if (spec->u.sp_ea.no_lov_create) { + if (spec->no_create) { CDEBUG(D_INFO, "we already have lov ea\n"); rc = mdd_lov_set_md(env, mdd_pobj, son, (struct lov_mds_md *)spec->u.sp_ea.eadata, @@ -1092,7 +1276,7 @@ static int __mdd_lookup(const struct lu_env *env, struct md_object *pobj, const struct lu_name *lname, struct lu_fid* fid, int mask) { - char *name = lname->ln_name; + const char *name = lname->ln_name; const struct dt_key *key = (const struct dt_key *)name; struct mdd_object *mdd_obj = md2mdd_obj(pobj); struct mdd_device *m = mdo2mdd(pobj); @@ -1127,8 +1311,10 @@ __mdd_lookup(const struct lu_env *env, struct md_object *pobj, rc = dir->do_index_ops->dio_lookup(env, dir, (struct dt_rec *)pack, key, mdd_object_capa(env, mdd_obj)); - if (rc == 0) + if (rc > 0) rc = fid_unpack(pack, fid); + else if (rc == 0) + rc = -ENOENT; } else rc = -ENOTDIR; @@ -1137,7 +1323,7 @@ __mdd_lookup(const struct lu_env *env, struct md_object *pobj, int mdd_object_initialize(const struct lu_env *env, const struct lu_fid *pfid, struct mdd_object *child, struct md_attr *ma, - struct thandle *handle) + struct thandle *handle, const struct md_op_spec *spec) { int rc; ENTRY; @@ -1281,9 +1467,17 @@ static int mdd_create(const struct lu_env *env, struct lov_mds_md *lmm = NULL; struct thandle *handle; struct dynlock_handle *dlh; - char *name = lname->ln_name; + const char *name = lname->ln_name; int rc, created = 0, initialized = 0, inserted = 0, lmm_size = 0; int got_def_acl = 0; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qcids[MAXQUOTAS] = { 0, 0 }; + unsigned int qpids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, block_count = 0; + int inode_pending = 0, block_pending = 0, parent_pending = 0; +#endif ENTRY; /* @@ -1327,6 +1521,51 @@ static int mdd_create(const struct lu_env *env, if (rc) RETURN(rc); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_pobj, la_tmp, BYPASS_CAPA); + if (!rc) { + int same = 0; + + quota_opc = FSFILT_OP_CREATE; + mdd_quota_wrapper(&ma->ma_attr, qcids); + mdd_quota_wrapper(la_tmp, qpids); + /* get file quota for child */ + lquota_chkquota(mds_quota_interface_ref, obd, + qcids[USRQUOTA], qcids[GRPQUOTA], 1, + &inode_pending, NULL, 0); + switch (ma->ma_attr.la_mode & S_IFMT) { + case S_IFLNK: + case S_IFDIR: + block_count = 2; + break; + case S_IFREG: + block_count = 1; + break; + } + if (qcids[USRQUOTA] == qpids[USRQUOTA] && + qcids[GRPQUOTA] == qpids[GRPQUOTA]) { + block_count += 1; + same = 1; + } + /* get block quota for child and parent */ + if (block_count) + lquota_chkquota(mds_quota_interface_ref, obd, + qcids[USRQUOTA], qcids[GRPQUOTA], + block_count, + &block_pending, NULL, + LQUOTA_FLAGS_BLK); + if (!same) + lquota_chkquota(mds_quota_interface_ref, obd, + qpids[USRQUOTA], qpids[GRPQUOTA], 1, + &parent_pending, NULL, + LQUOTA_FLAGS_BLK); + } + } +#endif + /* * No RPC inside the transaction, so OST objects should be created at * first. @@ -1335,7 +1574,7 @@ static int mdd_create(const struct lu_env *env, rc = mdd_lov_create(env, mdd, mdd_pobj, son, &lmm, &lmm_size, spec, attr); if (rc) - RETURN(rc); + GOTO(out_pending, rc); } if (!S_ISLNK(attr->la_mode)) { @@ -1363,7 +1602,7 @@ static int mdd_create(const struct lu_env *env, GOTO(out_trans, rc = -ENOMEM); mdd_write_lock(env, son, MOR_TGT_CHILD); - rc = mdd_object_create_internal(env, mdd_pobj, son, ma, handle); + rc = mdd_object_create_internal(env, mdd_pobj, son, ma, handle, spec); if (rc) { mdd_write_unlock(env, son); GOTO(cleanup, rc); @@ -1388,7 +1627,7 @@ static int mdd_create(const struct lu_env *env, #endif rc = mdd_object_initialize(env, mdo2fid(mdd_pobj), - son, ma, handle); + son, ma, handle, spec); mdd_write_unlock(env, son); if (rc) /* @@ -1422,6 +1661,7 @@ static int mdd_create(const struct lu_env *env, } if (S_ISLNK(attr->la_mode)) { + struct md_ucred *uc = md_ucred(env); struct dt_object *dt = mdd_object_child(son); const char *target_name = spec->u.sp_symname; int sym_len = strlen(target_name); @@ -1430,7 +1670,9 @@ static int mdd_create(const struct lu_env *env, buf = mdd_buf_get_const(env, target_name, sym_len); rc = dt->do_body_ops->dbo_write(env, dt, buf, &pos, handle, - mdd_object_capa(env, son)); + mdd_object_capa(env, son), + uc->mu_cap & + CFS_CAP_SYS_RESOURCE_MASK); if (rc == sym_len) rc = 0; @@ -1479,6 +1721,27 @@ out_trans: out_free: /* finis lov_create stuff, free all temporary data */ mdd_lov_create_finish(env, mdd, lmm, lmm_size, spec); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) { + if (inode_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qcids[USRQUOTA], qcids[GRPQUOTA], + 1, 0); + if (block_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qcids[USRQUOTA], qcids[GRPQUOTA], + block_count, 1); + if (parent_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qpids[USRQUOTA], qpids[GRPQUOTA], + 1, 1); + /* Trigger dqacq on the owner of child and parent. If failed, + * the next call for lquota_chkquota will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qcids, qpids, rc, + quota_opc); + } +#endif return rc; } @@ -1571,8 +1834,8 @@ static int mdd_rename(const struct lu_env *env, struct md_object *tobj, const struct lu_name *ltname, struct md_attr *ma) { - char *sname = lsname->ln_name; - char *tname = ltname->ln_name; + const char *sname = lsname->ln_name; + const char *tname = ltname->ln_name; struct lu_attr *la = &mdd_env_info(env)->mti_la_for_fix; struct mdd_object *mdd_spobj = md2mdd_obj(src_pobj); struct mdd_object *mdd_tpobj = md2mdd_obj(tgt_pobj); @@ -1581,8 +1844,18 @@ static int mdd_rename(const struct lu_env *env, struct mdd_object *mdd_tobj = NULL; struct dynlock_handle *sdlh, *tdlh; struct thandle *handle; + const struct lu_fid *tpobj_fid = mdo2fid(mdd_tpobj); int is_dir; int rc; + +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qspids[MAXQUOTAS] = { 0, 0 }; + unsigned int qtcids[MAXQUOTAS] = { 0, 0 }; + unsigned int qtpids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, rec_pending = 0; +#endif ENTRY; LASSERT(ma->ma_attr.la_mode & S_IFMT); @@ -1591,10 +1864,34 @@ static int mdd_rename(const struct lu_env *env, if (tobj) mdd_tobj = md2mdd_obj(tobj); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_spobj, la_tmp, BYPASS_CAPA); + if (!rc) { + mdd_quota_wrapper(la_tmp, qspids); + if (!tobj) { + rc = mdd_la_get(env, mdd_tpobj, la_tmp, + BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_LINK; + mdd_quota_wrapper(la_tmp, qtpids); + /* get block quota for target parent */ + lquota_chkquota(mds_quota_interface_ref, + obd, qtpids[USRQUOTA], + qtpids[GRPQUOTA], 1, + &rec_pending, NULL, + LQUOTA_FLAGS_BLK); + } + } + } + } +#endif mdd_txn_param_build(env, mdd, MDD_TXN_RENAME_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); /* FIXME: Should consider tobj and sobj too in rename_lock. */ rc = mdd_rename_order(env, mdd, mdd_spobj, mdd_tpobj); @@ -1632,6 +1929,20 @@ static int mdd_rename(const struct lu_env *env, if (rc) GOTO(cleanup, rc); + /* "mv dir1 dir2" needs "dir1/.." link update */ + if (is_dir) { + rc = __mdd_index_delete(env, mdd_sobj, dotdot, is_dir, handle, + mdd_object_capa(env, mdd_spobj)); + if (rc) + GOTO(cleanup, rc); + + rc = __mdd_index_insert(env, mdd_sobj, tpobj_fid, dotdot, + is_dir, handle, + mdd_object_capa(env, mdd_tpobj)); + if (rc) + GOTO(cleanup, rc); + } + /* * Here tobj can be remote one, so we do index_delete unconditionally * and -ENOENT is allowed. @@ -1658,7 +1969,7 @@ static int mdd_rename(const struct lu_env *env, GOTO(cleanup, rc); } - /* + /* * For tobj is remote case cmm layer has processed * and set tobj to NULL then. So when tobj is NOT NULL, * it must be local one. @@ -1680,6 +1991,14 @@ static int mdd_rename(const struct lu_env *env, mdd_write_unlock(env, mdd_tobj); if (rc) GOTO(cleanup, rc); + +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && ma->ma_valid & MA_INODE && + ma->ma_attr.la_nlink == 0 && mdd_tobj->mod_count == 0) { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD; + mdd_quota_wrapper(&ma->ma_attr, qtcids); + } +#endif } la->la_valid = LA_CTIME | LA_MTIME; @@ -1703,6 +2022,27 @@ cleanup_unlocked: mdd_trans_stop(env, mdd, rc, handle); if (mdd_sobj) mdd_object_put(env, mdd_sobj); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + if (rec_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qtpids[USRQUOTA], + qtpids[GRPQUOTA], + 1, 1); + /* Trigger dqrel on the source owner of parent. + * If failed, the next call for lquota_chkquota will + * process it. */ + lquota_adjust(mds_quota_interface_ref, obd, 0, qspids, rc, + FSFILT_OP_UNLINK_PARTIAL_PARENT); + if (quota_opc) + /* Trigger dqrel/dqacq on the target owner of child and + * parent. If failed, the next call for lquota_chkquota + * will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qtcids, + qtpids, rc, quota_opc); + } +#endif return rc; } diff --git a/lustre/mdd/mdd_internal.h b/lustre/mdd/mdd_internal.h index 205cce1..d6729e2 100644 --- a/lustre/mdd/mdd_internal.h +++ b/lustre/mdd/mdd_internal.h @@ -51,6 +51,21 @@ #include #include #include +#ifdef HAVE_QUOTA_SUPPORT +# include +#endif +#include + +#ifdef HAVE_QUOTA_SUPPORT +/* quota stuff */ +extern quota_interface_t *mds_quota_interface_ref; + +static inline void mdd_quota_wrapper(struct lu_attr *la, unsigned int *qids) +{ + qids[0] = la->la_uid; + qids[1] = la->la_gid; +} +#endif enum mdd_txn_op { MDD_TXN_OBJECT_DESTROY_OP = 0, @@ -108,25 +123,18 @@ enum mdd_object_role { }; struct mdd_object { - struct md_object mod_obj; + struct md_object mod_obj; /* open count */ - __u32 mod_count; - __u32 mod_valid; - unsigned long mod_flags; - struct dynlock mod_pdlock; + __u32 mod_count; + __u32 mod_valid; + unsigned long mod_flags; + struct dynlock mod_pdlock; #ifdef CONFIG_LOCKDEP /* "dep_map" name is assumed by lockdep.h macros. */ struct lockdep_map dep_map; #endif }; -struct orph_key { - /* fid of the object*/ - struct lu_fid ok_fid; - /* type of operation: unlink, truncate */ - __u32 ok_op; -} __attribute__((packed)); - struct mdd_thread_info { struct txn_param mti_param; struct lu_fid mti_fid; @@ -134,7 +142,7 @@ struct mdd_thread_info { struct md_attr mti_ma; struct lu_attr mti_la_for_fix; struct obd_info mti_oi; - struct orph_key mti_orph_key; + char mti_orph_key[NAME_MAX + 1]; struct obd_trans_info mti_oti; struct lu_buf mti_buf; struct obdo mti_oa; @@ -146,8 +154,14 @@ struct mdd_thread_info { int mti_max_lmm_size; struct llog_cookie *mti_max_cookie; int mti_max_cookie_size; + struct dt_object_format mti_dof; + struct obd_quotactl mti_oqctl; }; +extern const char orph_index_name[]; + +extern const struct dt_index_features orph_index_features; + struct lov_mds_md *mdd_max_lmm_get(const struct lu_env *env, struct mdd_device *mdd); @@ -169,6 +183,7 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, struct mdd_object *parent, struct mdd_object *child, struct lov_mds_md **lmm, int *lmm_size, const struct md_op_spec *spec, struct lu_attr *la); +int mdd_lov_objid_prepare(struct mdd_device *mdd, struct lov_mds_md *lmm); void mdd_lov_objid_update(struct mdd_device *mdd, struct lov_mds_md *lmm); void mdd_lov_create_finish(const struct lu_env *env, struct mdd_device *mdd, struct lov_mds_md *lmm, int lmm_size, @@ -198,7 +213,8 @@ int mdd_attr_get_internal_locked(const struct lu_env *env, struct md_attr *ma); int mdd_object_create_internal(const struct lu_env *env, struct mdd_object *p, struct mdd_object *c, struct md_attr *ma, - struct thandle *handle); + struct thandle *handle, + const struct md_op_spec *spec); int mdd_attr_check_set_internal_locked(const struct lu_env *env, struct mdd_object *obj, struct lu_attr *attr, @@ -246,7 +262,7 @@ int mdd_finish_unlink(const struct lu_env *env, struct mdd_object *obj, struct md_attr *ma, struct thandle *th); int mdd_object_initialize(const struct lu_env *env, const struct lu_fid *pfid, struct mdd_object *child, struct md_attr *ma, - struct thandle *handle); + struct thandle *handle, const struct md_op_spec *spec); int mdd_link_sanity_check(const struct lu_env *env, struct mdd_object *tgt_obj, const struct lu_name *lname, struct mdd_object *src_obj); /* mdd_lov.c */ @@ -293,11 +309,47 @@ int mdd_get_flags(const struct lu_env *env, struct mdd_object *obj); extern const struct md_dir_operations mdd_dir_ops; extern const struct md_object_operations mdd_obj_ops; +/* mdd_quota.c*/ +#ifdef HAVE_QUOTA_SUPPORT +int mdd_quota_notify(const struct lu_env *env, struct md_device *m); +int mdd_quota_setup(const struct lu_env *env, struct md_device *m, + void *data); +int mdd_quota_cleanup(const struct lu_env *env, struct md_device *m); +int mdd_quota_recovery(const struct lu_env *env, struct md_device *m); +int mdd_quota_check(const struct lu_env *env, struct md_device *m, + struct obd_export *exp, __u32 type); +int mdd_quota_on(const struct lu_env *env, struct md_device *m, + __u32 type); +int mdd_quota_off(const struct lu_env *env, struct md_device *m, + __u32 type); +int mdd_quota_setinfo(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo); +int mdd_quota_getinfo(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo); +int mdd_quota_setquota(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk); +int mdd_quota_getquota(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk); +int mdd_quota_getoinfo(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo); +int mdd_quota_getoquota(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk); +int mdd_quota_invalidate(const struct lu_env *env, struct md_device *m, + __u32 type); +int mdd_quota_finvalidate(const struct lu_env *env, struct md_device *m, + __u32 type); +#endif + /* mdd_trans.c */ void mdd_txn_param_build(const struct lu_env *env, struct mdd_device *mdd, enum mdd_txn_op); int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj, struct md_attr *ma, enum mdd_txn_op); +int mdd_setattr_txn_param_build(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, enum mdd_txn_op); + +int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd, + struct mdd_object *obj, struct lu_attr *la); static inline void mdd_object_put(const struct lu_env *env, struct mdd_object *o) @@ -426,6 +478,15 @@ static inline const struct lu_fid *mdo2fid(const struct mdd_object *obj) return lu_object_fid(&obj->mod_obj.mo_lu); } +static inline const struct dt_rec *__mdd_fid_rec(const struct lu_env *env, + const struct lu_fid *fid) +{ + struct lu_fid_pack *pack = &mdd_env_info(env)->mti_pack; + + fid_pack(pack, fid, &mdd_env_info(env)->mti_fid2); + return (const struct dt_rec *)pack; +} + static inline umode_t mdd_object_type(const struct mdd_object *obj) { return lu_object_attr(&obj->mod_obj.mo_lu); @@ -609,10 +670,11 @@ static inline int mdo_create_obj(const struct lu_env *env, struct mdd_object *o, struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *handle) { struct dt_object *next = mdd_object_child(o); - return next->do_ops->do_create(env, next, attr, hint, handle); + return next->do_ops->do_create(env, next, attr, hint, dof, handle); } static inline struct obd_capa *mdo_capa_get(const struct lu_env *env, diff --git a/lustre/mdd/mdd_lov.c b/lustre/mdd/mdd_lov.c index 0cd36e1..6d62c7e 100644 --- a/lustre/mdd/mdd_lov.c +++ b/lustre/mdd/mdd_lov.c @@ -145,12 +145,11 @@ int mdd_init_obd(const struct lu_env *env, struct mdd_device *mdd, /* * Add here for obd notify mechanism, when adding a new ost, the mds - * will notify this mdd. + * will notify this mdd. The mds will be used for quota also. */ obd->obd_upcall.onu_upcall = mdd_notify; obd->obd_upcall.onu_owner = mdd; mdd->mdd_obd_dev = obd; - EXIT; class_detach: if (rc) @@ -185,7 +184,7 @@ int mdd_fini_obd(const struct lu_env *env, struct mdd_device *mdd, if (rc) GOTO(lcfg_cleanup, rc); mdd->mdd_obd_dev = NULL; - + EXIT; lcfg_cleanup: return rc; @@ -207,7 +206,7 @@ int mdd_get_md(const struct lu_env *env, struct mdd_object *obj, *md_size = 0; rc = 0; } else if (rc < 0) { - CERROR("Error %d reading eadata \n", rc); + CERROR("Error %d reading eadata - %d\n", rc, *md_size); } else { /* XXX: Convert lov EA but fixed after verification test. */ *md_size = rc; @@ -357,49 +356,23 @@ static obd_id mdd_lov_create_id(const struct lu_fid *fid) return fid_flatten(fid); } -static void mdd_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm) +int mdd_lov_objid_prepare(struct mdd_device *mdd, struct lov_mds_md *lmm) { - struct mds_obd *mds = &obd->u.mds; - int j; - struct lov_ost_data_v1 *lmm_objects; - ENTRY; - - /* if we create file without objects - lmm is NULL */ - if (lmm == NULL) - return; - - if (le32_to_cpu(lmm->lmm_magic) == LOV_MAGIC_V3) - lmm_objects = ((struct lov_mds_md_v3 *)lmm)->lmm_objects; - else - lmm_objects = lmm->lmm_objects; - - for (j = 0; j < le32_to_cpu(lmm->lmm_stripe_count); j++) { - int i = le32_to_cpu(lmm_objects[j].l_ost_idx); - obd_id id = le64_to_cpu(lmm_objects[j].l_object_id); - int page = i / OBJID_PER_PAGE(); - int idx = i % OBJID_PER_PAGE(); - obd_id *data = mds->mds_lov_page_array[page]; - - CDEBUG(D_INODE,"update last object for ost %d - new %llu" - " old %llu\n", i, id, data[idx]); - if (id > data[idx]) { - data[idx] = id; - cfs_bitmap_set(mds->mds_lov_page_dirty, page); - } - } - EXIT; + /* copy mds_lov code is using wrong layer */ + return mds_lov_prepare_objids(mdd->mdd_obd_dev, lmm); } void mdd_lov_objid_update(struct mdd_device *mdd, struct lov_mds_md *lmm) { - mdd_lov_update_objids(mdd->mdd_obd_dev, lmm); + /* copy mds_lov code is using wrong layer */ + mds_lov_update_objids(mdd->mdd_obd_dev, lmm); } void mdd_lov_create_finish(const struct lu_env *env, struct mdd_device *mdd, struct lov_mds_md *lmm, int lmm_size, const struct md_op_spec *spec) { - if (lmm && !spec->u.sp_ea.no_lov_create) + if (lmm && !spec->no_create) OBD_FREE(lmm, lmm_size); } @@ -410,6 +383,7 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, { struct obd_device *obd = mdd2obd_dev(mdd); struct obd_export *lov_exp = obd->u.mds.mds_osc_exp; + struct lu_site *site = mdd2lu_dev(mdd)->ld_site; struct obdo *oa; struct lov_stripe_md *lsm = NULL; const void *eadata = spec->u.sp_ea.eadata; @@ -424,7 +398,7 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, oti_init(oti, NULL); /* replay case, has objects already, only get lov from eadata */ - if (spec->u.sp_ea.no_lov_create != 0) { + if (spec->no_create != 0) { *lmm = (struct lov_mds_md *)spec->u.sp_ea.eadata; *lmm_size = spec->u.sp_ea.eadatalen; RETURN(0); @@ -438,8 +412,7 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, oa->o_uid = 0; /* must have 0 uid / gid on OST */ oa->o_gid = 0; - oa->o_gr = FILTER_GROUP_MDS0 + - lu_site2md(mdd2lu_dev(mdd)->ld_site)->ms_node_id; + oa->o_gr = mdt_to_obd_objgrp(lu_site2md(site)->ms_node_id); oa->o_mode = S_IFREG | 0600; oa->o_id = mdd_lov_create_id(mdd_object_fid(child)); oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLFLAGS | @@ -485,7 +458,7 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, } GOTO(out_oti, rc); } - LASSERT(lsm->lsm_object_gr >= FILTER_GROUP_MDS0); + LASSERT_MDS_GROUP(lsm->lsm_object_gr); } else { LASSERT(eadata != NULL); rc = obd_iocontrol(OBD_IOC_LOV_SETEA, lov_exp, 0, &lsm, @@ -521,16 +494,11 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, oa->o_valid |= OBD_MD_FLFID | OBD_MD_FLGENER; oinfo->oi_oa = oa; oinfo->oi_md = lsm; - oinfo->oi_capa = mdo_capa_get(env, child, NULL, - CAPA_OPC_MDS_DEFAULT); + oinfo->oi_capa = NULL; oinfo->oi_policy.l_extent.start = la->la_size; oinfo->oi_policy.l_extent.end = OBD_OBJECT_EOF; - if (IS_ERR(oinfo->oi_capa)) - oinfo->oi_capa = NULL; - rc = obd_punch_rqset(lov_exp, oinfo, oti); - capa_put(oinfo->oi_capa); if (rc) { CERROR("Error setting attrs for "DFID": rc %d\n", PFID(mdo2fid(child)), rc); @@ -552,6 +520,12 @@ int mdd_lov_create(const struct lu_env *env, struct mdd_device *mdd, CERROR("Cannot pack lsm, err = %d\n", rc); GOTO(out_oti, rc); } + if (mdd_lov_objid_prepare(mdd, *lmm) != 0) { + CERROR("Not have memory for update objid\n"); + OBD_FREE(*lmm, rc); + *lmm = NULL; + GOTO(out_oti, rc = -ENOMEM); + } *lmm_size = rc; rc = 0; EXIT; @@ -564,6 +538,111 @@ out_ids: return rc; } +/* + * used when destroying orphans and from mds_reint_unlink() when MDS wants to + * destroy objects on OSS. + */ +static +int mdd_lovobj_unlink(const struct lu_env *env, struct mdd_device *mdd, + struct mdd_object *obj, struct lu_attr *la, + struct lov_mds_md *lmm, int lmm_size, + struct llog_cookie *logcookies, + int log_unlink) +{ + struct obd_device *obd = mdd2obd_dev(mdd); + struct obd_export *lov_exp = obd->u.mds.mds_osc_exp; + struct lov_stripe_md *lsm = NULL; + struct obd_trans_info *oti = &mdd_env_info(env)->mti_oti; + struct obdo *oa = &mdd_env_info(env)->mti_oa; + struct lu_site *site = mdd2lu_dev(mdd)->ld_site; + int rc; + ENTRY; + + if (lmm_size == 0) + RETURN(0); + + rc = obd_unpackmd(lov_exp, &lsm, lmm, lmm_size); + if (rc < 0) { + CERROR("Error unpack md %p\n", lmm); + RETURN(rc); + } else { + LASSERT(rc >= sizeof(*lsm)); + rc = 0; + } + + oa->o_id = lsm->lsm_object_id; + oa->o_gr = mdt_to_obd_objgrp(lu_site2md(site)->ms_node_id); + oa->o_mode = la->la_mode & S_IFMT; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLGROUP; + + oti_init(oti, NULL); + if (log_unlink && logcookies) { + oa->o_valid |= OBD_MD_FLCOOKIE; + oti->oti_logcookies = logcookies; + } + + CDEBUG(D_INFO, "destroying OSS object %d/%d\n", + (int)oa->o_id, (int)oa->o_gr); + + rc = obd_destroy(lov_exp, oa, lsm, oti, NULL, NULL); + + obd_free_memmd(lov_exp, &lsm); + RETURN(rc); +} + +/* + * called with obj not locked. + */ + +int mdd_lov_destroy(const struct lu_env *env, struct mdd_device *mdd, + struct mdd_object *obj, struct lu_attr *la) +{ + struct md_attr *ma = &mdd_env_info(env)->mti_ma; + int rc; + ENTRY; + + if (unlikely(la->la_nlink != 0)) { + CWARN("Attempt to destroy OSS object when nlink == %d\n", + la->la_nlink); + RETURN(0); + } + + ma->ma_lmm_size = mdd_lov_mdsize(env, mdd); + ma->ma_lmm = mdd_max_lmm_get(env, mdd); + ma->ma_cookie_size = mdd_lov_cookiesize(env, mdd); + ma->ma_cookie = mdd_max_cookie_get(env, mdd); + if (ma->ma_lmm == NULL || ma->ma_cookie == NULL) + RETURN(rc = -ENOMEM); + + /* get lov ea */ + + rc = mdd_get_md_locked(env, obj, ma->ma_lmm, &ma->ma_lmm_size, + MDS_LOV_MD_NAME); + + if (rc <= 0) { + CWARN("Get lov ea failed for "DFID" rc = %d\n", + PFID(mdo2fid(obj)), rc); + if (rc == 0) + rc = -ENOENT; + RETURN(rc); + } + + ma->ma_valid = MA_LOV; + + rc = mdd_unlink_log(env, mdd, obj, ma); + if (rc) { + CWARN("mds unlink log for "DFID" failed: %d\n", + PFID(mdo2fid(obj)), rc); + RETURN(rc); + } + + if (ma->ma_valid & MA_COOKIE) + rc = mdd_lovobj_unlink(env, mdd, obj, la, + ma->ma_lmm, ma->ma_lmm_size, + ma->ma_cookie, 1); + RETURN(rc); +} + int mdd_log_op_unlink(struct obd_device *obd, struct lov_mds_md *lmm, int lmm_size, struct llog_cookie *logcookies, int cookies_size) @@ -624,7 +703,7 @@ int mdd_log_op_setattr(struct obd_device *obd, __u32 uid, __u32 gid, { struct mds_obd *mds = &obd->u.mds; struct lov_stripe_md *lsm = NULL; - struct llog_setattr_rec *lsr; + struct llog_setattr64_rec *lsr; struct llog_ctxt *ctxt; int rc; ENTRY; @@ -646,7 +725,7 @@ int mdd_log_op_setattr(struct obd_device *obd, __u32 uid, __u32 gid, /* prepare setattr log record */ lsr->lsr_hdr.lrh_len = lsr->lsr_tail.lrt_len = sizeof(*lsr); - lsr->lsr_hdr.lrh_type = MDS_SETATTR_REC; + lsr->lsr_hdr.lrh_type = MDS_SETATTR64_REC; lsr->lsr_uid = uid; lsr->lsr_gid = gid; @@ -673,10 +752,10 @@ int mdd_setattr_log(const struct lu_env *env, struct mdd_device *mdd, /* journal chown/chgrp in llog, just like unlink */ if (lmm_size > 0) { CDEBUG(D_INFO, "setattr llog for uid/gid=%lu/%lu\n", - (unsigned long)ma->ma_attr.la_uid, + (unsigned long)ma->ma_attr.la_uid, (unsigned long)ma->ma_attr.la_gid); return mdd_log_op_setattr(obd, ma->ma_attr.la_uid, - ma->ma_attr.la_gid, lmm, + ma->ma_attr.la_gid, lmm, lmm_size, logcookies, cookies_size); } else @@ -746,14 +825,13 @@ out: } int mdd_lov_setattr_async(const struct lu_env *env, struct mdd_object *obj, - struct lov_mds_md *lmm, int lmm_size, + struct lov_mds_md *lmm, int lmm_size, struct llog_cookie *logcookies) { struct mdd_device *mdd = mdo2mdd(&obj->mod_obj); struct obd_device *obd = mdd2obd_dev(mdd); struct lu_attr *tmp_la = &mdd_env_info(env)->mti_la; const struct lu_fid *fid = mdd_object_fid(obj); - struct obd_capa *oc; int rc = 0; ENTRY; @@ -763,15 +841,8 @@ int mdd_lov_setattr_async(const struct lu_env *env, struct mdd_object *obj, if (rc) RETURN(rc); - oc = mdo_capa_get(env, obj, NULL, CAPA_OPC_MDS_DEFAULT); - if (IS_ERR(oc)) - oc = NULL; - rc = mdd_osc_setattr_async(obd, tmp_la->la_uid, tmp_la->la_gid, lmm, lmm_size, logcookies, fid_seq(fid), - fid_oid(fid), oc); - - capa_put(oc); - + fid_oid(fid), NULL); RETURN(rc); } diff --git a/lustre/mdd/mdd_lproc.c b/lustre/mdd/mdd_lproc.c index 2d8bc67..9178114 100644 --- a/lustre/mdd/mdd_lproc.c +++ b/lustre/mdd/mdd_lproc.c @@ -150,8 +150,29 @@ static int lprocfs_rd_atime_diff(char *page, char **start, off_t off, return snprintf(page, count, "%lu\n", mdd->mdd_atime_diff); } +#ifdef HAVE_QUOTA_SUPPORT +static int mdd_lprocfs_quota_rd_type(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct mdd_device *mdd = data; + return lprocfs_quota_rd_type(page, start, off, count, eof, + mdd->mdd_obd_dev); +} + +static int mdd_lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct mdd_device *mdd = data; + return lprocfs_quota_wr_type(file, buffer, count, mdd->mdd_obd_dev); +} +#endif + static struct lprocfs_vars lprocfs_mdd_obd_vars[] = { { "atime_diff", lprocfs_rd_atime_diff, lprocfs_wr_atime_diff, 0 }, +#ifdef HAVE_QUOTA_SUPPORT + { "quota_type", mdd_lprocfs_quota_rd_type, + mdd_lprocfs_quota_wr_type, 0 }, +#endif { 0 } }; diff --git a/lustre/mdd/mdd_object.c b/lustre/mdd/mdd_object.c index 96425cd..d7a9969 100644 --- a/lustre/mdd/mdd_object.c +++ b/lustre/mdd/mdd_object.c @@ -180,16 +180,16 @@ struct lu_object *mdd_object_alloc(const struct lu_env *env, static int mdd_object_init(const struct lu_env *env, struct lu_object *o, const struct lu_object_conf *_) { - struct mdd_device *d = lu2mdd_dev(o->lo_dev); - struct lu_object *below; + struct mdd_device *d = lu2mdd_dev(o->lo_dev); + struct lu_object *below; struct lu_device *under; ENTRY; - under = &d->mdd_child->dd_lu_dev; - below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); + under = &d->mdd_child->dd_lu_dev; + below = under->ld_ops->ldo_object_alloc(env, o->lo_header, under); mdd_pdlock_init(lu2mdd_obj(o)); if (below == NULL) - RETURN(-ENOMEM); + RETURN(-ENOMEM); lu_object_add(o, below); RETURN(0); @@ -206,43 +206,22 @@ static int mdd_object_start(const struct lu_env *env, struct lu_object *o) static void mdd_object_free(const struct lu_env *env, struct lu_object *o) { struct mdd_object *mdd = lu2mdd_obj(o); - + lu_object_fini(o); OBD_FREE_PTR(mdd); } -/* orphan handling is here */ -static void mdd_object_delete(const struct lu_env *env, struct lu_object *o) +static int mdd_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) { - struct mdd_object *mdd_obj = lu2mdd_obj(o); - struct thandle *handle = NULL; - ENTRY; - - if (lu2mdd_dev(o->lo_dev)->mdd_orphans == NULL) - return; - - if (mdd_obj->mod_flags & ORPHAN_OBJ) { - mdd_txn_param_build(env, lu2mdd_dev(o->lo_dev), - MDD_TXN_INDEX_DELETE_OP); - handle = mdd_trans_start(env, lu2mdd_dev(o->lo_dev)); - if (IS_ERR(handle)) - CERROR("Cannot get thandle\n"); - else { - mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD); - /* let's remove obj from the orphan list */ - __mdd_orphan_del(env, mdd_obj, handle); - mdd_write_unlock(env, mdd_obj); - mdd_trans_stop(env, lu2mdd_dev(o->lo_dev), - 0, handle); - } - } + return (*p)(env, cookie, LUSTRE_MDD_NAME"-object@%p", o); } static const struct lu_object_operations mdd_lu_obj_ops = { .loo_object_init = mdd_object_init, .loo_object_start = mdd_object_start, .loo_object_free = mdd_object_free, - .loo_object_delete = mdd_object_delete + .loo_object_print = mdd_object_print, }; struct mdd_object *mdd_object_find(const struct lu_env *env, @@ -486,10 +465,13 @@ static int mdd_xattr_list(const struct lu_env *env, struct md_object *obj, int mdd_object_create_internal(const struct lu_env *env, struct mdd_object *p, struct mdd_object *c, struct md_attr *ma, - struct thandle *handle) + struct thandle *handle, + const struct md_op_spec *spec) { struct lu_attr *attr = &ma->ma_attr; struct dt_allocation_hint *hint = &mdd_env_info(env)->mti_hint; + struct dt_object_format *dof = &mdd_env_info(env)->mti_dof; + const struct dt_index_features *feat = spec->sp_feat; int rc; ENTRY; @@ -497,11 +479,19 @@ int mdd_object_create_internal(const struct lu_env *env, struct mdd_object *p, struct dt_object *next = mdd_object_child(c); LASSERT(next); + if (feat != &dt_directory_features && feat != NULL) + dof->dof_type = DFT_INDEX; + else + dof->dof_type = dt_mode_to_dft(attr->la_mode); + + dof->u.dof_idx.di_feat = feat; + /* @hint will be initialized by underlying device. */ next->do_ops->do_ah_init(env, hint, p ? mdd_object_child(p) : NULL, attr->la_mode & S_IFMT); - rc = mdo_create_obj(env, c, attr, hint, handle); + + rc = mdo_create_obj(env, c, attr, hint, dof, handle); LASSERT(ergo(rc == 0, mdd_object_exists(c))); } else rc = -EEXIST; @@ -669,7 +659,7 @@ static int mdd_fix_attr(const struct lu_env *env, struct mdd_object *obj, la->la_valid &= ~LA_ATIME; RETURN(0); } - + /* Check if flags change. */ if (la->la_valid & LA_FLAGS) { unsigned int oldflags = 0; @@ -685,7 +675,7 @@ static int mdd_fix_attr(const struct lu_env *env, struct mdd_object *obj, if (mdd_is_immutable(obj)) oldflags |= LUSTRE_IMMUTABLE_FL; if (mdd_is_append(obj)) - oldflags |= LUSTRE_APPEND_FL; + oldflags |= LUSTRE_APPEND_FL; if ((oldflags ^ newflags) && !mdd_capable(uc, CFS_CAP_LINUX_IMMUTABLE)) RETURN(-EPERM); @@ -840,9 +830,18 @@ static int mdd_attr_set(const struct lu_env *env, struct md_object *obj, struct llog_cookie *logcookies = NULL; int rc, lmm_size = 0, cookie_size = 0; struct lu_attr *la_copy = &mdd_env_info(env)->mti_la_for_fix; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qnids[MAXQUOTAS] = { 0, 0 }; + unsigned int qoids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, block_count = 0; + int inode_pending = 0, block_pending = 0; +#endif ENTRY; - mdd_txn_param_build(env, mdd, MDD_TXN_ATTR_SET_OP); + mdd_setattr_txn_param_build(env, obj, (struct md_attr *)ma, + MDD_TXN_ATTR_SET_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) RETURN(PTR_ERR(handle)); @@ -871,6 +870,31 @@ static int mdd_attr_set(const struct lu_env *env, struct md_object *obj, if (rc) GOTO(cleanup, rc); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && la_copy->la_valid & (LA_UID | LA_GID)) { + struct lu_attr *la_tmp = &mdd_env_info(env)->mti_la; + + rc = mdd_la_get(env, mdd_obj, la_tmp, BYPASS_CAPA); + if (!rc) { + quota_opc = FSFILT_OP_SETATTR; + mdd_quota_wrapper(la_copy, qnids); + mdd_quota_wrapper(la_tmp, qoids); + /* get file quota for new owner */ + lquota_chkquota(mds_quota_interface_ref, obd, + qnids[USRQUOTA], qnids[GRPQUOTA], 1, + &inode_pending, NULL, 0); + block_count = (la_tmp->la_blocks + 7) >> 3; + if (block_count) + /* get block quota for new owner */ + lquota_chkquota(mds_quota_interface_ref, obd, + qnids[USRQUOTA], + qnids[GRPQUOTA], + block_count, &block_pending, + NULL, LQUOTA_FLAGS_BLK); + } + } +#endif + if (la_copy->la_valid & LA_FLAGS) { rc = mdd_attr_set_internal_locked(env, mdd_obj, la_copy, handle, 1); @@ -913,6 +937,23 @@ cleanup: rc = mdd_lov_setattr_async(env, mdd_obj, lmm, lmm_size, logcookies); } +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) { + if (inode_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qnids[USRQUOTA], qnids[GRPQUOTA], + 1, 0); + if (block_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qnids[USRQUOTA], qnids[GRPQUOTA], + block_count, 1); + /* Trigger dqrel/dqacq for original owner and new owner. + * If failed, the next call for lquota_chkquota will + * process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qnids, qoids, rc, + quota_opc); + } +#endif RETURN(rc); } @@ -1020,6 +1061,12 @@ static int mdd_ref_del(const struct lu_env *env, struct md_object *obj, struct mdd_object *mdd_obj = md2mdd_obj(obj); struct mdd_device *mdd = mdo2mdd(obj); struct thandle *handle; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0; +#endif int rc; ENTRY; @@ -1062,11 +1109,26 @@ static int mdd_ref_del(const struct lu_env *env, struct md_object *obj, GOTO(cleanup, rc); rc = mdd_finish_unlink(env, mdd_obj, ma, handle); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota && ma->ma_valid & MA_INODE && + ma->ma_attr.la_nlink == 0 && mdd_obj->mod_count == 0) { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD; + mdd_quota_wrapper(&ma->ma_attr, qids); + } +#endif + EXIT; cleanup: mdd_write_unlock(env, mdd_obj); mdd_trans_stop(env, mdd, rc, handle); +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) + /* Trigger dqrel on the owner of child. If failed, + * the next call for lquota_chkquota will process it */ + lquota_adjust(mds_quota_interface_ref, obd, qids, 0, rc, + quota_opc); +#endif return rc; } @@ -1105,20 +1167,52 @@ static int mdd_object_create(const struct lu_env *env, struct mdd_object *mdd_obj = md2mdd_obj(obj); const struct lu_fid *pfid = spec->u.sp_pfid; struct thandle *handle; - int rc; +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdd->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0, block_count = 0; + int inode_pending = 0, block_pending = 0; +#endif + int rc = 0; ENTRY; +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + quota_opc = FSFILT_OP_CREATE_PARTIAL_CHILD; + mdd_quota_wrapper(&ma->ma_attr, qids); + /* get file quota for child */ + lquota_chkquota(mds_quota_interface_ref, obd, qids[USRQUOTA], + qids[GRPQUOTA], 1, &inode_pending, NULL, 0); + switch (ma->ma_attr.la_mode & S_IFMT) { + case S_IFLNK: + case S_IFDIR: + block_count = 2; + break; + case S_IFREG: + block_count = 1; + break; + } + /* get block quota for child */ + if (block_count) + lquota_chkquota(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], + block_count, &block_pending, NULL, + LQUOTA_FLAGS_BLK); + } +#endif + mdd_txn_param_build(env, mdd, MDD_TXN_OBJECT_CREATE_OP); handle = mdd_trans_start(env, mdd); if (IS_ERR(handle)) - RETURN(PTR_ERR(handle)); + GOTO(out_pending, rc = PTR_ERR(handle)); mdd_write_lock(env, mdd_obj, MOR_TGT_CHILD); rc = mdd_oc_sanity_check(env, mdd_obj, ma); if (rc) GOTO(unlock, rc); - rc = mdd_object_create_internal(env, NULL, mdd_obj, ma, handle); + rc = mdd_object_create_internal(env, NULL, mdd_obj, ma, handle, spec); if (rc) GOTO(unlock, rc); @@ -1158,7 +1252,7 @@ static int mdd_object_create(const struct lu_env *env, pfid = spec->u.sp_ea.fid; } #endif - rc = mdd_object_initialize(env, pfid, mdd_obj, ma, handle); + rc = mdd_object_initialize(env, pfid, mdd_obj, ma, handle, spec); } EXIT; unlock: @@ -1167,6 +1261,23 @@ unlock: mdd_write_unlock(env, mdd_obj); mdd_trans_stop(env, mdd, rc, handle); +out_pending: +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) { + if (inode_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], + 1, 0); + if (block_pending) + lquota_pending_commit(mds_quota_interface_ref, obd, + qids[USRQUOTA], qids[GRPQUOTA], + block_count, 1); + /* Trigger dqacq on the owner of child. If failed, + * the next call for lquota_chkquota will process it. */ + lquota_adjust(mds_quota_interface_ref, obd, qids, 0, rc, + FSFILT_OP_CREATE_PARTIAL_CHILD); + } +#endif return rc; } @@ -1319,6 +1430,7 @@ int mdd_object_kill(const struct lu_env *env, struct mdd_object *obj, if (S_ISREG(mdd_object_type(obj))) { /* Return LOV & COOKIES unconditionally here. We clean evth up. * Caller must be ready for that. */ + rc = __mdd_lmm_get(env, obj, ma); if ((ma->ma_valid & MA_LOV)) rc = mdd_unlink_log(env, mdo2mdd(&obj->mod_obj), @@ -1333,9 +1445,17 @@ int mdd_object_kill(const struct lu_env *env, struct mdd_object *obj, static int mdd_close(const struct lu_env *env, struct md_object *obj, struct md_attr *ma) { - int rc; struct mdd_object *mdd_obj = md2mdd_obj(obj); struct thandle *handle; + int rc; + int reset = 1; + +#ifdef HAVE_QUOTA_SUPPORT + struct obd_device *obd = mdo2mdd(obj)->mdd_obd_dev; + struct mds_obd *mds = &obd->u.mds; + unsigned int qids[MAXQUOTAS] = { 0, 0 }; + int quota_opc = 0; +#endif ENTRY; rc = mdd_log_txn_param_build(env, obj, ma, MDD_TXN_UNLINK_OP); @@ -1349,14 +1469,39 @@ static int mdd_close(const struct lu_env *env, struct md_object *obj, /* release open count */ mdd_obj->mod_count --; + if (mdd_obj->mod_count == 0) { + /* remove link to object from orphan index */ + if (mdd_obj->mod_flags & ORPHAN_OBJ) + __mdd_orphan_del(env, mdd_obj, handle); + } + rc = mdd_iattr_get(env, mdd_obj, ma); - if (rc == 0 && mdd_obj->mod_count == 0 && ma->ma_attr.la_nlink == 0) - rc = mdd_object_kill(env, mdd_obj, ma); - else + if (rc == 0) { + if (mdd_obj->mod_count == 0 && ma->ma_attr.la_nlink == 0) { + rc = mdd_object_kill(env, mdd_obj, ma); +#ifdef HAVE_QUOTA_SUPPORT + if (mds->mds_quota) { + quota_opc = FSFILT_OP_UNLINK_PARTIAL_CHILD; + mdd_quota_wrapper(&ma->ma_attr, qids); + } +#endif + if (rc == 0) + reset = 0; + } + } + + if (reset) ma->ma_valid &= ~(MA_LOV | MA_COOKIE); - + mdd_write_unlock(env, mdd_obj); mdd_trans_stop(env, mdo2mdd(obj), rc, handle); +#ifdef HAVE_QUOTA_SUPPORT + if (quota_opc) + /* Trigger dqrel on the owner of child. If failed, + * the next call for lquota_chkquota will process it */ + lquota_adjust(mds_quota_interface_ref, obd, qids, 0, rc, + quota_opc); +#endif RETURN(rc); } @@ -1473,7 +1618,7 @@ static int __mdd_readpage(const struct lu_env *env, struct mdd_object *obj, * iterate through directory and fill pages from @rdpg */ iops = &next->do_index_ops->dio_it; - it = iops->init(env, next, 0, mdd_object_capa(env, obj)); + it = iops->init(env, next, mdd_object_capa(env, obj)); if (IS_ERR(it)) return PTR_ERR(it); diff --git a/lustre/mdd/mdd_orphans.c b/lustre/mdd/mdd_orphans.c index 24a134e..e587094 100644 --- a/lustre/mdd/mdd_orphans.c +++ b/lustre/mdd/mdd_orphans.c @@ -38,6 +38,7 @@ * Orphan handling code * * Author: Mike Pershin + * Pravin B Shelar */ #ifndef EXPORT_SYMTAB @@ -52,126 +53,368 @@ #include #include "mdd_internal.h" -const char orph_index_name[] = "orphans"; - -static const struct dt_index_features orph_index_features = { - .dif_flags = DT_IND_UPDATE, - .dif_keysize_min = sizeof(struct orph_key), - .dif_keysize_max = sizeof(struct orph_key), - .dif_recsize_min = sizeof(loff_t), - .dif_recsize_max = sizeof(loff_t) -}; +const char orph_index_name[] = "PENDING"; enum { ORPH_OP_UNLINK, ORPH_OP_TRUNCATE }; -static struct orph_key *orph_key_fill(const struct lu_env *env, - const struct lu_fid *lf, __u32 op) +#define ORPHAN_FILE_NAME_FORMAT "%016llx:%08x:%08x:%2x" +#define ORPHAN_FILE_NAME_FORMAT_18 "%llx:%08x" + +static struct dt_key* orph_key_fill(const struct lu_env *env, + const struct lu_fid *lf, __u32 op) { - struct orph_key *key = &mdd_env_info(env)->mti_orph_key; + char *key = mdd_env_info(env)->mti_orph_key; + int rc; + LASSERT(key); - fid_cpu_to_be(&key->ok_fid, lf); - key->ok_op = cpu_to_be32(op); - return key; + rc = snprintf(key, NAME_MAX + 1, ORPHAN_FILE_NAME_FORMAT, fid_seq(lf), + fid_oid(lf), fid_ver(lf), op); + if (rc > 0) + return (struct dt_key*) key; + else + return ERR_PTR(rc); +} + +static struct dt_key* orph_key_fill_18(const struct lu_env *env, + const struct lu_fid *lf) +{ + char *key = mdd_env_info(env)->mti_orph_key; + int rc; + + LASSERT(key); + rc = snprintf(key, NAME_MAX + 1, ORPHAN_FILE_NAME_FORMAT_18, fid_seq(lf), + fid_oid(lf)); + if (rc > 0) + return (struct dt_key*) key; + else + return ERR_PTR(rc); +} + +static int orphan_key_to_fid(char *key, struct lu_fid *lf) +{ + int rc = 0; + unsigned int op; + + rc = sscanf(key, ORPHAN_FILE_NAME_FORMAT, &lf->f_seq, &lf->f_oid, + &lf->f_ver, &op); + if (rc == 4) + return 0; + + /* build igif */ + rc = sscanf(key, ORPHAN_FILE_NAME_FORMAT_18, + &lf->f_seq, &lf->f_oid); + if (rc == 2) { + lf->f_ver = 0; + return 0; + } + + CERROR("can not parse orphan file name %s\n",key); + return -EINVAL; +} + +static inline void mdd_orphan_write_lock(const struct lu_env *env, + struct mdd_device *mdd) +{ + + struct dt_object *dor = mdd->mdd_orphans; + dor->do_ops->do_write_lock(env, dor, MOR_TGT_CHILD); +} + +static inline void mdd_orphan_write_unlock(const struct lu_env *env, + struct mdd_device *mdd) +{ + + struct dt_object *dor = mdd->mdd_orphans; + dor->do_ops->do_write_unlock(env, dor); +} + +static inline int mdd_orphan_insert_obj(const struct lu_env *env, + struct mdd_device *mdd, + struct mdd_object *obj, + __u32 op, + struct thandle *th) +{ + struct dt_object *dor = mdd->mdd_orphans; + const struct lu_fid *lf = mdo2fid(obj); + struct dt_key *key = orph_key_fill(env, lf, op); + ENTRY; + + return dor->do_index_ops->dio_insert(env, dor, + __mdd_fid_rec(env, lf), + key, th, + BYPASS_CAPA, 1); +} + +static inline int mdd_orphan_delete_obj(const struct lu_env *env, + struct mdd_device *mdd , + struct dt_key *key, + struct thandle *th) +{ + struct dt_object *dor = mdd->mdd_orphans; + + return dor->do_index_ops->dio_delete(env, dor, + key, th, + BYPASS_CAPA); } +static inline void mdd_orphan_ref_add(const struct lu_env *env, + struct mdd_device *mdd, + struct thandle *th) +{ + struct dt_object *dor = mdd->mdd_orphans; + dor->do_ops->do_ref_add(env, dor, th); +} + +static inline void mdd_orphan_ref_del(const struct lu_env *env, + struct mdd_device *mdd, + struct thandle *th) +{ + struct dt_object *dor = mdd->mdd_orphans; + dor->do_ops->do_ref_del(env, dor, th); +} + + static int orph_index_insert(const struct lu_env *env, - struct mdd_object *obj, __u32 op, - loff_t *offset, struct thandle *th) + struct mdd_object *obj, + __u32 op, + struct thandle *th) { - struct mdd_device *mdd = mdo2mdd(&obj->mod_obj); - struct dt_object *dor = mdd->mdd_orphans; - struct orph_key *key = orph_key_fill(env, mdo2fid(obj), op); + struct mdd_device *mdd = mdo2mdd(&obj->mod_obj); + struct dt_object *dor = mdd->mdd_orphans; + const struct lu_fid *lf_dor = lu_object_fid(&dor->do_lu); + struct dt_object *next = mdd_object_child(obj); + const struct dt_key *dotdot = (const struct dt_key *) ".."; int rc; ENTRY; - rc = dor->do_index_ops->dio_insert(env, dor, (struct dt_rec *)offset, - (struct dt_key *)key, th, - BYPASS_CAPA); + mdd_orphan_write_lock(env, mdd); + + rc = mdd_orphan_insert_obj(env, mdd, obj, op, th); + if (rc) + GOTO(out, rc); + + mdo_ref_add(env, obj, th); + if (!S_ISDIR(mdd_object_type(obj))) + goto out; + + mdo_ref_add(env, obj, th); + mdd_orphan_ref_add(env, mdd, th); + + /* try best to fixup directory, dont return errors + * from here */ + if (!dt_try_as_dir(env, next)) + goto out; + next->do_index_ops->dio_delete(env, next, + dotdot, th, BYPASS_CAPA); + + next->do_index_ops->dio_insert(env, next, + __mdd_fid_rec(env, lf_dor), + dotdot, th, BYPASS_CAPA, 1); + +out: + mdd_orphan_write_unlock(env, mdd); + RETURN(rc); } +/** + * destroy osd object on mdd and associated ost objects. + * + * \param obj orphan object + * \param mdd used for sending llog msg to osts + * + * \retval 0 success + * \retval -ve error + */ +static int orphan_object_kill(const struct lu_env *env, + struct mdd_object *obj, + struct mdd_device *mdd, + struct thandle *th) +{ + struct lu_attr *la = &mdd_env_info(env)->mti_la; + int rc; + + /* No need to lock this object as its recovery phase, and + * no other thread can access it. But we need to lock it + * as its precondition for osd api we using. */ + + mdd_write_lock(env, obj, MOR_TGT_CHILD); + mdo_ref_del(env, obj, th); + if (S_ISDIR(mdd_object_type(obj))) { + mdo_ref_del(env, obj, th); + mdd_orphan_ref_del(env, mdd, th); + mdd_write_unlock(env, obj); + } else { + /* regular file , cleanup linked ost objects */ + rc = mdd_la_get(env, obj, la, BYPASS_CAPA); + mdd_write_unlock(env, obj); + if (rc) + RETURN(rc); + + mdd_lov_destroy(env, mdd, obj, la); + } + return 0; +} + static int orph_index_delete(const struct lu_env *env, - struct mdd_object *obj, __u32 op, + struct mdd_object *obj, + __u32 op, struct thandle *th) { struct mdd_device *mdd = mdo2mdd(&obj->mod_obj); struct dt_object *dor = mdd->mdd_orphans; - struct orph_key *key = orph_key_fill(env, mdo2fid(obj), op); + struct dt_key *key; int rc; + ENTRY; + LASSERT(dor); - rc = dor->do_index_ops->dio_delete(env, dor, - (struct dt_key *)key, th, - BYPASS_CAPA); - RETURN(rc); + key = orph_key_fill(env, mdo2fid(obj), op); + mdd_orphan_write_lock(env, mdd); + + rc = mdd_orphan_delete_obj(env, mdd, key, th); + + if (rc == -ENOENT) { + key = orph_key_fill_18(env, mdo2fid(obj)); + rc = mdd_orphan_delete_obj(env, mdd, key, th); + } + + if (!rc) { + /* lov objects will be destroyed by caller */ + mdo_ref_del(env, obj, th); + if (S_ISDIR(mdd_object_type(obj))) { + mdo_ref_del(env, obj, th); + mdd_orphan_ref_del(env, mdd, th); + } + } else + CERROR("could not delete object: rc = %d\n",rc); + + obj->mod_flags &= ~ORPHAN_OBJ; + mdd_orphan_write_unlock(env, mdd); + RETURN(rc); } -static inline struct orph_key *orph_key_empty(const struct lu_env *env, - __u32 op) + +static int orphan_object_destroy(const struct lu_env *env, + struct mdd_object *obj, + struct dt_key *key) { - struct orph_key *key = &mdd_env_info(env)->mti_orph_key; - LASSERT(key); - fid_zero(&key->ok_fid); - key->ok_op = cpu_to_be32(op); - return key; + struct thandle *th = NULL; + struct mdd_device *mdd = mdo2mdd(&obj->mod_obj); + int rc; + ENTRY; + + mdd_txn_param_build(env, mdd, MDD_TXN_UNLINK_OP); + th = mdd_trans_start(env, mdd); + if (IS_ERR(th)) { + CERROR("Cannot get thandle\n"); + RETURN(-ENOMEM); + } + + mdd_orphan_write_lock(env, mdd); + rc = mdd_orphan_delete_obj(env, mdd, key, th); + if (!rc) + orphan_object_kill(env, obj, mdd, th); + else + CERROR("could not delete object: rc = %d\n",rc); + + mdd_orphan_write_unlock(env, mdd); + mdd_trans_stop(env, mdd, 0, th); + + RETURN(rc); } -static void orph_key_test_and_del(const struct lu_env *env, - struct mdd_device *mdd, - const struct orph_key *key) +static int orph_key_test_and_del(const struct lu_env *env, + struct mdd_device *mdd, + struct lu_fid *lf, + struct dt_key *key) { struct mdd_object *mdo; + int rc; + + mdo = mdd_object_find(env, mdd, lf); - mdo = mdd_object_find(env, mdd, &key->ok_fid); if (IS_ERR(mdo)) - CERROR("Invalid orphan!\n"); - else { - mdd_write_lock(env, mdo, MOR_TGT_CHILD); - if (mdo->mod_count == 0) { - /* non-opened orphan, let's delete it */ - struct md_attr *ma = &mdd_env_info(env)->mti_ma; - CWARN("Found orphan!\n"); - mdd_object_kill(env, mdo, ma); - /* TODO: now handle OST objects */ - //mdd_ost_objects_destroy(env, ma); - /* TODO: destroy index entry */ - } - mdd_write_unlock(env, mdo); - mdd_object_put(env, mdo); + return PTR_ERR(mdo); + + rc = -EBUSY; + if (mdo->mod_count == 0) { + CWARN("Found orphan!\n"); + rc = orphan_object_destroy(env, mdo, key); + } else { + mdo->mod_flags |= ORPHAN_OBJ; } + + mdd_object_put(env, mdo); + return rc; } static int orph_index_iterate(const struct lu_env *env, struct mdd_device *mdd) { - struct dt_object *dt_obj = mdd->mdd_orphans; - struct dt_it *it; + struct dt_object *dor = mdd->mdd_orphans; + char *mti_key = mdd_env_info(env)->mti_orph_key; const struct dt_it_ops *iops; - struct orph_key *key = orph_key_empty(env, 0); - int result; + struct dt_it *it; + char *key; + struct lu_fid fid; + int result = 0; + int key_sz = 0; + int rc; + __u64 cookie; ENTRY; - iops = &dt_obj->do_index_ops->dio_it; - it = iops->init(env, dt_obj, 1, BYPASS_CAPA); + /* In recovery phase, do not need for any lock here */ + + iops = &dor->do_index_ops->dio_it; + it = iops->init(env, dor, BYPASS_CAPA); if (it != NULL) { - result = iops->get(env, it, (const void *)key); + result = iops->get(env, it, (const void *)""); if (result > 0) { - int i; /* main cycle */ - for (result = 0, i = 0; result == +1; ++i) { + do { + key = (void *)iops->key(env, it); - fid_be_to_cpu(&key->ok_fid, &key->ok_fid); - orph_key_test_and_del(env, mdd, key); + if (IS_ERR(key)) + goto next; + key_sz = iops->key_size(env, it); + + /* filter out "." and ".." entries from + * PENDING dir. */ + if (key_sz < 8) + goto next; + + memcpy(mti_key, key, key_sz); + mti_key[key_sz] = 0; + + if (orphan_key_to_fid(mti_key, &fid)) + goto next; + if (!fid_is_sane(&fid)) + goto next; + + /* kill orphan object */ + cookie = iops->store(env, it); + iops->put(env, it); + rc = orph_key_test_and_del(env, mdd, &fid, + (struct dt_key *)mti_key); + + /* after index delete reset iterator */ + if (!rc) + result = iops->get(env, it, + (const void *)""); + else + result = iops->load(env, it, cookie); +next: result = iops->next(env, it); - } + } while (result == 0); + result = 0; } else if (result == 0) /* Index contains no zero key? */ result = -EIO; - iops->put(env, it); iops->fini(env, it); } else @@ -184,17 +427,17 @@ int orph_index_init(const struct lu_env *env, struct mdd_device *mdd) { struct lu_fid fid; struct dt_object *d; - int rc; + int rc = 0; ENTRY; - d = dt_store_open(env, mdd->mdd_child, orph_index_name, &fid); + d = dt_store_open(env, mdd->mdd_child, "", orph_index_name, &fid); if (!IS_ERR(d)) { mdd->mdd_orphans = d; - rc = d->do_ops->do_index_try(env, d, &orph_index_features); - if (rc == 0) - LASSERT(d->do_index_ops != NULL); - else - CERROR("\"%s\" is not an index!\n", orph_index_name); + if (!dt_try_as_dir(env, d)) { + rc = -ENOTDIR; + CERROR("\"%s\" is not an index! : rc = %d\n", + orph_index_name, rc); + } } else { CERROR("cannot find \"%s\" obj %d\n", orph_index_name, (int)PTR_ERR(d)); @@ -214,18 +457,45 @@ void orph_index_fini(const struct lu_env *env, struct mdd_device *mdd) EXIT; } +/** + * Iterate orphan index to cleanup orphan objects in case of recovery. + * \param d mdd device in recovery. + * + */ + int __mdd_orphan_cleanup(const struct lu_env *env, struct mdd_device *d) { return orph_index_iterate(env, d); } +/** + * delete an orphan \a obj from orphan index. + * \param obj file or directory. + * \param th transaction for index insert. + * + * \pre obj nlink == 0 && obj->mod_count != 0 + * + * \retval 0 success + * \retva -ve index operation error. + */ + int __mdd_orphan_add(const struct lu_env *env, struct mdd_object *obj, struct thandle *th) { - loff_t offset = 0; - return orph_index_insert(env, obj, ORPH_OP_UNLINK, &offset, th); + return orph_index_insert(env, obj, ORPH_OP_UNLINK, th); } +/** + * delete an orphan \a obj from orphan index. + * \param obj file or directory. + * \param th transaction for index deletion and object destruction. + * + * \pre obj->mod_count == 0 && ORPHAN_OBJ is set for obj. + * + * \retval 0 success + * \retva -ve index operation error. + */ + int __mdd_orphan_del(const struct lu_env *env, struct mdd_object *obj, struct thandle *th) { diff --git a/lustre/mdd/mdd_permission.c b/lustre/mdd/mdd_permission.c index 7714e61..efbc52a 100644 --- a/lustre/mdd/mdd_permission.c +++ b/lustre/mdd/mdd_permission.c @@ -65,7 +65,7 @@ * Get default acl EA only. * Hold read_lock for mdd_obj. */ -int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj, +int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj, struct md_attr *ma) { struct lu_buf *buf; @@ -74,7 +74,7 @@ int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj, if (ma->ma_valid & MA_ACL_DEF) RETURN(0); - + buf = mdd_buf_get(env, ma->ma_acl, ma->ma_acl_size); rc = mdo_xattr_get(env, mdd_obj, buf, XATTR_NAME_ACL_DEFAULT, BYPASS_CAPA); @@ -91,7 +91,7 @@ int mdd_def_acl_get(const struct lu_env *env, struct mdd_object *mdd_obj, /* * Hold write_lock for o. */ -int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode, +int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode, struct thandle *handle) { struct lu_buf *buf; @@ -102,9 +102,9 @@ int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode, ENTRY; - buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, + buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, sizeof(mdd_env_info(env)->mti_xattr_buf)); - + rc = mdo_xattr_get(env, o, buf, XATTR_NAME_ACL_ACCESS, BYPASS_CAPA); if ((rc == -EOPNOTSUPP) || (rc == -ENODATA)) RETURN(0); @@ -118,7 +118,7 @@ int mdd_acl_chmod(const struct lu_env *env, struct mdd_object *o, __u32 mode, sizeof(posix_acl_xattr_entry); if (entry_count <= 0) RETURN(0); - + rc = lustre_posix_acl_chmod_masq(entry, mode, entry_count); if (rc) RETURN(rc); @@ -147,13 +147,13 @@ int __mdd_acl_init(const struct lu_env *env, struct mdd_object *obj, sizeof(posix_acl_xattr_entry); if (entry_count <= 0) RETURN(0); - - if (S_ISDIR(*mode)) { - rc = mdo_xattr_set(env, obj, buf, XATTR_NAME_ACL_DEFAULT, 0, + + if (S_ISDIR(*mode)) { + rc = mdo_xattr_set(env, obj, buf, XATTR_NAME_ACL_DEFAULT, 0, handle, BYPASS_CAPA); if (rc) RETURN(rc); - } + } rc = lustre_posix_acl_create_masq(entry, mode, entry_count); if (rc <= 0) @@ -180,7 +180,7 @@ static int mdd_check_acl(const struct lu_env *env, struct mdd_object *obj, int rc; ENTRY; - buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, + buf = mdd_buf_get(env, mdd_env_info(env)->mti_xattr_buf, sizeof(mdd_env_info(env)->mti_xattr_buf)); rc = mdo_xattr_get(env, obj, buf, XATTR_NAME_ACL_ACCESS, mdd_object_capa(env, obj)); @@ -270,7 +270,7 @@ check_capabilities: RETURN(-EACCES); } -int mdd_permission(const struct lu_env *env, +int mdd_permission(const struct lu_env *env, struct md_object *pobj, struct md_object *cobj, struct md_attr *ma, int mask) { @@ -386,7 +386,7 @@ int mdd_capa_get(const struct lu_env *env, struct md_object *obj, capa->lc_opc); if (IS_ERR(oc)) { rc = PTR_ERR(oc); - } else { + } else if (likely(oc != NULL)) { capa_cpy(capa, oc); capa_put(oc); } diff --git a/lustre/mdd/mdd_quota.c b/lustre/mdd/mdd_quota.c new file mode 100644 index 0000000..d246a07 --- /dev/null +++ b/lustre/mdd/mdd_quota.c @@ -0,0 +1,274 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/mdd/mdd_quota.c + * + * Lustre Metadata Server (mdd) routines + * + * Author: Fan Yong + */ + +#ifdef HAVE_QUOTA_SUPPORT + +#include "mdd_internal.h" + +int mdd_quota_notify(const struct lu_env *env, struct md_device *m) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + ENTRY; + + lquota_setinfo(mds_quota_interface_ref, obd, (void *)1); + RETURN(0); +} + +int mdd_quota_setup(const struct lu_env *env, struct md_device *m, + void *data) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct dt_device *dt = mdd->mdd_child; + int rc; + ENTRY; + + LASSERT(obd->obd_fsops != NULL); + dt->dd_ops->dt_init_quota_ctxt(env, dt, (void *)obd, data); + rc = lquota_setup(mds_quota_interface_ref, obd); + RETURN(rc); +} + +int mdd_quota_cleanup(const struct lu_env *env, struct md_device *m) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + int rc1, rc2; + ENTRY; + + rc1 = lquota_cleanup(mds_quota_interface_ref, obd); + rc2 = lquota_fs_cleanup(mds_quota_interface_ref, obd); + RETURN(rc1 ? : rc2); +} + +int mdd_quota_recovery(const struct lu_env *env, struct md_device *m) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + int rc; + ENTRY; + + rc = lquota_recovery(mds_quota_interface_ref, obd); + RETURN(rc); +} + +int mdd_quota_check(const struct lu_env *env, struct md_device *m, + struct obd_export *exp, __u32 type) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_type = type; + rc = lquota_check(mds_quota_interface_ref, obd, exp, oqctl); + RETURN(rc); +} + +int mdd_quota_on(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_QUOTAON; + oqctl->qc_type = type; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} + +int mdd_quota_off(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_QUOTAOFF; + oqctl->qc_type = type; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} + +int mdd_quota_setinfo(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_SETINFO; + oqctl->qc_type = type; + oqctl->qc_id = id; + oqctl->qc_dqinfo = *dqinfo; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} + +int mdd_quota_getinfo(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct mdd_device *mdd = lu2mdd_dev( + &((struct md_device *)m)->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_GETINFO; + oqctl->qc_type = type; + oqctl->qc_id = id; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + *dqinfo = oqctl->qc_dqinfo; + RETURN(rc); +} + +int mdd_quota_setquota(const struct lu_env *env, struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_SETQUOTA; + oqctl->qc_type = type; + oqctl->qc_id = id; + oqctl->qc_dqblk = *dqblk; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} + +int mdd_quota_getquota(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct mdd_device *mdd = lu2mdd_dev( + &((struct md_device *)m)->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_GETQUOTA; + oqctl->qc_type = type; + oqctl->qc_id = id; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + *dqblk = oqctl->qc_dqblk; + RETURN(rc); +} + +int mdd_quota_getoinfo(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqinfo *dqinfo) +{ + struct mdd_device *mdd = lu2mdd_dev( + &((struct md_device *)m)->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_GETOINFO; + oqctl->qc_type = type; + oqctl->qc_id = id; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + *dqinfo = oqctl->qc_dqinfo; + RETURN(rc); +} + +int mdd_quota_getoquota(const struct lu_env *env, const struct md_device *m, + __u32 type, __u32 id, struct obd_dqblk *dqblk) +{ + struct mdd_device *mdd = lu2mdd_dev( + &((struct md_device *)m)->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = Q_GETOQUOTA; + oqctl->qc_type = type; + oqctl->qc_id = id; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + *dqblk = oqctl->qc_dqblk; + RETURN(rc); +} + +int mdd_quota_invalidate(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = LUSTRE_Q_INVALIDATE; + oqctl->qc_type = type; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} + +int mdd_quota_finvalidate(const struct lu_env *env, struct md_device *m, + __u32 type) +{ + struct mdd_device *mdd = lu2mdd_dev(&m->md_lu_dev); + struct obd_device *obd = mdd->mdd_obd_dev; + struct obd_quotactl *oqctl = &mdd_env_info(env)->mti_oqctl; + int rc; + ENTRY; + + oqctl->qc_cmd = LUSTRE_Q_FINVALIDATE; + oqctl->qc_type = type; + rc = lquota_ctl(mds_quota_interface_ref, obd, oqctl); + RETURN(rc); +} +#endif diff --git a/lustre/mdd/mdd_trans.c b/lustre/mdd/mdd_trans.c index 01ab561..947ef75 100644 --- a/lustre/mdd/mdd_trans.c +++ b/lustre/mdd/mdd_trans.c @@ -135,6 +135,20 @@ int mdd_log_txn_param_build(const struct lu_env *env, struct md_object *obj, RETURN(rc); } +int mdd_setattr_txn_param_build(const struct lu_env *env, struct md_object *obj, + struct md_attr *ma, enum mdd_txn_op op) +{ + struct mdd_device *mdd = mdo2mdd(&md2mdd_obj(obj)->mod_obj); + ENTRY; + + mdd_txn_param_build(env, mdd, op); + if (ma->ma_attr.la_valid & (LA_UID | LA_GID)) + mdd_env_info(env)->mti_param.tp_credits = + dto_txn_credits[DTO_ATTR_SET_CHOWN]; + + RETURN(0); +} + static void mdd_txn_init_dto_credits(const struct lu_env *env, struct mdd_device *mdd, int *dto_credits) { @@ -161,16 +175,18 @@ int mdd_txn_init_credits(const struct lu_env *env, struct mdd_device *mdd) mdd->mdd_tod[op].mod_op = op; switch(op) { case MDD_TXN_OBJECT_DESTROY_OP: + /* Unused now */ *c = dt[DTO_OBJECT_DELETE]; break; case MDD_TXN_OBJECT_CREATE_OP: - /* OI_INSERT + CREATE OBJECT */ + /* OI INSERT + CREATE OBJECT */ *c = dt[DTO_INDEX_INSERT] + - dt[DTO_OBJECT_CREATE]; + dt[DTO_OBJECT_CREATE]; break; case MDD_TXN_ATTR_SET_OP: /* ATTR set + XATTR(lsm, lmv) set */ - *c = dt[DTO_ATTR_SET] + dt[DTO_XATTR_SET]; + *c = dt[DTO_ATTR_SET_BASE] + + dt[DTO_XATTR_SET]; break; case MDD_TXN_XATTR_SET_OP: *c = dt[DTO_XATTR_SET]; @@ -185,18 +201,28 @@ int mdd_txn_init_credits(const struct lu_env *env, struct mdd_device *mdd) *c = dt[DTO_INDEX_INSERT]; break; case MDD_TXN_UNLINK_OP: - /* delete index + Unlink log */ - *c = dt[DTO_INDEX_DELETE]; + /* delete index + Unlink log + + * mdd orphan handling */ + *c = dt[DTO_INDEX_DELETE] + + dt[DTO_INDEX_DELETE] + + dt[DTO_INDEX_INSERT] * 2 + + dt[DTO_XATTR_SET] * 3; break; case MDD_TXN_RENAME_OP: /* 2 delete index + 1 insert + Unlink log */ *c = 2 * dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT]; + dt[DTO_INDEX_INSERT] + + dt[DTO_INDEX_DELETE] + + dt[DTO_INDEX_INSERT] * 2 + + dt[DTO_XATTR_SET] * 3; break; case MDD_TXN_RENAME_TGT_OP: /* index insert + index delete */ *c = dt[DTO_INDEX_DELETE] + - dt[DTO_INDEX_INSERT]; + dt[DTO_INDEX_INSERT] + + dt[DTO_INDEX_DELETE] + + dt[DTO_INDEX_INSERT] * 2 + + dt[DTO_XATTR_SET] * 3; break; case MDD_TXN_CREATE_DATA_OP: /* same as set xattr(lsm) */ @@ -209,7 +235,7 @@ int mdd_txn_init_credits(const struct lu_env *env, struct mdd_device *mdd) * CREATE_OBJECT CREDITS */ *c = 2 * dt[DTO_INDEX_INSERT] + - dt[DTO_OBJECT_CREATE]; + dt[DTO_OBJECT_CREATE]; break; default: CERROR("Invalid op %d init its credit\n", op); diff --git a/lustre/mds/handler.c b/lustre/mds/handler.c index 5bf89a2..6e5c40a 100644 --- a/lustre/mds/handler.c +++ b/lustre/mds/handler.c @@ -60,7 +60,6 @@ #include #include #include -#include #include #include @@ -87,9 +86,6 @@ struct dentry *mds_fid2dentry(struct mds_obd *mds, struct ll_fid *fid, snprintf(fid_name, sizeof(fid_name), "0x%lx", ino); - CDEBUG(D_DENTRY, "--> mds_fid2dentry: ino/gen %lu/%u, sb %p\n", - ino, generation, mds->mds_obt.obt_sb); - /* under ext3 this is neither supposed to return bad inodes nor NULL inodes. */ result = ll_lookup_one_len(fid_name, mds->mds_fid_de, strlen(fid_name)); @@ -253,9 +249,6 @@ int mds_postrecov(struct obd_device *obd) obd->obd_async_recov ? OBD_NOTIFY_SYNC_NONBLOCK : OBD_NOTIFY_SYNC, NULL); - /* quota recovery */ - lquota_recovery(mds_quota_interface_ref, obd); - RETURN(rc); } @@ -311,9 +304,6 @@ struct lvfs_callback_ops mds_lvfs_ops = { l_fid2dentry: mds_lvfs_fid2dentry, }; -quota_interface_t *mds_quota_interface_ref; -extern quota_interface_t mds_quota_interface; - static void mds_init_ctxt(struct obd_device *obd, struct vfsmount *mnt) { struct mds_obd *mds = &obd->u.mds; @@ -414,9 +404,6 @@ static int mds_cmd_setup(struct obd_device *obd, struct lustre_cfg *lcfg) if (rc) GOTO(err_objects, rc); - mds->mds_max_mdsize = sizeof(struct lov_mds_md_v3); - mds->mds_max_cookiesize = sizeof(struct llog_cookie); - err_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); RETURN(rc); @@ -442,6 +429,9 @@ static int mds_cmd_cleanup(struct obd_device *obd) LCONSOLE_WARN("%s: shutting down for failover; client state " "will be preserved.\n", obd->obd_name); + if (strncmp(obd->obd_name, MDD_OBD_NAME, strlen(MDD_OBD_NAME))) + RETURN(0); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); mds_lov_destroy_objids(obd); @@ -480,9 +470,23 @@ static struct obd_ops mds_cmd_obd_ops = { // .o_health_check = mds_cmd_health_check, }; +quota_interface_t *mds_quota_interface_ref; +extern quota_interface_t mds_quota_interface; + static int __init mds_cmd_init(void) { struct lprocfs_static_vars lvars; + int rc; + + request_module("lquota"); + mds_quota_interface_ref = PORTAL_SYMBOL_GET(mds_quota_interface); + rc = lquota_init(mds_quota_interface_ref); + if (rc) { + if (mds_quota_interface_ref) + PORTAL_SYMBOL_PUT(mds_quota_interface); + return rc; + } + init_obd_quota_ops(mds_quota_interface_ref, &mds_cmd_obd_ops); lprocfs_mds_init_vars(&lvars); class_register_type(&mds_cmd_obd_ops, NULL, lvars.module_vars, @@ -493,9 +497,14 @@ static int __init mds_cmd_init(void) static void /*__exit*/ mds_cmd_exit(void) { + lquota_exit(mds_quota_interface_ref); + if (mds_quota_interface_ref) + PORTAL_SYMBOL_PUT(mds_quota_interface); + class_unregister_type(LUSTRE_MDS_NAME); } +EXPORT_SYMBOL(mds_quota_interface_ref); MODULE_AUTHOR("Sun Microsystems, Inc. "); MODULE_DESCRIPTION("Lustre Metadata Server (MDS)"); MODULE_LICENSE("GPL"); diff --git a/lustre/mds/lproc_mds.c b/lustre/mds/lproc_mds.c index 138eafa..58aac97 100644 --- a/lustre/mds/lproc_mds.c +++ b/lustre/mds/lproc_mds.c @@ -110,7 +110,7 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, ptlrpc_check_set(NULL, set); } - /* See the comments in function lprocfs_wr_evict_client() + /* See the comments in function lprocfs_wr_evict_client() * in ptlrpc/lproc_ptlrpc.c for details. - jay */ class_incref(obd, __FUNCTION__, cfs_current()); LPROCFS_EXIT(); @@ -130,162 +130,6 @@ static int lprocfs_mds_wr_evict_client(struct file *file, const char *buffer, return count; } -#if 0 -static int lprocfs_wr_group_info(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - struct mds_obd *mds = &obd->u.mds; - struct mds_grp_downcall_data sparam, *param = &sparam; - int size = 0, rc = count; - - if (count < sizeof(param)) { - CERROR("%s: invalid data size %lu\n", obd->obd_name, count); - return count; - } - - if (copy_from_user(param, buffer, sizeof(*param)) || - param->mgd_magic != MDS_GRP_DOWNCALL_MAGIC) { - CERROR("%s: MDS group downcall bad params\n", obd->obd_name); - return count; - } - - if (param->mgd_ngroups > NGROUPS_MAX) { - CWARN("%s: uid %u groups %d more than maximum %d\n", - obd->obd_name, param->mgd_uid, param->mgd_ngroups, - NGROUPS_MAX); - param->mgd_ngroups = NGROUPS_MAX; - } - - if (param->mgd_ngroups > 0) { - size = offsetof(struct mds_grp_downcall_data, - mgd_groups[param->mgd_ngroups]); - OBD_ALLOC(param, size); - if (!param) { - CERROR("%s: fail to alloc %d bytes for uid %u" - " with %d groups\n", obd->obd_name, size, - sparam.mgd_uid, sparam.mgd_ngroups); - param = &sparam; - param->mgd_ngroups = 0; - } else if (copy_from_user(param, buffer, size)) { - CERROR("%s: uid %u bad supplementary group data\n", - obd->obd_name, sparam.mgd_uid); - OBD_FREE(param, size); - param = &sparam; - param->mgd_ngroups = 0; - } - } - rc = upcall_cache_downcall(mds->mds_group_hash, param->mgd_err, - param->mgd_uid, param->mgd_gid, - param->mgd_ngroups, param->mgd_groups); - - if (param && param != &sparam) - OBD_FREE(param, size); - - return rc; -} - -static int lprocfs_rd_group_expire(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct obd_device *obd = data; - - *eof = 1; - return snprintf(page, count, "%lu\n", - obd->u.mds.mds_group_hash->uc_entry_expire / HZ); -} - -static int lprocfs_wr_group_expire(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - int val, rc; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val > 5) - obd->u.mds.mds_group_hash->uc_entry_expire = val * HZ; - else - CERROR("invalid expire time %u for group cache\n", val); - - return count; -} - -static int lprocfs_rd_group_acquire_expire(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct obd_device *obd = data; - - *eof = 1; - return snprintf(page, count, "%lu\n", - obd->u.mds.mds_group_hash->uc_acquire_expire / HZ); -} - -static int lprocfs_wr_group_acquire_expire(struct file *file,const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - int val, rc = 0; - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val > 2) - obd->u.mds.mds_group_hash->uc_acquire_expire = val * HZ; - - return count; -} - -static int lprocfs_rd_group_upcall(char *page, char **start, off_t off, - int count, int *eof, void *data) -{ - struct obd_device *obd = data; - - *eof = 1; - return snprintf(page, count, "%s\n", - obd->u.mds.mds_group_hash->uc_upcall); -} - -static int lprocfs_wr_group_upcall(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - struct upcall_cache *hash = obd->u.mds.mds_group_hash; - char kernbuf[UC_CACHE_UPCALL_MAXPATH] = { '\0' }; - - if (count >= UC_CACHE_UPCALL_MAXPATH) { - CERROR("%s: group upcall too long\n", obd->obd_name); - return -EINVAL; - } - - if (copy_from_user(kernbuf, buffer, - min(count, UC_CACHE_UPCALL_MAXPATH - 1))) - return -EFAULT; - - /* Remove any extraneous bits from the upcall (e.g. linefeeds) */ - sscanf(kernbuf, "%s", hash->uc_upcall); - - if (strcmp(hash->uc_name, obd->obd_name) != 0) - CWARN("%s: write to upcall name %s for MDS %s\n", - obd->obd_name, hash->uc_upcall, obd->obd_name); - CWARN("%s: group upcall set to %s\n", obd->obd_name, hash->uc_upcall); - - return count; -} - -static int lprocfs_wr_group_flush(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = data; - - upcall_cache_flush_idle(obd->u.mds.mds_group_hash); - return count; -} -#endif - static int lprocfs_wr_atime_diff(struct file *file, const char *buffer, unsigned long count, void *data) { @@ -336,23 +180,6 @@ struct lprocfs_vars lprocfs_mds_obd_vars[] = { { "evict_ost_nids", lprocfs_mds_rd_evictostnids, lprocfs_mds_wr_evictostnids, 0 }, { "num_exports", lprocfs_rd_num_exports, 0, 0 }, -#ifdef HAVE_QUOTA_SUPPORT - { "quota_bunit_sz", lprocfs_rd_bunit, lprocfs_wr_bunit, 0 }, - { "quota_btune_sz", lprocfs_rd_btune, lprocfs_wr_btune, 0 }, - { "quota_iunit_sz", lprocfs_rd_iunit, lprocfs_wr_iunit, 0 }, - { "quota_itune_sz", lprocfs_rd_itune, lprocfs_wr_itune, 0 }, - { "quota_type", lprocfs_rd_type, lprocfs_wr_type, 0 }, -#endif -#if 0 - { "group_expire_interval", lprocfs_rd_group_expire, - lprocfs_wr_group_expire, 0}, - { "group_acquire_expire", lprocfs_rd_group_acquire_expire, - lprocfs_wr_group_acquire_expire, 0}, - { "group_upcall", lprocfs_rd_group_upcall, - lprocfs_wr_group_upcall, 0}, - { "group_flush", 0, lprocfs_wr_group_flush, 0}, - { "group_info", 0, lprocfs_wr_group_info, 0 }, -#endif { "atime_diff", lprocfs_rd_atime_diff, lprocfs_wr_atime_diff, 0 }, { 0 } }; diff --git a/lustre/mds/mds_fs.c b/lustre/mds/mds_fs.c index ec83e30..ce287c6 100644 --- a/lustre/mds/mds_fs.c +++ b/lustre/mds/mds_fs.c @@ -46,7 +46,6 @@ #include #include #include -#include #include #include #include @@ -84,7 +83,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, strlen(MDD_OBD_NAME))) { RETURN(0); } - + push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, &ucred); sprintf(fidname, "OBJECTS/%u.%u", tmpname, current->pid); @@ -126,7 +125,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, lock_kernel(); rc = ll_vfs_rename(mds->mds_objects_dir->d_inode, filp->f_dentry, - filp->f_vfsmnt, mds->mds_objects_dir->d_inode, + filp->f_vfsmnt, mds->mds_objects_dir->d_inode, new_child, filp->f_vfsmnt); unlock_kernel(); if (rc) @@ -136,7 +135,7 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, err = fsfilt_commit(exp->exp_obd, mds->mds_objects_dir->d_inode, handle, 0); if (!err) { - oa->o_gr = FILTER_GROUP_MDS0 + mds->mds_id; + oa->o_gr = mdt_to_obd_objgrp(mds->mds_id); oa->o_valid |= OBD_MD_FLID | OBD_MD_FLGENER | OBD_MD_FLGROUP; } else if (!rc) rc = err; @@ -157,7 +156,7 @@ out_pop: int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obd_export *md_exp, void *capa) { struct mds_obd *mds = &exp->exp_obd->u.mds; struct inode *parent_inode = mds->mds_objects_dir->d_inode; diff --git a/lustre/mds/mds_internal.h b/lustre/mds/mds_internal.h index 9cf0e71..c98aefa 100644 --- a/lustre/mds/mds_internal.h +++ b/lustre/mds/mds_internal.h @@ -72,12 +72,10 @@ int mds_obd_create(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti); int mds_obd_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp); + struct obd_export *md_exp, void *capa); /* mds/handler.c */ extern struct lvfs_callback_ops mds_lvfs_ops; -/* quota stuff */ -extern quota_interface_t *mds_quota_interface_ref; /* mds/lproc_mds.c */ enum { diff --git a/lustre/mds/mds_lov.c b/lustre/mds/mds_lov.c index bd9295a..7886c1c 100644 --- a/lustre/mds/mds_lov.c +++ b/lustre/mds/mds_lov.c @@ -49,6 +49,7 @@ #include #include #include +#include #include "mds_internal.h" @@ -159,6 +160,174 @@ void mds_lov_destroy_objids(struct obd_device *obd) EXIT; } +/** + * currently exist two ways for know about ost count and max ost index. + * first - after ost is connected to mds and sync process finished + * second - get from lmm in recovery process, in case when mds not have configs, + * and ost isn't registered in mgs. + * + * \param mds pointer to mds structure + * \param index maxium ost index + * + * \retval -ENOMEM is not hame memory for new page + * \retval 0 is update passed + */ +static int mds_lov_update_max_ost(struct mds_obd *mds, obd_id index) +{ + __u32 page = index / OBJID_PER_PAGE(); + __u32 off = index % OBJID_PER_PAGE(); + obd_id *data = mds->mds_lov_page_array[page]; + + if (data == NULL) { + OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE); + if (data == NULL) + RETURN(-ENOMEM); + + mds->mds_lov_page_array[page] = data; + } + + if (index > mds->mds_lov_objid_max_index) { + mds->mds_lov_objid_lastpage = page; + mds->mds_lov_objid_lastidx = off; + mds->mds_lov_objid_max_index = index; + } + + /* workaround - New target not in objids file; increase mdsize */ + /* ld_tgt_count is used as the max index everywhere, despite its name. */ + if (data[off] == 0) { + __u32 stripes; + + data[off] = 1; + mds->mds_lov_objid_count++; + stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, + mds->mds_lov_objid_count); + + mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); + mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); + + CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d" + " stripes: %d/%d\n", stripes, mds->mds_max_mdsize, + mds->mds_max_cookiesize); + } + + EXIT; + return 0; +} + +int mds_lov_prepare_objids(struct obd_device *obd, struct lov_mds_md *lmm) +{ + struct lov_ost_data_v1 *data; + __u32 count; + int rc = 0; + __u32 j; + + /* if we create file without objects - lmm is NULL */ + if (lmm == NULL) + return 0; + + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + count = le32_to_cpu(((struct lov_mds_md_v1*)lmm)->lmm_stripe_count); + data = &(((struct lov_mds_md_v1*)lmm)->lmm_objects[0]); + break; + case LOV_MAGIC_V3: + count = le32_to_cpu(((struct lov_mds_md_v3*)lmm)->lmm_stripe_count); + data = &(((struct lov_mds_md_v3*)lmm)->lmm_objects[0]); + break; + default: + CERROR("Unknow lmm type %X!\n", le32_to_cpu(lmm->lmm_magic)); + RETURN(-EINVAL); + } + + + mutex_down(&obd->obd_dev_sem); + for (j = 0; j < count; j++) { + __u32 i = le32_to_cpu(data[j].l_ost_idx); + if (mds_lov_update_max_ost(&obd->u.mds, i)) { + rc = -ENOMEM; + break; + } + } + mutex_up(&obd->obd_dev_sem); + + RETURN(rc); +} +EXPORT_SYMBOL(mds_lov_prepare_objids); + +void mds_lov_update_objids(struct obd_device *obd, struct lov_mds_md *lmm) +{ + struct mds_obd *mds = &obd->u.mds; + int j; + struct lov_ost_data_v1 *obj; + int count; + ENTRY; + + /* if we create file without objects - lmm is NULL */ + if (lmm == NULL) + return; + + switch (le32_to_cpu(lmm->lmm_magic)) { + case LOV_MAGIC_V1: + count = le32_to_cpu(((struct lov_mds_md_v1*)lmm)->lmm_stripe_count); + obj = &(((struct lov_mds_md_v1*)lmm)->lmm_objects[0]); + break; + case LOV_MAGIC_V3: + count = le32_to_cpu(((struct lov_mds_md_v3*)lmm)->lmm_stripe_count); + obj = &(((struct lov_mds_md_v3*)lmm)->lmm_objects[0]); + break; + default: + CERROR("Unknow lmm type %X !\n", le32_to_cpu(lmm->lmm_magic)); + return; + } + + for (j = 0; j < count; j++) { + __u32 i = le32_to_cpu(obj[j].l_ost_idx); + obd_id id = le64_to_cpu(obj[j].l_object_id); + __u32 page = i / OBJID_PER_PAGE(); + __u32 idx = i % OBJID_PER_PAGE(); + obd_id *data; + + data = mds->mds_lov_page_array[page]; + + CDEBUG(D_INODE,"update last object for ost %u" + " - new "LPU64" old "LPU64"\n", i, id, data[idx]); + if (id > data[idx]) { + data[idx] = id; + cfs_bitmap_set(mds->mds_lov_page_dirty, page); + } + } + EXIT; + return; +} +EXPORT_SYMBOL(mds_lov_update_objids); + + +static int mds_lov_update_from_read(struct mds_obd *mds, obd_id *data, + __u32 count) +{ + __u32 i; + __u32 stripes; + + for(i = 0; i < count; i++) { + if (data[i] == 0) + continue; + + mds->mds_lov_objid_count++; + } + + stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, + mds->mds_lov_objid_count); + + mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); + mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); + + CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: " + "%d/%d\n", stripes, mds->mds_max_mdsize, mds->mds_max_cookiesize); + + EXIT; + return 0; +} + static int mds_lov_read_objids(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; @@ -178,34 +347,33 @@ static int mds_lov_read_objids(struct obd_device *obd) CDEBUG(D_INFO, "file size %lu pages %d\n", size, page); for (i = 0; i < page; i++) { - obd_id *data = mds->mds_lov_page_array[i]; loff_t off_old = off; - LASSERT(data == NULL); - OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE); - if (data == NULL) + LASSERT(mds->mds_lov_page_array[i] == NULL); + OBD_ALLOC(mds->mds_lov_page_array[i], MDS_LOV_ALLOC_SIZE); + if (mds->mds_lov_page_array[i] == NULL) GOTO(out, rc = -ENOMEM); - mds->mds_lov_page_array[i] = data; - - rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, data, + rc = fsfilt_read_record(obd, mds->mds_lov_objid_filp, mds->mds_lov_page_array[i], OBJID_PER_PAGE()*sizeof(obd_id), &off); if (rc < 0) { CERROR("Error reading objids %d\n", rc); GOTO(out, rc); } - if (off == off_old) - break; // eof count += (off - off_old)/sizeof(obd_id); + if (mds_lov_update_from_read(mds, mds->mds_lov_page_array[i], count)) { + CERROR("Can't update mds data\n"); + GOTO(out, rc = -EIO); + } + + if (off == off_old) + break; // eof } - mds->mds_lov_objid_count = count; - if (count) { - count --; - mds->mds_lov_objid_lastpage = count / OBJID_PER_PAGE(); - mds->mds_lov_objid_lastidx = count % OBJID_PER_PAGE(); - } - CDEBUG(D_INFO, "Read %u - %u %u objid\n", count, + mds->mds_lov_objid_lastpage = i; + mds->mds_lov_objid_lastidx = count % OBJID_PER_PAGE(); + + CDEBUG(D_INFO, "Read %u - %u %u objid\n", mds->mds_lov_objid_count, mds->mds_lov_objid_lastpage, mds->mds_lov_objid_lastidx); out: mds_lov_dump_objids("read",obd); @@ -249,7 +417,7 @@ int mds_lov_write_objids(struct obd_device *obd) EXPORT_SYMBOL(mds_lov_write_objids); static int mds_lov_get_objid(struct obd_device * obd, - __u32 idx) + obd_id idx) { struct mds_obd *mds = &obd->u.mds; unsigned int page; @@ -261,14 +429,6 @@ static int mds_lov_get_objid(struct obd_device * obd, page = idx / OBJID_PER_PAGE(); off = idx % OBJID_PER_PAGE(); data = mds->mds_lov_page_array[page]; - if (data == NULL) { - OBD_ALLOC(data, MDS_LOV_ALLOC_SIZE); - if (data == NULL) - GOTO(out, rc = -ENOMEM); - - mds->mds_lov_page_array[page] = data; - } - if (data[off] == 0) { /* We never read this lastid; ask the osc */ struct obd_id_info lastid; @@ -281,11 +441,6 @@ static int mds_lov_get_objid(struct obd_device * obd, if (rc) GOTO(out, rc); - if (idx > mds->mds_lov_objid_count) { - mds->mds_lov_objid_count = idx; - mds->mds_lov_objid_lastpage = page; - mds->mds_lov_objid_lastidx = off; - } cfs_bitmap_set(mds->mds_lov_page_dirty, page); } out: @@ -307,10 +462,10 @@ int mds_lov_clear_orphans(struct mds_obd *mds, struct obd_uuid *ost_uuid) * objects above this ID, they will be removed. */ memset(&oa, 0, sizeof(oa)); oa.o_flags = OBD_FL_DELORPHAN; - oa.o_gr = FILTER_GROUP_MDS0 + mds->mds_id; + oa.o_gr = mdt_to_obd_objgrp(mds->mds_id); oa.o_valid = OBD_MD_FLFLAGS | OBD_MD_FLGROUP; if (ost_uuid != NULL) - oti.oti_ost_uuid = ost_uuid; + oti.oti_ost_uuid = ost_uuid; rc = obd_create(mds->mds_osc_exp, &oa, &empty_ea, &oti); RETURN(rc); @@ -326,9 +481,6 @@ static int mds_lov_set_one_nextid(struct obd_device *obd, __u32 idx, obd_id *id) LASSERT(!obd->obd_recovering); - /* obd->obd_dev_sem must be held so mds_lov_objids doesn't change */ - LASSERT_SEM_LOCKED(&obd->obd_dev_sem); - info.idx = idx; info.data = id; rc = obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_NEXT_ID), @@ -346,7 +498,7 @@ static int mds_lov_update_desc(struct obd_device *obd, int idx, { struct mds_obd *mds = &obd->u.mds; struct lov_desc *ld; - __u32 stripes, valsize = sizeof(mds->mds_lov_desc); + __u32 valsize = sizeof(mds->mds_lov_desc); int rc = 0; ENTRY; @@ -365,14 +517,12 @@ static int mds_lov_update_desc(struct obd_device *obd, int idx, CDEBUG(D_CONFIG, "updated lov_desc, tgt_count: %d - idx %d / uuid %s\n", mds->mds_lov_desc.ld_tgt_count, idx, uuid->uuid); - stripes = min_t(__u32, LOV_MAX_STRIPE_COUNT, - mds->mds_lov_desc.ld_tgt_count); + mutex_down(&obd->obd_dev_sem); + rc = mds_lov_update_max_ost(mds, idx); + mutex_up(&obd->obd_dev_sem); + if (rc != 0) + GOTO(out, rc ); - mds->mds_max_mdsize = lov_mds_md_size(stripes, LOV_MAGIC_V3); - mds->mds_max_cookiesize = stripes * sizeof(struct llog_cookie); - CDEBUG(D_CONFIG, "updated max_mdsize/max_cookiesize for %d stripes: " - "%d/%d\n", mds->mds_max_mdsize, mds->mds_max_cookiesize, - stripes); /* If we added a target we have to reconnect the llogs */ /* We only _need_ to do this at first add (idx), or the first time @@ -384,7 +534,7 @@ static int mds_lov_update_desc(struct obd_device *obd, int idx, /*XXX this notifies the MDD until lov handling use old mds code */ if (obd->obd_upcall.onu_owner) { LASSERT(obd->obd_upcall.onu_upcall != NULL); - rc = obd->obd_upcall.onu_upcall(NULL, NULL, 0, + rc = obd->obd_upcall.onu_upcall(obd, NULL, OBD_NOTIFY_ACTIVE, obd->obd_upcall.onu_owner); } out: @@ -406,8 +556,6 @@ static int mds_lov_update_mds(struct obd_device *obd, ENTRY; /* Don't let anyone else mess with mds_lov_objids now */ - mutex_down(&obd->obd_dev_sem); - rc = mds_lov_update_desc(obd, idx, &watched->u.cli.cl_target_uuid); if (rc) GOTO(out, rc); @@ -446,7 +594,6 @@ static int mds_lov_update_mds(struct obd_device *obd, data[off], idx, rc); } out: - mutex_up(&obd->obd_dev_sem); RETURN(rc); } @@ -472,18 +619,40 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) RETURN(-ENOTCONN); } + mutex_down(&obd->obd_dev_sem); + rc = mds_lov_read_objids(obd); + mutex_up(&obd->obd_dev_sem); + if (rc) { + CERROR("cannot read %s: rc = %d\n", "lov_objids", rc); + GOTO(err_exit, rc); + } + + rc = obd_register_observer(mds->mds_osc_obd, obd); + if (rc) { + CERROR("MDS cannot register as observer of LOV %s (%d)\n", + lov_name, rc); + GOTO(err_exit, rc); + } + + mds->mds_osc_obd->u.lov.lov_sp_me = LUSTRE_SP_MDT; + OBD_ALLOC(data, sizeof(*data)); if (data == NULL) RETURN(-ENOMEM); data->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_INDEX | OBD_CONNECT_REQPORTAL | OBD_CONNECT_QUOTA64 | OBD_CONNECT_OSS_CAPA | OBD_CONNECT_FID | - OBD_CONNECT_AT; + OBD_CONNECT_BRW_SIZE | OBD_CONNECT_CKSUM | + OBD_CONNECT_AT | OBD_CONNECT_CHANGE_QS; #ifdef HAVE_LRU_RESIZE_SUPPORT data->ocd_connect_flags |= OBD_CONNECT_LRU_RESIZE; #endif data->ocd_version = LUSTRE_VERSION_CODE; - data->ocd_group = mds->mds_id + FILTER_GROUP_MDS0; + data->ocd_group = mdt_to_obd_objgrp(mds->mds_id); + /* send max bytes per rpc */ + data->ocd_brw_size = PTLRPC_MAX_BRW_PAGES << CFS_PAGE_SHIFT; + /* send the list of supported checksum types */ + data->ocd_cksum_types = OBD_CKSUM_ALL; /* NB: lov_connect() needs to fill in .ocd_index for each OST */ rc = obd_connect(NULL, &conn, mds->mds_osc_obd, &obd->obd_uuid, data, NULL); OBD_FREE(data, sizeof(*data)); @@ -494,24 +663,6 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) } mds->mds_osc_exp = class_conn2export(&conn); - rc = obd_register_observer(mds->mds_osc_obd, obd); - if (rc) { - CERROR("MDS cannot register as observer of LOV %s (%d)\n", - lov_name, rc); - GOTO(err_discon, rc); - } - - /* Deny new client connections until we are sure we have some OSTs */ - obd->obd_no_conn = 1; - - mutex_down(&obd->obd_dev_sem); - rc = mds_lov_read_objids(obd); - if (rc) { - CERROR("cannot read %s: rc = %d\n", "lov_objids", rc); - GOTO(err_reg, rc); - } - mutex_up(&obd->obd_dev_sem); - /* I want to see a callback happen when the OBD moves to a * "For General Use" state, and that's when we'll call * set_nextid(). The class driver can help us here, because @@ -523,11 +674,7 @@ int mds_lov_connect(struct obd_device *obd, char * lov_name) */ RETURN(rc); -err_reg: - mutex_up(&obd->obd_dev_sem); - obd_register_observer(mds->mds_osc_obd, NULL); -err_discon: - obd_disconnect(mds->mds_osc_exp); +err_exit: mds->mds_osc_exp = NULL; mds->mds_osc_obd = ERR_PTR(rc); RETURN(rc); @@ -554,18 +701,6 @@ int mds_lov_disconnect(struct obd_device *obd) RETURN(rc); } -/* Collect the preconditions we need to allow client connects */ -static void mds_allow_cli(struct obd_device *obd, unsigned int flag) -{ - if (flag & CONFIG_LOG) - obd->u.mds.mds_fl_cfglog = 1; - if (flag & CONFIG_SYNC) - obd->u.mds.mds_fl_synced = 1; - if (obd->u.mds.mds_fl_cfglog /* bz11778: && obd->u.mds.mds_fl_synced */) - /* Open for clients */ - obd->obd_no_conn = 0; -} - struct mds_lov_sync_info { struct obd_device *mlsi_obd; /* the lov device to sync */ struct obd_device *mlsi_watched; /* target osc */ @@ -633,7 +768,7 @@ static int __mds_lov_synchronize(void *data) CERROR("%s failed at update_mds: %d\n", obd_uuid2str(uuid), rc); GOTO(out, rc); } - mgi.group = FILTER_GROUP_MDS0 + mds->mds_id; + mgi.group = mdt_to_obd_objgrp(mds->mds_id); mgi.uuid = uuid; rc = obd_set_info_async(mds->mds_osc_exp, sizeof(KEY_MDS_CONN), @@ -646,11 +781,11 @@ static int __mds_lov_synchronize(void *data) GOTO(out, rc); ctxt = llog_get_context(obd, LLOG_MDS_OST_ORIG_CTXT); - if (!ctxt) + if (!ctxt) GOTO(out, rc = -ENODEV); OBD_FAIL_TIMEOUT(OBD_FAIL_MDS_LLOG_SYNC_TIMEOUT, 60); - rc = llog_connect(ctxt, NULL, NULL, uuid); + rc = llog_connect(ctxt, NULL, NULL, uuid); llog_ctxt_put(ctxt); if (rc != 0) { CERROR("%s failed at llog_origin_connect: %d\n", @@ -673,7 +808,7 @@ static int __mds_lov_synchronize(void *data) * in mdd is removed, This hack should be removed. */ LASSERT(obd->obd_upcall.onu_upcall != NULL); - rc = obd->obd_upcall.onu_upcall(NULL, NULL, 0, + rc = obd->obd_upcall.onu_upcall(obd, NULL, OBD_NOTIFY_ACTIVE, obd->obd_upcall.onu_owner); } EXIT; @@ -720,6 +855,7 @@ int mds_lov_start_synchronize(struct obd_device *obd, if (mlsi == NULL) RETURN(-ENOMEM); + LASSERT(data); mlsi->mlsi_obd = obd; mlsi->mlsi_watched = watched; mlsi->mlsi_index = *(__u32 *)data; @@ -761,19 +897,22 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, int rc = 0; ENTRY; + CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev); + switch (ev) { /* We only handle these: */ case OBD_NOTIFY_ACTIVE: + /* lov want one or more _active_ targets for work */ + /* activate event should be pass lov idx as argument */ case OBD_NOTIFY_SYNC: case OBD_NOTIFY_SYNC_NONBLOCK: + /* sync event should be pass lov idx as argument */ break; case OBD_NOTIFY_CONFIG: - mds_allow_cli(obd, (unsigned long)data); default: RETURN(0); } - CDEBUG(D_CONFIG, "notify %s ev=%d\n", watched->obd_name, ev); if (strcmp(watched->obd_type->typ_name, LUSTRE_OSC_NAME) != 0) { CERROR("unexpected notification of %s %s!\n", watched->obd_type->typ_name, watched->obd_name); @@ -787,19 +926,13 @@ int mds_notify(struct obd_device *obd, struct obd_device *watched, /* We still have to fix the lov descriptor for ost's added after the mdt in the config log. They didn't make it into mds_lov_connect. */ - mutex_down(&obd->obd_dev_sem); rc = mds_lov_update_desc(obd, *(__u32 *)data, &watched->u.cli.cl_target_uuid); - mutex_up(&obd->obd_dev_sem); - if (rc == 0) - mds_allow_cli(obd, CONFIG_SYNC); RETURN(rc); } rc = mds_lov_start_synchronize(obd, watched, data, !(ev == OBD_NOTIFY_SYNC)); - lquota_recovery(mds_quota_interface_ref, obd); - RETURN(rc); } diff --git a/lustre/mdt/mdt_capa.c b/lustre/mdt/mdt_capa.c index 9abc6df..1f03d81 100644 --- a/lustre/mdt/mdt_capa.c +++ b/lustre/mdt/mdt_capa.c @@ -60,10 +60,6 @@ static void make_capa_key(struct lustre_capa_key *key, ll_get_random_bytes(key->lk_key, sizeof(key->lk_key)); } -enum { - MDT_TXN_CAPA_KEYS_WRITE_CREDITS = 1 -}; - static inline void lck_cpu_to_le(struct lustre_capa_key *tgt, struct lustre_capa_key *src) { @@ -93,8 +89,8 @@ static int write_capa_keys(const struct lu_env *env, int i, rc; mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); - - th = mdt_trans_start(env, mdt, MDT_TXN_CAPA_KEYS_WRITE_CREDITS); + mdt_trans_credit_init(env, mdt, MDT_TXN_CAPA_KEYS_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) RETURN(PTR_ERR(th)); @@ -222,12 +218,13 @@ static int mdt_ck_thread_main(void *args) thread->t_flags = SVC_RUNNING; cfs_waitq_signal(&thread->t_ctl_waitq); - rc = lu_env_init(&env, NULL, LCT_MD_THREAD); + rc = lu_env_init(&env, LCT_MD_THREAD|LCT_REMEMBER|LCT_NOREF); if (rc) RETURN(rc); thread->t_env = &env; env.le_ctx.lc_thread = thread; + env.le_ctx.lc_cookie = 0x1; info = lu_context_key_get(&env.le_ctx, &mdt_thread_key); LASSERT(info != NULL); diff --git a/lustre/mdt/mdt_handler.c b/lustre/mdt/mdt_handler.c index 833de46..0aaea1b 100644 --- a/lustre/mdt/mdt_handler.c +++ b/lustre/mdt/mdt_handler.c @@ -67,8 +67,12 @@ #include #include #include "mdt_internal.h" +#ifdef HAVE_QUOTA_SUPPORT +# include +#endif #include #include +#include mdl_mode_t mdt_mdl_lock_modes[] = { [LCK_MINMODE] = MDL_MINMODE, @@ -309,7 +313,8 @@ static int mdt_getstatus(struct mdt_thread_info *info) repbody->valid |= OBD_MD_FLID; - if (mdt->mdt_opts.mo_mds_capa) { + if (mdt->mdt_opts.mo_mds_capa && + info->mti_exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA) { struct mdt_object *root; struct lustre_capa *capa; @@ -320,7 +325,6 @@ static int mdt_getstatus(struct mdt_thread_info *info) capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1); LASSERT(capa); capa->lc_opc = CAPA_OPC_MDS_DEFAULT; - rc = mo_capa_get(info->mti_env, mdt_object_child(root), capa, 0); mdt_object_put(info->mti_env, root); @@ -432,7 +436,6 @@ static int mdt_getattr_internal(struct mdt_thread_info *info, struct md_object *next = mdt_object_child(o); const struct mdt_body *reqbody = info->mti_body; struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_export_data *med = &req->rq_export->exp_mdt_data; struct md_attr *ma = &info->mti_attr; struct lu_attr *la = &ma->ma_attr; struct req_capsule *pill = info->mti_pill; @@ -537,7 +540,8 @@ static int mdt_getattr_internal(struct mdt_thread_info *info, repbody->max_cookiesize); } - if (med->med_rmtclient && (reqbody->valid & OBD_MD_FLRMTPERM)) { + if (exp_connect_rmtclient(info->mti_exp) && + reqbody->valid & OBD_MD_FLRMTPERM) { void *buf = req_capsule_server_get(pill, &RMF_ACL); /* mdt_getattr_lock only */ @@ -579,8 +583,9 @@ static int mdt_getattr_internal(struct mdt_thread_info *info, } #endif - if ((reqbody->valid & OBD_MD_FLMDSCAPA) && - info->mti_mdt->mdt_opts.mo_mds_capa) { + if (reqbody->valid & OBD_MD_FLMDSCAPA && + info->mti_mdt->mdt_opts.mo_mds_capa && + info->mti_exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA) { struct lustre_capa *capa; capa = req_capsule_server_get(pill, &RMF_CAPA1); @@ -596,7 +601,6 @@ static int mdt_getattr_internal(struct mdt_thread_info *info, static int mdt_renew_capa(struct mdt_thread_info *info) { - struct mdt_device *mdt = info->mti_mdt; struct mdt_object *obj = info->mti_object; struct mdt_body *body; struct lustre_capa *capa, *c; @@ -607,7 +611,8 @@ static int mdt_renew_capa(struct mdt_thread_info *info) * return directly, client will find body->valid OBD_MD_FLOSSCAPA * flag not set. */ - if (!obj || !mdt->mdt_opts.mo_mds_capa) + if (!obj || !info->mti_mdt->mdt_opts.mo_oss_capa || + !(info->mti_exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA)) RETURN(0); body = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); @@ -1116,12 +1121,14 @@ static int mdt_connect(struct mdt_thread_info *info) if (rc == 0) { LASSERT(req->rq_export != NULL); info->mti_mdt = mdt_dev(req->rq_export->exp_obd->obd_lu_dev); - rc = mdt_init_idmap(info); + rc = mdt_init_sec_level(info); + if (rc == 0) + rc = mdt_init_idmap(info); if (rc != 0) - /* if mdt_init_idmap failed, revocation for connect */ obd_disconnect(class_export_get(req->rq_export)); - } else + } else { rc = err_serious(rc); + } return rc; } @@ -1172,8 +1179,8 @@ static int mdt_sendpage(struct mdt_thread_info *info, if (timeout < 0) CERROR("Req deadline already passed %lu (now: %lu)\n", req->rq_deadline, cfs_time_current_sec()); - *lwi = LWI_TIMEOUT(max(timeout, 1) * HZ, NULL, NULL); - rc = l_wait_event(desc->bd_waitq, !ptlrpc_bulk_active(desc), lwi); + *lwi = LWI_TIMEOUT(cfs_time_seconds(max(timeout, 1)), NULL, NULL); + rc = l_wait_event(desc->bd_waitq, !ptlrpc_server_bulk_active(desc), lwi); LASSERT (rc == 0 || rc == -ETIMEDOUT); if (rc == 0) { @@ -1258,7 +1265,7 @@ static int mdt_write_dir_page(struct mdt_thread_info *info, struct page *page, memcpy(name, ent->lde_name, le16_to_cpu(ent->lde_namelen)); lname = mdt_name(info->mti_env, name, le16_to_cpu(ent->lde_namelen)); - ma->ma_attr_flags |= MDS_PERM_BYPASS; + ma->ma_attr_flags |= (MDS_PERM_BYPASS | MDS_QUOTA_IGNORE); rc = mdo_name_insert(info->mti_env, md_object_next(&object->mot_obj), lname, lf, ma); @@ -1474,6 +1481,16 @@ static int mdt_reint_internal(struct mdt_thread_info *info, GOTO(out_shrink, rc = err_serious(rc)); } + /* for replay no cookkie / lmm need, because client have this already */ + if (info->mti_spec.no_create == 1) { + if (req_capsule_has_field(pill, &RMF_MDT_MD, RCL_SERVER)) + req_capsule_set_size(pill, &RMF_MDT_MD, RCL_SERVER, 0); + + if (req_capsule_has_field(pill, &RMF_LOGCOOKIES, RCL_SERVER)) + req_capsule_set_size(pill, &RMF_LOGCOOKIES, RCL_SERVER, + 0); + } + rc = mdt_init_ucred_reint(info); if (rc) GOTO(out_shrink, rc); @@ -1629,15 +1646,134 @@ static int mdt_sync(struct mdt_thread_info *info) RETURN(rc); } +#ifdef HAVE_QUOTA_SUPPORT static int mdt_quotacheck_handle(struct mdt_thread_info *info) { - return err_serious(-EOPNOTSUPP); + struct obd_quotactl *oqctl; + struct req_capsule *pill = info->mti_pill; + struct obd_export *exp = info->mti_exp; + struct md_device *next = info->mti_mdt->mdt_child; + int rc; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_QUOTACHECK_NET)) + RETURN(0); + + oqctl = req_capsule_client_get(pill, &RMF_OBD_QUOTACTL); + if (oqctl == NULL) + RETURN(-EPROTO); + + /* remote client has no permission for quotacheck */ + if (unlikely(exp_connect_rmtclient(exp))) + RETURN(-EPERM); + + rc = req_capsule_server_pack(pill); + if (rc) + RETURN(rc); + + rc = next->md_ops->mdo_quota.mqo_check(info->mti_env, next, exp, + oqctl->qc_type); + RETURN(rc); } static int mdt_quotactl_handle(struct mdt_thread_info *info) { - return err_serious(-EOPNOTSUPP); + struct obd_quotactl *oqctl, *repoqc; + struct req_capsule *pill = info->mti_pill; + struct obd_export *exp = info->mti_exp; + struct md_device *next = info->mti_mdt->mdt_child; + const struct md_quota_operations *mqo = &next->md_ops->mdo_quota; + int id, rc; + ENTRY; + + if (OBD_FAIL_CHECK(OBD_FAIL_MDS_QUOTACTL_NET)) + RETURN(0); + + oqctl = req_capsule_client_get(pill, &RMF_OBD_QUOTACTL); + if (oqctl == NULL) + RETURN(-EPROTO); + + id = oqctl->qc_id; + if (exp_connect_rmtclient(exp)) { + struct ptlrpc_request *req = mdt_info_req(info); + struct mdt_export_data *med = mdt_req2med(req); + struct lustre_idmap_table *idmap = med->med_idmap; + + if (unlikely(oqctl->qc_cmd != Q_GETQUOTA && + oqctl->qc_cmd != Q_GETINFO)) + RETURN(-EPERM); + + + if (oqctl->qc_type == USRQUOTA) + id = lustre_idmap_lookup_uid(NULL, idmap, 0, + oqctl->qc_id); + else if (oqctl->qc_type == GRPQUOTA) + id = lustre_idmap_lookup_gid(NULL, idmap, 0, + oqctl->qc_id); + else + RETURN(-EINVAL); + + if (id == CFS_IDMAP_NOTFOUND) { + CDEBUG(D_QUOTA, "no mapping for id %u\n", + oqctl->qc_id); + RETURN(-EACCES); + } + } + + rc = req_capsule_server_pack(pill); + if (rc) + RETURN(rc); + + repoqc = req_capsule_server_get(pill, &RMF_OBD_QUOTACTL); + LASSERT(repoqc != NULL); + + switch (oqctl->qc_cmd) { + case Q_QUOTAON: + rc = mqo->mqo_on(info->mti_env, next, oqctl->qc_type); + break; + case Q_QUOTAOFF: + rc = mqo->mqo_off(info->mti_env, next, oqctl->qc_type); + break; + case Q_SETINFO: + rc = mqo->mqo_setinfo(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqinfo); + break; + case Q_GETINFO: + rc = mqo->mqo_getinfo(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqinfo); + break; + case Q_SETQUOTA: + rc = mqo->mqo_setquota(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqblk); + break; + case Q_GETQUOTA: + rc = mqo->mqo_getquota(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqblk); + break; + case Q_GETOINFO: + rc = mqo->mqo_getoinfo(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqinfo); + break; + case Q_GETOQUOTA: + rc = mqo->mqo_getoquota(info->mti_env, next, oqctl->qc_type, id, + &oqctl->qc_dqblk); + break; + case LUSTRE_Q_INVALIDATE: + rc = mqo->mqo_invalidate(info->mti_env, next, oqctl->qc_type); + break; + case LUSTRE_Q_FINVALIDATE: + rc = mqo->mqo_finvalidate(info->mti_env, next, oqctl->qc_type); + break; + default: + CERROR("unsupported mdt_quotactl command: %d\n", + oqctl->qc_cmd); + RETURN(-EFAULT); + } + + *repoqc = *oqctl; + RETURN(rc); } +#endif /* * OBD PING and other handlers. @@ -1776,6 +1912,108 @@ struct mdt_object *mdt_object_find(const struct lu_env *env, RETURN(m); } +/** + * Asyncronous commit for mdt device. + * + * Pass asynchonous commit call down the MDS stack. + * + * \param env environment + * \param mdt the mdt device + */ +static void mdt_device_commit_async(const struct lu_env *env, + struct mdt_device *mdt) +{ + struct dt_device *dt = mdt->mdt_bottom; + int rc; + + rc = dt->dd_ops->dt_commit_async(env, dt); + if (unlikely(rc != 0)) + CWARN("async commit start failed with rc = %d", rc); +} + +/** + * Mark the lock as "synchonous". + * + * Mark the lock to deffer transaction commit to the unlock time. + * + * \param lock the lock to mark as "synchonous" + * + * \see mdt_is_lock_sync + * \see mdt_save_lock + */ +static inline void mdt_set_lock_sync(struct ldlm_lock *lock) +{ + lock->l_ast_data = (void*)1; +} + +/** + * Check whehter the lock "synchonous" or not. + * + * \param lock the lock to check + * \retval 1 the lock is "synchonous" + * \retval 0 the lock isn't "synchronous" + * + * \see mdt_set_lock_sync + * \see mdt_save_lock + */ +static inline int mdt_is_lock_sync(struct ldlm_lock *lock) +{ + return lock->l_ast_data != NULL; +} + +/** + * Blocking AST for mdt locks. + * + * Starts transaction commit if in case of COS lock conflict or + * deffers such a commit to the mdt_save_lock. + * + * \param lock the lock which blocks a request or cancelling lock + * \param desc unused + * \param data unused + * \param flag indicates whether this cancelling or blocking callback + * \retval 0 + * \see ldlm_blocking_ast_nocheck + */ +int mdt_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, + void *data, int flag) +{ + struct obd_device *obd = lock->l_resource->lr_namespace->ns_obd; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + int rc; + ENTRY; + + if (flag == LDLM_CB_CANCELING) + RETURN(0); + lock_res_and_lock(lock); + if (lock->l_blocking_ast != mdt_blocking_ast) { + unlock_res_and_lock(lock); + RETURN(0); + } + if (mdt_cos_is_enabled(mdt) && + lock->l_req_mode & (LCK_PW | LCK_EX) && + lock->l_blocking_lock != NULL && + lock->l_client_cookie != lock->l_blocking_lock->l_client_cookie) { + mdt_set_lock_sync(lock); + } + rc = ldlm_blocking_ast_nocheck(lock); + + /* There is no lock conflict if l_blocking_lock == NULL, + * it indicates a blocking ast sent from ldlm_lock_decref_internal + * when the last reference to a local lock was released */ + if (lock->l_req_mode == LCK_COS && lock->l_blocking_lock != NULL) { + struct lu_env env; + + rc = lu_env_init(&env, LCT_MD_THREAD); + if (unlikely(rc != 0)) + CWARN("lu_env initialization failed with rc = %d," + "cannot start asynchronous commit\n", rc); + else + mdt_device_commit_async(&env, mdt); + lu_env_fini(&env); + } + RETURN(rc); +} + int mdt_object_lock(struct mdt_thread_info *info, struct mdt_object *o, struct mdt_lock_handle *lh, __u64 ibits, int locality) { @@ -1832,7 +2070,8 @@ int mdt_object_lock(struct mdt_thread_info *info, struct mdt_object *o, */ policy->l_inodebits.bits = MDS_INODELOCK_UPDATE; rc = mdt_fid_lock(ns, &lh->mlh_pdo_lh, lh->mlh_pdo_mode, - policy, res_id, LDLM_FL_ATOMIC_CB); + policy, res_id, LDLM_FL_ATOMIC_CB, + &info->mti_exp->exp_handle.h_cookie); if (unlikely(rc)) RETURN(rc); } @@ -1852,8 +2091,8 @@ int mdt_object_lock(struct mdt_thread_info *info, struct mdt_object *o, * fix it up and turn FL_LOCAL flag off. */ rc = mdt_fid_lock(ns, &lh->mlh_reg_lh, lh->mlh_reg_mode, policy, - res_id, LDLM_FL_LOCAL_ONLY | LDLM_FL_ATOMIC_CB); - + res_id, LDLM_FL_LOCAL_ONLY | LDLM_FL_ATOMIC_CB, + &info->mti_exp->exp_handle.h_cookie); if (rc) GOTO(out, rc); @@ -1865,36 +2104,79 @@ out: RETURN(rc); } -static inline -void mdt_save_lock(struct ptlrpc_request *req, struct lustre_handle *h, +/** + * Save a lock within request object. + * + * Keep the lock referenced until whether client ACK or transaction + * commit happens or release the lock immediately depending on input + * parameters. If COS is ON, a write lock is converted to COS lock + * before saving. + * + * \param info thead info object + * \param h lock handle + * \param mode lock mode + * \param decref force immediate lock releasing + */ +static +void mdt_save_lock(struct mdt_thread_info *info, struct lustre_handle *h, ldlm_mode_t mode, int decref) { ENTRY; if (lustre_handle_is_used(h)) { - if (decref) + if (decref || !info->mti_has_trans || + !(mode & (LCK_PW | LCK_EX))){ mdt_fid_unlock(h, mode); - else - ptlrpc_save_lock(req, h, mode); + } else { + struct mdt_device *mdt = info->mti_mdt; + struct ldlm_lock *lock = ldlm_handle2lock(h); + struct ptlrpc_request *req = mdt_info_req(info); + int no_ack = 0; + + LASSERTF(lock != NULL, "no lock for cookie "LPX64"\n", + h->cookie); + CDEBUG(D_HA, "request = %p reply state = %p" + " transno = "LPD64"\n", + req, req->rq_reply_state, req->rq_transno); + if (mdt_cos_is_enabled(mdt)) { + no_ack = 1; + ldlm_lock_downgrade(lock, LCK_COS); + mode = LCK_COS; + } + ptlrpc_save_lock(req, h, mode, no_ack); + if (mdt_is_lock_sync(lock)) { + CDEBUG(D_HA, "found sync-lock," + " async commit started\n"); + mdt_device_commit_async(info->mti_env, + mdt); + } + LDLM_LOCK_PUT(lock); + } h->cookie = 0ull; } EXIT; } -/* - * Just call ldlm_lock_decref() if decref, else we only call ptlrpc_save_lock() - * to save this lock in req. when transaction committed, req will be released, - * and lock will, too. +/** + * Unlock mdt object. + * + * Immeditely release the regular lock and the PDO lock or save the + * lock in reqeuest and keep them referenced until client ACK or + * transaction commit. + * + * \param info thread info object + * \param o mdt object + * \param h mdt lock handle referencing regular and PDO locks + * \param decref force immediate lock releasing */ void mdt_object_unlock(struct mdt_thread_info *info, struct mdt_object *o, struct mdt_lock_handle *lh, int decref) { - struct ptlrpc_request *req = mdt_info_req(info); ENTRY; - mdt_save_lock(req, &lh->mlh_pdo_lh, lh->mlh_pdo_mode, decref); - mdt_save_lock(req, &lh->mlh_reg_lh, lh->mlh_reg_mode, decref); + mdt_save_lock(info, &lh->mlh_pdo_lh, lh->mlh_pdo_mode, decref); + mdt_save_lock(info, &lh->mlh_reg_lh, lh->mlh_reg_mode, decref); EXIT; } @@ -2231,6 +2513,15 @@ static void mdt_thread_info_init(struct ptlrpc_request *req, info->mti_env = req->rq_svc_thread->t_env; ci = md_capainfo(info->mti_env); memset(ci, 0, sizeof *ci); + if (req->rq_export) { + if (exp_connect_rmtclient(req->rq_export)) + ci->mc_auth = LC_ID_CONVERT; + else if (req->rq_export->exp_connect_flags & + OBD_CONNECT_MDS_CAPA) + ci->mc_auth = LC_ID_PLAIN; + else + ci->mc_auth = LC_ID_NONE; + } info->mti_fail_id = OBD_FAIL_MDS_ALL_REPLY_NET; info->mti_transno = lustre_msg_get_transno(req->rq_reqmsg); @@ -2739,11 +3030,12 @@ int mdt_intent_lock_replace(struct mdt_thread_info *info, new_lock->l_remote_handle = lock->l_remote_handle; new_lock->l_flags &= ~LDLM_FL_LOCAL; + unlock_res_and_lock(new_lock); + lustre_hash_add(new_lock->l_export->exp_lock_hash, &new_lock->l_remote_handle, &new_lock->l_exp_hash); - unlock_res_and_lock(new_lock); LDLM_LOCK_RELEASE(new_lock); lh->mlh_reg_lh.cookie = 0; @@ -3081,7 +3373,7 @@ static void mdt_seq_adjust(const struct lu_env *env, struct mdt_device *m, int lost) { struct md_site *ms = mdt_md_site(m); - struct lu_range out; + struct lu_seq_range out; ENTRY; LASSERT(ms && ms->ms_server_seq); @@ -3147,6 +3439,7 @@ static int mdt_seq_init(const struct lu_env *env, rc = seq_server_init(ms->ms_control_seq, m->mdt_bottom, uuid, LUSTRE_SEQ_CONTROLLER, + ms, env); if (rc) @@ -3188,6 +3481,7 @@ static int mdt_seq_init(const struct lu_env *env, rc = seq_server_init(ms->ms_server_seq, m->mdt_bottom, uuid, LUSTRE_SEQ_SERVER, + ms, env); if (rc) GOTO(out_seq_fini, rc = -ENOMEM); @@ -3342,7 +3636,8 @@ static int mdt_fld_init(const struct lu_env *env, RETURN(rc = -ENOMEM); rc = fld_server_init(ms->ms_server_fld, - m->mdt_bottom, uuid, env); + m->mdt_bottom, uuid, + env, ms->ms_node_id); if (rc) { OBD_FREE_PTR(ms->ms_server_fld); ms->ms_server_fld = NULL; @@ -3388,7 +3683,7 @@ static void mdt_stop_ptlrpc_service(struct mdt_device *m) ptlrpc_unregister_service(m->mdt_fld_service); m->mdt_fld_service = NULL; } - ENTRY; + EXIT; } static int mdt_start_ptlrpc_service(struct mdt_device *m) @@ -3652,7 +3947,7 @@ err_mdt_svc: static void mdt_stack_fini(const struct lu_env *env, struct mdt_device *m, struct lu_device *top) { - struct obd_device *obd = m->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(m); struct lustre_cfg_bufs *bufs; struct lustre_cfg *lcfg; struct mdt_thread_info *info; @@ -3704,20 +3999,12 @@ static struct lu_device *mdt_layer_setup(struct lu_env *env, GOTO(out, rc = -ENODEV); } - rc = lu_context_refill(&env->le_ctx); + rc = lu_env_refill((struct lu_env *)env); if (rc != 0) { - CERROR("Failure to refill context: '%d'\n", rc); + CERROR("Failure to refill session: '%d'\n", rc); GOTO(out_type, rc); } - if (env->le_ses != NULL) { - rc = lu_context_refill(env->le_ses); - if (rc != 0) { - CERROR("Failure to refill session: '%d'\n", rc); - GOTO(out_type, rc); - } - } - ldt = type->typ_lu; if (ldt == NULL) { CERROR("type: '%s'\n", typename); @@ -3755,11 +4042,14 @@ out: } static int mdt_stack_init(struct lu_env *env, - struct mdt_device *m, struct lustre_cfg *cfg) + struct mdt_device *m, + struct lustre_cfg *cfg, + struct lustre_mount_info *lmi) { struct lu_device *d = &m->mdt_md_dev.md_lu_dev; struct lu_device *tmp; struct md_device *md; + struct lu_device *child_lu_dev; int rc; ENTRY; @@ -3794,7 +4084,15 @@ static int mdt_stack_init(struct lu_env *env, /* process setup config */ tmp = &m->mdt_md_dev.md_lu_dev; rc = tmp->ld_ops->ldo_process_config(env, tmp, cfg); - GOTO(out, rc); + if (rc) + GOTO(out, rc); + + /* initialize local objects */ + child_lu_dev = &m->mdt_child->md_lu_dev; + + rc = child_lu_dev->ld_ops->ldo_prepare(env, + &m->mdt_md_dev.md_lu_dev, + child_lu_dev); out: /* fini from last known good lu_device */ if (rc) @@ -3803,21 +4101,94 @@ out: return rc; } +/** + * setup CONFIG_ORIG context, used to access local config log. + * this may need to be rewrite as part of llog rewrite for lu-api. + */ +static int mdt_obd_llog_setup(struct obd_device *obd, + struct lustre_sb_info *lsi) +{ + int rc; + + LASSERT(obd->obd_fsops == NULL); + + obd->obd_fsops = fsfilt_get_ops(MT_STR(lsi->lsi_ldd)); + if (IS_ERR(obd->obd_fsops)) + return (int) PTR_ERR(obd->obd_fsops); + + rc = fsfilt_setup(obd, lsi->lsi_srv_mnt->mnt_sb); + if (rc) { + fsfilt_put_ops(obd->obd_fsops); + return rc; + } + + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.pwdmnt = lsi->lsi_srv_mnt; + obd->obd_lvfs_ctxt.pwd = lsi->lsi_srv_mnt->mnt_root; + obd->obd_lvfs_ctxt.fs = get_ds(); + + rc = llog_setup(obd, &obd->obd_olg, LLOG_CONFIG_ORIG_CTXT, obd, + 0, NULL, &llog_lvfs_ops); + if (rc) { + CERROR("llog setup failed: %d\n", rc); + fsfilt_put_ops(obd->obd_fsops); + } + + return rc; +} + +static void mdt_obd_llog_cleanup(struct obd_device *obd) +{ + struct llog_ctxt *ctxt; + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt) + llog_cleanup(ctxt); + + if (obd->obd_fsops) + fsfilt_put_ops(obd->obd_fsops); +} + static void mdt_fini(const struct lu_env *env, struct mdt_device *m) { struct md_device *next = m->mdt_child; struct lu_device *d = &m->mdt_md_dev.md_lu_dev; struct lu_site *ls = d->ld_site; - struct obd_device *obd = m->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(m); + int waited = 0; ENTRY; + /* At this point, obd exports might still be on the "obd_zombie_exports" + * list, and obd_zombie_impexp_thread() is trying to destroy them. + * We wait a little bit until all exports (except the self-export) + * have been destroyed, because the whole mdt stack might be accessed + * in mdt_destroy_export(). This will not be a long time, maybe one or + * two seconds are enough. This is not a problem while umounting. + * + * The three references that should be remaining are the + * obd_self_export and the attach and setup references. + */ + while (atomic_read(&obd->obd_refcount) > 3) { + cfs_schedule_timeout(CFS_TASK_UNINT, cfs_time_seconds(1)); + ++waited; + if (waited > 5 && IS_PO2(waited)) + LCONSOLE_WARN("Waiting for obd_zombie_impexp_thread " + "more than %d seconds to destroy all " + "the exports. The current obd refcount =" + " %d. Is it stuck there?\n", + waited, atomic_read(&obd->obd_refcount)); + } + ping_evictor_stop(); target_recovery_fini(obd); mdt_stop_ptlrpc_service(m); - + mdt_obd_llog_cleanup(obd); + obd_zombie_barrier(); +#ifdef HAVE_QUOTA_SUPPORT + next->md_ops->mdo_quota.mqo_cleanup(env, next); +#endif mdt_fs_cleanup(env, m); - upcall_cache_cleanup(m->mdt_identity_cache); m->mdt_identity_cache = NULL; @@ -3826,6 +4197,13 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m) d->ld_obd->obd_namespace = m->mdt_namespace = NULL; } + cfs_free_nidlist(&m->mdt_nosquash_nids); + if (m->mdt_nosquash_str) { + OBD_FREE(m->mdt_nosquash_str, m->mdt_nosquash_strlen); + m->mdt_nosquash_str = NULL; + m->mdt_nosquash_strlen = 0; + } + mdt_seq_fini(env, m); mdt_seq_fini_cli(m); mdt_fld_fini(env, m); @@ -3854,15 +4232,40 @@ static void mdt_fini(const struct lu_env *env, struct mdt_device *m) d->ld_site = NULL; } LASSERT(atomic_read(&d->ld_ref) == 0); - md_device_fini(&m->mdt_md_dev); EXIT; } +static int mdt_adapt_sptlrpc_conf(struct obd_device *obd, int initial) +{ + struct mdt_device *m = mdt_dev(obd->obd_lu_dev); + struct sptlrpc_rule_set tmp_rset; + int rc; + + sptlrpc_rule_set_init(&tmp_rset); + rc = sptlrpc_conf_target_get_rules(obd, &tmp_rset, initial); + if (rc) { + CERROR("mdt %s: failed get sptlrpc rules: %d\n", + obd->obd_name, rc); + return rc; + } + + sptlrpc_target_update_exp_flavor(obd, &tmp_rset); + + write_lock(&m->mdt_sptlrpc_lock); + sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset); + m->mdt_sptlrpc_rset = tmp_rset; + write_unlock(&m->mdt_sptlrpc_lock); + + return 0; +} + static void fsoptions_to_mdt_flags(struct mdt_device *m, char *options) { char *p = options; + m->mdt_opts.mo_mds_capa = 1; + m->mdt_opts.mo_oss_capa = 1; #ifdef CONFIG_FS_POSIX_ACL /* ACLs should be enabled by default (b=13829) */ m->mdt_opts.mo_acl = 1; @@ -3910,14 +4313,32 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, struct obd_device *obd; const char *dev = lustre_cfg_string(cfg, 0); const char *num = lustre_cfg_string(cfg, 2); - struct lustre_mount_info *lmi; + struct lustre_mount_info *lmi = NULL; struct lustre_sb_info *lsi; + struct lustre_disk_data *ldd; struct lu_site *s; struct md_site *mite; const char *identity_upcall = "NONE"; +#ifdef HAVE_QUOTA_SUPPORT + struct md_device *next; +#endif int rc; + int node_id; ENTRY; + md_device_init(&m->mdt_md_dev, ldt); + /* + * Environment (env) might be missing mdt_thread_key values at that + * point, if device is allocated when mdt_thread_key is in QUIESCENT + * mode. + * + * Usually device allocation path doesn't use module key values, but + * mdt has to do a lot of work here, so allocate key value. + */ + rc = lu_env_refill((struct lu_env *)env); + if (rc != 0) + RETURN(rc); + info = lu_context_key_get(&env->le_ctx, &mdt_thread_key); LASSERT(info != NULL); @@ -3931,6 +4352,7 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, m->mdt_opts.mo_user_xattr = 0; m->mdt_opts.mo_acl = 0; + m->mdt_opts.mo_cos = MDT_COS_DEFAULT; lmi = server_get_mount_2(dev); if (lmi == NULL) { CERROR("Cannot get mount info for %s!\n", dev); @@ -3938,7 +4360,14 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, } else { lsi = s2lsi(lmi->lmi_sb); fsoptions_to_mdt_flags(m, lsi->lsi_lmd->lmd_opts); - server_put_mount_2(dev, lmi->lmi_mnt); + /* CMD is supported only in IAM mode */ + ldd = lsi->lsi_ldd; + LASSERT(num); + node_id = simple_strtol(num, NULL, 10); + if (!(ldd->ldd_flags & LDD_F_IAM_DIR) && node_id) { + CERROR("CMD Operation not allowed in IOP mode\n"); + GOTO(err_lmi, rc = -EINVAL); + } } rwlock_init(&m->mdt_sptlrpc_lock); @@ -3949,14 +4378,19 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, m->mdt_capa_timeout = CAPA_TIMEOUT; m->mdt_capa_alg = CAPA_HMAC_ALG_SHA1; m->mdt_ck_timeout = CAPA_KEY_TIMEOUT; + m->mdt_squash_uid = 0; + m->mdt_squash_gid = 0; + CFS_INIT_LIST_HEAD(&m->mdt_nosquash_nids); + m->mdt_nosquash_str = NULL; + m->mdt_nosquash_strlen = 0; + init_rwsem(&m->mdt_squash_sem); spin_lock_init(&m->mdt_client_bitmap_lock); OBD_ALLOC_PTR(mite); if (mite == NULL) - RETURN(-ENOMEM); + GOTO(err_lmi, rc = -ENOMEM); - md_device_init(&m->mdt_md_dev, ldt); s = &mite->ms_lu; m->mdt_md_dev.md_lu_dev.ld_ops = &mdt_lu_ops; @@ -3992,12 +4426,11 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, lprocfs_nid_stats_clear_write, obd, NULL); /* set server index */ - LASSERT(num); - lu_site2md(s)->ms_node_id = simple_strtol(num, NULL, 10); + lu_site2md(s)->ms_node_id = node_id; /* failover is the default * FIXME: we do not failout mds0/mgs, which may cause some problems. - * assumed whose ls_node_id == 0 XXX + * assumed whose ms_node_id == 0 XXX * */ obd->obd_replayable = 1; /* No connection accepted until configurations will finish */ @@ -4012,7 +4445,7 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, } /* init the stack */ - rc = mdt_stack_init((struct lu_env *)env, m, cfg); + rc = mdt_stack_init((struct lu_env *)env, m, cfg, lmi); if (rc) { CERROR("Can't init device stack, rc %d\n", rc); GOTO(err_fini_proc, rc); @@ -4057,15 +4490,31 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, if (rc) GOTO(err_free_ns, rc); - rc = mdt_fs_setup(env, m, obd); + rc = mdt_fs_setup(env, m, obd, lsi); if (rc) GOTO(err_capa, rc); + rc = mdt_obd_llog_setup(obd, lsi); + if (rc) + GOTO(err_fs_cleanup, rc); + + mdt_adapt_sptlrpc_conf(obd, 1); + +#ifdef HAVE_QUOTA_SUPPORT + next = m->mdt_child; + rc = next->md_ops->mdo_quota.mqo_setup(env, next, lmi->lmi_mnt); + if (rc) + GOTO(err_llog_cleanup, rc); +#endif + + server_put_mount_2(dev, lmi->lmi_mnt); + lmi = NULL; + target_recovery_init(obd, mdt_recovery_handle); rc = mdt_start_ptlrpc_service(m); if (rc) - GOTO(err_fs_cleanup, rc); + GOTO(err_recovery, rc); ping_evictor_start(); @@ -4089,8 +4538,14 @@ static int mdt_init0(const struct lu_env *env, struct mdt_device *m, err_stop_service: ping_evictor_stop(); mdt_stop_ptlrpc_service(m); -err_fs_cleanup: +err_recovery: target_recovery_fini(obd); +#ifdef HAVE_QUOTA_SUPPORT + next->md_ops->mdo_quota.mqo_cleanup(env, next); +err_llog_cleanup: +#endif + mdt_obd_llog_cleanup(obd); +err_fs_cleanup: mdt_fs_cleanup(env, m); err_capa: cfs_timer_disarm(&m->mdt_ck_timer); @@ -4116,8 +4571,9 @@ err_fini_site: lu_site_fini(s); err_free_site: OBD_FREE_PTR(mite); - - md_device_fini(&m->mdt_md_dev); +err_lmi: + if (lmi) + server_put_mount_2(dev, lmi->lmi_mnt); return (rc); } @@ -4132,42 +4588,27 @@ static int mdt_process_config(const struct lu_env *env, ENTRY; switch (cfg->lcfg_command) { - case LCFG_SPTLRPC_CONF: { - struct sptlrpc_conf_log *log; - struct sptlrpc_rule_set tmp_rset; - - log = sptlrpc_conf_log_extract(cfg); - if (IS_ERR(log)) { - rc = PTR_ERR(log); - break; - } - - sptlrpc_rule_set_init(&tmp_rset); - - rc = sptlrpc_rule_set_from_log(&tmp_rset, log); - if (rc) { - CERROR("mdt %p: failed get sptlrpc rules: %d\n", m, rc); - break; - } - - write_lock(&m->mdt_sptlrpc_lock); - sptlrpc_rule_set_free(&m->mdt_sptlrpc_rset); - m->mdt_sptlrpc_rset = tmp_rset; - write_unlock(&m->mdt_sptlrpc_lock); - - sptlrpc_target_update_exp_flavor( - md2lu_dev(&m->mdt_md_dev)->ld_obd, &tmp_rset); - - break; - } case LCFG_PARAM: { struct lprocfs_static_vars lvars; struct obd_device *obd = d->ld_obd; + /* + * For interoperability between 1.8 and 2.0, + * skip old "mdt.group_upcall" param. + */ + { + char *param = lustre_cfg_string(cfg, 1); + if (param && !strncmp("mdt.group_upcall", param, 16)) { + CWARN("For 1.8 interoperability, skip this" + " mdt.group_upcall. It is obsolete\n"); + break; + } + } + lprocfs_mdt_init_vars(&lvars); rc = class_process_proc_param(PARAM_MDT, lvars.obd_vars, cfg, obd); - if (rc == -ENOSYS) + if (rc > 0 || rc == -ENOSYS) /* we don't understand; pass it on */ rc = next->ld_ops->ldo_process_config(env, next, cfg); break; @@ -4255,7 +4696,7 @@ static void mdt_object_free(const struct lu_env *env, struct lu_object *o) static const struct lu_device_operations mdt_lu_ops = { .ldo_object_alloc = mdt_object_alloc, - .ldo_process_config = mdt_process_config + .ldo_process_config = mdt_process_config, }; static const struct lu_object_operations mdt_obj_ops = { @@ -4263,13 +4704,30 @@ static const struct lu_object_operations mdt_obj_ops = { .loo_object_free = mdt_object_free }; +static int mdt_obd_set_info_async(struct obd_export *exp, + __u32 keylen, void *key, + __u32 vallen, void *val, + struct ptlrpc_request_set *set) +{ + struct obd_device *obd = exp->exp_obd; + int rc; + ENTRY; + + LASSERT(obd); + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + rc = mdt_adapt_sptlrpc_conf(obd, 0); + RETURN(rc); + } + + RETURN(0); +} + /* mds_connect_internal */ static int mdt_connect_internal(struct obd_export *exp, struct mdt_device *mdt, struct obd_connect_data *data) { - __u64 flags; - if (data != NULL) { data->ocd_connect_flags &= MDT_CONNECT_SUPPORTED; data->ocd_ibits_known &= MDS_INODELOCK_FULL; @@ -4287,12 +4745,6 @@ static int mdt_connect_internal(struct obd_export *exp, if (!mdt->mdt_opts.mo_user_xattr) data->ocd_connect_flags &= ~OBD_CONNECT_XATTR; - if (!mdt->mdt_opts.mo_mds_capa) - data->ocd_connect_flags &= ~OBD_CONNECT_MDS_CAPA; - - if (!mdt->mdt_opts.mo_oss_capa) - data->ocd_connect_flags &= ~OBD_CONNECT_OSS_CAPA; - spin_lock(&exp->exp_lock); exp->exp_connect_flags = data->ocd_connect_flags; spin_unlock(&exp->exp_lock); @@ -4309,35 +4761,57 @@ static int mdt_connect_internal(struct obd_export *exp, } #endif - flags = OBD_CONNECT_LCL_CLIENT | OBD_CONNECT_RMT_CLIENT; - if ((exp->exp_connect_flags & flags) == flags) { - CWARN("%s: both local and remote client flags are set\n", + if ((exp->exp_connect_flags & OBD_CONNECT_FID) == 0) { + CWARN("%s: MDS requires FID support, but client not\n", mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name); return -EBADE; } - if (mdt->mdt_opts.mo_mds_capa && - ((exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA) == 0)) { - CWARN("%s: MDS requires capability support, but client not\n", - mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name); - return -EBADE; - } + return 0; +} - if (mdt->mdt_opts.mo_oss_capa && - ((exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA) == 0)) { - CWARN("%s: MDS requires OSS capability support, " - "but client not\n", - mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name); - return -EBADE; - } +static int mdt_connect_check_sptlrpc(struct mdt_device *mdt, + struct obd_export *exp, + struct ptlrpc_request *req) +{ + struct sptlrpc_flavor flvr; + int rc = 0; - if ((exp->exp_connect_flags & OBD_CONNECT_FID) == 0) { - CWARN("%s: MDS requires FID support, but client not\n", - mdt->mdt_md_dev.md_lu_dev.ld_obd->obd_name); - return -EBADE; + if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + read_lock(&mdt->mdt_sptlrpc_lock); + sptlrpc_target_choose_flavor(&mdt->mdt_sptlrpc_rset, + req->rq_sp_from, + req->rq_peer.nid, + &flvr); + read_unlock(&mdt->mdt_sptlrpc_lock); + + spin_lock(&exp->exp_lock); + + exp->exp_sp_peer = req->rq_sp_from; + exp->exp_flvr = flvr; + + if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY && + exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CERROR("unauthorized rpc flavor %x from %s, " + "expect %x\n", req->rq_flvr.sf_rpc, + libcfs_nid2str(req->rq_peer.nid), + exp->exp_flvr.sf_rpc); + rc = -EACCES; + } + + spin_unlock(&exp->exp_lock); + } else { + if (exp->exp_sp_peer != req->rq_sp_from) { + CERROR("RPC source %s doesn't match %s\n", + sptlrpc_part2name(req->rq_sp_from), + sptlrpc_part2name(exp->exp_sp_peer)); + rc = -EACCES; + } else { + rc = sptlrpc_target_export_check(exp, req); + } } - return 0; + return rc; } /* mds_connect copy */ @@ -4370,25 +4844,9 @@ static int mdt_obd_connect(const struct lu_env *env, exp = class_conn2export(conn); LASSERT(exp != NULL); - CDEBUG(D_SEC, "from %s\n", sptlrpc_part2name(req->rq_sp_from)); - - spin_lock(&exp->exp_lock); - exp->exp_sp_peer = req->rq_sp_from; - - read_lock(&mdt->mdt_sptlrpc_lock); - sptlrpc_rule_set_choose(&mdt->mdt_sptlrpc_rset, exp->exp_sp_peer, - req->rq_peer.nid, &exp->exp_flvr); - read_unlock(&mdt->mdt_sptlrpc_lock); - - if (exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { - CERROR("invalid rpc flavor %x, expect %x, from %s\n", - req->rq_flvr.sf_rpc, exp->exp_flvr.sf_rpc, - libcfs_nid2str(req->rq_peer.nid)); - exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; - spin_unlock(&exp->exp_lock); - RETURN(-EACCES); - } - spin_unlock(&exp->exp_lock); + rc = mdt_connect_check_sptlrpc(mdt, exp, req); + if (rc) + GOTO(out, rc); rc = mdt_connect_internal(exp, mdt, data); if (rc == 0) { @@ -4412,6 +4870,7 @@ static int mdt_obd_connect(const struct lu_env *env, rc = -ENOMEM; } +out: if (rc != 0) class_disconnect(exp); else @@ -4439,28 +4898,9 @@ static int mdt_obd_reconnect(const struct lu_env *env, req = info->mti_pill->rc_req; mdt = mdt_dev(obd->obd_lu_dev); - CDEBUG(D_SEC, "from %s\n", sptlrpc_part2name(req->rq_sp_from)); - - spin_lock(&exp->exp_lock); - if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { - exp->exp_sp_peer = req->rq_sp_from; - - read_lock(&mdt->mdt_sptlrpc_lock); - sptlrpc_rule_set_choose(&mdt->mdt_sptlrpc_rset, - exp->exp_sp_peer, - req->rq_peer.nid, &exp->exp_flvr); - read_unlock(&mdt->mdt_sptlrpc_lock); - - if (exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { - CERROR("invalid rpc flavor %x, expect %x, from %s\n", - req->rq_flvr.sf_rpc, exp->exp_flvr.sf_rpc, - libcfs_nid2str(req->rq_peer.nid)); - exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; - spin_unlock(&exp->exp_lock); - RETURN(-EACCES); - } - } - spin_unlock(&exp->exp_lock); + rc = mdt_connect_check_sptlrpc(mdt, exp, req); + if (rc) + RETURN(rc); rc = mdt_connect_internal(exp, mdt_dev(obd->obd_lu_dev), data); if (rc == 0) @@ -4541,7 +4981,7 @@ static int mdt_destroy_export(struct obd_export *export) ENTRY; med = &export->exp_mdt_data; - if (med->med_rmtclient) + if (exp_connect_rmtclient(export)) mdt_cleanup_idmap(med); target_destroy_export(export); @@ -4553,7 +4993,7 @@ static int mdt_destroy_export(struct obd_export *export) mdt = mdt_dev(obd->obd_lu_dev); LASSERT(mdt != NULL); - rc = lu_env_init(&env, NULL, LCT_MD_THREAD); + rc = lu_env_init(&env, LCT_MD_THREAD); if (rc) RETURN(rc); @@ -4588,9 +5028,10 @@ static int mdt_destroy_export(struct obd_export *export) spin_unlock(&med->med_open_lock); list_for_each_entry_safe(mfd, n, &closing_list, mfd_list) { + list_del_init(&mfd->mfd_list); mdt_mfd_close(info, mfd); /* TODO: if we close the unlinked file, - * we need to remove it's objects from OST */ + * we need to remove its objects from OST */ memset(&ma->ma_attr, 0, sizeof(ma->ma_attr)); spin_lock(&med->med_open_lock); ma->ma_lmm_size = lmm_size; @@ -4622,10 +5063,12 @@ static void mdt_allow_cli(struct mdt_device *m, unsigned int flag) { if (flag & CONFIG_LOG) m->mdt_fl_cfglog = 1; + + /* also notify active event */ if (flag & CONFIG_SYNC) m->mdt_fl_synced = 1; - if (m->mdt_fl_cfglog /* bz11778: && m->mdt_fl_synced */) + if (m->mdt_fl_cfglog && m->mdt_fl_synced) /* Open for clients */ m->mdt_md_dev.md_lu_dev.ld_obd->obd_no_conn = 0; } @@ -4647,6 +5090,10 @@ static int mdt_upcall(const struct lu_env *env, struct md_device *md, CDEBUG(D_INFO, "get max mdsize %d max cookiesize %d\n", m->mdt_max_mdsize, m->mdt_max_cookiesize); mdt_allow_cli(m, CONFIG_SYNC); +#ifdef HAVE_QUOTA_SUPPORT + if (md->md_lu_dev.ld_obd->obd_recovering == 0) + next->md_ops->mdo_quota.mqo_recovery(env, next); +#endif break; case MD_NO_TRANS: mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); @@ -4669,11 +5116,21 @@ static int mdt_obd_notify(struct obd_device *host, struct obd_device *watched, enum obd_notify_event ev, void *data) { + struct mdt_device *mdt = mdt_dev(host->obd_lu_dev); +#ifdef HAVE_QUOTA_SUPPORT + struct md_device *next = mdt->mdt_child; +#endif ENTRY; switch (ev) { case OBD_NOTIFY_CONFIG: - mdt_allow_cli(mdt_dev(host->obd_lu_dev), (unsigned long)data); + mdt_allow_cli(mdt, (unsigned long)data); + +#ifdef HAVE_QUOTA_SUPPORT + /* quota_type has been processed, we can now handle + * incoming quota requests */ + next->md_ops->mdo_quota.mqo_notify(NULL, next); +#endif break; default: CDEBUG(D_INFO, "Unhandled notification %#x\n", ev); @@ -4692,7 +5149,7 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, ENTRY; CDEBUG(D_IOCTL, "handling ioctl cmd %#x\n", cmd); - rc = lu_env_init(&env, NULL, LCT_MD_THREAD); + rc = lu_env_init(&env, LCT_MD_THREAD); if (rc) RETURN(rc); @@ -4701,7 +5158,6 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, rc = mdt_device_sync(&env, mdt); break; case OBD_IOC_SET_READONLY: - rc = dt->dd_ops->dt_sync(&env, dt); dt->dd_ops->dt_ro(&env, dt); break; case OBD_IOC_ABORT_RECOVERY: @@ -4722,7 +5178,10 @@ static int mdt_iocontrol(unsigned int cmd, struct obd_export *exp, int len, int mdt_postrecov(const struct lu_env *env, struct mdt_device *mdt) { struct lu_device *ld = md2lu_dev(mdt->mdt_child); - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); +#ifdef HAVE_QUOTA_SUPPORT + struct md_device *next = mdt->mdt_child; +#endif int rc, lost; ENTRY; /* if some clients didn't participate in recovery then we can possibly @@ -4731,6 +5190,9 @@ int mdt_postrecov(const struct lu_env *env, struct mdt_device *mdt) mdt_seq_adjust(env, mdt, lost); rc = ld->ld_ops->ldo_recovery_complete(env, ld); +#ifdef HAVE_QUOTA_SUPPORT + next->md_ops->mdo_quota.mqo_recovery(env, next); +#endif RETURN(rc); } @@ -4739,7 +5201,7 @@ int mdt_obd_postrecov(struct obd_device *obd) struct lu_env env; int rc; - rc = lu_env_init(&env, NULL, LCT_MD_THREAD); + rc = lu_env_init(&env, LCT_MD_THREAD); if (rc) RETURN(rc); rc = mdt_postrecov(&env, mdt_dev(obd->obd_lu_dev)); @@ -4749,6 +5211,7 @@ int mdt_obd_postrecov(struct obd_device *obd) static struct obd_ops mdt_obd_device_ops = { .o_owner = THIS_MODULE, + .o_set_info_async = mdt_obd_set_info_async, .o_connect = mdt_obd_connect, .o_reconnect = mdt_obd_reconnect, .o_disconnect = mdt_obd_disconnect, @@ -4775,6 +5238,7 @@ static struct lu_device *mdt_device_free(const struct lu_env *env, struct mdt_device *m = mdt_dev(d); ENTRY; + md_device_fini(&m->mdt_md_dev); OBD_FREE_PTR(m); RETURN(NULL); } @@ -4793,7 +5257,7 @@ static struct lu_device *mdt_device_alloc(const struct lu_env *env, l = &m->mdt_md_dev.md_lu_dev; rc = mdt_init0(env, m, t, cfg); if (rc != 0) { - OBD_FREE_PTR(m); + mdt_device_free(env, l); l = ERR_PTR(rc); return l; } @@ -4823,6 +5287,42 @@ struct md_ucred *mdt_ucred(const struct mdt_thread_info *info) return md_ucred(info->mti_env); } +/** + * Enable/disable COS. + * + * Set/Clear the COS flag in mdt options. + * + * \param mdt mdt device + * \param val 0 disables COS, other values enable COS + */ +void mdt_enable_cos(struct mdt_device *mdt, int val) +{ + struct lu_env env; + int rc; + + mdt->mdt_opts.mo_cos = !!val; + rc = lu_env_init(&env, LCT_MD_THREAD); + if (unlikely(rc != 0)) { + CWARN("lu_env initialization failed with rc = %d," + "cannot sync\n", rc); + return; + } + mdt_device_sync(&env, mdt); + lu_env_fini(&env); +} + +/** + * Check COS status. + * + * Return COS flag status/ + * + * \param mdt mdt device + */ +int mdt_cos_is_enabled(struct mdt_device *mdt) +{ + return mdt->mdt_opts.mo_cos != 0; +} + /* type constructor/destructor: mdt_type_init, mdt_type_fini */ LU_TYPE_INIT_FINI(mdt, &mdt_thread_key, &mdt_txn_key); @@ -4845,11 +5345,19 @@ static struct lu_device_type mdt_device_type = { .ldt_ctx_tags = LCT_MD_THREAD }; +static struct lu_local_obj_desc mdt_last_recv = { + .llod_name = LAST_RCVD, + .llod_oid = MDT_LAST_RECV_OID, + .llod_is_index = 0, +}; + static int __init mdt_mod_init(void) { struct lprocfs_static_vars lvars; int rc; + llo_local_obj_register(&mdt_last_recv); + mdt_num_threads = MDT_NUM_THREADS; lprocfs_mdt_init_vars(&lvars); rc = class_register_type(&mdt_obd_device_ops, NULL, @@ -4861,6 +5369,7 @@ static int __init mdt_mod_init(void) static void __exit mdt_mod_exit(void) { + llo_local_obj_unregister(&mdt_last_recv); class_unregister_type(LUSTRE_MDT_NAME); } @@ -4915,8 +5424,10 @@ DEF_MDT_HNDL_F(HABEO_CORPUS, DONE_WRITING, mdt_done_writing), DEF_MDT_HNDL_F(0 |HABEO_REFERO, PIN, mdt_pin), DEF_MDT_HNDL_0(0, SYNC, mdt_sync), DEF_MDT_HNDL_F(HABEO_CORPUS|HABEO_REFERO, IS_SUBDIR, mdt_is_subdir), +#ifdef HAVE_QUOTA_SUPPORT DEF_MDT_HNDL_F(0, QUOTACHECK, mdt_quotacheck_handle), DEF_MDT_HNDL_F(0, QUOTACTL, mdt_quotactl_handle) +#endif }; #define DEF_OBD_HNDL(flags, name, fn) \ diff --git a/lustre/mdt/mdt_identity.c b/lustre/mdt/mdt_identity.c index 3243e65..21cd41b 100644 --- a/lustre/mdt/mdt_identity.c +++ b/lustre/mdt/mdt_identity.c @@ -285,10 +285,8 @@ __u32 mdt_identity_get_perm(struct md_identity *identity, int mdt_pack_remote_perm(struct mdt_thread_info *info, struct mdt_object *o, void *buf) { - struct ptlrpc_request *req = mdt_info_req(info); struct md_ucred *uc = mdt_ucred(info); struct md_object *next = mdt_object_child(o); - struct mdt_export_data *med = mdt_req2med(req); struct mdt_remote_perm *perm = buf; ENTRY; @@ -296,7 +294,7 @@ int mdt_pack_remote_perm(struct mdt_thread_info *info, struct mdt_object *o, /* remote client request always pack ptlrpc_user_desc! */ LASSERT(perm); - if (!med->med_rmtclient) + if (!exp_connect_rmtclient(info->mti_exp)) RETURN(-EBADE); if ((uc->mu_valid != UCRED_OLD) && (uc->mu_valid != UCRED_NEW)) diff --git a/lustre/mdt/mdt_idmap.c b/lustre/mdt/mdt_idmap.c index 08f38c0..162a02f 100644 --- a/lustre/mdt/mdt_idmap.c +++ b/lustre/mdt/mdt_idmap.c @@ -76,12 +76,24 @@ #include "mdt_internal.h" -int mdt_init_idmap(struct mdt_thread_info *info) +#define mdt_init_sec_none(reply, exp) \ +do { \ + reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_MDS_CAPA | \ + OBD_CONNECT_OSS_CAPA); \ + spin_lock(&exp->exp_lock); \ + exp->exp_connect_flags = reply->ocd_connect_flags; \ + spin_unlock(&exp->exp_lock); \ +} while (0) + +int mdt_init_sec_level(struct mdt_thread_info *info) { + struct mdt_device *mdt = info->mti_mdt; struct ptlrpc_request *req = mdt_info_req(info); char *client = libcfs_nid2str(req->rq_peer.nid); - struct mdt_export_data *med = mdt_req2med(req); - struct obd_device *obd = req->rq_export->exp_obd; + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; struct obd_connect_data *data, *reply; int rc = 0, remote; ENTRY; @@ -91,26 +103,116 @@ int mdt_init_idmap(struct mdt_thread_info *info) if (data == NULL || reply == NULL) RETURN(-EFAULT); - if (!req->rq_auth_gss || req->rq_auth_usr_mdt) { - med->med_rmtclient = 0; - reply->ocd_connect_flags &= ~OBD_CONNECT_RMT_CLIENT; + /* connection from MDT is always trusted */ + if (req->rq_auth_usr_mdt) { + mdt_init_sec_none(reply, exp); RETURN(0); } - remote = data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT; + /* no GSS support case */ + if (!req->rq_auth_gss) { + if (mdt->mdt_sec_level > LUSTRE_SEC_NONE) { + CWARN("client %s -> target %s does not user GSS, " + "can not run under security level %d.\n", + client, obd->obd_name, mdt->mdt_sec_level); + RETURN(-EACCES); + } else { + mdt_init_sec_none(reply, exp); + RETURN(0); + } + } + + /* old version case */ + if (unlikely(!(data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) || + !(data->ocd_connect_flags & OBD_CONNECT_MDS_CAPA) || + !(data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA))) { + if (mdt->mdt_sec_level > LUSTRE_SEC_NONE) { + CWARN("client %s -> target %s uses old version, " + "can not run under security level %d.\n", + client, obd->obd_name, mdt->mdt_sec_level); + RETURN(-EACCES); + } else { + CWARN("client %s -> target %s uses old version, " + "run under security level %d.\n", + client, obd->obd_name, mdt->mdt_sec_level); + mdt_init_sec_none(reply, exp); + RETURN(0); + } + } + remote = data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT_FORCE; if (remote) { - med->med_rmtclient = 1; if (!req->rq_auth_remote) CDEBUG(D_SEC, "client (local realm) %s -> target %s " - "asked to be remote!\n", client, obd->obd_name); + "asked to be remote.\n", client, obd->obd_name); } else if (req->rq_auth_remote) { - med->med_rmtclient = 1; - CDEBUG(D_SEC, "client (remote realm) %s -> target %s forced " - "to be remote!\n", client, obd->obd_name); + remote = 1; + CDEBUG(D_SEC, "client (remote realm) %s -> target %s is set " + "as remote by default.\n", client, obd->obd_name); + } + + if (remote) { + if (!mdt->mdt_opts.mo_oss_capa) { + CDEBUG(D_SEC, "client %s -> target %s is set as remote," + " but OSS capabilities are not enabled: %d.\n", + client, obd->obd_name, mdt->mdt_opts.mo_oss_capa); + RETURN(-EACCES); + } + } else { + if (req->rq_auth_uid == INVALID_UID) { + CDEBUG(D_SEC, "client %s -> target %s: user is not " + "authenticated!\n", client, obd->obd_name); + RETURN(-EACCES); + } } - if (med->med_rmtclient) { + switch (mdt->mdt_sec_level) { + case LUSTRE_SEC_NONE: + if (!remote) { + mdt_init_sec_none(reply, exp); + break; + } else { + CDEBUG(D_SEC, "client %s -> target %s is set as remote, " + "can not run under security level %d.\n", + client, obd->obd_name, mdt->mdt_sec_level); + RETURN(-EACCES); + } + case LUSTRE_SEC_REMOTE: + if (!remote) + mdt_init_sec_none(reply, exp); + break; + case LUSTRE_SEC_ALL: + if (!remote) { + reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT | + OBD_CONNECT_RMT_CLIENT_FORCE); + if (!mdt->mdt_opts.mo_mds_capa) + reply->ocd_connect_flags &= ~OBD_CONNECT_MDS_CAPA; + if (!mdt->mdt_opts.mo_oss_capa) + reply->ocd_connect_flags &= ~OBD_CONNECT_OSS_CAPA; + + spin_lock(&exp->exp_lock); + exp->exp_connect_flags = reply->ocd_connect_flags; + spin_unlock(&exp->exp_lock); + } + break; + default: + RETURN(-EINVAL); + } + + RETURN(rc); +} + +int mdt_init_idmap(struct mdt_thread_info *info) +{ + struct ptlrpc_request *req = mdt_info_req(info); + struct mdt_export_data *med = mdt_req2med(req); + struct obd_export *exp = req->rq_export; + char *client = libcfs_nid2str(req->rq_peer.nid); + struct obd_device *obd = exp->exp_obd; + int rc = 0; + ENTRY; + + if (exp_connect_rmtclient(exp)) { down(&med->med_idmap_sem); if (!med->med_idmap) med->med_idmap = lustre_idmap_init(); @@ -131,28 +233,16 @@ int mdt_init_idmap(struct mdt_thread_info *info) RETURN(-ENOMEM); } - reply->ocd_connect_flags &= ~OBD_CONNECT_LCL_CLIENT; CDEBUG(D_SEC, "client %s -> target %s is remote.\n", client, obd->obd_name); - /* NB, MDS_CONNECT establish root idmap too! */ rc = mdt_handle_idmap(info); - } else { - if (req->rq_auth_uid == INVALID_UID) { - CDEBUG(D_SEC, "client %s -> target %s: user is not " - "authenticated!\n", client, obd->obd_name); - RETURN(-EACCES); - } - reply->ocd_connect_flags &= ~OBD_CONNECT_RMT_CLIENT; } - RETURN(rc); } void mdt_cleanup_idmap(struct mdt_export_data *med) { - LASSERT(med->med_rmtclient); - down(&med->med_idmap_sem); if (med->med_idmap != NULL) { lustre_idmap_fini(med->med_idmap); @@ -185,7 +275,7 @@ int mdt_handle_idmap(struct mdt_thread_info *info) RETURN(0); med = mdt_req2med(req); - if (!med->med_rmtclient) + if (!exp_connect_rmtclient(info->mti_exp)) RETURN(0); opc = lustre_msg_get_opc(req->rq_reqmsg); @@ -262,7 +352,7 @@ int ptlrpc_user_desc_do_idmap(struct ptlrpc_request *req, gid_t gid, fsgid; /* Only remote client need desc_to_idmap. */ - if (!med->med_rmtclient) + if (!exp_connect_rmtclient(req->rq_export)) return 0; uid = lustre_idmap_lookup_uid(NULL, idmap, 0, pud->pud_uid); @@ -317,7 +407,7 @@ void mdt_body_reverse_idmap(struct mdt_thread_info *info, struct mdt_body *body) struct mdt_export_data *med = mdt_req2med(req); struct lustre_idmap_table *idmap = med->med_idmap; - if (!med->med_rmtclient) + if (!exp_connect_rmtclient(info->mti_exp)) return; if (body->valid & OBD_MD_FLUID) { @@ -366,7 +456,7 @@ int mdt_fix_attr_ucred(struct mdt_thread_info *info, __u32 op) * done in cmm/mdd layer, here set all cases as uc->mu_fsgid. */ if ((attr->la_valid & LA_GID) && (attr->la_gid != -1)) attr->la_gid = uc->mu_fsgid; - } else if (med->med_rmtclient) { + } else if (exp_connect_rmtclient(info->mti_exp)) { /* NB: -1 case will be handled by mdt_fix_attr() later. */ if ((attr->la_valid & LA_UID) && (attr->la_uid != -1)) { uid_t uid = lustre_idmap_lookup_uid(uc, idmap, 0, diff --git a/lustre/mdt/mdt_internal.h b/lustre/mdt/mdt_internal.h index 5457a05..c8f215c 100644 --- a/lustre/mdt/mdt_internal.h +++ b/lustre/mdt/mdt_internal.h @@ -71,6 +71,7 @@ #include #include #include +#include static inline __u64 lcd_last_transno(struct lsd_client_data *lcd) { @@ -127,7 +128,8 @@ struct mdt_device { mo_acl :1, mo_compat_resname:1, mo_mds_capa :1, - mo_oss_capa :1; + mo_oss_capa :1, + mo_cos :1; } mdt_opts; /* mdt state flags */ __u32 mdt_fl_cfglog:1, @@ -173,13 +175,23 @@ struct mdt_device { struct lustre_capa_key mdt_capa_keys[2]; unsigned int mdt_capa_conf:1; + /* root squash */ + uid_t mdt_squash_uid; + gid_t mdt_squash_gid; + struct list_head mdt_nosquash_nids; + char *mdt_nosquash_str; + int mdt_nosquash_strlen; + struct rw_semaphore mdt_squash_sem; + cfs_proc_dir_entry_t *mdt_proc_entry; struct lprocfs_stats *mdt_stats; + int mdt_sec_level; }; #define MDT_SERVICE_WATCHDOG_FACTOR (2000) #define MDT_ROCOMPAT_SUPP (OBD_ROCOMPAT_LOVOBJID) #define MDT_INCOMPAT_SUPP (OBD_INCOMPAT_MDT | OBD_INCOMPAT_COMMON_LR) +#define MDT_COS_DEFAULT (1) struct mdt_object { struct lu_object_header mot_header; @@ -310,7 +322,7 @@ struct mdt_thread_info { /* * XXX: Part Three: - * The following members will be filled explictly + * The following members will be filled explicitly * with zero in mdt_reint_unpack(), because they are only used * by reint requests (including mdt_reint_open()). */ @@ -367,6 +379,7 @@ struct mdt_thread_info { /* Ops object filename */ struct lu_name mti_name; + struct md_attr mti_tmp_attr; }; typedef void (*mdt_cb_t)(const struct mdt_device *mdt, __u64 transno, @@ -376,6 +389,12 @@ struct mdt_commit_cb { void *mdt_cb_data; }; +enum mdt_txn_op { + MDT_TXN_CAPA_KEYS_WRITE_OP, + MDT_TXN_LAST_RCVD_WRITE_OP, +}; + + /* * Info allocated per-transaction. */ @@ -530,7 +549,7 @@ extern void target_recovery_fini(struct obd_device *obd); extern void target_recovery_init(struct obd_device *obd, svc_handler_t handler); int mdt_fs_setup(const struct lu_env *, struct mdt_device *, - struct obd_device *); + struct obd_device *, struct lustre_sb_info *lsi); void mdt_fs_cleanup(const struct lu_env *, struct mdt_device *); int mdt_client_del(const struct lu_env *env, @@ -575,8 +594,12 @@ void mdt_shrink_reply(struct mdt_thread_info *info); int mdt_handle_last_unlink(struct mdt_thread_info *, struct mdt_object *, const struct md_attr *); void mdt_reconstruct_open(struct mdt_thread_info *, struct mdt_lock_handle *); + +void mdt_trans_credit_init(const struct lu_env *env, + struct mdt_device *mdt, + enum mdt_txn_op op); struct thandle* mdt_trans_start(const struct lu_env *env, - struct mdt_device *mdt, int credits); + struct mdt_device *mdt); void mdt_trans_stop(const struct lu_env *env, struct mdt_device *mdt, struct thandle *th); int mdt_record_write(const struct lu_env *env, @@ -597,21 +620,16 @@ int mdt_init_ucred_reint(struct mdt_thread_info *); void mdt_exit_ucred(struct mdt_thread_info *); /* mdt_idmap.c */ +int mdt_init_sec_level(struct mdt_thread_info *); int mdt_init_idmap(struct mdt_thread_info *); - void mdt_cleanup_idmap(struct mdt_export_data *); - int mdt_handle_idmap(struct mdt_thread_info *); - int ptlrpc_user_desc_do_idmap(struct ptlrpc_request *, struct ptlrpc_user_desc *); - void mdt_body_reverse_idmap(struct mdt_thread_info *, struct mdt_body *); - int mdt_remote_perm_reverse_idmap(struct ptlrpc_request *, struct mdt_remote_perm *); - int mdt_fix_attr_ucred(struct mdt_thread_info *, __u32); static inline struct mdt_device *mdt_dev(struct lu_device *d) @@ -680,13 +698,15 @@ static inline int is_identity_get_disabled(struct upcall_cache *cache) return cache ? (strcmp(cache->uc_upcall, "NONE") == 0) : 1; } +int mdt_blocking_ast(struct ldlm_lock*, struct ldlm_lock_desc*, void*, int); + /* Issues dlm lock on passed @ns, @f stores it lock handle into @lh. */ static inline int mdt_fid_lock(struct ldlm_namespace *ns, struct lustre_handle *lh, ldlm_mode_t mode, ldlm_policy_data_t *policy, const struct ldlm_res_id *res_id, - int flags) + int flags, const __u64 *client_cookie) { int rc; @@ -694,9 +714,9 @@ static inline int mdt_fid_lock(struct ldlm_namespace *ns, LASSERT(lh != NULL); rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, policy, - mode, &flags, ldlm_blocking_ast, - ldlm_completion_ast, NULL, NULL, - 0, NULL, lh); + mode, &flags, mdt_blocking_ast, + ldlm_completion_ast, + NULL, NULL, 0, NULL, client_cookie, lh); return rc == ELDLM_OK ? 0 : -EIO; } @@ -749,6 +769,9 @@ static inline struct lu_name *mdt_name_copy(struct lu_name *tlname, return tlname; } +void mdt_enable_cos(struct mdt_device *, int); +int mdt_cos_is_enabled(struct mdt_device *); + /* lprocfs stuff */ void lprocfs_mdt_init_vars(struct lprocfs_static_vars *lvars); int mdt_procfs_init(struct mdt_device *mdt, const char *name); @@ -771,11 +794,11 @@ static inline void mdt_set_capainfo(struct mdt_thread_info *info, int offset, const struct lu_fid *fid, struct lustre_capa *capa) { - struct mdt_device *dev = info->mti_mdt; struct md_capainfo *ci; LASSERT(offset >= 0 && offset <= MD_CAPAINFO_MAX); - if (!dev->mdt_opts.mo_mds_capa) + if (!info->mti_mdt->mdt_opts.mo_mds_capa || + !(info->mti_exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA)) return; ci = md_capainfo(info->mti_env); @@ -808,5 +831,9 @@ static inline void mdt_dump_capainfo(struct mdt_thread_info *info) } } +static inline struct obd_device *mdt2obd_dev(const struct mdt_device *mdt) +{ + return mdt->mdt_md_dev.md_lu_dev.ld_obd; +} #endif /* __KERNEL__ */ #endif /* _MDT_H */ diff --git a/lustre/mdt/mdt_lib.c b/lustre/mdt/mdt_lib.c index c2abdf1..973571e 100644 --- a/lustre/mdt/mdt_lib.c +++ b/lustre/mdt/mdt_lib.c @@ -53,6 +53,7 @@ #define DEBUG_SUBSYSTEM S_MDS #include "mdt_internal.h" +#include typedef enum ucred_init_type { @@ -81,23 +82,60 @@ void mdt_exit_ucred(struct mdt_thread_info *info) } } -/* XXX: root_squash will be redesigned in Lustre 1.7. - * Do not root_squash for inter-MDS operations */ -static int mdt_root_squash(struct mdt_thread_info *info) +static int match_nosquash_list(struct rw_semaphore *sem, + struct list_head *nidlist, + lnet_nid_t peernid) { - return 0; + int rc; + ENTRY; + down_read(sem); + rc = cfs_match_nid(peernid, nidlist); + up_read(sem); + RETURN(rc); +} + +/* root_squash for inter-MDS operations */ +static int mdt_root_squash(struct mdt_thread_info *info, lnet_nid_t peernid) +{ + struct md_ucred *ucred = mdt_ucred(info); + ENTRY; + + if (!info->mti_mdt->mdt_squash_uid || ucred->mu_fsuid) + RETURN(0); + + if (match_nosquash_list(&info->mti_mdt->mdt_squash_sem, + &info->mti_mdt->mdt_nosquash_nids, + peernid)) { + CDEBUG(D_OTHER, "%s is in nosquash_nids list\n", + libcfs_nid2str(peernid)); + RETURN(0); + } + + CDEBUG(D_OTHER, "squash req from %s, (%d:%d/%x)=>(%d:%d/%x)\n", + libcfs_nid2str(peernid), + ucred->mu_fsuid, ucred->mu_fsgid, ucred->mu_cap, + info->mti_mdt->mdt_squash_uid, info->mti_mdt->mdt_squash_gid, + 0); + + ucred->mu_fsuid = info->mti_mdt->mdt_squash_uid; + ucred->mu_fsgid = info->mti_mdt->mdt_squash_gid; + ucred->mu_cap = 0; + ucred->mu_suppgids[0] = -1; + ucred->mu_suppgids[1] = -1; + + RETURN(0); } static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, void *buf) { struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_export_data *med = mdt_req2med(req); struct mdt_device *mdt = info->mti_mdt; struct ptlrpc_user_desc *pud = req->rq_user_desc; struct md_ucred *ucred = mdt_ucred(info); lnet_nid_t peernid = req->rq_peer.nid; __u32 perm = 0; + __u32 remote = exp_connect_rmtclient(info->mti_exp); int setuid; int setgid; int rc = 0; @@ -123,7 +161,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, } /* sanity check: we expect the uid which client claimed is true */ - if (med->med_rmtclient) { + if (remote) { if (req->rq_auth_mapped_uid == INVALID_UID) { CDEBUG(D_SEC, "remote user not mapped, deny access!\n"); RETURN(-EACCES); @@ -153,7 +191,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, } if (is_identity_get_disabled(mdt->mdt_identity_cache)) { - if (med->med_rmtclient) { + if (remote) { CDEBUG(D_SEC, "remote client must run with identity_get " "enabled!\n"); RETURN(-EACCES); @@ -169,7 +207,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, pud->pud_uid); if (IS_ERR(identity)) { if (unlikely(PTR_ERR(identity) == -EREMCHG && - !med->med_rmtclient)) { + !remote)) { ucred->mu_identity = NULL; perm = CFS_SETUID_PERM | CFS_SETGID_PERM | CFS_SETGRP_PERM; @@ -181,8 +219,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, } else { ucred->mu_identity = identity; perm = mdt_identity_get_perm(ucred->mu_identity, - med->med_rmtclient, - peernid); + remote, peernid); } } @@ -211,7 +248,7 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, /* * NB: remote client not allowed to setgroups anyway. */ - if (!med->med_rmtclient && perm & CFS_SETGRP_PERM) { + if (!remote && perm & CFS_SETGRP_PERM) { if (pud->pud_ngroups) { /* setgroups for local client */ ucred->mu_ginfo = groups_alloc(pud->pud_ngroups); @@ -238,14 +275,17 @@ static int new_init_ucred(struct mdt_thread_info *info, ucred_init_type_t type, ucred->mu_fsuid = pud->pud_fsuid; ucred->mu_fsgid = pud->pud_fsgid; - /* XXX: need to process root_squash here. */ - mdt_root_squash(info); + /* process root_squash here. */ + mdt_root_squash(info, peernid); - /* remove fs privilege for non-root user */ + /* remove fs privilege for non-root user. */ if (ucred->mu_fsuid) ucred->mu_cap = pud->pud_cap & ~CFS_CAP_FS_MASK; else ucred->mu_cap = pud->pud_cap; + if (remote && !(perm & CFS_RMTOWN_PERM)) + ucred->mu_cap &= ~(CFS_CAP_SYS_RESOURCE_MASK | + CFS_CAP_CHOWN_MASK); ucred->mu_valid = UCRED_NEW; EXIT; @@ -269,13 +309,13 @@ out: int mdt_check_ucred(struct mdt_thread_info *info) { struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_export_data *med = mdt_req2med(req); struct mdt_device *mdt = info->mti_mdt; struct ptlrpc_user_desc *pud = req->rq_user_desc; struct md_ucred *ucred = mdt_ucred(info); struct md_identity *identity = NULL; lnet_nid_t peernid = req->rq_peer.nid; __u32 perm = 0; + __u32 remote = exp_connect_rmtclient(info->mti_exp); int setuid; int setgid; int rc = 0; @@ -290,7 +330,7 @@ int mdt_check_ucred(struct mdt_thread_info *info) /* sanity check: if we use strong authentication, we expect the * uid which client claimed is true */ - if (med->med_rmtclient) { + if (remote) { if (req->rq_auth_mapped_uid == INVALID_UID) { CDEBUG(D_SEC, "remote user not mapped, deny access!\n"); RETURN(-EACCES); @@ -320,7 +360,7 @@ int mdt_check_ucred(struct mdt_thread_info *info) } if (is_identity_get_disabled(mdt->mdt_identity_cache)) { - if (med->med_rmtclient) { + if (remote) { CDEBUG(D_SEC, "remote client must run with identity_get " "enabled!\n"); RETURN(-EACCES); @@ -331,7 +371,7 @@ int mdt_check_ucred(struct mdt_thread_info *info) identity = mdt_identity_get(mdt->mdt_identity_cache, pud->pud_uid); if (IS_ERR(identity)) { if (unlikely(PTR_ERR(identity) == -EREMCHG && - !med->med_rmtclient)) { + !remote)) { RETURN(0); } else { CDEBUG(D_SEC, "Deny access without identity: uid %u\n", @@ -340,7 +380,7 @@ int mdt_check_ucred(struct mdt_thread_info *info) } } - perm = mdt_identity_get_perm(identity, med->med_rmtclient, peernid); + perm = mdt_identity_get_perm(identity, remote, peernid); /* find out the setuid/setgid attempt */ setuid = (pud->pud_uid != pud->pud_fsuid); setgid = (pud->pud_gid != pud->pud_fsgid || @@ -401,10 +441,10 @@ static int old_init_ucred(struct mdt_thread_info *info, } uc->mu_identity = identity; - /* XXX: need to process root_squash here. */ - mdt_root_squash(info); + /* process root_squash here. */ + mdt_root_squash(info, mdt_info_req(info)->rq_peer.nid); - /* remove fs privilege for non-root user */ + /* remove fs privilege for non-root user. */ if (uc->mu_fsuid) uc->mu_cap = body->capability & ~CFS_CAP_FS_MASK; else @@ -441,10 +481,10 @@ static int old_init_ucred_reint(struct mdt_thread_info *info) } uc->mu_identity = identity; - /* XXX: need to process root_squash here. */ - mdt_root_squash(info); + /* process root_squash here. */ + mdt_root_squash(info, mdt_info_req(info)->rq_peer.nid); - /* remove fs privilege for non-root user */ + /* remove fs privilege for non-root user. */ if (uc->mu_fsuid) uc->mu_cap &= ~CFS_CAP_FS_MASK; uc->mu_valid = UCRED_OLD; @@ -525,6 +565,12 @@ void mdt_shrink_reply(struct mdt_thread_info *info) acl_size = body->aclsize; + /* this replay - not send info to client */ + if (info->mti_spec.no_create == 1) { + md_size = 0; + acl_size = 0; + } + CDEBUG(D_INFO, "Shrink to md_size = %d cookie/acl_size = %d" " MDSCAPA = "LPX64", OSSCAPA = "LPX64"\n", md_size, acl_size, @@ -571,6 +617,7 @@ int mdt_handle_last_unlink(struct mdt_thread_info *info, struct mdt_object *mo, { struct mdt_body *repbody; const struct lu_attr *la = &ma->ma_attr; + int rc; ENTRY; repbody = req_capsule_server_get(info->mti_pill, &RMF_MDT_BODY); @@ -605,6 +652,21 @@ int mdt_handle_last_unlink(struct mdt_thread_info *info, struct mdt_object *mo, repbody->valid |= OBD_MD_FLCOOKIE; } + if (info->mti_mdt->mdt_opts.mo_oss_capa && + info->mti_exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA && + repbody->valid & OBD_MD_FLEASIZE) { + struct lustre_capa *capa; + + capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2); + LASSERT(capa); + capa->lc_opc = CAPA_OPC_OSS_DESTROY; + rc = mo_capa_get(info->mti_env, mdt_object_child(mo), capa, 0); + if (rc) + RETURN(rc); + + repbody->valid |= OBD_MD_FLOSSCAPA; + } + RETURN(0); } @@ -1001,7 +1063,6 @@ static int mdt_unlink_unpack(struct mdt_thread_info *info) } else { rr->rr_name = NULL; rr->rr_namelen = 0; - } info->mti_spec.sp_ck_split = !!(rec->ul_bias & MDS_CHECK_SPLIT); if (rec->ul_bias & MDS_VTX_BYPASS) @@ -1009,6 +1070,9 @@ static int mdt_unlink_unpack(struct mdt_thread_info *info) else ma->ma_attr_flags &= ~MDS_VTX_BYPASS; + if (lustre_msg_get_flags(mdt_info_req(info)->rq_reqmsg) & MSG_REPLAY) + info->mti_spec.no_create = 1; + rc = mdt_dlmreq_unpack(info); RETURN(rc); } @@ -1140,7 +1204,7 @@ static int mdt_open_unpack(struct mdt_thread_info *info) if (sp->u.sp_ea.eadatalen) { sp->u.sp_ea.eadata = req_capsule_client_get(pill, &RMF_EADATA); if (lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) - sp->u.sp_ea.no_lov_create = 1; + sp->no_create = 1; } RETURN(0); diff --git a/lustre/mdt/mdt_lproc.c b/lustre/mdt/mdt_lproc.c index 171c77a..3ac423a 100644 --- a/lustre/mdt/mdt_lproc.c +++ b/lustre/mdt/mdt_lproc.c @@ -66,6 +66,7 @@ #include #include #include "mdt_internal.h" +#include static const char *mdt_proc_names[LPROC_MDT_NR] = { }; @@ -425,6 +426,227 @@ static int lprocfs_mdt_wr_evict_client(struct file *file, const char *buffer, return count; } +static int lprocfs_rd_sec_level(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return snprintf(page, count, "%d\n", mdt->mdt_sec_level); +} + +static int lprocfs_wr_sec_level(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val > LUSTRE_SEC_ALL || val < LUSTRE_SEC_NONE) + return -EINVAL; + + if (val == LUSTRE_SEC_SPECIFY) { + CWARN("security level %d will be supported in future.\n", + LUSTRE_SEC_SPECIFY); + return -EINVAL; + } + + mdt->mdt_sec_level = val; + return count; +} + +static int lprocfs_rd_cos(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + return snprintf(page, count, "%u\n", mdt_cos_is_enabled(mdt)); +} + +static int lprocfs_wr_cos(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + mdt_enable_cos(mdt, val); + return count; +} + +static int lprocfs_rd_root_squash(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + ENTRY; + + return snprintf(page, count, "%u:%u\n", mdt->mdt_squash_uid, + mdt->mdt_squash_gid); +} + +static int safe_strtoul(const char *str, char **endp, unsigned long *res) +{ + char n[24]; + + *res = simple_strtoul(str, endp, 0); + if (str == *endp) + return 1; + + sprintf(n, "%lu", *res); + if (strncmp(n, str, *endp - str)) + /* overflow */ + return 1; + return 0; +} + +static int lprocfs_wr_root_squash(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + char kernbuf[50], *tmp, *end; + unsigned long uid, gid; + int nouid, nogid; + ENTRY; + + if (count > (sizeof(kernbuf) - 1) || + copy_from_user(kernbuf, buffer, count)) { + CWARN("%s: can't copy string to kernel space, " + "uid:gid is expected, " + "continue with %u:%u, " + "there will be 0:0 on MDS restart\n", + obd->obd_name, mdt->mdt_squash_uid, + mdt->mdt_squash_gid); + RETURN(count); + } + + if (copy_from_user(kernbuf, buffer, count)) + RETURN(-EFAULT); + + kernbuf[count] = '\0'; + + nouid = nogid = 0; + if (safe_strtoul(buffer, &tmp, &uid)) { + uid = mdt->mdt_squash_uid; + nouid = 1; + } + + /* skip ':' */ + if (*tmp == ':') { + tmp++; + if (safe_strtoul(tmp, &end, &gid)) { + gid = mdt->mdt_squash_gid; + nogid = 1; + } + } else { + gid = mdt->mdt_squash_gid; + nogid = 1; + } + + mdt->mdt_squash_uid = uid; + mdt->mdt_squash_gid = gid; + + if (nouid || nogid) + CWARN("%s: can't parse \"\%s\", uid:gid is expected, " + "continue with %u:%u, " + "there will be %u:%u on MDS restart\n", + obd->obd_name, + buffer, mdt->mdt_squash_uid, mdt->mdt_squash_gid, + nouid ? 0 : mdt->mdt_squash_uid, + nogid ? 0 : mdt->mdt_squash_gid); + else + LCONSOLE_INFO("%s: root_squash is set to %u:%u\n", + obd->obd_name, + mdt->mdt_squash_uid, mdt->mdt_squash_gid); + RETURN(count); +} + +static int lprocfs_rd_nosquash_nids(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + + if (mdt->mdt_nosquash_str) + return snprintf(page, count, "%s\n", mdt->mdt_nosquash_str); + return snprintf(page, count, "NONE\n"); +} + +static int lprocfs_wr_nosquash_nids(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct mdt_device *mdt = mdt_dev(obd->obd_lu_dev); + int rc; + char *new; + struct list_head tmp; + ENTRY; + + /* copy to kernel space */ + OBD_ALLOC(new, count + 1); + if (new == 0) + GOTO(failed, rc = -ENOMEM); + + if (copy_from_user(new, buffer, count)) + GOTO(failed, rc = -EFAULT); + + new[count] = 0; + if (strlen(new) != count) + GOTO(failed, rc = -EINVAL); + + if (!strcmp(new, "NONE") || !strcmp(new, "clear")) { + /* empty string is special case */ + down_write(&mdt->mdt_squash_sem); + if (!list_empty(&mdt->mdt_nosquash_nids)) { + cfs_free_nidlist(&mdt->mdt_nosquash_nids); + OBD_FREE(mdt->mdt_nosquash_str, + mdt->mdt_nosquash_strlen); + mdt->mdt_nosquash_str = NULL; + mdt->mdt_nosquash_strlen = 0; + } + up_write(&mdt->mdt_squash_sem); + LCONSOLE_INFO("%s: nosquash_nids is cleared\n", + obd->obd_name); + OBD_FREE(new, count + 1); + RETURN(0); + } + + CFS_INIT_LIST_HEAD(&tmp); + if (cfs_parse_nidlist(new, count, &tmp) <= 0) + GOTO(failed, rc = -EINVAL); + + down_write(&mdt->mdt_squash_sem); + if (!list_empty(&mdt->mdt_nosquash_nids)) { + cfs_free_nidlist(&mdt->mdt_nosquash_nids); + OBD_FREE(mdt->mdt_nosquash_str, mdt->mdt_nosquash_strlen); + } + mdt->mdt_nosquash_str = new; + mdt->mdt_nosquash_strlen = count + 1; + list_splice(&tmp, &mdt->mdt_nosquash_nids); + + LCONSOLE_INFO("%s: nosquash_nids is set to %s\n", obd->obd_name, new); + up_write(&mdt->mdt_squash_sem); + RETURN(count); + + failed: + CWARN("%s: failed to set nosquash_nids (rc %d), " + "on MDS restart we will try to set it again, " + "continue with current nosquash_nids\n", + obd->obd_name, rc); + if (new) + OBD_FREE(new, count + 1); + RETURN(count); +} + static struct lprocfs_vars lprocfs_mdt_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "recovery_status", lprocfs_obd_rd_recovery_status, 0, 0 }, @@ -447,6 +669,13 @@ static struct lprocfs_vars lprocfs_mdt_obd_vars[] = { { "site_stats", lprocfs_rd_site_stats, 0, 0 }, { "evict_client", 0, lprocfs_mdt_wr_evict_client, 0 }, { "hash_stats", lprocfs_obd_rd_hash, 0, 0 }, + { "sec_level", lprocfs_rd_sec_level, + lprocfs_wr_sec_level, 0 }, + { "commit_on_sharing", lprocfs_rd_cos, lprocfs_wr_cos, 0 }, + { "root_squash", lprocfs_rd_root_squash, + lprocfs_wr_root_squash, 0 }, + { "nosquash_nids", lprocfs_rd_nosquash_nids, + lprocfs_wr_nosquash_nids, 0 }, { 0 } }; diff --git a/lustre/mdt/mdt_open.c b/lustre/mdt/mdt_open.c index 4c4690d..e2e8802 100644 --- a/lustre/mdt/mdt_open.c +++ b/lustre/mdt/mdt_open.c @@ -501,8 +501,8 @@ static int mdt_finish_open(struct mdt_thread_info *info, int flags, int created, struct ldlm_reply *rep) { struct ptlrpc_request *req = mdt_info_req(info); + struct obd_export *exp = req->rq_export; struct mdt_export_data *med = &req->rq_export->exp_mdt_data; - struct mdt_device *mdt = info->mti_mdt; struct md_attr *ma = &info->mti_attr; struct lu_attr *la = &ma->ma_attr; struct mdt_file_data *mfd; @@ -521,7 +521,7 @@ static int mdt_finish_open(struct mdt_thread_info *info, islnk = S_ISLNK(la->la_mode); mdt_pack_attr2body(info, repbody, la, mdt_object_fid(o)); - if (med->med_rmtclient) { + if (exp_connect_rmtclient(exp)) { void *buf = req_capsule_server_get(info->mti_pill, &RMF_ACL); rc = mdt_pack_remote_perm(info, o, buf); @@ -534,7 +534,7 @@ static int mdt_finish_open(struct mdt_thread_info *info, } } #ifdef CONFIG_FS_POSIX_ACL - else if (req->rq_export->exp_connect_flags & OBD_CONNECT_ACL) { + else if (exp->exp_connect_flags & OBD_CONNECT_ACL) { const struct lu_env *env = info->mti_env; struct md_object *next = mdt_object_child(o); struct lu_buf *buf = &info->mti_buf; @@ -564,26 +564,26 @@ static int mdt_finish_open(struct mdt_thread_info *info, } #endif - if (mdt->mdt_opts.mo_mds_capa) { + if (info->mti_mdt->mdt_opts.mo_mds_capa && + exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA) { struct lustre_capa *capa; capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1); LASSERT(capa); capa->lc_opc = CAPA_OPC_MDS_DEFAULT; - capa->lc_uid = 0; rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0); if (rc) RETURN(rc); repbody->valid |= OBD_MD_FLMDSCAPA; } - if (mdt->mdt_opts.mo_oss_capa && + if (info->mti_mdt->mdt_opts.mo_oss_capa && + exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA && S_ISREG(lu_object_attr(&o->mot_obj.mo_lu))) { struct lustre_capa *capa; capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA2); LASSERT(capa); capa->lc_opc = CAPA_OPC_OSS_DEFAULT | capa_open_opc(flags); - capa->lc_uid = 0; rc = mo_capa_get(info->mti_env, mdt_object_child(o), capa, 0); if (rc) RETURN(rc); @@ -683,7 +683,10 @@ void mdt_reconstruct_open(struct mdt_thread_info *info, ma->ma_lmm = req_capsule_server_get(pill, &RMF_MDT_MD); ma->ma_lmm_size = req_capsule_get_size(pill, &RMF_MDT_MD, RCL_SERVER); - ma->ma_need = MA_INODE | MA_LOV; + ma->ma_need = MA_INODE; + if (ma->ma_lmm_size > 0) + ma->ma_need |= MA_LOV; + ma->ma_valid = 0; mdt_req_from_lcd(req, med->med_lcd); @@ -884,7 +887,10 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) ma->ma_lmm = req_capsule_server_get(info->mti_pill, &RMF_MDT_MD); ma->ma_lmm_size = req_capsule_get_size(info->mti_pill, &RMF_MDT_MD, RCL_SERVER); - ma->ma_need = MA_INODE | MA_LOV; + ma->ma_need = MA_INODE; + if (ma->ma_lmm_size > 0) + ma->ma_need |= MA_LOV; + ma->ma_valid = 0; LASSERT(info->mti_pill->rc_fmt == &RQF_LDLM_INTENT_OPEN); @@ -1003,6 +1009,7 @@ int mdt_reint_open(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) * not exist. */ info->mti_spec.sp_cr_lookup = 0; + info->mti_spec.sp_feat = &dt_directory_features; result = mdo_create(info->mti_env, mdt_object_child(parent), diff --git a/lustre/mdt/mdt_recovery.c b/lustre/mdt/mdt_recovery.c index 32bdeb5..81c9dfa 100644 --- a/lustre/mdt/mdt_recovery.c +++ b/lustre/mdt/mdt_recovery.c @@ -101,28 +101,56 @@ int mdt_record_write(const struct lu_env *env, LASSERTF(dt != NULL, "dt is NULL when we want to write record\n"); LASSERT(th != NULL); - rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA); + rc = dt->do_body_ops->dbo_write(env, dt, buf, pos, th, BYPASS_CAPA, 1); if (rc == buf->lb_len) rc = 0; else if (rc >= 0) rc = -EFAULT; return rc; } -/* only one record write */ -enum { - MDT_TXN_LAST_RCVD_WRITE_CREDITS = 3 -}; +static inline int mdt_trans_credit_get(const struct lu_env *env, + struct mdt_device *mdt, + enum mdt_txn_op op) +{ + struct dt_device *dev = mdt->mdt_bottom; + int cr; + switch (op) { + case MDT_TXN_CAPA_KEYS_WRITE_OP: + case MDT_TXN_LAST_RCVD_WRITE_OP: + cr = dev->dd_ops->dt_credit_get(env, + dev, + DTO_WRITE_BLOCK); + break; + default: + LBUG(); + } + return cr; +} + +void mdt_trans_credit_init(const struct lu_env *env, + struct mdt_device *mdt, + enum mdt_txn_op op) +{ + struct mdt_thread_info *mti; + struct txn_param *p; + int cr; + + mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); + p = &mti->mti_txn_param; + + cr = mdt_trans_credit_get(env, mdt, op); + txn_param_init(p, cr); +} struct thandle* mdt_trans_start(const struct lu_env *env, - struct mdt_device *mdt, int credits) + struct mdt_device *mdt) { struct mdt_thread_info *mti; struct txn_param *p; mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); p = &mti->mti_txn_param; - txn_param_init(p, credits); /* export can require sync operations */ if (mti->mti_exp != NULL) @@ -225,7 +253,8 @@ static inline int mdt_last_rcvd_header_write(const struct lu_env *env, mti = lu_context_key_get(&env->le_ctx, &mdt_thread_key); - th = mdt_trans_start(env, mdt, MDT_TXN_LAST_RCVD_WRITE_CREDITS); + mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) RETURN(PTR_ERR(th)); @@ -329,7 +358,7 @@ static int mdt_clients_data_init(const struct lu_env *env, { struct lr_server_data *lsd = &mdt->mdt_lsd; struct lsd_client_data *lcd = NULL; - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); loff_t off; int cl_idx; int rc = 0; @@ -419,14 +448,16 @@ err_client: } static int mdt_server_data_init(const struct lu_env *env, - struct mdt_device *mdt) + struct mdt_device *mdt, + struct lustre_sb_info *lsi) { struct lr_server_data *lsd = &mdt->mdt_lsd; struct lsd_client_data *lcd = NULL; - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); struct mdt_thread_info *mti; struct dt_object *obj; struct lu_attr *la; + struct lustre_disk_data *ldd; unsigned long last_rcvd_size; __u64 mount_count; int rc; @@ -479,7 +510,13 @@ static int mdt_server_data_init(const struct lu_env *env, } mount_count = lsd->lsd_mount_count; + ldd = lsi->lsi_ldd; + + if (ldd->ldd_flags & LDD_F_IAM_DIR) + lsd->lsd_feature_incompat |= OBD_INCOMPAT_IAM_DIR; + lsd->lsd_feature_compat = OBD_COMPAT_MDT; + lsd->lsd_feature_incompat |= OBD_INCOMPAT_FID; spin_lock(&mdt->mdt_transno_lock); mdt->mdt_last_transno = lsd->lsd_last_transno; @@ -561,7 +598,7 @@ static int mdt_server_data_update(const struct lu_env *env, void mdt_cb_new_client(const struct mdt_device *mdt, __u64 transno, void *data, int err) { - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); target_client_add_cb(obd, transno, data, err); } @@ -573,7 +610,7 @@ int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) struct mdt_export_data *med; struct lsd_client_data *lcd; struct lr_server_data *lsd = &mdt->mdt_lsd; - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); struct thandle *th; loff_t off; int rc; @@ -616,7 +653,8 @@ int mdt_client_new(const struct lu_env *env, struct mdt_device *mdt) LASSERTF(med->med_lr_off > 0, "med_lr_off = %llu\n", med->med_lr_off); /* write new client data */ off = med->med_lr_off; - th = mdt_trans_start(env, mdt, MDT_TXN_LAST_RCVD_WRITE_CREDITS); + mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) RETURN(PTR_ERR(th)); @@ -649,7 +687,7 @@ int mdt_client_add(const struct lu_env *env, struct mdt_thread_info *mti; struct mdt_export_data *med; unsigned long *bitmap = mdt->mdt_client_bitmap; - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); struct lr_server_data *lsd = &mdt->mdt_lsd; int rc = 0; ENTRY; @@ -691,7 +729,7 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) struct mdt_thread_info *mti; struct mdt_export_data *med; struct lsd_client_data *lcd; - struct obd_device *obd = mdt->mdt_md_dev.md_lu_dev.ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); struct thandle *th; loff_t off; int rc = 0; @@ -739,7 +777,8 @@ int mdt_client_del(const struct lu_env *env, struct mdt_device *mdt) * mdt->mdt_last_rcvd may be NULL that time. */ if (mdt->mdt_last_rcvd != NULL) { - th = mdt_trans_start(env, mdt, MDT_TXN_LAST_RCVD_WRITE_CREDITS); + mdt_trans_credit_init(env, mdt, MDT_TXN_LAST_RCVD_WRITE_OP); + th = mdt_trans_start(env, mdt); if (IS_ERR(th)) GOTO(free, rc = PTR_ERR(th)); @@ -847,7 +886,10 @@ extern struct lu_context_key mdt_thread_key; static int mdt_txn_start_cb(const struct lu_env *env, struct txn_param *param, void *cookie) { - param->tp_credits += MDT_TXN_LAST_RCVD_WRITE_CREDITS; + struct mdt_device *mdt = cookie; + + param->tp_credits += mdt_trans_credit_get(env, mdt, + MDT_TXN_LAST_RCVD_WRITE_OP); return 0; } @@ -913,12 +955,12 @@ static int mdt_txn_stop_cb(const struct lu_env *env, return mdt_last_rcvd_update(mti, txn); } -/* commit callback, need to update last_commited value */ +/* commit callback, need to update last_committed value */ static int mdt_txn_commit_cb(const struct lu_env *env, struct thandle *txn, void *cookie) { struct mdt_device *mdt = cookie; - struct obd_device *obd = md2lu_dev(&mdt->mdt_md_dev)->ld_obd; + struct obd_device *obd = mdt2obd_dev(mdt); struct mdt_txn_info *txi; int i; @@ -946,7 +988,8 @@ static int mdt_txn_commit_cb(const struct lu_env *env, } int mdt_fs_setup(const struct lu_env *env, struct mdt_device *mdt, - struct obd_device *obd) + struct obd_device *obd, + struct lustre_sb_info *lsi) { struct lu_fid fid; struct dt_object *o; @@ -965,10 +1008,10 @@ int mdt_fs_setup(const struct lu_env *env, struct mdt_device *mdt, dt_txn_callback_add(mdt->mdt_bottom, &mdt->mdt_txn_cb); - o = dt_store_open(env, mdt->mdt_bottom, LAST_RCVD, &fid); + o = dt_store_open(env, mdt->mdt_bottom, "", LAST_RCVD, &fid); if (!IS_ERR(o)) { mdt->mdt_last_rcvd = o; - rc = mdt_server_data_init(env, mdt); + rc = mdt_server_data_init(env, mdt, lsi); if (rc) GOTO(put_last_rcvd, rc); } else { @@ -977,7 +1020,7 @@ int mdt_fs_setup(const struct lu_env *env, struct mdt_device *mdt, RETURN(rc); } - o = dt_store_open(env, mdt->mdt_bottom, CAPA_KEYS, &fid); + o = dt_store_open(env, mdt->mdt_bottom, "", CAPA_KEYS, &fid); if (!IS_ERR(o)) { mdt->mdt_ck_obj = o; rc = mdt_capa_keys_init(env, mdt); @@ -1051,9 +1094,8 @@ static void mdt_steal_ack_locks(struct ptlrpc_request *req) libcfs_nid2str(exp->exp_connection->c_peer.nid)); for (i = 0; i < oldrep->rs_nlocks; i++) - ptlrpc_save_lock(req, - &oldrep->rs_locks[i], - oldrep->rs_modes[i]); + ptlrpc_save_lock(req, &oldrep->rs_locks[i], + oldrep->rs_modes[i], 0); oldrep->rs_nlocks = 0; DEBUG_REQ(D_HA, req, "stole locks for"); diff --git a/lustre/mdt/mdt_reint.c b/lustre/mdt/mdt_reint.c index bfdff5f..4de1f39 100644 --- a/lustre/mdt/mdt_reint.c +++ b/lustre/mdt/mdt_reint.c @@ -80,7 +80,8 @@ static int mdt_create_pack_capa(struct mdt_thread_info *info, int rc, if (repbody->valid & OBD_MD_FLMDSCAPA) RETURN(rc); - if (rc == 0 && info->mti_mdt->mdt_opts.mo_mds_capa) { + if (rc == 0 && info->mti_mdt->mdt_opts.mo_mds_capa && + info->mti_exp->exp_connect_flags & OBD_CONNECT_MDS_CAPA) { struct lustre_capa *capa; capa = req_capsule_server_get(info->mti_pill, &RMF_CAPA1); @@ -144,6 +145,7 @@ static int mdt_md_create(struct mdt_thread_info *info) * or not. */ info->mti_spec.sp_cr_lookup = 1; + info->mti_spec.sp_feat = &dt_directory_features; lname = mdt_name(info->mti_env, (char *)rr->rr_name, rr->rr_namelen); @@ -291,7 +293,6 @@ out_unlock: static int mdt_reint_setattr(struct mdt_thread_info *info, struct mdt_lock_handle *lhc) { - struct mdt_device *mdt = info->mti_mdt; struct md_attr *ma = &info->mti_attr; struct mdt_reint_record *rr = &info->mti_rr; struct ptlrpc_request *req = mdt_info_req(info); @@ -387,7 +388,8 @@ static int mdt_reint_setattr(struct mdt_thread_info *info, mdt_pack_attr2body(info, repbody, &ma->ma_attr, mdt_object_fid(mo)); - if (mdt->mdt_opts.mo_oss_capa && + if (info->mti_mdt->mdt_opts.mo_oss_capa && + info->mti_exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA && S_ISREG(lu_object_attr(&mo->mot_obj.mo_lu)) && (ma->ma_attr.la_valid & LA_SIZE)) { struct lustre_capa *capa; @@ -470,9 +472,9 @@ static int mdt_reint_unlink(struct mdt_thread_info *info, if (OBD_FAIL_CHECK(OBD_FAIL_MDS_REINT_UNLINK)) RETURN(err_serious(-ENOENT)); - /* + /* * step 1: lock the parent. Note, this may be child in case of - * remote operation denoted by ->mti_cross_ref flag. + * remote operation denoted by ->mti_cross_ref flag. */ parent_lh = &info->mti_lh[MDT_LH_PARENT]; if (info->mti_cross_ref) { @@ -751,7 +753,9 @@ static int mdt_rename_lock(struct mdt_thread_info *info, rc = ldlm_cli_enqueue_local(ns, res_id, LDLM_IBITS, policy, LCK_EX, &flags, ldlm_blocking_ast, ldlm_completion_ast, NULL, NULL, 0, - NULL, lh); + NULL, + &info->mti_exp->exp_handle.h_cookie, + lh); } else { struct ldlm_enqueue_info einfo = { LDLM_IBITS, LCK_EX, ldlm_blocking_ast, ldlm_completion_ast, NULL, NULL, NULL }; diff --git a/lustre/mdt/mdt_xattr.c b/lustre/mdt/mdt_xattr.c index 47cce00..5a93bfe 100644 --- a/lustre/mdt/mdt_xattr.c +++ b/lustre/mdt/mdt_xattr.c @@ -128,6 +128,8 @@ int mdt_getxattr(struct mdt_thread_info *info) struct mdt_body *repbody = NULL; struct md_object *next; struct lu_buf *buf; + __u32 remote = exp_connect_rmtclient(info->mti_exp); + __u32 perm; int easize, rc; ENTRY; @@ -147,11 +149,11 @@ int mdt_getxattr(struct mdt_thread_info *info) next = mdt_object_child(info->mti_object); if (info->mti_body->valid & OBD_MD_FLRMTRGETFACL) { - __u32 perm = mdt_identity_get_perm(uc->mu_identity, - med->med_rmtclient, - req->rq_peer.nid); + if (unlikely(!remote)) + GOTO(out, rc = err_serious(-EINVAL)); - LASSERT(med->med_rmtclient); + perm = mdt_identity_get_perm(uc->mu_identity, remote, + req->rq_peer.nid); if (!(perm & CFS_RMTACL_PERM)) GOTO(out, rc = err_serious(-EPERM)); @@ -197,7 +199,9 @@ int mdt_getxattr(struct mdt_thread_info *info) if (rc > 0 && flags != CFS_IC_NOTHING) { int rc1; - LASSERT(med->med_rmtclient); + if (unlikely(!remote)) + GOTO(out, rc = -EINVAL); + rc1 = lustre_posix_acl_xattr_id2client(uc, med->med_idmap, (posix_acl_xattr_header *)(buf->lb_buf), @@ -275,7 +279,6 @@ int mdt_reint_setxattr(struct mdt_thread_info *info, struct mdt_lock_handle *unused) { struct ptlrpc_request *req = mdt_info_req(info); - struct mdt_export_data *med = mdt_req2med(req); struct md_ucred *uc = mdt_ucred(info); const char user_string[] = "user."; const char trust_string[] = "trusted."; @@ -294,6 +297,8 @@ int mdt_reint_setxattr(struct mdt_thread_info *info, __u64 lockpart; int rc; posix_acl_xattr_header *new_xattr = NULL; + __u32 remote = exp_connect_rmtclient(info->mti_exp); + __u32 perm; ENTRY; CDEBUG(D_INODE, "setxattr for "DFID"\n", PFID(rr->rr_fid1)); @@ -311,11 +316,11 @@ int mdt_reint_setxattr(struct mdt_thread_info *info, RETURN(rc); if (valid & OBD_MD_FLRMTRSETFACL) { - __u32 perm = mdt_identity_get_perm(uc->mu_identity, - med->med_rmtclient, - req->rq_peer.nid); + if (unlikely(!remote)) + GOTO(out, rc = err_serious(-EINVAL)); - LASSERT(med->med_rmtclient); + perm = mdt_identity_get_perm(uc->mu_identity, remote, + req->rq_peer.nid); if (!(perm & CFS_RMTACL_PERM)) GOTO(out, rc = err_serious(-EPERM)); } @@ -368,7 +373,9 @@ int mdt_reint_setxattr(struct mdt_thread_info *info, xattr = req_capsule_client_get(pill, &RMF_EADATA); if (valid & OBD_MD_FLRMTLSETFACL) { - LASSERT(med->med_rmtclient); + if (unlikely(!remote)) + GOTO(out_unlock, rc = -EINVAL); + xattr_len = mdt_rmtlsetfacl(info, child, xattr_name, (ext_acl_xattr_header *)xattr, diff --git a/lustre/mgc/mgc_internal.h b/lustre/mgc/mgc_internal.h index d76bc67..cb74025 100644 --- a/lustre/mgc/mgc_internal.h +++ b/lustre/mgc/mgc_internal.h @@ -53,4 +53,6 @@ static void lprocfs_mgc_init_vars(struct lprocfs_static_vars *lvars) } #endif /* LPROCFS */ +int mgc_process_log(struct obd_device *mgc, struct config_llog_data *cld); + #endif /* _MGC_INTERNAL_H */ diff --git a/lustre/mgc/mgc_request.c b/lustre/mgc/mgc_request.c index d995ccb..bf696c1 100644 --- a/lustre/mgc/mgc_request.c +++ b/lustre/mgc/mgc_request.c @@ -126,28 +126,42 @@ static int config_log_get(struct config_llog_data *cld) static void config_log_put(struct config_llog_data *cld) { ENTRY; + CDEBUG(D_INFO, "log %s refs %d\n", cld->cld_logname, atomic_read(&cld->cld_refcount)); - if (atomic_dec_and_test(&cld->cld_refcount)) { - CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); - class_export_put(cld->cld_mgcexp); - spin_lock(&config_list_lock); + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + /* spinlock to make sure no item with 0 refcount in the list */ + spin_lock(&config_list_lock); + if (unlikely(atomic_dec_and_test(&cld->cld_refcount))) { list_del(&cld->cld_list_chain); spin_unlock(&config_list_lock); + + CDEBUG(D_MGC, "dropping config log %s\n", cld->cld_logname); + + if (cld->cld_sptlrpc) + config_log_put(cld->cld_sptlrpc); + if (cld->cld_is_sptlrpc) + sptlrpc_conf_log_stop(cld->cld_logname); + + class_export_put(cld->cld_mgcexp); OBD_FREE(cld->cld_logname, strlen(cld->cld_logname) + 1); if (cld->cld_cfg.cfg_instance != NULL) OBD_FREE(cld->cld_cfg.cfg_instance, strlen(cld->cld_cfg.cfg_instance) + 1); OBD_FREE(cld, sizeof(*cld)); + } else { + spin_unlock(&config_list_lock); } + EXIT; } /* Find a config log by name */ -static struct config_llog_data *config_log_find(char *logname, - struct config_llog_instance *cfg) +static +struct config_llog_data *config_log_find(char *logname, + struct config_llog_instance *cfg) { - struct list_head *tmp; struct config_llog_data *cld; char *logid = logname; int match_instance = 0; @@ -163,8 +177,7 @@ static struct config_llog_data *config_log_find(char *logname, } spin_lock(&config_list_lock); - list_for_each(tmp, &config_llog_list) { - cld = list_entry(tmp, struct config_llog_data, cld_list_chain); + list_for_each_entry(cld, &config_llog_list, cld_list_chain) { if (match_instance && cld->cld_cfg.cfg_instance && strcmp(logid, cld->cld_cfg.cfg_instance) == 0) goto out_found; @@ -179,74 +192,169 @@ static struct config_llog_data *config_log_find(char *logname, out_found: atomic_inc(&cld->cld_refcount); spin_unlock(&config_list_lock); + LASSERT(cld->cld_stopping == 0 || cld->cld_is_sptlrpc == 0); RETURN(cld); } -/* Add this log to our list of active logs. - We have one active log per "mount" - client instance or servername. - Each instance may be at a different point in the log. */ -static int config_log_add(char *logname, struct config_llog_instance *cfg, - struct super_block *sb) +static +struct config_llog_data *do_config_log_add(struct obd_device *obd, + char *logname, + unsigned int is_sptlrpc, + struct config_llog_instance *cfg, + struct super_block *sb) { struct config_llog_data *cld; - struct lustre_sb_info *lsi = s2lsi(sb); - int rc; + int rc; ENTRY; - CDEBUG(D_MGC, "adding config log %s:%s\n", logname, cfg->cfg_instance); + CDEBUG(D_MGC, "do adding config log %s:%s\n", logname, + cfg ? cfg->cfg_instance : "NULL"); OBD_ALLOC(cld, sizeof(*cld)); if (!cld) - RETURN(-ENOMEM); + RETURN(ERR_PTR(-ENOMEM)); OBD_ALLOC(cld->cld_logname, strlen(logname) + 1); if (!cld->cld_logname) { OBD_FREE(cld, sizeof(*cld)); - RETURN(-ENOMEM); + RETURN(ERR_PTR(-ENOMEM)); } strcpy(cld->cld_logname, logname); - cld->cld_cfg = *cfg; + if (cfg) + cld->cld_cfg = *cfg; cld->cld_cfg.cfg_last_idx = 0; cld->cld_cfg.cfg_flags = 0; cld->cld_cfg.cfg_sb = sb; + cld->cld_is_sptlrpc = is_sptlrpc; atomic_set(&cld->cld_refcount, 1); /* Keep the mgc around until we are done */ - cld->cld_mgcexp = class_export_get(lsi->lsi_mgc->obd_self_export); + cld->cld_mgcexp = class_export_get(obd->obd_self_export); - if (cfg->cfg_instance != NULL) { + if (cfg && cfg->cfg_instance != NULL) { OBD_ALLOC(cld->cld_cfg.cfg_instance, strlen(cfg->cfg_instance) + 1); strcpy(cld->cld_cfg.cfg_instance, cfg->cfg_instance); } + + if (is_sptlrpc) { + sptlrpc_conf_log_start(logname); + cld->cld_cfg.cfg_obdname = obd->obd_name; + } + rc = mgc_logname2resid(logname, &cld->cld_resid); + spin_lock(&config_list_lock); list_add(&cld->cld_list_chain, &config_llog_list); spin_unlock(&config_list_lock); if (rc) { config_log_put(cld); - RETURN(rc); + RETURN(ERR_PTR(rc)); } - RETURN(rc); + if (is_sptlrpc) { + rc = mgc_process_log(obd, cld); + if (rc) + CERROR("failed processing sptlrpc log: %d\n", rc); + } + + RETURN(cld); } +/** + * Add this log to our list of active logs. + * We have one active log per "mount" - client instance or servername. + * Each instance may be at a different point in the log. + */ +static int config_log_add(struct obd_device *obd, char *logname, + struct config_llog_instance *cfg, + struct super_block *sb) +{ + struct config_llog_data *cld, *sptlrpc_cld; + char seclogname[20]; + char *ptr; + ENTRY; + + CDEBUG(D_MGC, "adding config log %s:%s\n", logname, cfg->cfg_instance); + + /* + * for each regular log, the depended sptlrpc log name is + * -sptlrpc. multiple regular logs may share one sptlrpc log. + */ + ptr = strrchr(logname, '-'); + if (ptr == NULL || ptr - logname > 8) { + CERROR("logname %s is too long\n", logname); + RETURN(-EINVAL); + } + + memcpy(seclogname, logname, ptr - logname); + strcpy(seclogname + (ptr - logname), "-sptlrpc"); + + sptlrpc_cld = config_log_find(seclogname, NULL); + if (IS_ERR(sptlrpc_cld)) { + sptlrpc_cld = do_config_log_add(obd, seclogname, 1, NULL, NULL); + if (IS_ERR(sptlrpc_cld)) { + CERROR("can't create sptlrpc log: %s\n", seclogname); + RETURN(PTR_ERR(sptlrpc_cld)); + } + } + + cld = do_config_log_add(obd, logname, 0, cfg, sb); + if (IS_ERR(cld)) { + CERROR("can't create log: %s\n", logname); + config_log_put(sptlrpc_cld); + RETURN(PTR_ERR(cld)); + } + + cld->cld_sptlrpc = sptlrpc_cld; + + RETURN(0); +} + +DECLARE_MUTEX(llog_process_lock); + /* Stop watching for updates on this log. */ static int config_log_end(char *logname, struct config_llog_instance *cfg) { - struct config_llog_data *cld; + struct config_llog_data *cld, *cld_sptlrpc = NULL; int rc = 0; ENTRY; cld = config_log_find(logname, cfg); if (IS_ERR(cld)) RETURN(PTR_ERR(cld)); - /* drop the ref from the find */ - config_log_put(cld); + + down(&llog_process_lock); + /* + * if cld_stopping is set, it means we didn't start the log thus + * not owning the start ref. this can happen after previous umount: + * the cld still hanging there waiting for lock cancel, and we + * remount again but failed in the middle and call log_end without + * calling start_log. + */ + if (unlikely(cld->cld_stopping)) { + up(&llog_process_lock); + /* drop the ref from the find */ + config_log_put(cld); + RETURN(rc); + } cld->cld_stopping = 1; + up(&llog_process_lock); + + spin_lock(&config_list_lock); + cld_sptlrpc = cld->cld_sptlrpc; + cld->cld_sptlrpc = NULL; + spin_unlock(&config_list_lock); + + if (cld_sptlrpc) + config_log_put(cld_sptlrpc); + + /* drop the ref from the find */ + config_log_put(cld); /* drop the start ref */ config_log_put(cld); + CDEBUG(D_MGC, "end config log %s (%d)\n", logname ? logname : "client", rc); RETURN(rc); @@ -260,14 +368,29 @@ static int config_log_end(char *logname, struct config_llog_instance *cfg) static int rq_state = 0; static cfs_waitq_t rq_waitq; -static int mgc_process_log(struct obd_device *mgc, - struct config_llog_data *cld); static int mgc_requeue_add(struct config_llog_data *cld, int later); +static void do_requeue(struct config_llog_data *cld) +{ + LASSERT(atomic_read(&cld->cld_refcount) > 0); + + if (cld->cld_mgcexp->exp_obd->u.cli.cl_conn_count != 0) { + CDEBUG(D_MGC, "updating log %s\n", cld->cld_logname); + mgc_process_log(cld->cld_mgcexp->exp_obd, cld); + } else { + CDEBUG(D_MGC, "disconnecting, won't update log %s\n", + cld->cld_logname); + } + + /* Whether we enqueued again or not in mgc_process_log, we're done + * with the ref from the old enqueue */ + config_log_put(cld); +} + static int mgc_requeue_thread(void *data) { struct l_wait_info lwi_now, lwi_later; - struct config_llog_data *cld, *n; + struct config_llog_data *cld, *cld_next, *cld_prev; char name[] = "ll_cfg_requeue"; int rc = 0; ENTRY; @@ -293,22 +416,52 @@ static int mgc_requeue_thread(void *data) NULL, NULL); l_wait_event(rq_waitq, rq_state & RQ_STOP, &lwi_now); + /* + * iterate & processing through the list. for each cld, process + * its depending sptlrpc cld firstly (if any) and then itself. + * + * it's guaranteed any item in the list must have + * reference > 0; and if cld_lostlock is set, at + * least one reference is taken by the previous enqueue. + * + * Note: releasing a cld might lead to itself and its depended + * sptlrpc cld be unlinked from the list. to safely iterate + * we need to take a reference on next cld before processing. + */ + cld_prev = NULL; + spin_lock(&config_list_lock); - list_for_each_entry_safe(cld, n, &config_llog_list, + list_for_each_entry_safe(cld, cld_next, &config_llog_list, cld_list_chain) { - spin_unlock(&config_list_lock); + if (cld->cld_list_chain.next != &config_llog_list) + atomic_inc(&cld_next->cld_refcount); + if (cld->cld_lostlock) { - CDEBUG(D_MGC, "updating log %s\n", - cld->cld_logname); + if (cld->cld_sptlrpc && + cld->cld_sptlrpc->cld_lostlock) { + cld->cld_sptlrpc->cld_lostlock = 0; + + spin_unlock(&config_list_lock); + do_requeue(cld->cld_sptlrpc); + spin_lock(&config_list_lock); + LASSERT(cld->cld_lostlock); + } + cld->cld_lostlock = 0; - rc = mgc_process_log(cld->cld_mgcexp->exp_obd, - cld); - /* Whether we enqueued again or not in - mgc_process_log, we're done with the ref - from the old enqueue */ - config_log_put(cld); + + spin_unlock(&config_list_lock); + do_requeue(cld); + spin_lock(&config_list_lock); } - spin_lock(&config_list_lock); + + + if (cld_prev) { + spin_unlock(&config_list_lock); + config_log_put(cld_prev); + spin_lock(&config_list_lock); + } + + cld_prev = cld_next; } spin_unlock(&config_list_lock); @@ -334,11 +487,13 @@ static int mgc_requeue_add(struct config_llog_data *cld, int later) CDEBUG(D_INFO, "log %s: requeue (l=%d r=%d sp=%d st=%x)\n", cld->cld_logname, later, atomic_read(&cld->cld_refcount), cld->cld_stopping, rq_state); + LASSERT(atomic_read(&cld->cld_refcount) > 0); /* Hold lock for rq_state */ spin_lock(&config_list_lock); if (cld->cld_stopping || (rq_state & RQ_STOP)) { + cld->cld_lostlock = 0; spin_unlock(&config_list_lock); config_log_put(cld); RETURN(0); @@ -531,6 +686,7 @@ static int mgc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) lprocfs_mgc_init_vars(&lvars); lprocfs_obd_setup(obd, lvars.obd_vars); + sptlrpc_lprocfs_cliobd_attach(obd); spin_lock(&config_list_lock); atomic_inc(&mgc_count); @@ -594,7 +750,8 @@ static int mgc_blocking_ast(struct ldlm_lock *lock, struct ldlm_lock_desc *desc, break; } /* Did we fail to get the lock? */ - if (lock->l_req_mode != lock->l_granted_mode) { + if (lock->l_req_mode != lock->l_granted_mode && + !cld->cld_is_sptlrpc) { CDEBUG(D_MGC, "log %s: original grant failed, will " "requeue later\n", cld->cld_logname); /* Try to re-enqueue later */ @@ -863,6 +1020,49 @@ int mgc_set_info_async(struct obd_export *exp, obd_count keylen, rc = mgc_set_mgs_param(exp, msp); RETURN(rc); } + if (KEY_IS(KEY_MGSSEC)) { + struct client_obd *cli = &exp->exp_obd->u.cli; + struct sptlrpc_flavor flvr; + + /* + * empty string means using current flavor, if which haven't + * been set yet, set it as null. + * + * if flavor has been set previously, check the asking flavor + * must match the existing one. + */ + if (vallen == 0) { + if (cli->cl_flvr_mgc.sf_rpc != SPTLRPC_FLVR_INVALID) + RETURN(0); + val = "null"; + vallen = 4; + } + + rc = sptlrpc_parse_flavor(val, &flvr); + if (rc) { + CERROR("invalid sptlrpc flavor %s to MGS\n", + (char *) val); + RETURN(rc); + } + + /* + * caller already hold a mutex + */ + if (cli->cl_flvr_mgc.sf_rpc == SPTLRPC_FLVR_INVALID) { + cli->cl_flvr_mgc = flvr; + } else if (memcmp(&cli->cl_flvr_mgc, &flvr, + sizeof(flvr)) != 0) { + char str[20]; + + sptlrpc_flavor2name(&cli->cl_flvr_mgc, + str, sizeof(str)); + LCONSOLE_ERROR("asking sptlrpc flavor %s to MGS but " + "currently %s is in use\n", + (char *) val, str); + rc = -EPERM; + } + RETURN(rc); + } RETURN(rc); } @@ -1074,32 +1274,36 @@ out: RETURN(rc); } -DECLARE_MUTEX(llog_process_lock); - /* Get a config log from the MGS and process it. This func is called for both clients and servers. */ -static int mgc_process_log(struct obd_device *mgc, - struct config_llog_data *cld) +int mgc_process_log(struct obd_device *mgc, + struct config_llog_data *cld) { struct llog_ctxt *ctxt, *lctxt; struct lustre_handle lockh; struct client_obd *cli = &mgc->u.cli; struct lvfs_run_ctxt saved; - struct lustre_sb_info *lsi; + struct lustre_sb_info *lsi = NULL; int rc = 0, rcl, flags = 0, must_pop = 0; ENTRY; - if (!cld || !cld->cld_cfg.cfg_sb) { - /* This should never happen */ - CERROR("Missing cld, aborting log update\n"); - RETURN(-EINVAL); - } - if (cld->cld_stopping) + LASSERT(cld); + + /* I don't want mutliple processes running process_log at once -- + sounds like badness. It actually might be fine, as long as + we're not trying to update from the same log + simultaneously (in which case we should use a per-log sem.) */ + down(&llog_process_lock); + + if (cld->cld_stopping) { + up(&llog_process_lock); RETURN(0); + } OBD_FAIL_TIMEOUT(OBD_FAIL_MGC_PAUSE_PROCESS_LOG, 20); - lsi = s2lsi(cld->cld_cfg.cfg_sb); + if (cld->cld_cfg.cfg_sb) + lsi = s2lsi(cld->cld_cfg.cfg_sb); CDEBUG(D_MGC, "Process log %s:%s from %d\n", cld->cld_logname, cld->cld_cfg.cfg_instance, cld->cld_cfg.cfg_last_idx + 1); @@ -1107,15 +1311,10 @@ static int mgc_process_log(struct obd_device *mgc, ctxt = llog_get_context(mgc, LLOG_CONFIG_REPL_CTXT); if (!ctxt) { CERROR("missing llog context\n"); + up(&llog_process_lock); RETURN(-EINVAL); } - /* I don't want mutliple processes running process_log at once -- - sounds like badness. It actually might be fine, as long as - we're not trying to update from the same log - simultaneously (in which case we should use a per-log sem.) */ - down(&llog_process_lock); - /* Get the cfg lock on the llog */ rcl = mgc_enqueue(mgc->u.cli.cl_mgc_mgsexp, NULL, LDLM_PLAIN, NULL, LCK_CR, &flags, NULL, NULL, NULL, @@ -1125,6 +1324,13 @@ static int mgc_process_log(struct obd_device *mgc, lctxt = llog_get_context(mgc, LLOG_CONFIG_ORIG_CTXT); + /* + * local copy of sptlrpc log is controlled elsewhere, don't try to + * read it up here. + */ + if (rcl && cld->cld_is_sptlrpc) + goto out_pop; + /* Copy the setup log locally if we can. Don't mess around if we're running an MGS though (logs are already local). */ if (lctxt && lsi && (lsi->lsi_flags & LSI_SERVER) && @@ -1153,6 +1359,9 @@ static int mgc_process_log(struct obd_device *mgc, ctxt = lctxt; } + if (cld->cld_is_sptlrpc) + sptlrpc_conf_log_update_begin(cld->cld_logname); + /* logname and instance info should be the same, so use our copy of the instance for the update. The cfg_last_idx will be updated here. */ @@ -1164,6 +1373,19 @@ out_pop: if (must_pop) pop_ctxt(&saved, &mgc->obd_lvfs_ctxt, NULL); + /* + * update settings on existing OBDs. doing it inside + * of llog_process_lock so no device is attaching/detaching + * in parallel. + * the logname must be -sptlrpc + */ + if (cld->cld_is_sptlrpc && rcl == 0) { + sptlrpc_conf_log_update_end(cld->cld_logname); + class_notify_sptlrpc_conf(cld->cld_logname, + strlen(cld->cld_logname) - + strlen("-sptlrpc")); + } + /* Now drop the lock so MGS can revoke it */ if (!rcl) { rcl = mgc_cancel(mgc->u.cli.cl_mgc_mgsexp, NULL, @@ -1208,6 +1430,10 @@ static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf) CERROR("lov_del_obd unimplemented\n"); rc = -ENOSYS; break; + case LCFG_SPTLRPC_CONF: { + rc = sptlrpc_process_config(lcfg); + break; + } case LCFG_LOG_START: { struct config_llog_data *cld; struct config_llog_instance *cfg; @@ -1220,7 +1446,7 @@ static int mgc_process_config(struct obd_device *obd, obd_count len, void *buf) cfg->cfg_last_idx); /* We're only called through here on the initial mount */ - rc = config_log_add(logname, cfg, sb); + rc = config_log_add(obd, logname, cfg, sb); if (rc) break; cld = config_log_find(logname, cfg); diff --git a/lustre/mgs/lproc_mgs.c b/lustre/mgs/lproc_mgs.c index d3434bf..b673280 100644 --- a/lustre/mgs/lproc_mgs.c +++ b/lustre/mgs/lproc_mgs.c @@ -88,6 +88,54 @@ static int mgs_fs_seq_show(struct seq_file *seq, void *v) LPROC_SEQ_FOPS_RO(mgs_fs); +static void seq_show_srpc_rules(struct seq_file *seq, const char *tgtname, + struct sptlrpc_rule_set *rset) +{ + struct sptlrpc_rule *r; + char dirbuf[10]; + char flvrbuf[40]; + char *net; + int i; + + for (i = 0; i < rset->srs_nrule; i++) { + r = &rset->srs_rules[i]; + + if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY)) + net = "default"; + else + net = libcfs_net2str(r->sr_netid); + + if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY) + dirbuf[0] = '\0'; + else + snprintf(dirbuf, sizeof(dirbuf), ".%s2%s", + sptlrpc_part2name(r->sr_from), + sptlrpc_part2name(r->sr_to)); + + sptlrpc_flavor2name(&r->sr_flvr, flvrbuf, sizeof(flvrbuf)); + seq_printf(seq, "%s.srpc.flavor.%s%s=%s\n", tgtname, + net, dirbuf, flvrbuf); + } +} + +static int mgsself_srpc_seq_show(struct seq_file *seq, void *v) +{ + struct obd_device *obd = seq->private; + struct fs_db *fsdb; + int rc; + + rc = mgs_find_or_make_fsdb(obd, MGSSELF_NAME, &fsdb); + if (rc) + return rc; + + down(&fsdb->fsdb_sem); + seq_show_srpc_rules(seq, fsdb->fsdb_name, &fsdb->fsdb_srpc_gen); + up(&fsdb->fsdb_sem); + return 0; +} + +LPROC_SEQ_FOPS_RO(mgsself_srpc); + int lproc_mgs_setup(struct obd_device *obd) { struct mgs_obd *mgs = &obd->u.mgs; @@ -95,6 +143,8 @@ int lproc_mgs_setup(struct obd_device *obd) rc = lprocfs_obd_seq_create(obd, "filesystems", 0444, &mgs_fs_fops, obd); + rc = lprocfs_obd_seq_create(obd, "srpc_rules", 0600, + &mgsself_srpc_fops, obd); mgs->mgs_proc_live = lprocfs_register("live", obd->obd_proc_entry, NULL, NULL); @@ -136,36 +186,6 @@ int lproc_mgs_cleanup(struct obd_device *obd) return lprocfs_obd_cleanup(obd); } -static void seq_show_srpc_rule(struct seq_file *seq, const char *tgtname, - struct sptlrpc_rule_set *rset) -{ - struct sptlrpc_rule *r; - char dirbuf[10]; - char flvrbuf[40]; - char *net; - int i; - - for (i = 0; i < rset->srs_nrule; i++) { - r = &rset->srs_rules[i]; - - if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY)) - net = "default"; - else - net = libcfs_net2str(r->sr_netid); - - if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY) - dirbuf[0] = '\0'; - else - snprintf(dirbuf, sizeof(dirbuf), ".%s2%s", - sptlrpc_part2name(r->sr_from), - sptlrpc_part2name(r->sr_to)); - - sptlrpc_flavor2name(&r->sr_flvr, flvrbuf, sizeof(flvrbuf)); - seq_printf(seq, "%s.srpc.flavor.%s%s=%s\n", tgtname, - net, dirbuf, flvrbuf); - } -} - static int mgs_live_seq_show(struct seq_file *seq, void *v) { struct fs_db *fsdb = seq->private; @@ -191,10 +211,10 @@ static int mgs_live_seq_show(struct seq_file *seq, void *v) #endif for (srpc_tgt = fsdb->fsdb_srpc_tgt; srpc_tgt; srpc_tgt = srpc_tgt->mtsc_next) { - seq_show_srpc_rule(seq, srpc_tgt->mtsc_tgt, - &srpc_tgt->mtsc_rset); + seq_show_srpc_rules(seq, srpc_tgt->mtsc_tgt, + &srpc_tgt->mtsc_rset); } - seq_show_srpc_rule(seq, fsdb->fsdb_name, &fsdb->fsdb_srpc_gen); + seq_show_srpc_rules(seq, fsdb->fsdb_name, &fsdb->fsdb_srpc_gen); up(&fsdb->fsdb_sem); return 0; diff --git a/lustre/mgs/mgs_handler.c b/lustre/mgs/mgs_handler.c index ca41d1a..d9b8272 100644 --- a/lustre/mgs/mgs_handler.c +++ b/lustre/mgs/mgs_handler.c @@ -80,8 +80,6 @@ static int mgs_connect(const struct lu_env *env, exp = class_conn2export(conn); LASSERT(exp); - exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_NULL; - mgs_counter_incr(exp, LPROC_MGS_CONNECT); if (data != NULL) { @@ -248,7 +246,7 @@ static int mgs_setup(struct obd_device *obd, struct lustre_cfg *lcfg) mgs_handle, LUSTRE_MGS_NAME, obd->obd_proc_entry, target_print_req, MGS_THREADS_AUTO_MIN, MGS_THREADS_AUTO_MAX, - "ll_mgs", LCT_MD_THREAD); + "ll_mgs", LCT_MD_THREAD, NULL); if (!mgs->mgs_service) { CERROR("failed to start service\n"); @@ -348,7 +346,7 @@ static int mgs_get_cfg_lock(struct obd_device *obd, char *fsname, LDLM_PLAIN, NULL, LCK_EX, &flags, ldlm_blocking_ast, ldlm_completion_ast, NULL, - fsname, 0, NULL, lockh); + fsname, 0, NULL, NULL, lockh); if (rc) CERROR("can't take cfg lock for %s (%d)\n", fsname, rc); @@ -560,6 +558,60 @@ static int mgs_set_info_rpc(struct ptlrpc_request *req) RETURN(rc); } +/* + * similar as in ost_connect_check_sptlrpc() + */ +static int mgs_connect_check_sptlrpc(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct obd_device *obd = exp->exp_obd; + struct fs_db *fsdb; + struct sptlrpc_flavor flvr; + int rc = 0; + + if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + rc = mgs_find_or_make_fsdb(obd, MGSSELF_NAME, &fsdb); + if (rc) + return rc; + + down(&fsdb->fsdb_sem); + if (sptlrpc_rule_set_choose(&fsdb->fsdb_srpc_gen, + LUSTRE_SP_MGC, LUSTRE_SP_MGS, + req->rq_peer.nid, + &flvr) == 0) { + /* by defualt allow any flavors */ + flvr.sf_rpc = SPTLRPC_FLVR_ANY; + } + up(&fsdb->fsdb_sem); + + spin_lock(&exp->exp_lock); + + exp->exp_sp_peer = req->rq_sp_from; + exp->exp_flvr = flvr; + + if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY && + exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CERROR("invalid rpc flavor %x, expect %x, from %s\n", + req->rq_flvr.sf_rpc, exp->exp_flvr.sf_rpc, + libcfs_nid2str(req->rq_peer.nid)); + rc = -EACCES; + } + + spin_unlock(&exp->exp_lock); + } else { + if (exp->exp_sp_peer != req->rq_sp_from) { + CERROR("RPC source %s doesn't match %s\n", + sptlrpc_part2name(req->rq_sp_from), + sptlrpc_part2name(exp->exp_sp_peer)); + rc = -EACCES; + } else { + rc = sptlrpc_target_export_check(exp, req); + } + } + + return rc; +} + /* Called whenever a target cleans up. */ /* XXX - Currently unused */ static int mgs_handle_target_del(struct ptlrpc_request *req) @@ -591,6 +643,12 @@ int mgs_handle(struct ptlrpc_request *req) LASSERT(current->journal_info == NULL); opc = lustre_msg_get_opc(req->rq_reqmsg); + + if (opc == SEC_CTX_INIT || + opc == SEC_CTX_INIT_CONT || + opc == SEC_CTX_FINI) + GOTO(out, rc = 0); + if (opc != MGS_CONNECT) { if (req->rq_export == NULL) { CERROR("lustre_mgs: operation %d on unconnected MGS\n", @@ -606,6 +664,9 @@ int mgs_handle(struct ptlrpc_request *req) /* MGS and MDS have same request format for connect */ req_capsule_set(&req->rq_pill, &RQF_MDS_CONNECT); rc = target_handle_connect(req); + if (rc == 0) + rc = mgs_connect_check_sptlrpc(req); + if (!rc && (lustre_msg_get_conn_cnt(req->rq_reqmsg) > 1)) /* Make clients trying to reconnect after a MGS restart happy; also requires obd_replayable */ diff --git a/lustre/mgs/mgs_internal.h b/lustre/mgs/mgs_internal.h index 99e698f..8ae2d68 100644 --- a/lustre/mgs/mgs_internal.h +++ b/lustre/mgs/mgs_internal.h @@ -52,6 +52,8 @@ int class_dentry_readdir(struct obd_device *obd, struct dentry *dir, struct vfsmount *inmnt, struct list_head *dentry_list); +#define MGSSELF_NAME "_mgs" + struct mgs_tgt_srpc_conf { struct mgs_tgt_srpc_conf *mtsc_next; char *mtsc_tgt; @@ -82,11 +84,15 @@ struct fs_db { /* in-memory copy of the srpc rules, guarded by fsdb_sem */ struct sptlrpc_rule_set fsdb_srpc_gen; struct mgs_tgt_srpc_conf *fsdb_srpc_tgt; - unsigned int fsdb_srpc_fl_udesc:1; + unsigned int fsdb_fl_udesc:1, + fsdb_fl_mgsself:1; }; int mgs_init_fsdb_list(struct obd_device *obd); int mgs_cleanup_fsdb_list(struct obd_device *obd); +int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, + struct fs_db **dbh); +int mgs_get_fsdb_srpc_from_llog(struct obd_device *obd, struct fs_db *fsdb); int mgs_check_index(struct obd_device *obd, struct mgs_target_info *mti); int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti); int mgs_write_log_target(struct obd_device *obd, struct mgs_target_info *mti); diff --git a/lustre/mgs/mgs_llog.c b/lustre/mgs/mgs_llog.c index bb4ce90..89dcf26 100644 --- a/lustre/mgs/mgs_llog.c +++ b/lustre/mgs/mgs_llog.c @@ -65,13 +65,6 @@ #include #include "mgs_internal.h" -static int mgs_get_fsdb_srpc_from_llog(struct obd_device *obd, - struct fs_db *fsdb); -static int mgs_get_srpc_conf_log(struct fs_db *fsdb, const char *tgt, - enum lustre_sec_part from, - enum lustre_sec_part to, - struct sptlrpc_conf_log *log); - /********************** Class functions ********************/ /* Caller must list_del and OBD_FREE each dentry from the list */ @@ -117,14 +110,14 @@ static inline int name_create(char **newname, char *prefix, char *suffix) { LASSERT(newname); OBD_ALLOC(*newname, strlen(prefix) + strlen(suffix) + 1); - if (!*newname) + if (!*newname) return -ENOMEM; sprintf(*newname, "%s%s", prefix, suffix); return 0; } static inline void name_destroy(char **name) -{ +{ if (*name) OBD_FREE(*name, strlen(*name) + 1); *name = NULL; @@ -135,11 +128,11 @@ static inline void name_destroy(char **name) 2. what the last config step is 3. COMPAT_146 lov name 4. COMPAT_146 mdt lov name - 5. COMPAT_146 mdc name + 5. COMPAT_146 mdc name */ /* It might be better to have a separate db file, instead of parsing the info out of the client log. This is slow and potentially error-prone. */ -static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, +static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, void *data) { struct fs_db *fsdb = (struct fs_db *)data; @@ -201,9 +194,9 @@ static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, (strcmp(lustre_cfg_string(lcfg, 1), LUSTRE_LOV_NAME) == 0)) { fsdb->fsdb_flags |= FSDB_OLDLOG14; name_destroy(&fsdb->fsdb_clilov); - rc = name_create(&fsdb->fsdb_clilov, + rc = name_create(&fsdb->fsdb_clilov, lustre_cfg_string(lcfg, 0), ""); - if (rc) + if (rc) RETURN(rc); CDEBUG(D_MGS, "client lov name is %s\n", fsdb->fsdb_clilov); } @@ -215,20 +208,20 @@ static int mgs_fsdb_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, fsdb->fsdb_flags |= FSDB_OLDLOG14; ptr = strstr(lustre_cfg_string(lcfg, 1), "_UUID"); if (!ptr) { - CERROR("Can't parse MDT uuid %s\n", + CERROR("Can't parse MDT uuid %s\n", lustre_cfg_string(lcfg, 1)); RETURN(-EINVAL); } *ptr = '\0'; name_destroy(&fsdb->fsdb_mdtlov); - rc = name_create(&fsdb->fsdb_mdtlov, + rc = name_create(&fsdb->fsdb_mdtlov, "lov_", lustre_cfg_string(lcfg, 1)); - if (rc) + if (rc) RETURN(rc); name_destroy(&fsdb->fsdb_mdc); - rc = name_create(&fsdb->fsdb_mdc, + rc = name_create(&fsdb->fsdb_mdc, lustre_cfg_string(lcfg, 0), ""); - if (rc) + if (rc) RETURN(rc); CDEBUG(D_MGS, "MDT lov name is %s\n", fsdb->fsdb_mdtlov); } @@ -327,37 +320,46 @@ static struct fs_db *mgs_new_fsdb(struct obd_device *obd, char *fsname) int rc; ENTRY; + if (strlen(fsname) >= sizeof(fsdb->fsdb_name)) { + CERROR("fsname %s is too long\n", fsname); + RETURN(NULL); + } + OBD_ALLOC_PTR(fsdb); if (!fsdb) RETURN(NULL); - OBD_ALLOC(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); - OBD_ALLOC(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); - if (!fsdb->fsdb_ost_index_map || !fsdb->fsdb_mdt_index_map) { - CERROR("No memory for index maps\n"); - GOTO(err, 0); - } - - strncpy(fsdb->fsdb_name, fsname, sizeof(fsdb->fsdb_name)); - fsdb->fsdb_name[sizeof(fsdb->fsdb_name) - 1] = 0; - rc = name_create(&fsdb->fsdb_mdtlov, fsname, "-mdtlov"); - if (rc) - GOTO(err, rc); - rc = name_create(&fsdb->fsdb_mdtlmv, fsname, "-mdtlmv"); - if (rc) - GOTO(err, rc); - rc = name_create(&fsdb->fsdb_clilov, fsname, "-clilov"); - if (rc) - GOTO(err, rc); - - rc = name_create(&fsdb->fsdb_clilmv, fsname, "-clilmv"); - if (rc) - GOTO(err, rc); - - fsdb->fsdb_srpc_fl_udesc = 1; + strcpy(fsdb->fsdb_name, fsname); sema_init(&fsdb->fsdb_sem, 1); + fsdb->fsdb_fl_udesc = 1; + + if (strcmp(fsname, MGSSELF_NAME) == 0) { + fsdb->fsdb_fl_mgsself = 1; + } else { + OBD_ALLOC(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); + OBD_ALLOC(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); + if (!fsdb->fsdb_ost_index_map || !fsdb->fsdb_mdt_index_map) { + CERROR("No memory for index maps\n"); + GOTO(err, 0); + } + + rc = name_create(&fsdb->fsdb_mdtlov, fsname, "-mdtlov"); + if (rc) + GOTO(err, rc); + rc = name_create(&fsdb->fsdb_mdtlmv, fsname, "-mdtlmv"); + if (rc) + GOTO(err, rc); + rc = name_create(&fsdb->fsdb_clilov, fsname, "-clilov"); + if (rc) + GOTO(err, rc); + rc = name_create(&fsdb->fsdb_clilmv, fsname, "-clilmv"); + if (rc) + GOTO(err, rc); + + lproc_mgs_add_live(obd, fsdb); + } + list_add(&fsdb->fsdb_list, &mgs->mgs_fs_db_list); - lproc_mgs_add_live(obd, fsdb); RETURN(fsdb); err: @@ -367,8 +369,8 @@ err: OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); name_destroy(&fsdb->fsdb_clilov); name_destroy(&fsdb->fsdb_clilmv); - name_destroy(&fsdb->fsdb_mdtlov); - name_destroy(&fsdb->fsdb_mdtlmv); + name_destroy(&fsdb->fsdb_mdtlov); + name_destroy(&fsdb->fsdb_mdtlmv); OBD_FREE_PTR(fsdb); RETURN(NULL); } @@ -379,13 +381,15 @@ static void mgs_free_fsdb(struct obd_device *obd, struct fs_db *fsdb) down(&fsdb->fsdb_sem); lproc_mgs_del_live(obd, fsdb); list_del(&fsdb->fsdb_list); - OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); - OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); - name_destroy(&fsdb->fsdb_clilov); - name_destroy(&fsdb->fsdb_clilmv); - name_destroy(&fsdb->fsdb_mdtlov); - name_destroy(&fsdb->fsdb_mdtlmv); - name_destroy(&fsdb->fsdb_mdc); + if (fsdb->fsdb_ost_index_map) + OBD_FREE(fsdb->fsdb_ost_index_map, INDEX_MAP_SIZE); + if (fsdb->fsdb_mdt_index_map) + OBD_FREE(fsdb->fsdb_mdt_index_map, INDEX_MAP_SIZE); + name_destroy(&fsdb->fsdb_clilov); + name_destroy(&fsdb->fsdb_clilmv); + name_destroy(&fsdb->fsdb_mdtlov); + name_destroy(&fsdb->fsdb_mdtlmv); + name_destroy(&fsdb->fsdb_mdc); mgs_free_fsdb_srpc(fsdb); OBD_FREE_PTR(fsdb); } @@ -411,8 +415,8 @@ int mgs_cleanup_fsdb_list(struct obd_device *obd) return 0; } -static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, - struct fs_db **dbh) +int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, + struct fs_db **dbh) { struct mgs_obd *mgs = &obd->u.mgs; struct fs_db *fsdb; @@ -432,12 +436,14 @@ static int mgs_find_or_make_fsdb(struct obd_device *obd, char *name, if (!fsdb) return -ENOMEM; - /* populate the db from the client llog */ - rc = mgs_get_fsdb_from_llog(obd, fsdb); - if (rc) { - CERROR("Can't get db from client log %d\n", rc); - mgs_free_fsdb(obd, fsdb); - return rc; + if (!fsdb->fsdb_fl_mgsself) { + /* populate the db from the client llog */ + rc = mgs_get_fsdb_from_llog(obd, fsdb); + if (rc) { + CERROR("Can't get db from client log %d\n", rc); + mgs_free_fsdb(obd, fsdb); + return rc; + } } /* populate srpc rules from params llog */ @@ -568,13 +574,13 @@ struct mgs_modify_lookup { int mml_modified; }; -static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, +static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, void *data) { struct mgs_modify_lookup *mml = (struct mgs_modify_lookup *)data; struct cfg_marker *marker; struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); - int cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - + int cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - sizeof(struct llog_rec_tail); int rc; ENTRY; @@ -592,27 +598,27 @@ static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, /* We only care about markers */ if (lcfg->lcfg_command != LCFG_MARKER) - RETURN(0); - + RETURN(0); + marker = lustre_cfg_buf(lcfg, 1); - if ((strcmp(mml->mml_marker.cm_comment, marker->cm_comment) == 0) && + if ((strcmp(mml->mml_marker.cm_comment, marker->cm_comment) == 0) && (strcmp(mml->mml_marker.cm_tgtname, marker->cm_tgtname) == 0) && !(marker->cm_flags & CM_SKIP)) { /* Found a non-skipped marker match */ CDEBUG(D_MGS, "Changing rec %u marker %d %x->%x: %s %s\n", - rec->lrh_index, marker->cm_step, + rec->lrh_index, marker->cm_step, marker->cm_flags, mml->mml_marker.cm_flags, marker->cm_tgtname, marker->cm_comment); /* Overwrite the old marker llog entry */ marker->cm_flags &= ~CM_EXCLUDE; /* in case we're unexcluding */ marker->cm_flags |= mml->mml_marker.cm_flags; marker->cm_canceltime = mml->mml_marker.cm_canceltime; - /* Header and tail are added back to lrh_len in + /* Header and tail are added back to lrh_len in llog_lvfs_write_rec */ - rec->lrh_len = cfg_len; - rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg, + rec->lrh_len = cfg_len; + rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg, rec->lrh_index); - if (!rc) + if (!rc) mml->mml_modified++; } @@ -621,7 +627,7 @@ static int mgs_modify_handler(struct llog_handle *llh, struct llog_rec_hdr *rec, /* Modify an existing config log record (for CM_SKIP or CM_EXCLUDE) */ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb, - struct mgs_target_info *mti, char *logname, + struct mgs_target_info *mti, char *logname, char *devname, char *comment, int flags) { struct llog_handle *loghandle; @@ -634,7 +640,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb, CDEBUG(D_MGS, "modify %s/%s/%s\n", logname, devname, comment); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); LASSERT(ctxt != NULL); rc = llog_create(ctxt, &loghandle, NULL, logname); @@ -649,7 +655,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb, GOTO(out_close, rc = 0); OBD_ALLOC_PTR(mml); - if (!mml) + if (!mml) GOTO(out_close, rc = -ENOMEM); strcpy(mml->mml_marker.cm_comment, comment); strcpy(mml->mml_marker.cm_tgtname, devname); @@ -658,7 +664,7 @@ static int mgs_modify(struct obd_device *obd, struct fs_db *fsdb, mml->mml_marker.cm_canceltime = flags ? cfs_time_current_sec() : 0; mml->mml_modified = 0; rc = llog_process(loghandle, mgs_modify_handler, (void *)mml, NULL); - if (!rc && !mml->mml_modified) + if (!rc && !mml->mml_modified) rc = -ENODEV; OBD_FREE_PTR(mml); @@ -668,7 +674,7 @@ out_close: rc = rc2; out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - if (rc && rc != -ENODEV) + if (rc && rc != -ENODEV) CERROR("modify %s/%s failed %d\n", mti->mti_svname, comment, rc); llog_ctxt_put(ctxt); @@ -684,10 +690,10 @@ static int record_lcfg(struct obd_device *obd, struct llog_handle *llh, struct llog_rec_hdr rec; int buflen, rc; - if (!lcfg || !llh) + if (!lcfg || !llh) return -ENOMEM; - LASSERT(llh->lgh_ctxt); + LASSERT(llh->lgh_ctxt); buflen = lustre_cfg_len(lcfg->lcfg_bufcount, lcfg->lcfg_buflens); @@ -698,7 +704,7 @@ static int record_lcfg(struct obd_device *obd, struct llog_handle *llh, /* idx = -1 means append */ rc = llog_write_rec(llh, &rec, NULL, 0, (void *)lcfg, -1); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - if (rc) + if (rc) CERROR("failed %d\n", rc); return rc; } @@ -725,7 +731,7 @@ static int record_base(struct obd_device *obd, struct llog_handle *llh, lustre_cfg_bufs_set_string(&bufs, 4, s4); lcfg = lustre_cfg_new(cmd, &bufs); - if (!lcfg) + if (!lcfg) return -ENOMEM; lcfg->lcfg_nid = nid; @@ -770,25 +776,6 @@ static inline int record_setup(struct obd_device *obd, struct llog_handle *llh, return record_base(obd,llh,devname,0,LCFG_SETUP,s1,s2,s3,s4); } -static inline int record_sptlrpc_conf(struct obd_device *obd, - struct llog_handle *llh, - char *devname, - struct sptlrpc_conf_log *srpc_log) -{ - struct lustre_cfg_bufs bufs; - struct lustre_cfg *lcfg; - int rc; - - lustre_cfg_bufs_reset(&bufs, devname); - lustre_cfg_bufs_set(&bufs, 1, srpc_log, sizeof(*srpc_log)); - lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs); - - rc = record_lcfg(obd, llh, lcfg); - - lustre_cfg_free(lcfg); - return rc; -} - static int record_lov_setup(struct obd_device *obd, struct llog_handle *llh, char *devname, struct lov_desc *desc) { @@ -799,7 +786,7 @@ static int record_lov_setup(struct obd_device *obd, struct llog_handle *llh, lustre_cfg_bufs_reset(&bufs, devname); lustre_cfg_bufs_set(&bufs, 1, desc, sizeof(*desc)); lcfg = lustre_cfg_new(LCFG_SETUP, &bufs); - if (!lcfg) + if (!lcfg) return -ENOMEM; rc = record_lcfg(obd, llh, lcfg); @@ -866,14 +853,14 @@ static int record_marker(struct obd_device *obd, struct llog_handle *llh, marker.cm_step = fsdb->fsdb_gen; marker.cm_flags = flags; marker.cm_vers = LUSTRE_VERSION_CODE; - strncpy(marker.cm_tgtname, tgtname, sizeof(marker.cm_tgtname)); - strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment)); + strncpy(marker.cm_tgtname, tgtname, sizeof(marker.cm_tgtname)); + strncpy(marker.cm_comment, comment, sizeof(marker.cm_comment)); marker.cm_createtime = cfs_time_current_sec(); marker.cm_canceltime = 0; lustre_cfg_bufs_reset(&bufs, NULL); lustre_cfg_bufs_set(&bufs, 1, &marker, sizeof(marker)); lcfg = lustre_cfg_new(LCFG_MARKER, &bufs); - if (!lcfg) + if (!lcfg) return -ENOMEM; rc = record_lcfg(obd, llh, lcfg); @@ -889,7 +876,7 @@ static int record_start_log(struct obd_device *obd, struct llog_ctxt *ctxt; int rc = 0; - if (*llh) + if (*llh) GOTO(out, rc = -EBUSY); ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); @@ -953,23 +940,23 @@ static int mgs_log_is_empty(struct obd_device *obd, char *name) /* write an lcfg directly into a log (with markers) */ static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb, - char *logname, struct lustre_cfg *lcfg, + char *logname, struct lustre_cfg *lcfg, char *devname, char *comment) { struct llog_handle *llh = NULL; int rc; ENTRY; - if (!lcfg) + if (!lcfg) RETURN(-ENOMEM); rc = record_start_log(obd, &llh, logname); - if (rc) + if (rc) RETURN(rc); /* FIXME These should be a single journal transaction */ - rc = record_marker(obd, llh, fsdb, CM_START, devname, comment); - + rc = record_marker(obd, llh, fsdb, CM_START, devname, comment); + rc = record_lcfg(obd, llh, lcfg); rc = record_marker(obd, llh, fsdb, CM_END, devname, comment); @@ -980,7 +967,7 @@ static int mgs_write_log_direct(struct obd_device *obd, struct fs_db *fsdb, /* write the lcfg in all logs for the given fs */ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb, - struct mgs_target_info *mti, + struct mgs_target_info *mti, struct lustre_cfg *lcfg, char *devname, char *comment) { @@ -991,9 +978,9 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb, char *logname; int rc = 0, len = strlen(fsname); ENTRY; - - /* We need to set params for any future logs - as well. FIXME Append this file to every new log. + + /* We need to set params for any future logs + as well. FIXME Append this file to every new log. Actually, we should store as params (text), not llogs. Or in a database. */ name_create(&logname, fsname, "-params"); @@ -1003,7 +990,7 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb, record_end_log(obd, &llh); } name_destroy(&logname); - if (rc) + if (rc) RETURN(rc); /* Find all the logs in the CONFIGS directory */ @@ -1022,13 +1009,13 @@ int mgs_write_log_direct_all(struct obd_device *obd, struct fs_db *fsdb, strstr(dirent->lld_name, "-sptlrpc") == NULL) { CDEBUG(D_MGS, "Changing log %s\n", dirent->lld_name); /* Erase any old settings of this same parameter */ - mgs_modify(obd, fsdb, mti, dirent->lld_name, devname, + mgs_modify(obd, fsdb, mti, dirent->lld_name, devname, comment, CM_SKIP); /* Write the new one */ rc = mgs_write_log_direct(obd, fsdb, dirent->lld_name, lcfg, devname, comment); if (rc) - CERROR("err %d writing log %s\n", rc, + CERROR("err %d writing log %s\n", rc, dirent->lld_name); } OBD_FREE(dirent, sizeof(*dirent)); @@ -1131,11 +1118,11 @@ static int mgs_steal_llog_handler(struct llog_handle *llh, if (got_an_osc_or_mdc == 0 || last_step < 0) RETURN(rc); - + if (lcfg->lcfg_command == LCFG_ADD_UUID) { uint64_t nodenid; nodenid = lcfg->lcfg_nid; - + tmti->mti_nids[tmti->mti_nid_count] = nodenid; tmti->mti_nid_count++; @@ -1288,10 +1275,10 @@ static int mgs_write_log_lov(struct obd_device *obd, struct fs_db *fsdb, /* This should always be the first entry in a log. rc = mgs_clear_log(obd, logname); */ rc = record_start_log(obd, &llh, logname); - if (rc) + if (rc) GOTO(out, rc); /* FIXME these should be a single journal transaction */ - rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup"); + rc = record_marker(obd, llh, fsdb, CM_START, lovname, "lov setup"); rc = record_attach(obd, llh, lovname, "lov", uuid); rc = record_lov_setup(obd, llh, lovname, lovdesc); rc = record_marker(obd, llh, fsdb, CM_END, lovname, "lov setup"); @@ -1331,7 +1318,7 @@ static int mgs_write_log_failnids(struct obd_device *obd, so just use the first nid as the uuid */ rc = name_create(&failnodeuuid, libcfs_nid2str(nid), ""); - if (rc) + if (rc) return rc; } CDEBUG(D_MGS, "add nid %s for failover uuid %s, " @@ -1354,12 +1341,11 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb, char *logname, char *lmvname) { struct llog_handle *llh = NULL; - struct sptlrpc_conf_log *srpc_log; char *mdcname, *nodeuuid, *mdcuuid, *lmvuuid; char index[5]; int i, rc; ENTRY; - + if (mgs_log_is_empty(obd, logname)) { CERROR("log is empty! Logical error\n"); RETURN(-EINVAL); @@ -1368,16 +1354,6 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb, CDEBUG(D_MGS, "adding mdc for %s to log %s:lmv(%s)\n", mti->mti_svname, logname, lmvname); - srpc_log = sptlrpc_conf_log_alloc(); - if (IS_ERR(srpc_log)) - RETURN(PTR_ERR(srpc_log)); - srpc_log->scl_part = LUSTRE_SP_CLI; - - rc = mgs_get_srpc_conf_log(fsdb, mti->mti_svname, - LUSTRE_SP_CLI, LUSTRE_SP_MDT, srpc_log); - if (rc) - goto out_srpc; - name_create(&nodeuuid, libcfs_nid2str(mti->mti_nids[0]), ""); name_create(&mdcname, mti->mti_svname, "-mdc"); name_create(&mdcuuid, mdcname, "_UUID"); @@ -1388,29 +1364,26 @@ static int mgs_write_log_mdc_to_lmv(struct obd_device *obd, struct fs_db *fsdb, "add mdc"); for (i = 0; i < mti->mti_nid_count; i++) { - CDEBUG(D_MGS, "add nid %s for mdt\n", + CDEBUG(D_MGS, "add nid %s for mdt\n", libcfs_nid2str(mti->mti_nids[i])); - + rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); } rc = record_attach(obd, llh, mdcname, LUSTRE_MDC_NAME, lmvuuid); rc = record_setup(obd, llh, mdcname, mti->mti_uuid, nodeuuid, 0, 0); - rc = record_sptlrpc_conf(obd, llh, mdcname, srpc_log); rc = mgs_write_log_failnids(obd, mti, llh, mdcname); snprintf(index, sizeof(index), "%d", mti->mti_stripe_index); rc = record_mdc_add(obd, llh, lmvname, mdcuuid, mti->mti_uuid, index, "1"); rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, - "add mdc"); + "add mdc"); rc = record_end_log(obd, &llh); name_destroy(&lmvuuid); name_destroy(&mdcuuid); name_destroy(&mdcname); name_destroy(&nodeuuid); -out_srpc: - sptlrpc_conf_log_free(srpc_log); RETURN(rc); } @@ -1419,7 +1392,6 @@ static int mgs_write_log_mdc_to_mdt(struct obd_device *obd, struct fs_db *fsdb, struct mgs_target_info *mti, char *logname) { struct llog_handle *llh = NULL; - struct sptlrpc_conf_log *srpc_log; char *nodeuuid, *mdcname, *mdcuuid, *mdtuuid; int idx = mti->mti_stripe_index; char index[9]; @@ -1433,16 +1405,6 @@ static int mgs_write_log_mdc_to_mdt(struct obd_device *obd, struct fs_db *fsdb, CDEBUG(D_MGS, "adding mdc index %d to %s\n", idx, logname); - srpc_log = sptlrpc_conf_log_alloc(); - if (IS_ERR(srpc_log)) - RETURN(PTR_ERR(srpc_log)); - srpc_log->scl_part = LUSTRE_SP_MDT; - - rc = mgs_get_srpc_conf_log(fsdb, mti->mti_svname, - LUSTRE_SP_MDT, LUSTRE_SP_MDT, srpc_log); - if (rc) - goto out_srpc; - name_create(&nodeuuid, libcfs_nid2str(mti->mti_nids[0]), ""); snprintf(index, sizeof(index), "-mdc%04x", idx); name_create(&mdcname, logname, index); @@ -1458,21 +1420,18 @@ static int mgs_write_log_mdc_to_mdt(struct obd_device *obd, struct fs_db *fsdb, } rc = record_attach(obd, llh, mdcname, LUSTRE_MDC_NAME, mdcuuid); rc = record_setup(obd, llh, mdcname, mti->mti_uuid, nodeuuid, 0, 0); - rc = record_sptlrpc_conf(obd, llh, mdcname, srpc_log); rc = mgs_write_log_failnids(obd, mti, llh, mdcname); snprintf(index, sizeof(index), "%d", idx); rc = record_mdc_add(obd, llh, logname, mdcuuid, mti->mti_uuid, index, "1"); - rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); rc = record_end_log(obd, &llh); name_destroy(&mdcuuid); name_destroy(&mdcname); name_destroy(&nodeuuid); name_destroy(&mdtuuid); -out_srpc: - sptlrpc_conf_log_free(srpc_log); RETURN(rc); } @@ -1483,26 +1442,15 @@ static int mgs_write_log_mdt0(struct obd_device *obd, struct fs_db *fsdb, struct llog_handle *llh = NULL; char *uuid, *lovname; char mdt_index[5]; - struct sptlrpc_conf_log *srpc_log; char *ptr = mti->mti_params; int rc = 0, failout = 0; ENTRY; - srpc_log = sptlrpc_conf_log_alloc(); - if (IS_ERR(srpc_log)) - RETURN(PTR_ERR(srpc_log)); - srpc_log->scl_part = LUSTRE_SP_MDT; - - rc = mgs_get_srpc_conf_log(fsdb, mti->mti_svname, - LUSTRE_SP_ANY, LUSTRE_SP_MDT, srpc_log); - if (rc) - GOTO(out_srpc, rc); - OBD_ALLOC(uuid, sizeof(struct obd_uuid)); if (uuid == NULL) - GOTO(out_srpc, rc = -ENOMEM); + RETURN(-ENOMEM); - if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) + if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) failout = (strncmp(ptr, "failout", 7) == 0); name_create(&lovname, log, "-mdtlov"); @@ -1510,27 +1458,24 @@ static int mgs_write_log_mdt0(struct obd_device *obd, struct fs_db *fsdb, rc = mgs_write_log_lov(obd, fsdb, mti, log, lovname); sprintf(uuid, "%s_UUID", log); - sprintf(mdt_index,"%d",mti->mti_stripe_index); + sprintf(mdt_index,"%d",mti->mti_stripe_index); /* add MDT itself */ rc = record_start_log(obd, &llh, log); - if (rc) + if (rc) GOTO(out, rc); - + /* FIXME this whole fn should be a single journal transaction */ rc = record_marker(obd, llh, fsdb, CM_START, log, "add mdt"); rc = record_attach(obd, llh, log, LUSTRE_MDT_NAME, uuid); rc = record_mount_opt(obd, llh, log, lovname, NULL); - rc = record_setup(obd, llh, log, uuid, mdt_index, lovname, + rc = record_setup(obd, llh, log, uuid, mdt_index, lovname, failout ? "n" : "f"); - rc = record_sptlrpc_conf(obd, llh, log, srpc_log); rc = record_marker(obd, llh, fsdb, CM_END, log, "add mdt"); rc = record_end_log(obd, &llh); out: name_destroy(&lovname); OBD_FREE(uuid, sizeof(struct obd_uuid)); -out_srpc: - sptlrpc_conf_log_free(srpc_log); RETURN(rc); } @@ -1553,10 +1498,10 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb, /* We're starting with an old uuid. Assume old name for lov as well since the lov entry already exists in the log. */ CDEBUG(D_MGS, "old mds uuid %s\n", mti->mti_uuid); - if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, + if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, strlen(fsdb->fsdb_mdtlov) - 4) != 0) { CERROR("old mds uuid %s doesn't match log %s (%s)\n", - mti->mti_uuid, fsdb->fsdb_mdtlov, + mti->mti_uuid, fsdb->fsdb_mdtlov, fsdb->fsdb_mdtlov + 4); RETURN(-EINVAL); } @@ -1571,19 +1516,19 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb, /* add mdt */ rc = mgs_write_log_mdt0(obd, fsdb, mti); - + /* Append the mdt info to the client log */ name_create(&cliname, mti->mti_fsname, "-client"); - - if (mgs_log_is_empty(obd, cliname)) { + + if (mgs_log_is_empty(obd, cliname)) { /* Start client log */ - rc = mgs_write_log_lov(obd, fsdb, mti, cliname, + rc = mgs_write_log_lov(obd, fsdb, mti, cliname, fsdb->fsdb_clilov); - rc = mgs_write_log_lmv(obd, fsdb, mti, cliname, + rc = mgs_write_log_lmv(obd, fsdb, mti, cliname, fsdb->fsdb_clilmv); } - /* + /* #09 L add_uuid nid=uml1@tcp(0x20000c0a80201) 0: 1:uml1_UUID #10 L attach 0:MDC_uml1_mdsA_MNT_client 1:mdc 2:1d834_MNT_client_03f #11 L setup 0:MDC_uml1_mdsA_MNT_client 1:mdsA_UUID 2:uml1_UUID @@ -1591,27 +1536,27 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb, #13 L add_conn 0:MDC_uml1_mdsA_MNT_client 1:uml2_UUID #14 L mount_option 0: 1:client 2:lov1 3:MDC_uml1_mdsA_MNT_client */ - + #if 0 /* COMPAT_146 */ - if (mti->mti_flags & LDD_F_UPGRADE14) { + if (mti->mti_flags & LDD_F_UPGRADE14) { rc = record_start_log(obd, &llh, cliname); - if (rc) + if (rc) GOTO(out, rc); - - rc = record_marker(obd, llh, fsdb, CM_START, + + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add mdc"); - - /* Old client log already has MDC entry, but needs mount opt + + /* Old client log already has MDC entry, but needs mount opt for new client name (lustre-client) */ - /* FIXME Old MDT log already has an old mount opt + /* FIXME Old MDT log already has an old mount opt which we should remove (currently handled by class_del_profiles()) */ rc = record_mount_opt(obd, llh, cliname, fsdb->fsdb_clilov, fsdb->fsdb_mdc); /* end COMPAT_146 */ - - rc = record_marker(obd, llh, fsdb, CM_END, + + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add mdc"); } else #endif @@ -1619,42 +1564,42 @@ static int mgs_write_log_mdt(struct obd_device *obd, struct fs_db *fsdb, /* copy client info about lov/lmv */ comp.comp_mti = mti; comp.comp_fsdb = fsdb; - - rc = mgs_steal_llog_for_mdt_from_client(obd, cliname, + + rc = mgs_steal_llog_for_mdt_from_client(obd, cliname, &comp); rc = mgs_write_log_mdc_to_lmv(obd, fsdb, mti, cliname, fsdb->fsdb_clilmv); /* add mountopts */ rc = record_start_log(obd, &llh, cliname); - if (rc) + if (rc) GOTO(out, rc); - rc = record_marker(obd, llh, fsdb, CM_START, cliname, + rc = record_marker(obd, llh, fsdb, CM_START, cliname, "mount opts"); rc = record_mount_opt(obd, llh, cliname, fsdb->fsdb_clilov, fsdb->fsdb_clilmv); - rc = record_marker(obd, llh, fsdb, CM_END, cliname, - "mount opts"); + rc = record_marker(obd, llh, fsdb, CM_END, cliname, + "mount opts"); } - + rc = record_end_log(obd, &llh); out: name_destroy(&cliname); - + // for_all_existing_mdt except current one for (i = 0; i < INDEX_MAP_SIZE * 8; i++){ char *mdtname; if (i != mti->mti_stripe_index && test_bit(i, fsdb->fsdb_mdt_index_map)) { sprintf(mdt_index,"-MDT%04x",i); - + name_create(&mdtname, mti->mti_fsname, mdt_index); rc = mgs_write_log_mdc_to_mdt(obd, fsdb, mti, mdtname); name_destroy(&mdtname); } } - + RETURN(rc); } @@ -1665,7 +1610,6 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb, enum lustre_sec_part sec_part, int flags) { struct llog_handle *llh = NULL; - struct sptlrpc_conf_log *srpc_log; char *nodeuuid, *oscname, *oscuuid, *lovuuid, *svname; char index[5]; int i, rc; @@ -1673,23 +1617,13 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb, ENTRY; CDEBUG(D_INFO, "adding osc for %s to log %s\n", mti->mti_svname, logname); - - srpc_log = sptlrpc_conf_log_alloc(); - if (IS_ERR(srpc_log)) - RETURN(PTR_ERR(srpc_log)); - srpc_log->scl_part = sec_part; - - rc = mgs_get_srpc_conf_log(fsdb, mti->mti_svname, - sec_part, LUSTRE_SP_OST, srpc_log); - if (rc) - goto out_srpc; if (mgs_log_is_empty(obd, logname)) { /* The first item in the log must be the lov, so we have somewhere to add our osc. */ rc = mgs_write_log_lov(obd, fsdb, mti, logname, lovname); } - + name_create(&nodeuuid, libcfs_nid2str(mti->mti_nids[0]), ""); name_create(&svname, mti->mti_svname, "-osc"); name_create(&oscname, svname, suffix); @@ -1707,34 +1641,31 @@ static int mgs_write_log_osc_to_lov(struct obd_device *obd, struct fs_db *fsdb, #07 L add_conn 0:OSC_uml1_ost1_MNT_client 1:uml2_UUID #08 L lov_modify_tgts add 0:lov1 1:ost1_UUID 2(index):0 3(gen):1 */ - + rc = record_start_log(obd, &llh, logname); - if (rc) + if (rc) GOTO(out, rc); /* FIXME these should be a single journal transaction */ rc = record_marker(obd, llh, fsdb, CM_START | flags, mti->mti_svname, - "add osc"); + "add osc"); for (i = 0; i < mti->mti_nid_count; i++) { CDEBUG(D_MGS, "add nid %s\n", libcfs_nid2str(mti->mti_nids[i])); rc = record_add_uuid(obd, llh, mti->mti_nids[i], nodeuuid); } rc = record_attach(obd, llh, oscname, LUSTRE_OSC_NAME, lovuuid); rc = record_setup(obd, llh, oscname, mti->mti_uuid, nodeuuid, 0, 0); - rc = record_sptlrpc_conf(obd, llh, oscname, srpc_log); rc = mgs_write_log_failnids(obd, mti, llh, oscname); snprintf(index, sizeof(index), "%d", mti->mti_stripe_index); rc = record_lov_add(obd, llh, lovname, mti->mti_uuid, index, "1"); rc = record_marker(obd, llh, fsdb, CM_END | flags, mti->mti_svname, - "add osc"); + "add osc"); rc = record_end_log(obd, &llh); -out: +out: name_destroy(&lovuuid); name_destroy(&oscuuid); name_destroy(&oscname); name_destroy(&svname); name_destroy(&nodeuuid); -out_srpc: - sptlrpc_conf_log_free(srpc_log); RETURN(rc); } @@ -1742,13 +1673,12 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, struct mgs_target_info *mti) { struct llog_handle *llh = NULL; - struct sptlrpc_conf_log *srpc_log; char *logname, *lovname; char mdt_index[9]; char *ptr = mti->mti_params; int rc, flags = 0, failout = 0, i; ENTRY; - + CDEBUG(D_MGS, "writing new ost %s\n", mti->mti_svname); /* The ost startup log */ @@ -1764,28 +1694,18 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, RETURN(-EALREADY); } - srpc_log = sptlrpc_conf_log_alloc(); - if (IS_ERR(srpc_log)) - RETURN(PTR_ERR(srpc_log)); - srpc_log->scl_part = LUSTRE_SP_OST; - - rc = mgs_get_srpc_conf_log(fsdb, mti->mti_svname, - LUSTRE_SP_ANY, LUSTRE_SP_OST, srpc_log); - if (rc) - goto out_srpc; - /* attach obdfilter ost1 ost1_UUID setup /dev/loop2 ldiskfs f|n errors=remount-ro,user_xattr */ - if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) + if (class_find_param(ptr, PARAM_FAILMODE, &ptr) == 0) failout = (strncmp(ptr, "failout", 7) == 0); rc = record_start_log(obd, &llh, mti->mti_svname); - if (rc) + if (rc) RETURN(rc); /* FIXME these should be a single journal transaction */ - rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost"); - if (*mti->mti_uuid == '\0') + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname,"add ost"); + if (*mti->mti_uuid == '\0') snprintf(mti->mti_uuid, sizeof(mti->mti_uuid), "%s_UUID", mti->mti_svname); rc = record_attach(obd, llh, mti->mti_svname, @@ -1793,11 +1713,10 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, rc = record_setup(obd, llh, mti->mti_svname, "dev"/*ignored*/, "type"/*ignored*/, failout ? "n" : "f", 0/*options*/); - rc = record_sptlrpc_conf(obd, llh, mti->mti_svname, srpc_log); - rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost"); + rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, "add ost"); rc = record_end_log(obd, &llh); - /* We also have to update the other logs where this osc is part of + /* We also have to update the other logs where this osc is part of the lov */ if (fsdb->fsdb_flags & FSDB_OLDLOG14) { @@ -1806,7 +1725,7 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, /* Note that we can't add any new failnids, since we don't know the old osc names. */ flags = CM_SKIP | CM_UPGRADE146; - + } else if ((mti->mti_flags & LDD_F_UPDATE) != LDD_F_UPDATE) { /* If the update flag isn't set, don't update client/mdt logs. */ @@ -1829,18 +1748,16 @@ static int mgs_write_log_ost(struct obd_device *obd, struct fs_db *fsdb, name_destroy(&lovname); } } - + /* Append ost info to the client log */ name_create(&logname, mti->mti_fsname, "-client"); mgs_write_log_osc_to_lov(obd, fsdb, mti, logname, "", fsdb->fsdb_clilov, LUSTRE_SP_CLI, 0); name_destroy(&logname); -out_srpc: - sptlrpc_conf_log_free(srpc_log); RETURN(rc); } -/* Add additional failnids to an existing log. +/* Add additional failnids to an existing log. The mdc/osc must have been added to logs first */ /* tcp nids must be in dotted-quad ascii - we can't resolve hostnames from the kernel. */ @@ -1853,7 +1770,7 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb, ENTRY; /* FIXME how do we delete a failnid? Currently --writeconf is the - only way. Maybe make --erase-params pass a flag to really + only way. Maybe make --erase-params pass a flag to really erase all params from logs - except it can't erase the failnids given when a target first registers, since they aren't processed as params... */ @@ -1874,17 +1791,17 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb, } else { RETURN(-EINVAL); } - + /* Add failover nids to client log */ name_create(&logname, mti->mti_fsname, "-client"); rc = record_start_log(obd, &llh, logname); - if (!rc) { + if (!rc) { /* FIXME this fn should be a single journal transaction */ rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname, "add failnid"); rc = mgs_write_log_failnids(obd, mti, llh, cliname); rc = record_marker(obd, llh, fsdb, CM_END, mti->mti_svname, - "add failnid"); + "add failnid"); rc = record_end_log(obd, &llh); } name_destroy(&logname); @@ -1894,11 +1811,11 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb, name_create(&logname, mti->mti_fsname, "-MDT0000"); rc = record_start_log(obd, &llh, logname); if (!rc) { - rc = record_marker(obd, llh, fsdb, CM_START, + rc = record_marker(obd, llh, fsdb, CM_START, mti->mti_svname, "add failnid"); rc = mgs_write_log_failnids(obd, mti, llh, cliname); - rc = record_marker(obd, llh, fsdb, CM_END, - mti->mti_svname, "add failnid"); + rc = record_marker(obd, llh, fsdb, CM_END, + mti->mti_svname, "add failnid"); rc = record_end_log(obd, &llh); } name_destroy(&logname); @@ -1908,7 +1825,7 @@ static int mgs_write_log_add_failnid(struct obd_device *obd, struct fs_db *fsdb, RETURN(rc); } -static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb, +static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb, struct mgs_target_info *mti, char *logname, struct lustre_cfg_bufs *bufs, char *tgtname, char *ptr) @@ -1917,7 +1834,7 @@ static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb, char *tmp; struct lustre_cfg *lcfg; int rc; - + /* Erase any old settings of this same parameter */ memcpy(comment, ptr, MTI_NAME_MAXLEN); comment[MTI_NAME_MAXLEN - 1] = 0; @@ -1932,340 +1849,13 @@ static int mgs_wlp_lcfg(struct obd_device *obd, struct fs_db *fsdb, lustre_cfg_bufs_reset(bufs, tgtname); lustre_cfg_bufs_set_string(bufs, 1, ptr); lcfg = lustre_cfg_new(LCFG_PARAM, bufs); - if (!lcfg) + if (!lcfg) return -ENOMEM; rc = mgs_write_log_direct(obd, fsdb, logname, lcfg, tgtname, comment); lustre_cfg_free(lcfg); return rc; } -/* - * populate rules which applied to a target device - */ -static int mgs_get_srpc_conf_log(struct fs_db *fsdb, const char *tgt, - enum lustre_sec_part from, - enum lustre_sec_part to, - struct sptlrpc_conf_log *log) -{ - struct mgs_tgt_srpc_conf *tgtconf; - struct sptlrpc_rule_set *tgt_rset; - int found_tgt = 0, rc; - - for (tgtconf = fsdb->fsdb_srpc_tgt; tgtconf; - tgtconf = tgtconf->mtsc_next) { - if (!strcmp(tgt, tgtconf->mtsc_tgt)) { - found_tgt = 1; - break; - } - } - - if (found_tgt) - tgt_rset = &tgtconf->mtsc_rset; - else - tgt_rset = NULL; - - rc = sptlrpc_conf_log_populate(&fsdb->fsdb_srpc_gen, tgt_rset, - from, to, fsdb->fsdb_srpc_fl_udesc, log); - if (rc) - CERROR("failed to populate srpc log for %s: %d\n", tgt, rc); - - return rc; -} - -struct mgs_msl_data { - struct obd_device *mmd_obd; - struct fs_db *mmd_fsdb; - struct mgs_target_info *mmd_mti; - int mmd_skip; - int mmd_attached; - int mmd_server; - enum lustre_sec_part mmd_tgtpart; - char mmd_tgtname[MTI_NAME_MAXLEN]; -}; - -static void mgs_msl_data_cleanup(struct mgs_msl_data *mmd) -{ - mmd->mmd_attached = 0; - mmd->mmd_tgtname[0] = '\0'; -} - -static int mgs_msl_tgt_uuid2name(char *tgtname, char *tgtuuid) -{ - char *ptr; - - if (tgtuuid == NULL) { - CERROR("missing target UUID???\n"); - return -EINVAL; - } - - ptr = strstr(tgtuuid, "_UUID"); - if (ptr == NULL) { - CERROR("unrecognized UUID: %s\n", tgtuuid); - return -EINVAL; - } - - *ptr = '\0';; - strncpy(tgtname, tgtuuid, MTI_NAME_MAXLEN); - tgtname[MTI_NAME_MAXLEN - 1] = '\0'; - - return 0; -} - -static int mgs_modify_srpc_log_handler(struct llog_handle *llh, - struct llog_rec_hdr *rec, - void *data) -{ - struct mgs_msl_data *mmd = (struct mgs_msl_data *)data; - struct cfg_marker *marker; - struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); - int cfg_len, rc; - ENTRY; - - if (rec->lrh_type != OBD_CFG_REC) { - CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); - RETURN(-EINVAL); - } - - cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - - sizeof(struct llog_rec_tail); - - rc = lustre_cfg_sanity_check(lcfg, cfg_len); - if (rc) { - CERROR("Insane cfg\n"); - RETURN(rc); - } - - if (lcfg->lcfg_command == LCFG_MARKER) { - marker = lustre_cfg_buf(lcfg, 1); - - if (marker->cm_flags & CM_START && - marker->cm_flags & CM_SKIP) - mmd->mmd_skip = 1; - if (marker->cm_flags & CM_END) - mmd->mmd_skip = 0; - - RETURN(0); - } - - if (mmd->mmd_skip) - RETURN(0); - - switch (lcfg->lcfg_command) { - case LCFG_ATTACH: - mmd->mmd_attached = 1; - - if (!strcmp(lustre_cfg_string(lcfg, 1), LUSTRE_OST_NAME)) { - mmd->mmd_server = 1; - mmd->mmd_tgtpart = LUSTRE_SP_OST; - } else if (!strcmp(lustre_cfg_string(lcfg, 1), - LUSTRE_MDT_NAME)) { - mmd->mmd_server = 1; - mmd->mmd_tgtpart = LUSTRE_SP_MDT; - } else if (!strcmp(lustre_cfg_string(lcfg, 1), - LUSTRE_OSC_NAME)) { - mmd->mmd_server = 0; - mmd->mmd_tgtpart = LUSTRE_SP_OST; - } else if (!strcmp(lustre_cfg_string(lcfg, 1), - LUSTRE_MDC_NAME)) { - mmd->mmd_server = 0; - mmd->mmd_tgtpart = LUSTRE_SP_MDT; - } else { - mmd->mmd_attached = 0; - } - - if (mmd->mmd_attached && mmd->mmd_server) { - rc = mgs_msl_tgt_uuid2name(mmd->mmd_tgtname, - lustre_cfg_string(lcfg, 2)); - if (rc) { - mgs_msl_data_cleanup(mmd); - break; - } - } - - break; - case LCFG_SETUP: - if (!mmd->mmd_attached) - break; - - /* already got tgtname at LCFG_ATTACH */ - if (mmd->mmd_server) - break; - - rc = mgs_msl_tgt_uuid2name(mmd->mmd_tgtname, - lustre_cfg_string(lcfg, 1)); - if (rc) { - mgs_msl_data_cleanup(mmd); - break; - } - - break; - case LCFG_SPTLRPC_CONF: { - struct sptlrpc_conf_log *log; - enum lustre_sec_part from; - - if (!mmd->mmd_attached) - break; - - log = sptlrpc_conf_log_extract(lcfg); - if (log == NULL) { - CERROR("missing sptlrpc config log???\n"); - mgs_msl_data_cleanup(mmd); - break; - } - - if (mmd->mmd_server) - from = LUSTRE_SP_ANY; - else - from = log->scl_part; - - /* cleanup the old log */ - sptlrpc_conf_log_cleanup(log); - - /* populate new log */ - rc = mgs_get_srpc_conf_log(mmd->mmd_fsdb, mmd->mmd_tgtname, - from, mmd->mmd_tgtpart, log); - if (rc) { - mgs_msl_data_cleanup(mmd); - break; - } - - /* Overwrite the log */ - rec->lrh_len = cfg_len; - rc = llog_write_rec(llh, rec, NULL, 0, (void *)lcfg, - rec->lrh_index); - if (rc) - CERROR("overwrite sptlrpc conf log failed: %d\n", rc); - - /* append new one */ - rc = record_marker(mmd->mmd_obd, llh, mmd->mmd_fsdb, CM_START, - mmd->mmd_mti->mti_svname, "sptlrpc config"); - rc = record_sptlrpc_conf(mmd->mmd_obd, llh, - lustre_cfg_string(lcfg, 0), log); - rc = record_marker(mmd->mmd_obd, llh, mmd->mmd_fsdb, CM_END, - mmd->mmd_mti->mti_svname, "sptlrpc config"); - - mgs_msl_data_cleanup(mmd); - break; - } - default: - /* ignore all others */ - break; - } - - RETURN(rc); -} - -static int mgs_modify_srpc_log(struct obd_device *obd, - struct fs_db *fsdb, - struct mgs_target_info *mti, - char *logname) -{ - struct llog_handle *llh; - struct lvfs_run_ctxt saved; - struct llog_ctxt *ctxt; - struct mgs_msl_data *mmd; - int rc, rc2; - ENTRY; - - CDEBUG(D_MGS, "modify sptlrpc log for %s\n", logname); - - push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - - ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); - LASSERT(ctxt != NULL); - rc = llog_create(ctxt, &llh, NULL, logname); - if (rc) - GOTO(out_pop, rc); - - rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); - if (rc) - GOTO(out_close, rc); - - if (llog_get_size(llh) <= 1) - GOTO(out_close, rc = 0); - - OBD_ALLOC_PTR(mmd); - if (!mmd) - GOTO(out_close, rc = -ENOMEM); - - mmd->mmd_obd = obd; - mmd->mmd_fsdb = fsdb; - mmd->mmd_mti = mti; - - rc = llog_process(llh, mgs_modify_srpc_log_handler, (void *) mmd, NULL); - - OBD_FREE_PTR(mmd); - -out_close: - rc2 = llog_close(llh); - if (!rc) - rc = rc2; - -out_pop: - pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - llog_ctxt_put(ctxt); - - if (rc) - CERROR("modify sptlrpc log %s failed %d\n", logname, rc); - RETURN(rc); -} - -/* - * for each of log, remove old conf at first - */ -static int mgs_modify_srpc_log_all(struct obd_device *obd, - struct fs_db *fsdb, - struct mgs_target_info *mti) -{ - char tgt_index[9]; - char *logname; - int i, rc = 0, rc2; - ENTRY; - - for (i = 0; i < INDEX_MAP_SIZE * 8; i++){ - if (test_bit(i, fsdb->fsdb_mdt_index_map)) { - sprintf(tgt_index,"-MDT%04x",i); - - name_create(&logname, mti->mti_fsname, tgt_index); - rc2 = mgs_modify(obd, fsdb, mti, logname, - mti->mti_fsname, "sptlrpc config", - CM_SKIP); - rc2 = mgs_modify_srpc_log(obd, fsdb, mti, logname); - name_destroy(&logname); - - if (rc2 && rc == 0) - rc = rc2; - } - } - - for (i = 0; i < INDEX_MAP_SIZE * 8; i++){ - if (test_bit(i, fsdb->fsdb_ost_index_map)) { - sprintf(tgt_index,"-OST%04x",i); - - name_create(&logname, mti->mti_fsname, tgt_index); - rc2 = mgs_modify(obd, fsdb, mti, logname, - mti->mti_fsname, "sptlrpc config", - CM_SKIP); - rc2 = mgs_modify_srpc_log(obd, fsdb, mti, logname); - name_destroy(&logname); - - if (rc2 && rc == 0) - rc = rc2; - } - } - - name_create(&logname, mti->mti_fsname, "-client"); - rc2 = mgs_modify(obd, fsdb, mti, logname, - mti->mti_fsname, "sptlrpc config", CM_SKIP); - rc2 = mgs_modify_srpc_log(obd, fsdb, mti, logname); - name_destroy(&logname); - - if (rc2 && rc == 0) - rc = rc2; - - RETURN(rc); -} - static int mgs_srpc_set_param_disk(struct obd_device *obd, struct fs_db *fsdb, struct mgs_target_info *mti, @@ -2293,7 +1883,7 @@ static int mgs_srpc_set_param_disk(struct obd_device *obd, /* prepare lcfg */ lustre_cfg_bufs_reset(&bufs, mti->mti_svname); lustre_cfg_bufs_set_string(&bufs, 1, param); - lcfg = lustre_cfg_new(0, &bufs); + lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs); if (lcfg == NULL) GOTO(out_comment, rc = -ENOMEM); @@ -2305,7 +1895,7 @@ static int mgs_srpc_set_param_disk(struct obd_device *obd, if (mgs_log_is_empty(obd, logname)) { rc = record_start_log(obd, &llh, logname); record_end_log(obd, &llh); - if (rc) + if (rc) GOTO(out, rc); } @@ -2346,10 +1936,10 @@ static int mgs_srpc_set_param_udesc_mem(struct fs_db *fsdb, goto error_out; if (strcmp(ptr, "yes") == 0) { - fsdb->fsdb_srpc_fl_udesc = 1; + fsdb->fsdb_fl_udesc = 1; CWARN("Enable user descriptor shipping from client to MDT\n"); } else if (strcmp(ptr, "no") == 0) { - fsdb->fsdb_srpc_fl_udesc = 0; + fsdb->fsdb_fl_udesc = 0; CWARN("Disable user descriptor shipping from client to MDT\n"); } else { *(ptr - 1) = '='; @@ -2392,6 +1982,15 @@ static int mgs_srpc_set_param_mem(struct fs_db *fsdb, if (rc) RETURN(rc); + /* mgs rules implies must be mgc->mgs */ + if (fsdb->fsdb_fl_mgsself) { + if ((rule.sr_from != LUSTRE_SP_MGC && + rule.sr_from != LUSTRE_SP_ANY) || + (rule.sr_to != LUSTRE_SP_MGS && + rule.sr_to != LUSTRE_SP_ANY)) + RETURN(-EINVAL); + } + /* preapre room for this coming rule. svcname format should be: * - fsname: general rule * - fsname-tgtname: target-specific rule @@ -2433,14 +2032,6 @@ static int mgs_srpc_set_param_mem(struct fs_db *fsdb, rset = &fsdb->fsdb_srpc_gen; } - /* limit the maximum number of rules, but allow deletion in any case */ - if (rset->srs_nrule >= SPTLRPC_CONF_LOG_MAX / 2 && - rule.sr_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) { - CERROR("too many (%d) rules already for %s\n", - rset->srs_nrule, svname); - RETURN(-E2BIG); - } - rc = sptlrpc_rule_set_merge(rset, &rule, 1); RETURN(rc); @@ -2451,8 +2042,8 @@ static int mgs_srpc_set_param(struct obd_device *obd, struct mgs_target_info *mti, char *param) { - char *copy; - int rc, copy_size; + char *copy; + int rc, copy_size; ENTRY; /* keep a copy of original param, which could be destroied @@ -2472,8 +2063,13 @@ static int mgs_srpc_set_param(struct obd_device *obd, if (rc) goto out_free; - /* now apply the new rules to all existing config logs */ - rc = mgs_modify_srpc_log_all(obd, fsdb, mti); + if (fsdb->fsdb_fl_mgsself) { + /* + * for mgs rules, make them effective immediately. + */ + LASSERT(fsdb->fsdb_srpc_tgt == NULL); + sptlrpc_target_update_exp_flavor(obd, &fsdb->fsdb_srpc_gen); + } out_free: OBD_FREE(copy, copy_size); @@ -2486,7 +2082,7 @@ struct mgs_srpc_read_data { }; static int mgs_srpc_read_handler(struct llog_handle *llh, - struct llog_rec_hdr *rec, + struct llog_rec_hdr *rec, void *data) { struct mgs_srpc_read_data *msrd = (struct mgs_srpc_read_data *) data; @@ -2501,7 +2097,7 @@ static int mgs_srpc_read_handler(struct llog_handle *llh, RETURN(-EINVAL); } - cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - + cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - sizeof(struct llog_rec_tail); rc = lustre_cfg_sanity_check(lcfg, cfg_len); @@ -2525,7 +2121,7 @@ static int mgs_srpc_read_handler(struct llog_handle *llh, if (msrd->msrd_skip) RETURN(0); - if (lcfg->lcfg_command != 0) { + if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) { CERROR("invalid command (%x)\n", lcfg->lcfg_command); RETURN(0); } @@ -2549,8 +2145,8 @@ static int mgs_srpc_read_handler(struct llog_handle *llh, RETURN(0); } -static int mgs_get_fsdb_srpc_from_llog(struct obd_device *obd, - struct fs_db *fsdb) +int mgs_get_fsdb_srpc_from_llog(struct obd_device *obd, + struct fs_db *fsdb) { struct llog_handle *llh = NULL; struct lvfs_run_ctxt saved; @@ -2560,14 +2156,14 @@ static int mgs_get_fsdb_srpc_from_llog(struct obd_device *obd, int rc; ENTRY; - ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); - LASSERT(ctxt != NULL); - /* construct log name */ rc = name_create(&logname, fsdb->fsdb_name, "-sptlrpc"); if (rc) RETURN(rc); + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + LASSERT(ctxt != NULL); + if (mgs_log_is_empty(obd, logname)) GOTO(out, rc = 0); @@ -2594,289 +2190,303 @@ out_close: out_pop: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); out: - name_destroy(&logname); llog_ctxt_put(ctxt); + name_destroy(&logname); if (rc) CERROR("failed to read sptlrpc config database: %d\n", rc); RETURN(rc); } -static int mgs_write_log_params(struct obd_device *obd, struct fs_db *fsdb, - struct mgs_target_info *mti) +static int mgs_write_log_param(struct obd_device *obd, struct fs_db *fsdb, + struct mgs_target_info *mti, char *ptr) { struct lustre_cfg_bufs bufs; struct lustre_cfg *lcfg; char *logname; - char *ptr = mti->mti_params; - char *endptr, *tmp; + char *tmp; int rc = 0; ENTRY; - if (!mti->mti_params) - RETURN(0); - /* For various parameter settings, we have to figure out which logs care about them (e.g. both mdt and client for lov settings) */ - while (ptr) { - while (*ptr == ' ') - ptr++; - if (*ptr == '\0') - break; - endptr = strchr(ptr, ' '); - if (endptr) - *endptr = '\0'; - CDEBUG(D_MGS, "next param '%s'\n", ptr); - - /* The params are stored in MOUNT_DATA_FILE and modified - via tunefs.lustre, or set using lctl conf_param */ - - /* Processed in lustre_start_mgc */ - if (class_match_param(ptr, PARAM_MGSNODE, NULL) == 0) - GOTO(end_while, rc); - - /* Processed in mgs_write_log_ost */ - if (class_match_param(ptr, PARAM_FAILMODE, NULL) == 0) { - if (mti->mti_flags & LDD_F_PARAM) { - LCONSOLE_ERROR_MSG(0x169, "%s can only be " - "changed with tunefs.lustre" - "and --writeconf\n", ptr); - rc = -EPERM; - } - GOTO(end_while, rc); + CDEBUG(D_MGS, "next param '%s'\n", ptr); + + /* The params are stored in MOUNT_DATA_FILE and modified via + tunefs.lustre, or set using lctl conf_param */ + + /* Processed in lustre_start_mgc */ + if (class_match_param(ptr, PARAM_MGSNODE, NULL) == 0) + GOTO(end, rc); + + /* Processed in mgs_write_log_ost */ + if (class_match_param(ptr, PARAM_FAILMODE, NULL) == 0) { + if (mti->mti_flags & LDD_F_PARAM) { + LCONSOLE_ERROR_MSG(0x169, "%s can only be " + "changed with tunefs.lustre" + "and --writeconf\n", ptr); + rc = -EPERM; } + GOTO(end, rc); + } + + if (class_match_param(ptr, PARAM_SRPC, NULL) == 0) { + rc = mgs_srpc_set_param(obd, fsdb, mti, ptr); + GOTO(end, rc); + } - if (class_match_param(ptr, PARAM_SRPC, NULL) == 0) { - rc = mgs_srpc_set_param(obd, fsdb, mti, ptr); - GOTO(end_while, rc); + if (class_match_param(ptr, PARAM_FAILNODE, NULL) == 0) { + /* Add a failover nidlist */ + rc = 0; + /* We already processed failovers params for new + targets in mgs_write_log_target */ + if (mti->mti_flags & LDD_F_PARAM) { + CDEBUG(D_MGS, "Adding failnode\n"); + rc = mgs_write_log_add_failnid(obd, fsdb, mti); + } + GOTO(end, rc); + } + + if (class_match_param(ptr, PARAM_SYS_TIMEOUT, &tmp) == 0) { + /* Change obd timeout */ + int timeout; + timeout = simple_strtoul(tmp, NULL, 0); + + CDEBUG(D_MGS, "obd timeout %d\n", timeout); + lustre_cfg_bufs_reset(&bufs, NULL); + lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs); + lcfg->lcfg_num = timeout; + /* modify all servers and clients */ + rc = mgs_write_log_direct_all(obd, fsdb, mti, lcfg, + mti->mti_fsname, + "timeout"); + lustre_cfg_free(lcfg); + GOTO(end, rc); + } + + if (class_match_param(ptr, PARAM_OSC""PARAM_ACTIVE, &tmp) == 0) { + /* active=0 means off, anything else means on */ + char mdt_index[16]; + int flag = (*tmp == '0') ? CM_EXCLUDE : 0; + int i; + + if (!(mti->mti_flags & LDD_F_SV_TYPE_OST)) { + LCONSOLE_ERROR_MSG(0x144, "%s: Only OSCs can " + "be (de)activated.\n", + mti->mti_svname); + GOTO(end, rc = -EINVAL); + } + LCONSOLE_WARN("Permanently %sactivating %s\n", + flag ? "de": "re", mti->mti_svname); + /* Modify clilov */ + name_create(&logname, mti->mti_fsname, "-client"); + rc = mgs_modify(obd, fsdb, mti, logname, + mti->mti_svname, "add osc", flag); + name_destroy(&logname); + if (rc) + goto active_err; + /* Modify mdtlov */ + /* FIXME add to all MDT logs for CMD */ + for (i = 0; i < INDEX_MAP_SIZE * 8; i++) { + if (!test_bit(i, fsdb->fsdb_mdt_index_map)) + continue; + sprintf(mdt_index,"-MDT%04x", i); + name_create(&logname, mti->mti_fsname, mdt_index); + rc = mgs_modify(obd, fsdb, mti, logname, + mti->mti_svname, "add osc", flag); + name_destroy(&logname); + if (rc) + goto active_err; + } + active_err: + if (rc) { + LCONSOLE_ERROR_MSG(0x145, "Couldn't find %s in" + "log (%d). No permanent " + "changes were made to the " + "config log.\n", + mti->mti_svname, rc); + if (fsdb->fsdb_flags & FSDB_OLDLOG14) + LCONSOLE_ERROR_MSG(0x146, "This may be" + " because the log" + "is in the old 1.4" + "style. Consider " + " --writeconf to " + "update the logs.\n"); + GOTO(end, rc); } + /* Fall through to osc proc for deactivating live OSC + on running MDT / clients. */ + } + /* Below here, let obd's XXX_process_config methods handle it */ - if (class_match_param(ptr, PARAM_FAILNODE, NULL) == 0) { - /* Add a failover nidlist */ - rc = 0; - /* We already processed failovers params for new - targets in mgs_write_log_target */ - if (mti->mti_flags & LDD_F_PARAM) { - CDEBUG(D_MGS, "Adding failnode\n"); - rc = mgs_write_log_add_failnid(obd, fsdb, mti); - } - GOTO(end_while, rc); + /* All lov. in proc */ + if (class_match_param(ptr, PARAM_LOV, NULL) == 0) { + char mdt_index[16]; + char *mdtlovname; + + CDEBUG(D_MGS, "lov param %s\n", ptr); + if (!(mti->mti_flags & LDD_F_SV_TYPE_MDT)) { + LCONSOLE_ERROR_MSG(0x147, "LOV params must be " + "set on the MDT, not %s. " + "Ignoring.\n", + mti->mti_svname); + GOTO(end, rc = 0); } - if (class_match_param(ptr, PARAM_SYS_TIMEOUT, &tmp) == 0) { - /* Change obd timeout */ - int timeout; - timeout = simple_strtoul(tmp, NULL, 0); - - CDEBUG(D_MGS, "obd timeout %d\n", timeout); - lustre_cfg_bufs_reset(&bufs, NULL); - lcfg = lustre_cfg_new(LCFG_SET_TIMEOUT, &bufs); - lcfg->lcfg_num = timeout; - /* modify all servers and clients */ - rc = mgs_write_log_direct_all(obd, fsdb, mti, lcfg, - mti->mti_fsname, - "timeout"); - lustre_cfg_free(lcfg); - GOTO(end_while, rc); + /* Modify mdtlov */ + if (mgs_log_is_empty(obd, mti->mti_svname)) + GOTO(end, rc = -ENODEV); + + sprintf(mdt_index,"-MDT%04x", mti->mti_stripe_index); + name_create(&logname, mti->mti_fsname, mdt_index); + name_create(&mdtlovname, logname, "-mdtlov"); + rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname, + &bufs, mdtlovname, ptr); + name_destroy(&logname); + name_destroy(&mdtlovname); + if (rc) + GOTO(end, rc); + + /* Modify clilov */ + name_create(&logname, mti->mti_fsname, "-client"); + rc = mgs_wlp_lcfg(obd, fsdb, mti, logname, &bufs, + fsdb->fsdb_clilov, ptr); + name_destroy(&logname); + GOTO(end, rc); + } + + /* All osc., mdc., llite. params in proc */ + if ((class_match_param(ptr, PARAM_OSC, NULL) == 0) || + (class_match_param(ptr, PARAM_MDC, NULL) == 0) || + (class_match_param(ptr, PARAM_LLITE, NULL) == 0)) { + char *cname; + if (memcmp(ptr, PARAM_LLITE, strlen(PARAM_LLITE)) == 0) { + name_create(&cname, mti->mti_fsname, "-client"); + /* Add the client type to match the obdname in + class_config_llog_handler */ + } else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { + /* COMPAT_146 */ + if (fsdb->fsdb_mdc) + name_create(&cname, fsdb->fsdb_mdc, ""); + else + name_create(&cname, mti->mti_svname, + "-mdc"); + } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) { + /* COMPAT_146 */ + if (fsdb->fsdb_flags & FSDB_OLDLOG14) { + LCONSOLE_ERROR_MSG(0x148, "Upgraded " + "client logs for %s" + " cannot be " + "modified. Consider" + " updating the " + "configuration with" + " --writeconf\n", + mti->mti_svname); + /* We don't know the names of all the + old oscs*/ + GOTO(end, rc = -EINVAL); + } + name_create(&cname, mti->mti_svname, "-osc"); + } else { + GOTO(end, rc = -EINVAL); } - if (class_match_param(ptr, PARAM_OSC""PARAM_ACTIVE, &tmp) == 0) { - /* active=0 means off, anything else means on */ + CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4); + + /* Modify client */ + name_create(&logname, mti->mti_fsname, "-client"); + rc = mgs_wlp_lcfg(obd, fsdb, mti, logname, &bufs, + cname, ptr); + + /* osc params affect the MDT as well */ + if (!rc && (mti->mti_flags & LDD_F_SV_TYPE_OST)) { char mdt_index[16]; - int flag = (*tmp == '0') ? CM_EXCLUDE : 0; int i; - if (!(mti->mti_flags & LDD_F_SV_TYPE_OST)) { - LCONSOLE_ERROR_MSG(0x144, "%s: Only OSCs can " - "be (de)activated.\n", - mti->mti_svname); - rc = -EINVAL; - goto end_while; - } - LCONSOLE_WARN("Permanently %sactivating %s\n", - flag ? "de": "re", mti->mti_svname); - /* Modify clilov */ - name_create(&logname, mti->mti_fsname, "-client"); - rc = mgs_modify(obd, fsdb, mti, logname, - mti->mti_svname, "add osc", flag); - name_destroy(&logname); - if (rc) - goto active_err; - /* Modify mdtlov */ - /* FIXME add to all MDT logs for CMD */ - for (i = 0; i < INDEX_MAP_SIZE * 8; i++) { + for (i = 0; i < INDEX_MAP_SIZE * 8; i++){ if (!test_bit(i, fsdb->fsdb_mdt_index_map)) continue; - sprintf(mdt_index,"-MDT%04x", i); - name_create(&logname, mti->mti_fsname, mdt_index); - rc = mgs_modify(obd, fsdb, mti, logname, - mti->mti_svname, "add osc", flag); + name_destroy(&cname); + sprintf(mdt_index, "-osc-MDT%04x", i); + name_create(&cname, mti->mti_svname, + mdt_index); name_destroy(&logname); + sprintf(mdt_index, "-MDT%04x", i); + name_create(&logname, mti->mti_fsname, + mdt_index); + if (!mgs_log_is_empty(obd, logname)) + rc = mgs_wlp_lcfg(obd, fsdb, + mti, logname, + &bufs, cname, + ptr); if (rc) - goto active_err; - } -active_err: - if (rc) { - LCONSOLE_ERROR_MSG(0x145, "Couldn't find %s in" - "log (%d). No permanent " - "changes were made to the " - "config log.\n", - mti->mti_svname, rc); - if (fsdb->fsdb_flags & FSDB_OLDLOG14) - LCONSOLE_ERROR_MSG(0x146, "This may be" - " because the log " - "is in the old 1.4" - "style. Consider " - " --writeconf to " - "update the logs.\n"); - goto end_while; + break; } - /* Fall through to osc proc for deactivating - live OSC on running MDT / clients. */ } - /* Below here, let obd's XXX_process_config methods handle it */ - - /* All lov. in proc */ - if (class_match_param(ptr, PARAM_LOV, NULL) == 0) { - char mdt_index[16]; - char *mdtlovname; - - CDEBUG(D_MGS, "lov param %s\n", ptr); - if (!(mti->mti_flags & LDD_F_SV_TYPE_MDT)) { - LCONSOLE_ERROR_MSG(0x147, "LOV params must be " - "set on the MDT, not %s. " - "Ignoring.\n", - mti->mti_svname); - rc = 0; - goto end_while; - } + name_destroy(&logname); + name_destroy(&cname); + GOTO(end, rc); + } - /* Modify mdtlov */ - if (mgs_log_is_empty(obd, mti->mti_svname)) - GOTO(end_while, rc = -ENODEV); + /* All mdt. params in proc */ + if (class_match_param(ptr, PARAM_MDT, NULL) == 0) { + char mdt_index[16]; + int i; + __u32 idx; - sprintf(mdt_index,"-MDT%04x", mti->mti_stripe_index); - name_create(&logname, mti->mti_fsname, mdt_index); - name_create(&mdtlovname, logname, "-mdtlov"); - rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname, - &bufs, mdtlovname, ptr); - name_destroy(&logname); - name_destroy(&mdtlovname); + CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4); + if (strncmp(mti->mti_svname, mti->mti_fsname, + MTI_NAME_MAXLEN) == 0) + /* device is unspecified completely? */ + rc = LDD_F_SV_TYPE_MDT | LDD_F_SV_ALL; + else + rc = server_name2index(mti->mti_svname, &idx, NULL); + if (rc < 0) + goto active_err; + if ((rc & LDD_F_SV_TYPE_MDT) == 0) + goto active_err; + if (rc & LDD_F_SV_ALL) { + for (i = 0; i < INDEX_MAP_SIZE * 8; i++) { + if (!test_bit(i, + fsdb->fsdb_mdt_index_map)) + continue; + sprintf(mdt_index,"-MDT%04x", i); + name_create(&logname, mti->mti_fsname, + mdt_index); + rc = mgs_wlp_lcfg(obd, fsdb, mti, + logname, &bufs, + logname, ptr); + name_destroy(&logname); + if (rc) + goto active_err; + } + } else { + rc = mgs_wlp_lcfg(obd, fsdb, mti, + mti->mti_svname, &bufs, + mti->mti_svname, ptr); if (rc) - GOTO(end_while, rc); - - /* Modify clilov */ - name_create(&logname, mti->mti_fsname, "-client"); - rc = mgs_wlp_lcfg(obd, fsdb, mti, logname, &bufs, - fsdb->fsdb_clilov, ptr); - name_destroy(&logname); - GOTO(end_while, rc); + goto active_err; } + GOTO(end, rc); + } - /* All osc., mdc., llite. params in proc */ - if ((class_match_param(ptr, PARAM_OSC, NULL) == 0) || - (class_match_param(ptr, PARAM_MDC, NULL) == 0) || - (class_match_param(ptr, PARAM_LLITE, NULL) == 0)) { - char *cname; - if (memcmp(ptr, PARAM_LLITE, strlen(PARAM_LLITE)) == 0) { - name_create(&cname, mti->mti_fsname, "-client"); - /* Add the client type to match the obdname - in class_config_llog_handler */ - } else if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { - /* COMPAT_146 */ - if (fsdb->fsdb_mdc) - name_create(&cname, fsdb->fsdb_mdc, ""); - else - name_create(&cname, mti->mti_svname, - "-mdc"); - } else if (mti->mti_flags & LDD_F_SV_TYPE_OST) { - /* COMPAT_146 */ - if (fsdb->fsdb_flags & FSDB_OLDLOG14) { - LCONSOLE_ERROR_MSG(0x148, "Upgraded " - "client logs for %s" - " cannot be " - "modified. Consider" - " updating the " - "configuration with" - " --writeconf\n", - mti->mti_svname); - /* We don't know the names of all the - old oscs*/ - rc = -EINVAL; - goto end_while; - } - name_create(&cname, mti->mti_svname, "-osc"); - } else { - rc = -EINVAL; - goto end_while; - } - - CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4); - - /* Modify client */ - name_create(&logname, mti->mti_fsname, "-client"); - rc = mgs_wlp_lcfg(obd, fsdb, mti, logname, &bufs, - cname, ptr); - - /* osc params affect the MDT as well */ - if (!rc && (mti->mti_flags & LDD_F_SV_TYPE_OST)) { - char mdt_index[16]; - int i; - - for (i = 0; i < INDEX_MAP_SIZE * 8; i++){ - if (!test_bit(i, fsdb->fsdb_mdt_index_map)) - continue; - name_destroy(&cname); - sprintf(mdt_index, "-osc-MDT%04x", i); - name_create(&cname, mti->mti_svname, - mdt_index); - name_destroy(&logname); - sprintf(mdt_index, "-MDT%04x", i); - name_create(&logname, mti->mti_fsname, - mdt_index); - if (!mgs_log_is_empty(obd, logname)) - rc = mgs_wlp_lcfg(obd, fsdb, - mti, logname, - &bufs, cname, - ptr); - if (rc) - break; - } - } - name_destroy(&logname); - name_destroy(&cname); - GOTO(end_while, rc); - } + /* All mdd., ost. params in proc */ + if ((class_match_param(ptr, PARAM_MDD, NULL) == 0) || + (class_match_param(ptr, PARAM_OST, NULL) == 0)) { + CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4); + if (mgs_log_is_empty(obd, mti->mti_svname)) + GOTO(end, rc = -ENODEV); - /* All mdt., ost. params in proc */ - if ((class_match_param(ptr, PARAM_MDT, NULL) == 0) || - (class_match_param(ptr, PARAM_MDD, NULL) == 0) || - (class_match_param(ptr, PARAM_OST, NULL) == 0)) { - CDEBUG(D_MGS, "%.3s param %s\n", ptr, ptr + 4); - if (mgs_log_is_empty(obd, mti->mti_svname)) { - rc = -ENODEV; - goto end_while; - } - rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname, - &bufs, mti->mti_svname, ptr); - GOTO(end_while, rc); - } + rc = mgs_wlp_lcfg(obd, fsdb, mti, mti->mti_svname, + &bufs, mti->mti_svname, ptr); + GOTO(end, rc); + } - LCONSOLE_WARN("Ignoring unrecognized param '%s'\n", ptr); + LCONSOLE_WARN("Ignoring unrecognized param '%s'\n", ptr); -end_while: - if (rc) { - CERROR("err %d on param '%s\n", rc, ptr); - break; - } - - if (!endptr) - /* last param */ - break; - - *endptr = ' '; - ptr = endptr + 1; - } +end: + if (rc) + CERROR("err %d on param '%s'\n", rc, ptr); RETURN(rc); } @@ -2889,20 +2499,20 @@ int mgs_check_failnid(struct obd_device *obd, struct mgs_target_info *mti) int rc; ENTRY; - rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); - if (rc) + rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); + if (rc) RETURN(rc); - if (mgs_log_is_empty(obd, mti->mti_svname)) + if (mgs_log_is_empty(obd, mti->mti_svname)) /* should never happen */ RETURN(-ENOENT); CDEBUG(D_MGS, "Checking for new failnids for %s\n", mti->mti_svname); /* FIXME We can just check mti->params to see if we're already in - the failover list. Modify mti->params for rewriting back at + the failover list. Modify mti->params for rewriting back at server_register_target(). */ - + down(&fsdb->fsdb_sem); rc = mgs_write_log_add_failnid(obd, fsdb, mti); up(&fsdb->fsdb_sem); @@ -2917,6 +2527,7 @@ int mgs_write_log_target(struct obd_device *obd, { struct fs_db *fsdb; int rc = -EINVAL; + char *buf, *params; ENTRY; /* set/check the new target index */ @@ -2930,7 +2541,7 @@ int mgs_write_log_target(struct obd_device *obd, if (mti->mti_flags & LDD_F_UPGRADE14) { if (rc == EALREADY) { LCONSOLE_INFO("Found index %d for %s 1.4 log, " - "upgrading\n", mti->mti_stripe_index, + "upgrading\n", mti->mti_stripe_index, mti->mti_svname); } else { LCONSOLE_ERROR_MSG(0x149, "Failed to find %s in the old" @@ -2948,9 +2559,9 @@ int mgs_write_log_target(struct obd_device *obd, /* end COMPAT_146 */ } else { if (rc == EALREADY) { - LCONSOLE_WARN("Found index %d for %s, updating log\n", + LCONSOLE_WARN("Found index %d for %s, updating log\n", mti->mti_stripe_index, mti->mti_svname); - /* We would like to mark old log sections as invalid + /* We would like to mark old log sections as invalid and add new log sections in the client and mdt logs. But if we add new sections, then live clients will get repeat setup instructions for already running @@ -2959,7 +2570,7 @@ int mgs_write_log_target(struct obd_device *obd, } } - rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); if (rc) { CERROR("Can't get db for %s\n", mti->mti_fsname); RETURN(rc); @@ -2967,7 +2578,7 @@ int mgs_write_log_target(struct obd_device *obd, down(&fsdb->fsdb_sem); - if (mti->mti_flags & + if (mti->mti_flags & (LDD_F_VIRGIN | LDD_F_UPGRADE14 | LDD_F_WRITECONF)) { /* Generate a log from scratch */ if (mti->mti_flags & LDD_F_SV_TYPE_MDT) { @@ -2988,8 +2599,30 @@ int mgs_write_log_target(struct obd_device *obd, CDEBUG(D_MGS, "Update params for %s\n", mti->mti_svname); mti->mti_flags |= LDD_F_PARAM; } - - rc = mgs_write_log_params(obd, fsdb, mti); + + /* allocate temporary buffer, where class_get_next_param will + make copy of a current parameter */ + OBD_ALLOC(buf, strlen(mti->mti_params) + 1); + if (buf == NULL) + GOTO(out_up, rc = -ENOMEM); + params = mti->mti_params; + while (params != NULL) { + rc = class_get_next_param(¶ms, buf); + if (rc) { + if (rc == 1) + /* there is no next parameter, that is + not an error */ + rc = 0; + break; + } + CDEBUG(D_MGS, "remaining string: '%s', param: '%s'\n", + params, buf); + rc = mgs_write_log_param(obd, fsdb, mti, buf); + if (rc) + break; + } + + OBD_FREE(buf, strlen(mti->mti_params) + 1); out_up: up(&fsdb->fsdb_sem); @@ -2997,30 +2630,30 @@ out_up: } /* COMPAT_146 */ -/* verify that we can handle the old config logs */ +/* verify that we can handle the old config logs */ int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti) { struct fs_db *fsdb; int rc = 0; ENTRY; - /* Create ost log normally, as servers register. Servers + /* Create ost log normally, as servers register. Servers register with their old uuids (from last_rcvd), so old (MDT and client) logs should work. - - new MDT won't know about old OSTs, only the ones that have - registered, so we need the old MDT log to get the LOV right - in order for old clients to work. - - Old clients connect to the MDT, not the MGS, for their logs, and - will therefore receive the old client log from the MDT /LOGS dir. + - new MDT won't know about old OSTs, only the ones that have + registered, so we need the old MDT log to get the LOV right + in order for old clients to work. + - Old clients connect to the MDT, not the MGS, for their logs, and + will therefore receive the old client log from the MDT /LOGS dir. - Old clients can continue to use and connect to old or new OSTs - - New clients will contact the MGS for their log + - New clients will contact the MGS for their log */ - LCONSOLE_INFO("upgrading server %s from pre-1.6\n", mti->mti_svname); + LCONSOLE_INFO("upgrading server %s from pre-1.6\n", mti->mti_svname); server_mti_print("upgrade", mti); - + rc = mgs_find_or_make_fsdb(obd, mti->mti_fsname, &fsdb); - if (rc) + if (rc) RETURN(rc); if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) { @@ -3031,7 +2664,7 @@ int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti) } if (fsdb->fsdb_gen == 0) { - /* There were no markers in the client log, meaning we have + /* There were no markers in the client log, meaning we have not updated the logs for this fs */ CDEBUG(D_MGS, "found old, unupdated client log\n"); } @@ -3047,10 +2680,10 @@ int mgs_upgrade_sv_14(struct obd_device *obd, struct mgs_target_info *mti) /* We're starting with an old uuid. Assume old name for lov as well since the lov entry already exists in the log. */ CDEBUG(D_MGS, "old mds uuid %s\n", mti->mti_uuid); - if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, + if (strncmp(mti->mti_uuid, fsdb->fsdb_mdtlov + 4, strlen(fsdb->fsdb_mdtlov) - 4) != 0) { CERROR("old mds uuid %s doesn't match log %s (%s)\n", - mti->mti_uuid, fsdb->fsdb_mdtlov, + mti->mti_uuid, fsdb->fsdb_mdtlov, fsdb->fsdb_mdtlov + 4); RETURN(-EINVAL); } @@ -3102,7 +2735,7 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname) struct l_linux_dirent *dirent, *n; int rc, len = strlen(fsname); ENTRY; - + /* Find all the logs in the CONFIGS directory */ rc = class_dentry_readdir(obd, mgs->mgs_configs_dir, mgs->mgs_vfsmnt, &dentry_list); @@ -3110,12 +2743,12 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname) CERROR("Can't read %s dir\n", MOUNT_CONFIGS_DIR); RETURN(rc); } - + down(&mgs->mgs_sem); - + /* Delete the fs db */ fsdb = mgs_find_fsdb(obd, fsname); - if (fsdb) + if (fsdb) mgs_free_fsdb(obd, fsdb); list_for_each_entry_safe(dirent, n, &dentry_list, lld_list) { @@ -3126,7 +2759,7 @@ int mgs_erase_logs(struct obd_device *obd, char *fsname) } OBD_FREE(dirent, sizeof(*dirent)); } - + up(&mgs->mgs_sem); RETURN(rc); @@ -3150,7 +2783,7 @@ static void print_lustre_cfg(struct lustre_cfg *lcfg) if (lcfg->lcfg_bufcount < LUSTRE_CFG_MAX_BUFCOUNT) for (i = 0; i < lcfg->lcfg_bufcount; i++) { CDEBUG(D_MGS, "\tlcfg->lcfg_buflens[%d]: %d %s\n", - i, lcfg->lcfg_buflens[i], + i, lcfg->lcfg_buflens[i], lustre_cfg_string(lcfg, i)); } EXIT; @@ -3168,7 +2801,7 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname) ENTRY; print_lustre_cfg(lcfg); - + /* lustre, lustre-mdtlov, lustre-client, lustre-MDT0000 */ devname = lustre_cfg_string(lcfg, 0); param = lustre_cfg_string(lcfg, 1); @@ -3199,10 +2832,10 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname) fsname[MTI_NAME_MAXLEN - 1] = 0; CDEBUG(D_MGS, "setparam on fs %s device %s\n", fsname, devname); - rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); - if (rc) + rc = mgs_find_or_make_fsdb(obd, fsname, &fsdb); + if (rc) RETURN(rc); - if (fsdb->fsdb_flags & FSDB_LOG_EMPTY) { + if (!fsdb->fsdb_fl_mgsself && fsdb->fsdb_flags & FSDB_LOG_EMPTY) { CERROR("No filesystem targets for %s. cfg_device from lctl " "is '%s'\n", fsname, devname); mgs_free_fsdb(obd, fsdb); @@ -3211,25 +2844,27 @@ int mgs_setparam(struct obd_device *obd, struct lustre_cfg *lcfg, char *fsname) /* Create a fake mti to hold everything */ OBD_ALLOC_PTR(mti); - if (!mti) + if (!mti) GOTO(out, rc = -ENOMEM); strncpy(mti->mti_fsname, fsname, MTI_NAME_MAXLEN); strncpy(mti->mti_svname, devname, MTI_NAME_MAXLEN); strncpy(mti->mti_params, param, sizeof(mti->mti_params)); rc = server_name2index(mti->mti_svname, &mti->mti_stripe_index, &tmp); - if (rc < 0) + if (rc < 0) /* Not a valid server; may be only fsname */ rc = 0; else /* Strip -osc or -mdc suffix from svname */ - if (server_make_name(rc, mti->mti_stripe_index, mti->mti_fsname, - mti->mti_svname)) + if (server_make_name(rc, mti->mti_stripe_index, mti->mti_fsname, + mti->mti_svname)) GOTO(out, rc = -EINVAL); mti->mti_flags = rc | LDD_F_PARAM; down(&fsdb->fsdb_sem); - rc = mgs_write_log_params(obd, fsdb, mti); + /* this is lctl conf_param's single param path, there is not + need to loop through parameters */ + rc = mgs_write_log_param(obd, fsdb, mti, mti->mti_params); up(&fsdb->fsdb_sem); out: @@ -3412,10 +3047,10 @@ static int mgs_backup_llog(struct obd_device *obd, char* fsname) if (len >= PATH_MAX - 1) { GOTO(out, -ENAMETOOLONG); - } + } push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - + bak_filp = l_filp_open(logname, O_RDWR|O_CREAT|O_TRUNC, 0660); if (IS_ERR(bak_filp)) { rc = PTR_ERR(bak_filp); diff --git a/lustre/obdclass/Makefile.in b/lustre/obdclass/Makefile.in index 50da9e8..2c7f0d2 100644 --- a/lustre/obdclass/Makefile.in +++ b/lustre/obdclass/Makefile.in @@ -25,8 +25,9 @@ obdclass-all-objs += debug.o genops.o uuid.o llog_ioctl.o obdclass-all-objs += lprocfs_status.o lustre_handles.o lustre_peer.o obdclass-all-objs += statfs_pack.o obdo.o obd_config.o obd_mount.o mea.o obdclass-all-objs += lu_object.o dt_object.o hash.o capa.o lu_time.o -obdclass-all-objs += lu_ref.o +obdclass-all-objs += cl_object.o cl_page.o cl_lock.o cl_io.o lu_ref.o obdclass-all-objs += acl.o idmap.o +obdclass-all-objs += md_local_object.o obdclass-objs := $(obdclass-linux-objs) $(obdclass-all-objs) diff --git a/lustre/obdclass/autoMakefile.am b/lustre/obdclass/autoMakefile.am index 778dba0..b7fb43e 100644 --- a/lustre/obdclass/autoMakefile.am +++ b/lustre/obdclass/autoMakefile.am @@ -11,7 +11,8 @@ liblustreclass_a_SOURCES = class_obd.c debug.c genops.c statfs_pack.c mea.c uuid liblustreclass_a_SOURCES += lustre_handles.c lustre_peer.c lprocfs_status.c class_hash.c liblustreclass_a_SOURCES += obdo.c obd_config.c llog.c llog_obd.c llog_cat.c liblustreclass_a_SOURCES += llog_lvfs.c llog_swab.c capa.c -liblustreclass_a_SOURCES += lu_object.c lu_ref.c lu_time.c +liblustreclass_a_SOURCES += lu_object.c cl_object.c lu_time.c lu_ref.c +liblustreclass_a_SOURCES += cl_page.c cl_lock.c cl_io.c liblustreclass_a_SOURCES += #llog_ioctl.c rbtree.c liblustreclass_a_CPPFLAGS = $(LLCPPFLAGS) liblustreclass_a_CFLAGS = $(LLCFLAGS) @@ -53,4 +54,4 @@ install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ llog-test.c MOSTLYCLEANFILES += linux/*.o darwin/*.o -DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h +DIST_SOURCES = $(filter-out llog-test.c,$(obdclass-all-objs:.o=.c)) $(llog-test-objs:.o=.c) llog_test.c llog_internal.h cl_internal.h diff --git a/lustre/obdclass/capa.c b/lustre/obdclass/capa.c index 421df58..b73386f 100644 --- a/lustre/obdclass/capa.c +++ b/lustre/obdclass/capa.c @@ -113,10 +113,11 @@ static inline int capa_on_server(struct obd_capa *ocapa) static inline void capa_delete(struct obd_capa *ocapa) { LASSERT(capa_on_server(ocapa)); - hlist_del(&ocapa->u.tgt.c_hash); - list_del(&ocapa->c_list); + hlist_del_init(&ocapa->u.tgt.c_hash); + list_del_init(&ocapa->c_list); capa_count[ocapa->c_site]--; - free_capa(ocapa); + /* release the ref when alloc */ + capa_put(ocapa); } void cleanup_capa_hash(struct hlist_head *hash) @@ -200,7 +201,7 @@ struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa) struct list_head *list = &capa_list[CAPA_SITE_SERVER]; ocapa = alloc_capa(CAPA_SITE_SERVER); - if (!ocapa) + if (IS_ERR(ocapa)) return NULL; spin_lock(&capa_lock); @@ -210,25 +211,18 @@ struct obd_capa *capa_add(struct hlist_head *hash, struct lustre_capa *capa) set_capa_expiry(ocapa); hlist_add_head(&ocapa->u.tgt.c_hash, head); list_add_tail(&ocapa->c_list, list); - capa_count[CAPA_SITE_SERVER]++; capa_get(ocapa); - + capa_count[CAPA_SITE_SERVER]++; if (capa_count[CAPA_SITE_SERVER] > CAPA_HASH_SIZE) capa_delete_lru(list); - - DEBUG_CAPA(D_SEC, &ocapa->c_capa, "new"); - spin_unlock(&capa_lock); return ocapa; + } else { + capa_get(old); + spin_unlock(&capa_lock); + capa_put(ocapa); + return old; } - - capa_get(old); - spin_unlock(&capa_lock); - - DEBUG_CAPA(D_SEC, &old->c_capa, "update"); - - free_capa(ocapa); - return old; } struct obd_capa *capa_lookup(struct hlist_head *hash, struct lustre_capa *capa, @@ -278,6 +272,110 @@ int capa_hmac(__u8 *hmac, struct lustre_capa *capa, __u8 *key) return 0; } + +int capa_encrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd = { + .page = virt_to_page(d), + .offset = (unsigned long)(d) % CFS_PAGE_SIZE, + .length = 16, + }; + struct scatterlist ss = { + .page = virt_to_page(s), + .offset = (unsigned long)(s) % CFS_PAGE_SIZE, + .length = 16, + }; + struct blkcipher_desc desc; + unsigned int min; + int rc; + ENTRY; + + tfm = ll_crypto_alloc_blkcipher("aes", 0, 0 ); + if (tfm == NULL) { + CERROR("failed to load transform for aes\n"); + RETURN(-EFAULT); + } + + min = crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_encrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to encrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} + +int capa_decrypt_id(__u32 *d, __u32 *s, __u8 *key, int keylen) +{ + struct ll_crypto_cipher *tfm; + struct scatterlist sd = { + .page = virt_to_page(d), + .offset = (unsigned long)(d) % CFS_PAGE_SIZE, + .length = 16, + }; + struct scatterlist ss = { + .page = virt_to_page(s), + .offset = (unsigned long)(s) % CFS_PAGE_SIZE, + .length = 16, + }; + struct blkcipher_desc desc; + unsigned int min; + int rc; + ENTRY; + + tfm = ll_crypto_alloc_blkcipher("aes", 0, 0 ); + if (tfm == NULL) { + CERROR("failed to load transform for aes\n"); + RETURN(-EFAULT); + } + + min = crypto_tfm_alg_min_keysize(tfm); + if (keylen < min) { + CERROR("keylen at least %d bits for aes\n", min * 8); + GOTO(out, rc = -EINVAL); + } + + rc = ll_crypto_blkcipher_setkey(tfm, key, min); + if (rc) { + CERROR("failed to setting key for aes\n"); + GOTO(out, rc); + } + + desc.tfm = tfm; + desc.info = NULL; + desc.flags = 0; + rc = ll_crypto_blkcipher_decrypt(&desc, &sd, &ss, 16); + if (rc) { + CERROR("failed to decrypt for aes\n"); + GOTO(out, rc); + } + + EXIT; + +out: + ll_crypto_free_blkcipher(tfm); + return rc; +} #endif void capa_cpy(void *capa, struct obd_capa *ocapa) @@ -287,22 +385,11 @@ void capa_cpy(void *capa, struct obd_capa *ocapa) spin_unlock(&ocapa->c_lock); } -char *dump_capa_content(char *buf, char *key, int len) -{ - int i, n = 0; - - for (i = 0; i < len; i++) - n += sprintf(buf + n, "%02x", (unsigned char) key[i]); - return buf; -} - EXPORT_SYMBOL(init_capa_hash); EXPORT_SYMBOL(cleanup_capa_hash); - EXPORT_SYMBOL(capa_add); EXPORT_SYMBOL(capa_lookup); - EXPORT_SYMBOL(capa_hmac); +EXPORT_SYMBOL(capa_encrypt_id); +EXPORT_SYMBOL(capa_decrypt_id); EXPORT_SYMBOL(capa_cpy); - -EXPORT_SYMBOL(dump_capa_content); diff --git a/lustre/obdclass/cl_internal.h b/lustre/obdclass/cl_internal.h new file mode 100644 index 0000000..578fdc7 --- /dev/null +++ b/lustre/obdclass/cl_internal.h @@ -0,0 +1,97 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal cl interfaces. + * + * Author: Nikita Danilov + */ +#ifndef _CL_INTERNAL_H +#define _CL_INTERNAL_H + +#define CLT_PVEC_SIZE (14) + +/** + * Thread local state internal for generic cl-code. + */ +struct cl_thread_info { + /* + * Common fields. + */ + struct cl_io clt_io; + struct cl_2queue clt_queue; + + /* + * Fields used by cl_lock.c + */ + struct cl_lock_descr clt_descr; + struct cl_page_list clt_list; + /** + * \name debugging. + * + * Counters used to check correctness of cl_lock interface usage. + * @{ + */ + /** + * Number of outstanding calls to cl_lock_mutex_get() made by the + * current thread. For debugging. + */ + int clt_nr_locks_locked; + /** List of locked locks. */ + struct lu_ref clt_locks_locked; + /** Number of outstanding holds on the top-level locks. */ + int clt_nr_held; + /** Number of outstanding uses on the top-level locks. */ + int clt_nr_used; + /** Number of held top-level extent locks. */ + int clt_nr_locks_acquired; + /** @} debugging */ + + /* + * Fields used by cl_page.c + */ + struct cl_page *clt_pvec[CLT_PVEC_SIZE]; + + /* + * Fields used by cl_io.c + */ + /** + * Pointer to the topmost ongoing IO in this thread. + */ + struct cl_io *clt_current_io; +}; + +struct cl_thread_info *cl_env_info(const struct lu_env *env); + +#endif /* _CL_INTERNAL_H */ diff --git a/lustre/obdclass/cl_io.c b/lustre/obdclass/cl_io.c new file mode 100644 index 0000000..62357e7 --- /dev/null +++ b/lustre/obdclass/cl_io.c @@ -0,0 +1,1625 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client IO. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +/* lu_time_global_{init,fini}() */ +#include + +#include +#include "cl_internal.h" + +/***************************************************************************** + * + * cl_io interface. + * + */ + +#define cl_io_for_each(slice, io) \ + list_for_each_entry((slice), &io->ci_layers, cis_linkage) +#define cl_io_for_each_reverse(slice, io) \ + list_for_each_entry_reverse((slice), &io->ci_layers, cis_linkage) + +static inline int cl_io_type_is_valid(enum cl_io_type type) +{ + return CIT_READ <= type && type < CIT_OP_NR; +} + +static inline int cl_io_is_loopable(const struct cl_io *io) +{ + return cl_io_type_is_valid(io->ci_type) && io->ci_type != CIT_MISC; +} + +/** + * True, iff \a io is a sendfile(). + */ +int cl_io_is_sendfile(const struct cl_io *io) +{ + return io->ci_type == CIT_READ && io->u.ci_rd.rd_is_sendfile; +} +EXPORT_SYMBOL(cl_io_is_sendfile); + +/** + * Returns true iff there is an IO ongoing in the given environment. + */ +int cl_io_is_going(const struct lu_env *env) +{ + return cl_env_info(env)->clt_current_io != NULL; +} +EXPORT_SYMBOL(cl_io_is_going); + +/** + * cl_io invariant that holds at all times when exported cl_io_*() functions + * are entered and left. + */ +static int cl_io_invariant(const struct cl_io *io) +{ + struct cl_io *up; + + up = io->ci_parent; + return + /* + * io can own pages only when it is ongoing. Sub-io might + * still be in CIS_LOCKED state when top-io is in + * CIS_IO_GOING. + */ + ergo(io->ci_owned_nr > 0, io->ci_state == CIS_IO_GOING || + (io->ci_state == CIS_LOCKED && up != NULL)); +} + +/** + * Finalize \a io, by calling cl_io_operations::cio_fini() bottom-to-top. + */ +void cl_io_fini(const struct lu_env *env, struct cl_io *io) +{ + struct cl_io_slice *slice; + struct cl_thread_info *info; + + LINVRNT(cl_io_type_is_valid(io->ci_type)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + while (!list_empty(&io->ci_layers)) { + slice = container_of(io->ci_layers.next, struct cl_io_slice, + cis_linkage); + list_del_init(&slice->cis_linkage); + if (slice->cis_iop->op[io->ci_type].cio_fini != NULL) + slice->cis_iop->op[io->ci_type].cio_fini(env, slice); + /* + * Invalidate slice to catch use after free. This assumes that + * slices are allocated within session and can be touched + * after ->cio_fini() returns. + */ + slice->cis_io = NULL; + } + io->ci_state = CIS_FINI; + info = cl_env_info(env); + if (info->clt_current_io == io) + info->clt_current_io = NULL; + EXIT; +} +EXPORT_SYMBOL(cl_io_fini); + +static int cl_io_init0(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_object *scan; + int result; + + LINVRNT(io->ci_state == CIS_ZERO || io->ci_state == CIS_FINI); + LINVRNT(cl_io_type_is_valid(iot)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_type = iot; + CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_todo); + CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_curr); + CFS_INIT_LIST_HEAD(&io->ci_lockset.cls_done); + CFS_INIT_LIST_HEAD(&io->ci_layers); + + result = 0; + cl_object_for_each(scan, obj) { + if (scan->co_ops->coo_io_init != NULL) { + result = scan->co_ops->coo_io_init(env, scan, io); + if (result != 0) + break; + } + } + if (result == 0) + io->ci_state = CIS_INIT; + RETURN(result); +} + +/** + * Initialize sub-io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * \pre obj != cl_object_top(obj) + */ +int cl_io_sub_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj != cl_object_top(obj)); + if (info->clt_current_io == NULL) + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_sub_init); + +/** + * Initialize \a io, by calling cl_io_operations::cio_init() top-to-bottom. + * + * Caller has to call cl_io_fini() after a call to cl_io_init(), no matter + * what the latter returned. + * + * \pre obj == cl_object_top(obj) + * \pre cl_io_type_is_valid(iot) + * \post cl_io_type_is_valid(io->ci_type) && io->ci_type == iot + */ +int cl_io_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, struct cl_object *obj) +{ + struct cl_thread_info *info = cl_env_info(env); + + LASSERT(obj == cl_object_top(obj)); + LASSERT(info->clt_current_io == NULL); + + info->clt_current_io = io; + return cl_io_init0(env, io, iot, obj); +} +EXPORT_SYMBOL(cl_io_init); + +/** + * Initialize read or write io. + * + * \pre iot == CIT_READ || iot == CIT_WRITE + */ +int cl_io_rw_init(const struct lu_env *env, struct cl_io *io, + enum cl_io_type iot, loff_t pos, size_t count) +{ + LINVRNT(iot == CIT_READ || iot == CIT_WRITE); + LINVRNT(io->ci_obj != NULL); + ENTRY; + + LU_OBJECT_HEADER(D_VFSTRACE, env, &io->ci_obj->co_lu, + "io range: %i [%llu, %llu) %i %i\n", + iot, (__u64)pos, (__u64)pos + count, + io->u.ci_rw.crw_nonblock, io->u.ci_wr.wr_append); + io->u.ci_rw.crw_pos = pos; + io->u.ci_rw.crw_count = count; + RETURN(cl_io_init(env, io, iot, io->ci_obj)); +} +EXPORT_SYMBOL(cl_io_rw_init); + +static inline const struct lu_fid * +cl_lock_descr_fid(const struct cl_lock_descr *descr) +{ + return lu_object_fid(&descr->cld_obj->co_lu); +} + +static int cl_lock_descr_cmp(const struct cl_lock_descr *d0, + const struct cl_lock_descr *d1) +{ + return lu_fid_cmp(cl_lock_descr_fid(d0), cl_lock_descr_fid(d1)) ?: + __diff_normalize(d0->cld_start, d1->cld_start); +} + +/* + * Sort locks in lexicographical order of their (fid, start-offset) pairs. + */ +static void cl_io_locks_sort(struct cl_io *io) +{ + int done = 0; + + ENTRY; + /* hidden treasure: bubble sort for now. */ + do { + struct cl_io_lock_link *curr; + struct cl_io_lock_link *prev; + struct cl_io_lock_link *temp; + + done = 1; + prev = NULL; + + list_for_each_entry_safe(curr, temp, &io->ci_lockset.cls_todo, + cill_linkage) { + if (prev != NULL) { + switch (cl_lock_descr_cmp(&prev->cill_descr, + &curr->cill_descr)) { + case 0: + /* + * IMPOSSIBLE: Identical locks are + * already removed at + * this point. + */ + default: + LBUG(); + case +1: + list_move_tail(&curr->cill_linkage, + &prev->cill_linkage); + done = 0; + continue; /* don't change prev: it's + * still "previous" */ + case -1: /* already in order */ + break; + } + } + prev = curr; + } + } while (!done); + EXIT; +} + +/** + * Check whether \a queue contains locks matching \a need. + * + * \retval +ve there is a matching lock in the \a queue + * \retval 0 there are no matching locks in the \a queue + */ +int cl_queue_match(const struct list_head *queue, + const struct cl_lock_descr *need) +{ + struct cl_io_lock_link *scan; + + ENTRY; + list_for_each_entry(scan, queue, cill_linkage) { + if (cl_lock_descr_match(&scan->cill_descr, need)) + RETURN(+1); + } + return 0; +} +EXPORT_SYMBOL(cl_queue_match); + +static int cl_lockset_match(const struct cl_lockset *set, + const struct cl_lock_descr *need, int all_queues) +{ + return (all_queues ? cl_queue_match(&set->cls_todo, need) : 0) || + cl_queue_match(&set->cls_curr, need) || + cl_queue_match(&set->cls_done, need); +} + +static int cl_lockset_lock_one(const struct lu_env *env, + struct cl_io *io, struct cl_lockset *set, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock; + int result; + + ENTRY; + + lock = cl_lock_request(env, io, &link->cill_descr, link->cill_enq_flags, + "io", io); + if (!IS_ERR(lock)) { + link->cill_lock = lock; + list_move(&link->cill_linkage, &set->cls_curr); + if (!(link->cill_enq_flags & CEF_ASYNC)) { + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, &set->cls_done); + } else + result = 0; + } else + result = PTR_ERR(lock); + RETURN(result); +} + +static void cl_lock_link_fini(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + struct cl_lock *lock = link->cill_lock; + + ENTRY; + list_del_init(&link->cill_linkage); + if (lock != NULL) { + cl_lock_release(env, lock, "io", io); + link->cill_lock = NULL; + } + if (link->cill_fini != NULL) + link->cill_fini(env, link); + EXIT; +} + +static int cl_lockset_lock(const struct lu_env *env, struct cl_io *io, + struct cl_lockset *set) +{ + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + struct cl_lock *lock; + int result; + + ENTRY; + result = 0; + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) { + if (!cl_lockset_match(set, &link->cill_descr, 0)) { + /* XXX some locking to guarantee that locks aren't + * expanded in between. */ + result = cl_lockset_lock_one(env, io, set, link); + if (result != 0) + break; + } else + cl_lock_link_fini(env, io, link); + } + if (result == 0) { + list_for_each_entry_safe(link, temp, + &set->cls_curr, cill_linkage) { + lock = link->cill_lock; + result = cl_wait(env, lock); + if (result == 0) + list_move(&link->cill_linkage, &set->cls_done); + else + break; + } + } + RETURN(result); +} + +/** + * Takes locks necessary for the current iteration of io. + * + * Calls cl_io_operations::cio_lock() top-to-bottom to collect locks required + * by layers for the current iteration. Then sort locks (to avoid dead-locks), + * and acquire them. + */ +int cl_io_lock(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IT_STARTED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_lock == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_lock(env, scan); + if (result != 0) + break; + } + if (result == 0) { + cl_io_locks_sort(io); + result = cl_lockset_lock(env, io, &io->ci_lockset); + } + if (result != 0) + cl_io_unlock(env, io); + else + io->ci_state = CIS_LOCKED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock); + +/** + * Release locks takes by io. + */ +void cl_io_unlock(const struct lu_env *env, struct cl_io *io) +{ + struct cl_lockset *set; + struct cl_io_lock_link *link; + struct cl_io_lock_link *temp; + const struct cl_io_slice *scan; + + LASSERT(cl_io_is_loopable(io)); + LASSERT(CIS_IT_STARTED <= io->ci_state && io->ci_state < CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + set = &io->ci_lockset; + + list_for_each_entry_safe(link, temp, &set->cls_todo, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_curr, cill_linkage) + cl_lock_link_fini(env, io, link); + + list_for_each_entry_safe(link, temp, &set->cls_done, cill_linkage) { + cl_unuse(env, link->cill_lock); + cl_lock_link_fini(env, io, link); + } + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_unlock != NULL) + scan->cis_iop->op[io->ci_type].cio_unlock(env, scan); + } + io->ci_state = CIS_UNLOCKED; + LASSERT(cl_env_info(env)->clt_nr_locks_acquired == 0); + EXIT; +} +EXPORT_SYMBOL(cl_io_unlock); + +/** + * Prepares next iteration of io. + * + * Calls cl_io_operations::cio_iter_init() top-to-bottom. This exists to give + * layers a chance to modify io parameters, e.g., so that lov can restrict io + * to a single stripe. + */ +int cl_io_iter_init(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_INIT || io->ci_state == CIS_IT_ENDED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + result = 0; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_init == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_iter_init(env, + scan); + if (result != 0) + break; + } + if (result == 0) + io->ci_state = CIS_IT_STARTED; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_iter_init); + +/** + * Finalizes io iteration. + * + * Calls cl_io_operations::cio_iter_fini() bottom-to-top. + */ +void cl_io_iter_fini(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_UNLOCKED); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_iter_fini != NULL) + scan->cis_iop->op[io->ci_type].cio_iter_fini(env, scan); + } + io->ci_state = CIS_IT_ENDED; + EXIT; +} +EXPORT_SYMBOL(cl_io_iter_fini); + +/** + * Records that read or write io progressed \a nob bytes forward. + */ +void cl_io_rw_advance(const struct lu_env *env, struct cl_io *io, size_t nob) +{ + const struct cl_io_slice *scan; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_WRITE || + nob == 0); + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(cl_io_invariant(io)); + + ENTRY; + + io->u.ci_rw.crw_pos += nob; + io->u.ci_rw.crw_count -= nob; + + /* layers have to be notified. */ + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_advance != NULL) + scan->cis_iop->op[io->ci_type].cio_advance(env, scan, + nob); + } + EXIT; +} +EXPORT_SYMBOL(cl_io_rw_advance); + +/** + * Adds a lock to a lockset. + */ +int cl_io_lock_add(const struct lu_env *env, struct cl_io *io, + struct cl_io_lock_link *link) +{ + int result; + + ENTRY; + if (cl_lockset_match(&io->ci_lockset, &link->cill_descr, 1)) + result = +1; + else { + list_add(&link->cill_linkage, &io->ci_lockset.cls_todo); + result = 0; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_add); + +static void cl_free_io_lock_link(const struct lu_env *env, + struct cl_io_lock_link *link) +{ + OBD_FREE_PTR(link); +} + +/** + * Allocates new lock link, and uses it to add a lock to a lockset. + */ +int cl_io_lock_alloc_add(const struct lu_env *env, struct cl_io *io, + struct cl_lock_descr *descr) +{ + struct cl_io_lock_link *link; + int result; + + ENTRY; + OBD_ALLOC_PTR(link); + if (link != NULL) { + link->cill_descr = *descr; + link->cill_fini = cl_free_io_lock_link; + result = cl_io_lock_add(env, io, link); + if (result) /* lock match */ + link->cill_fini(env, link); + } else + result = -ENOMEM; + + RETURN(result); +} +EXPORT_SYMBOL(cl_io_lock_alloc_add); + +/** + * Starts io by calling cl_io_operations::cio_start() top-to-bottom. + */ +int cl_io_start(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + io->ci_state = CIS_IO_GOING; + cl_io_for_each(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_start == NULL) + continue; + result = scan->cis_iop->op[io->ci_type].cio_start(env, scan); + if (result != 0) + break; + } + if (result >= 0) + result = 0; + RETURN(result); +} +EXPORT_SYMBOL(cl_io_start); + +/** + * Wait until current io iteration is finished by calling + * cl_io_operations::cio_end() bottom-to-top. + */ +void cl_io_end(const struct lu_env *env, struct cl_io *io) +{ + const struct cl_io_slice *scan; + + LINVRNT(cl_io_is_loopable(io)); + LINVRNT(io->ci_state == CIS_IO_GOING); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->op[io->ci_type].cio_end != NULL) + scan->cis_iop->op[io->ci_type].cio_end(env, scan); + /* TODO: error handling. */ + } + io->ci_state = CIS_IO_FINISHED; + EXIT; +} +EXPORT_SYMBOL(cl_io_end); + +static const struct cl_page_slice * +cl_io_slice_page(const struct cl_io_slice *ios, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, ios->cis_obj->co_lu.lo_dev->ld_type); + LINVRNT(slice != NULL); + return slice; +} + +/** + * True iff \a page is within \a io range. + */ +static int cl_page_in_io(const struct cl_page *page, const struct cl_io *io) +{ + int result; + loff_t start; + loff_t end; + pgoff_t idx; + + idx = page->cp_index; + switch (io->ci_type) { + case CIT_READ: + case CIT_WRITE: + /* + * check that [start, end) and [pos, pos + count) extents + * overlap. + */ + start = cl_offset(page->cp_obj, idx); + end = cl_offset(page->cp_obj, idx + 1); + result = io->u.ci_rw.crw_pos < end && + start < io->u.ci_rw.crw_pos + io->u.ci_rw.crw_count; + break; + case CIT_FAULT: + result = io->u.ci_fault.ft_index == idx; + break; + default: + LBUG(); + } + return result; +} + +/** + * Called by read io, when page has to be read from the server. + * + * \see cl_io_operations::cio_read_page() + */ +int cl_io_read_page(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + const struct cl_io_slice *scan; + struct cl_2queue *queue; + int result = 0; + + LINVRNT(io->ci_type == CIT_READ || io->ci_type == CIT_FAULT); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_page_in_io(page, io)); + LINVRNT(cl_io_invariant(io)); + ENTRY; + + queue = &io->ci_queue; + + cl_2queue_init(queue); + /* + * ->cio_read_page() methods called in the loop below are supposed to + * never block waiting for network (the only subtle point is the + * creation of new pages for read-ahead that might result in cache + * shrinking, but currently only clean pages are shrunk and this + * requires no network io). + * + * Should this ever starts blocking, retry loop would be needed for + * "parallel io" (see CLO_REPEAT loops in cl_lock.c). + */ + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_read_page != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + LINVRNT(slice != NULL); + result = scan->cis_iop->cio_read_page(env, scan, slice); + if (result != 0) + break; + } + } + if (result == 0) + result = cl_io_submit_rw(env, io, CRT_READ, queue); + /* + * Unlock unsent pages in case of error. + */ + cl_page_list_disown(env, io, &queue->c2_qin); + cl_2queue_fini(env, queue); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_read_page); + +/** + * Called by write io to prepare page to receive data from user buffer. + * + * \see cl_io_operations::cio_prepare_write() + */ +int cl_io_prepare_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(cl_page_is_owned(page, io)); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each_reverse(scan, io) { + if (scan->cis_iop->cio_prepare_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_prepare_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_io_prepare_write); + +/** + * Called by write io after user data were copied into a page. + * + * \see cl_io_operations::cio_commit_write() + */ +int cl_io_commit_write(const struct lu_env *env, struct cl_io *io, + struct cl_page *page, unsigned from, unsigned to) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(io->ci_type == CIT_WRITE); + LINVRNT(io->ci_state == CIS_IO_GOING || io->ci_state == CIS_LOCKED); + LINVRNT(cl_io_invariant(io)); + /* + * XXX Uh... not nice. Top level cl_io_commit_write() call (vvp->lov) + * already called cl_page_cache_add(), moving page into CPS_CACHED + * state. Better (and more general) way of dealing with such situation + * is needed. + */ + LASSERT(cl_page_is_owned(page, io) || page->cp_parent != NULL); + LASSERT(cl_page_in_io(page, io)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->cio_commit_write != NULL) { + const struct cl_page_slice *slice; + + slice = cl_io_slice_page(scan, page); + result = scan->cis_iop->cio_commit_write(env, scan, + slice, + from, to); + if (result != 0) + break; + } + } + LINVRNT(result <= 0); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_commit_write); + +/** + * Submits a list of pages for immediate io. + * + * After the function gets returned, The submitted pages are moved to + * queue->c2_qout queue, and queue->c2_qin contain both the pages don't need + * to be submitted, and the pages are errant to submit. + * + * \returns 0 if at least one page was submitted, error code otherwise. + * \see cl_io_operations::cio_submit() + */ +int cl_io_submit_rw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type crt, struct cl_2queue *queue) +{ + const struct cl_io_slice *scan; + int result = 0; + + LINVRNT(crt < ARRAY_SIZE(scan->cis_iop->req_op)); + ENTRY; + + cl_io_for_each(scan, io) { + if (scan->cis_iop->req_op[crt].cio_submit == NULL) + continue; + result = scan->cis_iop->req_op[crt].cio_submit(env, scan, crt, + queue); + if (result != 0) + break; + } + /* + * If ->cio_submit() failed, no pages were sent. + */ + LASSERT(ergo(result != 0, list_empty(&queue->c2_qout.pl_pages))); + RETURN(result); +} +EXPORT_SYMBOL(cl_io_submit_rw); + +/** + * Cancel an IO which has been submitted by cl_io_submit_rw. + */ +int cl_io_cancel(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue) +{ + struct cl_page *page; + int result = 0; + + CERROR("Canceling ongoing page trasmission\n"); + cl_page_list_for_each(page, queue) { + int rc; + + LINVRNT(cl_page_in_io(page, io)); + rc = cl_page_cancel(env, page); + result = result ?: rc; + } + return result; +} +EXPORT_SYMBOL(cl_io_cancel); + +/** + * Main io loop. + * + * Pumps io through iterations calling + * + * - cl_io_iter_init() + * + * - cl_io_lock() + * + * - cl_io_start() + * + * - cl_io_end() + * + * - cl_io_unlock() + * + * - cl_io_iter_fini() + * + * repeatedly until there is no more io to do. + */ +int cl_io_loop(const struct lu_env *env, struct cl_io *io) +{ + int result = 0; + + LINVRNT(cl_io_is_loopable(io)); + ENTRY; + + do { + size_t nob; + + io->ci_continue = 0; + result = cl_io_iter_init(env, io); + if (result == 0) { + nob = io->ci_nob; + result = cl_io_lock(env, io); + if (result == 0) { + /* + * Notify layers that locks has been taken, + * and do actual i/o. + * + * - llite: kms, short read; + * - llite: generic_file_read(); + */ + result = cl_io_start(env, io); + /* + * Send any remaining pending + * io, etc. + * + * - llite: ll_rw_stats_tally. + */ + cl_io_end(env, io); + cl_io_unlock(env, io); + cl_io_rw_advance(env, io, io->ci_nob - nob); + } + } + cl_io_iter_fini(env, io); + } while (result == 0 && io->ci_continue); + RETURN(result < 0 ? result : 0); +} +EXPORT_SYMBOL(cl_io_loop); + +/** + * Adds io slice to the cl_io. + * + * This is called by cl_object_operations::coo_io_init() methods to add a + * per-layer state to the io. New state is added at the end of + * cl_io::ci_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_page_slice_add() + */ +void cl_io_slice_add(struct cl_io *io, struct cl_io_slice *slice, + struct cl_object *obj, + const struct cl_io_operations *ops) +{ + struct list_head *linkage = &slice->cis_linkage; + + LASSERT((linkage->prev == NULL && linkage->next == NULL) || + list_empty(linkage)); + ENTRY; + + list_add_tail(linkage, &io->ci_layers); + slice->cis_io = io; + slice->cis_obj = obj; + slice->cis_iop = ops; + EXIT; +} +EXPORT_SYMBOL(cl_io_slice_add); + + +/** + * Initializes page list. + */ +void cl_page_list_init(struct cl_page_list *plist) +{ + ENTRY; + plist->pl_nr = 0; + CFS_INIT_LIST_HEAD(&plist->pl_pages); + plist->pl_owner = cfs_current(); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_init); + +/** + * Adds a page to a page list. + */ +void cl_page_list_add(struct cl_page_list *plist, struct cl_page *page) +{ + ENTRY; + /* it would be better to check that page is owned by "current" io, but + * it is not passed here. */ + LASSERT(page->cp_owner != NULL); + LINVRNT(plist->pl_owner == cfs_current()); + + lockdep_off(); + mutex_lock(&page->cp_mutex); + lockdep_on(); + LASSERT(list_empty(&page->cp_batch)); + list_add_tail(&page->cp_batch, &plist->pl_pages); + ++plist->pl_nr; + page->cp_queue_ref = lu_ref_add(&page->cp_reference, "queue", plist); + cl_page_get(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_add); + +/** + * Removes a page from a page list. + */ +void cl_page_list_del(const struct lu_env *env, + struct cl_page_list *plist, struct cl_page *page) +{ + LASSERT(plist->pl_nr > 0); + LINVRNT(plist->pl_owner == cfs_current()); + + ENTRY; + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + lu_ref_del_at(&page->cp_reference, page->cp_queue_ref, "queue", plist); + cl_page_put(env, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_del); + +/** + * Moves a page from one page list to another. + */ +void cl_page_list_move(struct cl_page_list *dst, struct cl_page_list *src, + struct cl_page *page) +{ + LASSERT(src->pl_nr > 0); + LINVRNT(dst->pl_owner == cfs_current()); + LINVRNT(src->pl_owner == cfs_current()); + + ENTRY; + list_move_tail(&page->cp_batch, &dst->pl_pages); + --src->pl_nr; + ++dst->pl_nr; + lu_ref_set_at(&page->cp_reference, + page->cp_queue_ref, "queue", src, dst); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_move); + +/** + * splice the cl_page_list, just as list head does + */ +void cl_page_list_splice(struct cl_page_list *list, struct cl_page_list *head) +{ + struct cl_page *page; + struct cl_page *tmp; + + LINVRNT(list->pl_owner == cfs_current()); + LINVRNT(head->pl_owner == cfs_current()); + + ENTRY; + cl_page_list_for_each_safe(page, tmp, list) + cl_page_list_move(head, list, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_splice); + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg); + +/** + * Disowns pages in a queue. + */ +void cl_page_list_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == cfs_current()); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) { + LASSERT(plist->pl_nr > 0); + + list_del_init(&page->cp_batch); + lockdep_off(); + mutex_unlock(&page->cp_mutex); + lockdep_on(); + --plist->pl_nr; + /* + * cl_page_disown0 rather than usual cl_page_disown() is used, + * because pages are possibly in CPS_FREEING state already due + * to the call to cl_page_list_discard(). + */ + /* + * XXX cl_page_disown0() will fail if page is not locked. + */ + cl_page_disown0(env, io, page); + lu_ref_del(&page->cp_reference, "queue", plist); + cl_page_put(env, page); + } + EXIT; +} +EXPORT_SYMBOL(cl_page_list_disown); + +/** + * Releases pages from queue. + */ +void cl_page_list_fini(const struct lu_env *env, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + + LINVRNT(plist->pl_owner == cfs_current()); + + ENTRY; + cl_page_list_for_each_safe(page, temp, plist) + cl_page_list_del(env, plist, page); + LASSERT(plist->pl_nr == 0); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_fini); + +/** + * Owns all pages in a queue. + */ +int cl_page_list_own(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + struct cl_page *temp; + int result; + + LINVRNT(plist->pl_owner == cfs_current()); + + ENTRY; + result = 0; + cl_page_list_for_each_safe(page, temp, plist) { + if (cl_page_own(env, io, page) == 0) + result = result ?: page->cp_error; + else + cl_page_list_del(env, plist, page); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_own); + +/** + * Assumes all pages in a queue. + */ +void cl_page_list_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == cfs_current()); + + cl_page_list_for_each(page, plist) + cl_page_assume(env, io, page); +} +EXPORT_SYMBOL(cl_page_list_assume); + +/** + * Discards all pages in a queue. + */ +void cl_page_list_discard(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + + LINVRNT(plist->pl_owner == cfs_current()); + ENTRY; + cl_page_list_for_each(page, plist) + cl_page_discard(env, io, page); + EXIT; +} +EXPORT_SYMBOL(cl_page_list_discard); + +/** + * Unmaps all pages in a queue from user virtual memory. + */ +int cl_page_list_unmap(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *plist) +{ + struct cl_page *page; + int result; + + LINVRNT(plist->pl_owner == cfs_current()); + ENTRY; + result = 0; + cl_page_list_for_each(page, plist) { + result = cl_page_unmap(env, io, page); + if (result != 0) + break; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_page_list_unmap); + +/** + * Initialize dual page queue. + */ +void cl_2queue_init(struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_init(&queue->c2_qin); + cl_page_list_init(&queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init); + +/** + * Add a page to the incoming page list of 2-queue. + */ +void cl_2queue_add(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_page_list_add(&queue->c2_qin, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_add); + +/** + * Disown pages in both lists of a 2-queue. + */ +void cl_2queue_disown(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_disown(env, io, &queue->c2_qin); + cl_page_list_disown(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_disown); + +/** + * Discard (truncate) pages in both lists of a 2-queue. + */ +void cl_2queue_discard(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_discard(env, io, &queue->c2_qin); + cl_page_list_discard(env, io, &queue->c2_qout); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_discard); + +/** + * Assume to own the pages in cl_2queue + */ +void cl_2queue_assume(const struct lu_env *env, + struct cl_io *io, struct cl_2queue *queue) +{ + cl_page_list_assume(env, io, &queue->c2_qin); + cl_page_list_assume(env, io, &queue->c2_qout); +} +EXPORT_SYMBOL(cl_2queue_assume); + +/** + * Finalize both page lists of a 2-queue. + */ +void cl_2queue_fini(const struct lu_env *env, struct cl_2queue *queue) +{ + ENTRY; + cl_page_list_fini(env, &queue->c2_qout); + cl_page_list_fini(env, &queue->c2_qin); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_fini); + +/** + * Initialize a 2-queue to contain \a page in its incoming page list. + */ +void cl_2queue_init_page(struct cl_2queue *queue, struct cl_page *page) +{ + ENTRY; + cl_2queue_init(queue); + cl_2queue_add(queue, page); + EXIT; +} +EXPORT_SYMBOL(cl_2queue_init_page); + +/** + * Returns top-level io. + * + * \see cl_object_top(), cl_page_top(). + */ +struct cl_io *cl_io_top(struct cl_io *io) +{ + ENTRY; + while (io->ci_parent != NULL) + io = io->ci_parent; + RETURN(io); +} +EXPORT_SYMBOL(cl_io_top); + +/** + * Prints human readable representation of \a io to the \a f. + */ +void cl_io_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_io *io) +{ +} + +/** + * Adds request slice to the compound request. + * + * This is called by cl_device_operations::cdo_req_init() methods to add a + * per-layer state to the request. New state is added at the end of + * cl_req::crq_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_req_slice_add(struct cl_req *req, struct cl_req_slice *slice, + struct cl_device *dev, + const struct cl_req_operations *ops) +{ + ENTRY; + list_add_tail(&slice->crs_linkage, &req->crq_layers); + slice->crs_dev = dev; + slice->crs_ops = ops; + slice->crs_req = req; + EXIT; +} +EXPORT_SYMBOL(cl_req_slice_add); + +static void cl_req_free(const struct lu_env *env, struct cl_req *req) +{ + unsigned i; + + LASSERT(list_empty(&req->crq_pages)); + LASSERT(req->crq_nrpages == 0); + LINVRNT(list_empty(&req->crq_layers)); + LINVRNT(equi(req->crq_nrobjs > 0, req->crq_o != NULL)); + ENTRY; + + if (req->crq_o != NULL) { + for (i = 0; i < req->crq_nrobjs; ++i) { + struct cl_object *obj = req->crq_o[i].ro_obj; + if (obj != NULL) { + lu_object_ref_del_at(&obj->co_lu, + req->crq_o[i].ro_obj_ref, + "cl_req", req); + cl_object_put(env, obj); + } + } + OBD_FREE(req->crq_o, req->crq_nrobjs * sizeof req->crq_o[0]); + } + OBD_FREE_PTR(req); + EXIT; +} + +static int cl_req_init(const struct lu_env *env, struct cl_req *req, + struct cl_page *page) +{ + struct cl_device *dev; + struct cl_page_slice *slice; + int result; + + ENTRY; + result = 0; + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + dev = lu2cl_dev(slice->cpl_obj->co_lu.lo_dev); + if (dev->cd_ops->cdo_req_init != NULL) { + result = dev->cd_ops->cdo_req_init(env, + dev, req); + if (result != 0) + break; + } + } + page = page->cp_child; + } while (page != NULL && result == 0); + RETURN(result); +} + +/** + * Invokes per-request transfer completion call-backs + * (cl_req_operations::cro_completion()) bottom-to-top. + */ +void cl_req_completion(const struct lu_env *env, struct cl_req *req, int rc) +{ + struct cl_req_slice *slice; + + ENTRY; + /* + * for the lack of list_for_each_entry_reverse_safe()... + */ + while (!list_empty(&req->crq_layers)) { + slice = list_entry(req->crq_layers.prev, + struct cl_req_slice, crs_linkage); + list_del_init(&slice->crs_linkage); + if (slice->crs_ops->cro_completion != NULL) + slice->crs_ops->cro_completion(env, slice, rc); + } + cl_req_free(env, req); + EXIT; +} +EXPORT_SYMBOL(cl_req_completion); + +/** + * Allocates new transfer request. + */ +struct cl_req *cl_req_alloc(const struct lu_env *env, struct cl_page *page, + enum cl_req_type crt, int nr_objects) +{ + struct cl_req *req; + + LINVRNT(nr_objects > 0); + ENTRY; + + OBD_ALLOC_PTR(req); + if (req != NULL) { + int result; + + OBD_ALLOC(req->crq_o, nr_objects * sizeof req->crq_o[0]); + if (req->crq_o != NULL) { + req->crq_nrobjs = nr_objects; + req->crq_type = crt; + CFS_INIT_LIST_HEAD(&req->crq_pages); + CFS_INIT_LIST_HEAD(&req->crq_layers); + result = cl_req_init(env, req, page); + } else + result = -ENOMEM; + if (result != 0) { + cl_req_completion(env, req, result); + req = ERR_PTR(result); + } + } else + req = ERR_PTR(-ENOMEM); + RETURN(req); +} +EXPORT_SYMBOL(cl_req_alloc); + +/** + * Adds a page to a request. + */ +void cl_req_page_add(const struct lu_env *env, + struct cl_req *req, struct cl_page *page) +{ + struct cl_object *obj; + struct cl_req_obj *rqo; + int i; + + ENTRY; + page = cl_page_top(page); + + LINVRNT(cl_page_is_vmlocked(env, page)); + LASSERT(list_empty(&page->cp_flight)); + LASSERT(page->cp_req == NULL); + + list_add_tail(&page->cp_flight, &req->crq_pages); + ++req->crq_nrpages; + page->cp_req = req; + obj = cl_object_top(page->cp_obj); + for (i = 0, rqo = req->crq_o; obj != rqo->ro_obj; ++i, ++rqo) { + if (rqo->ro_obj == NULL) { + rqo->ro_obj = obj; + cl_object_get(obj); + rqo->ro_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_req", req); + break; + } + } + LASSERT(i < req->crq_nrobjs); + EXIT; +} +EXPORT_SYMBOL(cl_req_page_add); + +/** + * Removes a page from a request. + */ +void cl_req_page_done(const struct lu_env *env, struct cl_page *page) +{ + struct cl_req *req = page->cp_req; + + ENTRY; + page = cl_page_top(page); + + LINVRNT(cl_page_is_vmlocked(env, page)); + LASSERT(!list_empty(&page->cp_flight)); + LASSERT(req->crq_nrpages > 0); + + list_del_init(&page->cp_flight); + --req->crq_nrpages; + page->cp_req = NULL; + EXIT; +} +EXPORT_SYMBOL(cl_req_page_done); + +/** + * Notifies layers that request is about to depart by calling + * cl_req_operations::cro_prep() top-to-bottom. + */ +int cl_req_prep(const struct lu_env *env, struct cl_req *req) +{ + int i; + int result; + const struct cl_req_slice *slice; + + ENTRY; + /* + * Check that the caller of cl_req_alloc() didn't lie about the number + * of objects. + */ + for (i = 0; i < req->crq_nrobjs; ++i) + LASSERT(req->crq_o[i].ro_obj != NULL); + + result = 0; + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + if (slice->crs_ops->cro_prep != NULL) { + result = slice->crs_ops->cro_prep(env, slice); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_req_prep); + +/** + * Fills in attributes that are passed to server together with transfer. Only + * attributes from \a flags may be touched. This can be called multiple times + * for the same request. + */ +void cl_req_attr_set(const struct lu_env *env, struct cl_req *req, + struct cl_req_attr *attr, obd_valid flags) +{ + const struct cl_req_slice *slice; + struct cl_page *page; + int i; + + LASSERT(!list_empty(&req->crq_pages)); + ENTRY; + + /* Take any page to use as a model. */ + page = list_entry(req->crq_pages.next, struct cl_page, cp_flight); + + for (i = 0; i < req->crq_nrobjs; ++i) { + list_for_each_entry(slice, &req->crq_layers, crs_linkage) { + const struct cl_page_slice *scan; + const struct cl_object *obj; + + scan = cl_page_at(page, + slice->crs_dev->cd_lu_dev.ld_type); + LASSERT(scan != NULL); + obj = scan->cpl_obj; + if (slice->crs_ops->cro_attr_set != NULL) + slice->crs_ops->cro_attr_set(env, slice, obj, + attr + i, flags); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_req_attr_set); + +/* XXX complete(), init_completion(), and wait_for_completion(), until they are + * implemented in libcfs. */ +#ifdef __KERNEL__ +# include +#else /* __KERNEL__ */ +# include +#endif + +/** + * Initialize synchronous io wait anchor, for transfer of \a nrpages pages. + */ +void cl_sync_io_init(struct cl_sync_io *anchor, int nrpages) +{ + ENTRY; + init_completion(&anchor->csi_sync_completion); + atomic_set(&anchor->csi_sync_nr, nrpages); + anchor->csi_sync_rc = 0; + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_init); + +/** + * Wait until all transfer completes. Transfer completion routine has to call + * cl_sync_io_note() for every page. + */ +int cl_sync_io_wait(const struct lu_env *env, struct cl_io *io, + struct cl_page_list *queue, struct cl_sync_io *anchor) +{ + int rc; + ENTRY; + + rc = wait_for_completion_interruptible(&anchor->csi_sync_completion); + if (rc < 0) { + int rc2; + rc2 = cl_io_cancel(env, io, queue); + if (rc2 < 0) { + /* Too bad, some pages are still in IO. */ + CDEBUG(D_VFSTRACE, "Failed to cancel transfer (%i). " + "Waiting for %i pages\n", + rc2, atomic_read(&anchor->csi_sync_nr)); + wait_for_completion(&anchor->csi_sync_completion); + } + } else + rc = anchor->csi_sync_rc; + LASSERT(atomic_read(&anchor->csi_sync_nr) == 0); + cl_page_list_assume(env, io, queue); + POISON(anchor, 0x5a, sizeof *anchor); + RETURN(rc); +} +EXPORT_SYMBOL(cl_sync_io_wait); + +/** + * Indicate that transfer of a single page completed. + */ +void cl_sync_io_note(struct cl_sync_io *anchor, int ioret) +{ + ENTRY; + if (anchor->csi_sync_rc == 0 && ioret < 0) + anchor->csi_sync_rc = ioret; + /* + * Synchronous IO done without releasing page lock (e.g., as a part of + * ->{prepare,commit}_write(). Completion is used to signal the end of + * IO. + */ + if (atomic_dec_and_test(&anchor->csi_sync_nr)) + complete(&anchor->csi_sync_completion); + EXIT; +} +EXPORT_SYMBOL(cl_sync_io_note); diff --git a/lustre/obdclass/cl_lock.c b/lustre/obdclass/cl_lock.c new file mode 100644 index 0000000..fca3f16 --- /dev/null +++ b/lustre/obdclass/cl_lock.c @@ -0,0 +1,2076 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Extent Lock. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +/* lu_time_global_{init,fini}() */ +#include + +#include +#include "cl_internal.h" + +/** Lock class of cl_lock::cll_guard */ +static struct lock_class_key cl_lock_guard_class; +static cfs_mem_cache_t *cl_lock_kmem; + +static struct lu_kmem_descr cl_lock_caches[] = { + { + .ckd_cache = &cl_lock_kmem, + .ckd_name = "cl_lock_kmem", + .ckd_size = sizeof (struct cl_lock) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Basic lock invariant that is maintained at all times. Caller either has a + * reference to \a lock, or somehow assures that \a lock cannot be freed. + * + * \see cl_lock_invariant() + */ +static int cl_lock_invariant_trusted(const struct lu_env *env, + const struct cl_lock *lock) +{ + return + cl_is_lock(lock) && + ergo(lock->cll_state == CLS_FREEING, lock->cll_holds == 0) && + atomic_read(&lock->cll_ref) >= lock->cll_holds && + lock->cll_holds >= lock->cll_users && + lock->cll_holds >= 0 && + lock->cll_users >= 0 && + lock->cll_depth >= 0; +} + +/** + * Stronger lock invariant, checking that caller has a reference on a lock. + * + * \see cl_lock_invariant_trusted() + */ +static int cl_lock_invariant(const struct lu_env *env, + const struct cl_lock *lock) +{ + int result; + + result = atomic_read(&lock->cll_ref) > 0 && + cl_lock_invariant_trusted(env, lock); + if (!result && env != NULL) + CL_LOCK_DEBUG(D_ERROR, env, lock, "invariant broken"); + return result; +} + +#define RETIP ((unsigned long)__builtin_return_address(0)) + +#ifdef CONFIG_LOCKDEP +static struct lock_class_key cl_lock_key; + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{ + lockdep_set_class_and_name(lock, &cl_lock_key, "EXT"); +} + +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{ + cl_env_info(env)->clt_nr_locks_acquired++; + lock_acquire(&lock->dep_map, !!(enqflags & CEF_ASYNC), + /* try: */ 0, lock->cll_descr.cld_mode <= CLM_READ, + /* check: */ 2, RETIP); +} + +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{ + cl_env_info(env)->clt_nr_locks_acquired--; + lock_release(&lock->dep_map, 0, RETIP); +} + +#else /* !CONFIG_LOCKDEP */ + +static void cl_lock_lockdep_init(struct cl_lock *lock) +{} +static void cl_lock_lockdep_acquire(const struct lu_env *env, + struct cl_lock *lock, __u32 enqflags) +{} +static void cl_lock_lockdep_release(const struct lu_env *env, + struct cl_lock *lock) +{} + +#endif /* !CONFIG_LOCKDEP */ + +/** + * Adds lock slice to the compound lock. + * + * This is called by cl_object_operations::coo_lock_init() methods to add a + * per-layer state to the lock. New state is added at the end of + * cl_lock::cll_layers list, that is, it is at the bottom of the stack. + * + * \see cl_req_slice_add(), cl_page_slice_add(), cl_io_slice_add() + */ +void cl_lock_slice_add(struct cl_lock *lock, struct cl_lock_slice *slice, + struct cl_object *obj, + const struct cl_lock_operations *ops) +{ + ENTRY; + slice->cls_lock = lock; + list_add_tail(&slice->cls_linkage, &lock->cll_layers); + slice->cls_obj = obj; + slice->cls_ops = ops; + EXIT; +} +EXPORT_SYMBOL(cl_lock_slice_add); + +/** + * Returns true iff a lock with the mode \a has provides at least the same + * guarantees as a lock with the mode \a need. + */ +int cl_lock_mode_match(enum cl_lock_mode has, enum cl_lock_mode need) +{ + LINVRNT(need == CLM_READ || need == CLM_WRITE || need == CLM_PHANTOM); + LINVRNT(has == CLM_READ || has == CLM_WRITE || has == CLM_PHANTOM); + CLASSERT(CLM_PHANTOM < CLM_READ); + CLASSERT(CLM_READ < CLM_WRITE); + + return need <= has; +} +EXPORT_SYMBOL(cl_lock_mode_match); + +/** + * Returns true iff extent portions of lock descriptions match. + */ +int cl_lock_ext_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + has->cld_start <= need->cld_start && + has->cld_end >= need->cld_end && + cl_lock_mode_match(has->cld_mode, need->cld_mode); +} +EXPORT_SYMBOL(cl_lock_ext_match); + +/** + * Returns true iff a lock with the description \a has provides at least the + * same guarantees as a lock with the description \a need. + */ +int cl_lock_descr_match(const struct cl_lock_descr *has, + const struct cl_lock_descr *need) +{ + return + cl_object_same(has->cld_obj, need->cld_obj) && + cl_lock_ext_match(has, need); +} +EXPORT_SYMBOL(cl_lock_descr_match); + +static void cl_lock_free(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj = lock->cll_descr.cld_obj; + + LASSERT(cl_is_lock(lock)); + LINVRNT(!cl_lock_is_mutexed(lock)); + + ENTRY; + might_sleep(); + while (!list_empty(&lock->cll_layers)) { + struct cl_lock_slice *slice; + + slice = list_entry(lock->cll_layers.next, struct cl_lock_slice, + cls_linkage); + list_del_init(lock->cll_layers.next); + slice->cls_ops->clo_fini(env, slice); + } + atomic_dec(&cl_object_site(obj)->cs_locks.cs_total); + atomic_dec(&cl_object_site(obj)->cs_locks_state[lock->cll_state]); + lu_object_ref_del_at(&obj->co_lu, lock->cll_obj_ref, "cl_lock", lock); + cl_object_put(env, obj); + lu_ref_fini(&lock->cll_reference); + lu_ref_fini(&lock->cll_holders); + mutex_destroy(&lock->cll_guard); + OBD_SLAB_FREE_PTR(lock, cl_lock_kmem); + EXIT; +} + +/** + * Releases a reference on a lock. + * + * When last reference is released, lock is returned to the cache, unless it + * is in cl_lock_state::CLS_FREEING state, in which case it is destroyed + * immediately. + * + * \see cl_object_put(), cl_page_put() + */ +void cl_lock_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object *obj; + struct cl_object_header *head; + struct cl_site *site; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + obj = lock->cll_descr.cld_obj; + LINVRNT(obj != NULL); + head = cl_object_header(obj); + site = cl_object_site(obj); + + CDEBUG(D_DLMTRACE, "releasing reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + + if (atomic_dec_and_test(&lock->cll_ref)) { + if (lock->cll_state == CLS_FREEING) { + LASSERT(list_empty(&lock->cll_linkage)); + cl_lock_free(env, lock); + } + atomic_dec(&site->cs_locks.cs_busy); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_put); + +/** + * Acquires an additional reference to a lock. + * + * This can be called only by caller already possessing a reference to \a + * lock. + * + * \see cl_object_get(), cl_page_get() + */ +void cl_lock_get(struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(NULL, lock)); + CDEBUG(D_DLMTRACE|D_TRACE, "acquiring reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + atomic_inc(&lock->cll_ref); +} +EXPORT_SYMBOL(cl_lock_get); + +/** + * Acquires a reference to a lock. + * + * This is much like cl_lock_get(), except that this function can be used to + * acquire initial reference to the cached lock. Caller has to deal with all + * possible races. Use with care! + * + * \see cl_page_get_trust() + */ +void cl_lock_get_trust(struct cl_lock *lock) +{ + struct cl_site *site = cl_object_site(lock->cll_descr.cld_obj); + + LASSERT(cl_is_lock(lock)); + CDEBUG(D_DLMTRACE|D_TRACE, "acquiring trusted reference: %d %p %lu\n", + atomic_read(&lock->cll_ref), lock, RETIP); + if (atomic_inc_return(&lock->cll_ref) == 1) + atomic_inc(&site->cs_locks.cs_busy); +} +EXPORT_SYMBOL(cl_lock_get_trust); + +/** + * Helper function destroying the lock that wasn't completely initialized. + * + * Other threads can acquire references to the top-lock through its + * sub-locks. Hence, it cannot be cl_lock_free()-ed immediately. + */ +static void cl_lock_finish(const struct lu_env *env, struct cl_lock *lock) +{ + cl_lock_mutex_get(env, lock); + cl_lock_delete(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); +} + +static struct cl_lock *cl_lock_alloc(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *descr) +{ + struct cl_lock *lock; + struct lu_object_header *head; + struct cl_site *site = cl_object_site(obj); + + ENTRY; + OBD_SLAB_ALLOC_PTR(lock, cl_lock_kmem); + if (lock != NULL) { + atomic_set(&lock->cll_ref, 1); + lock->cll_descr = *descr; + lock->cll_state = CLS_NEW; + cl_object_get(obj); + lock->cll_obj_ref = lu_object_ref_add(&obj->co_lu, + "cl_lock", lock); + CFS_INIT_LIST_HEAD(&lock->cll_layers); + CFS_INIT_LIST_HEAD(&lock->cll_linkage); + CFS_INIT_LIST_HEAD(&lock->cll_inclosure); + lu_ref_init(&lock->cll_reference); + lu_ref_init(&lock->cll_holders); + mutex_init(&lock->cll_guard); + lockdep_set_class(&lock->cll_guard, &cl_lock_guard_class); + cfs_waitq_init(&lock->cll_wq); + head = obj->co_lu.lo_header; + atomic_inc(&site->cs_locks_state[CLS_NEW]); + atomic_inc(&site->cs_locks.cs_total); + atomic_inc(&site->cs_locks.cs_created); + cl_lock_lockdep_init(lock); + list_for_each_entry(obj, &head->loh_layers, co_lu.lo_linkage) { + int err; + + err = obj->co_ops->coo_lock_init(env, obj, lock, io); + if (err != 0) { + cl_lock_finish(env, lock); + lock = ERR_PTR(err); + break; + } + } + } else + lock = ERR_PTR(-ENOMEM); + RETURN(lock); +} + +/** + * Returns true iff lock is "suitable" for given io. E.g., locks acquired by + * truncate and O_APPEND cannot be reused for read/non-append-write, as they + * cover multiple stripes and can trigger cascading timeouts. + */ +static int cl_lock_fits_into(const struct lu_env *env, + const struct cl_lock *lock, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_fits_into != NULL && + !slice->cls_ops->clo_fits_into(env, slice, need, io)) + RETURN(0); + } + RETURN(1); +} + +static struct cl_lock *cl_lock_lookup(const struct lu_env *env, + struct cl_object *obj, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_lock *lock; + struct cl_object_header *head; + struct cl_site *site; + + ENTRY; + + head = cl_object_header(obj); + site = cl_object_site(obj); + LINVRNT_SPIN_LOCKED(&head->coh_lock_guard); + atomic_inc(&site->cs_locks.cs_lookup); + list_for_each_entry(lock, &head->coh_locks, cll_linkage) { + int matched; + + LASSERT(cl_is_lock(lock)); + matched = cl_lock_ext_match(&lock->cll_descr, need) && + lock->cll_state < CLS_FREEING && + !(lock->cll_flags & CLF_CANCELLED) && + cl_lock_fits_into(env, lock, need, io); + CDEBUG(D_DLMTRACE, "has: "DDESCR"(%i) need: "DDESCR": %d\n", + PDESCR(&lock->cll_descr), lock->cll_state, PDESCR(need), + matched); + if (matched) { + cl_lock_get_trust(lock); + /* move the lock to the LRU head */ + list_move(&lock->cll_linkage, &head->coh_locks); + atomic_inc(&cl_object_site(obj)->cs_locks.cs_hit); + RETURN(lock); + } + } + RETURN(NULL); +} + +/** + * Returns a lock matching description \a need. + * + * This is the main entry point into the cl_lock caching interface. First, a + * cache (implemented as a per-object linked list) is consulted. If lock is + * found there, it is returned immediately. Otherwise new lock is allocated + * and returned. In any case, additional reference to lock is acquired. + * + * \see cl_object_find(), cl_page_find() + */ +static struct cl_lock *cl_lock_find(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + struct cl_site *site; + + ENTRY; + + obj = need->cld_obj; + head = cl_object_header(obj); + site = cl_object_site(obj); + + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + + if (lock == NULL) { + lock = cl_lock_alloc(env, obj, io, need); + if (!IS_ERR(lock)) { + struct cl_lock *ghost; + + spin_lock(&head->coh_lock_guard); + ghost = cl_lock_lookup(env, obj, io, need); + if (ghost == NULL) { + list_add(&lock->cll_linkage, &head->coh_locks); + spin_unlock(&head->coh_lock_guard); + atomic_inc(&site->cs_locks.cs_busy); + } else { + spin_unlock(&head->coh_lock_guard); + /* + * Other threads can acquire references to the + * top-lock through its sub-locks. Hence, it + * cannot be cl_lock_free()-ed immediately. + */ + cl_lock_finish(env, lock); + lock = ghost; + } + } + } + RETURN(lock); +} + +/** + * Returns existing lock matching given description. This is similar to + * cl_lock_find() except that no new lock is created, and returned lock is + * guaranteed to be in enum cl_lock_state::CLS_HELD state. + */ +struct cl_lock *cl_lock_peek(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_object_header *head; + struct cl_object *obj; + struct cl_lock *lock; + + obj = need->cld_obj; + head = cl_object_header(obj); + + spin_lock(&head->coh_lock_guard); + lock = cl_lock_lookup(env, obj, io, need); + spin_unlock(&head->coh_lock_guard); + + if (lock != NULL) { + int ok; + + cl_lock_mutex_get(env, lock); + if (lock->cll_state == CLS_CACHED) + cl_use_try(env, lock); + ok = lock->cll_state == CLS_HELD; + if (ok) { + cl_lock_hold_add(env, lock, scope, source); + cl_lock_user_add(env, lock); + } + cl_lock_mutex_put(env, lock); + if (!ok) { + cl_lock_put(env, lock); + lock = NULL; + } + } + return lock; +} +EXPORT_SYMBOL(cl_lock_peek); + +/** + * Returns a slice within a lock, corresponding to the given layer in the + * device stack. + * + * \see cl_page_at() + */ +const struct cl_lock_slice *cl_lock_at(const struct cl_lock *lock, + const struct lu_device_type *dtype) +{ + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_invariant_trusted(NULL, lock)); + ENTRY; + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + RETURN(NULL); +} +EXPORT_SYMBOL(cl_lock_at); + +static void cl_lock_trace(struct cl_thread_info *info, + const char *prefix, const struct cl_lock *lock) +{ + CDEBUG(D_DLMTRACE|D_TRACE, "%s: %i@%p %p %i %i\n", prefix, + atomic_read(&lock->cll_ref), lock, lock->cll_guarder, + lock->cll_depth, info->clt_nr_locks_locked); +} + +static void cl_lock_mutex_tail(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_info *info; + + info = cl_env_info(env); + lock->cll_depth++; + info->clt_nr_locks_locked++; + lu_ref_add(&info->clt_locks_locked, "cll_guard", lock); + cl_lock_trace(info, "got mutex", lock); +} + +/** + * Locks cl_lock object. + * + * This is used to manipulate cl_lock fields, and to serialize state + * transitions in the lock state machine. + * + * \post cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_put() + */ +void cl_lock_mutex_get(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_invariant(env, lock)); + + if (lock->cll_guarder == cfs_current()) { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_depth > 0); + } else { + struct cl_object_header *hdr; + + LINVRNT(lock->cll_guarder != cfs_current()); + hdr = cl_object_header(lock->cll_descr.cld_obj); + mutex_lock_nested(&lock->cll_guard, hdr->coh_nesting); + lock->cll_guarder = cfs_current(); + LINVRNT(lock->cll_depth == 0); + } + cl_lock_mutex_tail(env, lock); +} +EXPORT_SYMBOL(cl_lock_mutex_get); + +/** + * Try-locks cl_lock object. + * + * \retval 0 \a lock was successfully locked + * + * \retval -EBUSY \a lock cannot be locked right now + * + * \post ergo(result == 0, cl_lock_is_mutexed(lock)) + * + * \see cl_lock_mutex_get() + */ +int cl_lock_mutex_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + LINVRNT(cl_lock_invariant_trusted(env, lock)); + ENTRY; + + result = 0; + if (lock->cll_guarder == cfs_current()) { + LINVRNT(lock->cll_depth > 0); + cl_lock_mutex_tail(env, lock); + } else if (mutex_trylock(&lock->cll_guard)) { + LINVRNT(lock->cll_depth == 0); + lock->cll_guarder = cfs_current(); + cl_lock_mutex_tail(env, lock); + } else + result = -EBUSY; + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_mutex_try); + +/** + * Unlocks cl_lock object. + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_mutex_get() + */ +void cl_lock_mutex_put(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_thread_info *info; + + LINVRNT(cl_lock_invariant(env, lock)); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(lock->cll_guarder == cfs_current()); + LINVRNT(lock->cll_depth > 0); + + info = cl_env_info(env); + LINVRNT(info->clt_nr_locks_locked > 0); + + cl_lock_trace(info, "put mutex", lock); + lu_ref_del(&info->clt_locks_locked, "cll_guard", lock); + info->clt_nr_locks_locked--; + if (--lock->cll_depth == 0) { + lock->cll_guarder = NULL; + mutex_unlock(&lock->cll_guard); + } +} +EXPORT_SYMBOL(cl_lock_mutex_put); + +/** + * Returns true iff lock's mutex is owned by the current thread. + */ +int cl_lock_is_mutexed(struct cl_lock *lock) +{ + return lock->cll_guarder == cfs_current(); +} +EXPORT_SYMBOL(cl_lock_is_mutexed); + +/** + * Returns number of cl_lock mutices held by the current thread (environment). + */ +int cl_lock_nr_mutexed(const struct lu_env *env) +{ + return cl_env_info(env)->clt_nr_locks_locked; +} +EXPORT_SYMBOL(cl_lock_nr_mutexed); + +static void cl_lock_cancel0(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + if (!(lock->cll_flags & CLF_CANCELLED)) { + const struct cl_lock_slice *slice; + + lock->cll_flags |= CLF_CANCELLED; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_cancel != NULL) + slice->cls_ops->clo_cancel(env, slice); + } + } + EXIT; +} + +static void cl_lock_delete0(const struct lu_env *env, struct cl_lock *lock) +{ + struct cl_object_header *head; + const struct cl_lock_slice *slice; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_state < CLS_FREEING) { + cl_lock_state_set(env, lock, CLS_FREEING); + + head = cl_object_header(lock->cll_descr.cld_obj); + + spin_lock(&head->coh_lock_guard); + list_del_init(&lock->cll_linkage); + /* + * No locks, no pages. This is only valid for bottom sub-locks + * and head->coh_nesting == 1 check assumes two level top-sub + * hierarchy. + */ + LASSERT(ergo(head->coh_nesting == 1 && + list_empty(&head->coh_locks), !head->coh_pages)); + spin_unlock(&head->coh_lock_guard); + /* + * From now on, no new references to this lock can be acquired + * by cl_lock_lookup(). + */ + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_delete != NULL) + slice->cls_ops->clo_delete(env, slice); + } + /* + * From now on, no new references to this lock can be acquired + * by layer-specific means (like a pointer from struct + * ldlm_lock in osc, or a pointer from top-lock to sub-lock in + * lov). + * + * Lock will be finally freed in cl_lock_put() when last of + * existing references goes away. + */ + } + EXIT; +} + +static void cl_lock_hold_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_info *cti; + struct cl_object_header *hdr; + + cti = cl_env_info(env); + hdr = cl_object_header(lock->cll_descr.cld_obj); + lock->cll_holds += delta; + if (hdr->coh_nesting == 0) { + cti->clt_nr_held += delta; + LASSERT(cti->clt_nr_held >= 0); + } +} + +static void cl_lock_used_mod(const struct lu_env *env, struct cl_lock *lock, + int delta) +{ + struct cl_thread_info *cti; + struct cl_object_header *hdr; + + cti = cl_env_info(env); + hdr = cl_object_header(lock->cll_descr.cld_obj); + lock->cll_users += delta; + if (hdr->coh_nesting == 0) { + cti->clt_nr_used += delta; + LASSERT(cti->clt_nr_used >= 0); + } +} + +static void cl_lock_hold_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + ENTRY; + lu_ref_del(&lock->cll_holders, scope, source); + cl_lock_hold_mod(env, lock, -1); + if (lock->cll_holds == 0) { + if (lock->cll_descr.cld_mode == CLM_PHANTOM) + /* + * If lock is still phantom when user is done with + * it---destroy the lock. + */ + lock->cll_flags |= CLF_CANCELPEND|CLF_DOOMED; + if (lock->cll_flags & CLF_CANCELPEND) { + lock->cll_flags &= ~CLF_CANCELPEND; + cl_lock_cancel0(env, lock); + } + if (lock->cll_flags & CLF_DOOMED) { + /* no longer doomed: it's dead... Jim. */ + lock->cll_flags &= ~CLF_DOOMED; + cl_lock_delete0(env, lock); + } + } + EXIT; +} + + +/** + * Waits until lock state is changed. + * + * This function is called with cl_lock mutex locked, atomically releases + * mutex and goes to sleep, waiting for a lock state change (signaled by + * cl_lock_signal()), and re-acquires the mutex before return. + * + * This function is used to wait until lock state machine makes some progress + * and to emulate synchronous operations on top of asynchronous lock + * interface. + * + * \retval -EINTR wait was interrupted + * + * \retval 0 wait wasn't interrupted + * + * \pre cl_lock_is_mutexed(lock) + * + * \see cl_lock_signal() + */ +int cl_lock_state_wait(const struct lu_env *env, struct cl_lock *lock) +{ + cfs_waitlink_t waiter; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_depth == 1); + LASSERT(lock->cll_state != CLS_FREEING); /* too late to wait */ + + result = lock->cll_error; + if (result == 0 && !(lock->cll_flags & CLF_STATE)) { + cfs_waitlink_init(&waiter); + cfs_waitq_add(&lock->cll_wq, &waiter); + set_current_state(CFS_TASK_INTERRUPTIBLE); + cl_lock_mutex_put(env, lock); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + cfs_waitq_wait(&waiter, CFS_TASK_INTERRUPTIBLE); + + cl_lock_mutex_get(env, lock); + set_current_state(CFS_TASK_RUNNING); + cfs_waitq_del(&lock->cll_wq, &waiter); + result = cfs_signal_pending() ? -EINTR : 0; + } + lock->cll_flags &= ~CLF_STATE; + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_state_wait); + +static void cl_lock_state_signal(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + const struct cl_lock_slice *slice; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) + if (slice->cls_ops->clo_state != NULL) + slice->cls_ops->clo_state(env, slice, state); + lock->cll_flags |= CLF_STATE; + cfs_waitq_broadcast(&lock->cll_wq); + EXIT; +} + +/** + * Notifies waiters that lock state changed. + * + * Wakes up all waiters sleeping in cl_lock_state_wait(), also notifies all + * layers about state change by calling cl_lock_operations::clo_state() + * top-to-bottom. + */ +void cl_lock_signal(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_state_signal(env, lock, lock->cll_state); + EXIT; +} +EXPORT_SYMBOL(cl_lock_signal); + +/** + * Changes lock state. + * + * This function is invoked to notify layers that lock state changed, possible + * as a result of an asynchronous event such as call-back reception. + * + * \post lock->cll_state == state + * + * \see cl_lock_operations::clo_state() + */ +void cl_lock_state_set(const struct lu_env *env, struct cl_lock *lock, + enum cl_lock_state state) +{ + struct cl_site *site = cl_object_site(lock->cll_descr.cld_obj); + + ENTRY; + LASSERT(lock->cll_state <= state || + (lock->cll_state == CLS_CACHED && + (state == CLS_HELD || /* lock found in cache */ + state == CLS_NEW /* sub-lock canceled */)) || + /* sub-lock canceled during unlocking */ + (lock->cll_state == CLS_UNLOCKING && state == CLS_NEW)); + + if (lock->cll_state != state) { + atomic_dec(&site->cs_locks_state[lock->cll_state]); + atomic_inc(&site->cs_locks_state[state]); + + cl_lock_state_signal(env, lock, state); + lock->cll_state = state; + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_state_set); + +/** + * Yanks lock from the cache (cl_lock_state::CLS_CACHED state) by calling + * cl_lock_operations::clo_use() top-to-bottom to notify layers. + */ +int cl_use_try(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + const struct cl_lock_slice *slice; + + ENTRY; + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_use != NULL) { + result = slice->cls_ops->clo_use(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + if (result == 0) + cl_lock_state_set(env, lock, CLS_HELD); + RETURN(result); +} +EXPORT_SYMBOL(cl_use_try); + +/** + * Helper for cl_enqueue_try() that calls ->clo_enqueue() across all layers + * top-to-bottom. + */ +static int cl_enqueue_kick(const struct lu_env *env, + struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + const struct cl_lock_slice *slice; + + ENTRY; + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_enqueue != NULL) { + result = slice->cls_ops->clo_enqueue(env, + slice, io, flags); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + RETURN(result); +} + +/** + * Tries to enqueue a lock. + * + * This function is called repeatedly by cl_enqueue() until either lock is + * enqueued, or error occurs. This function does not block waiting for + * networking communication to complete. + * + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \see cl_enqueue() cl_lock_operations::clo_enqueue() + * \see cl_lock_state::CLS_ENQUEUED + */ +int cl_enqueue_try(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 flags) +{ + int result; + + ENTRY; + do { + result = 0; + + LINVRNT(cl_lock_is_mutexed(lock)); + + if (lock->cll_error != 0) + break; + switch (lock->cll_state) { + case CLS_NEW: + cl_lock_state_set(env, lock, CLS_QUEUING); + /* fall-through */ + case CLS_QUEUING: + /* kick layers. */ + result = cl_enqueue_kick(env, lock, io, flags); + if (result == 0) + cl_lock_state_set(env, lock, CLS_ENQUEUED); + break; + case CLS_UNLOCKING: + /* wait until unlocking finishes, and enqueue lock + * afresh. */ + result = CLO_WAIT; + break; + case CLS_CACHED: + /* yank lock from the cache. */ + result = cl_use_try(env, lock); + break; + case CLS_ENQUEUED: + case CLS_HELD: + result = 0; + break; + default: + case CLS_FREEING: + /* + * impossible, only held locks with increased + * ->cll_holds can be enqueued, and they cannot be + * freed. + */ + LBUG(); + } + } while (result == CLO_REPEAT); + if (result < 0) + cl_lock_error(env, lock, result); + RETURN(result ?: lock->cll_error); +} +EXPORT_SYMBOL(cl_enqueue_try); + +static int cl_enqueue_locked(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_holds > 0); + + cl_lock_user_add(env, lock); + do { + result = cl_enqueue_try(env, lock, io, enqflags); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result != 0) { + cl_lock_user_del(env, lock); + if (result != -EINTR) + cl_lock_error(env, lock, result); + } + LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} + +/** + * Enqueues a lock. + * + * \pre current thread or io owns a hold on lock. + * + * \post ergo(result == 0, lock->users increased) + * \post ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + */ +int cl_enqueue(const struct lu_env *env, struct cl_lock *lock, + struct cl_io *io, __u32 enqflags) +{ + int result; + + ENTRY; + + cl_lock_lockdep_acquire(env, lock, enqflags); + cl_lock_mutex_get(env, lock); + result = cl_enqueue_locked(env, lock, io, enqflags); + cl_lock_mutex_put(env, lock); + if (result != 0) + cl_lock_lockdep_release(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_enqueue); + +/** + * Tries to unlock a lock. + * + * This function is called repeatedly by cl_unuse() until either lock is + * unlocked, or error occurs. + * + * \ppre lock->cll_state <= CLS_HELD || lock->cll_state == CLS_UNLOCKING + * + * \post ergo(result == 0, lock->cll_state == CLS_CACHED) + * + * \see cl_unuse() cl_lock_operations::clo_unuse() + * \see cl_lock_state::CLS_CACHED + */ +int cl_unuse_try(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + if (lock->cll_state != CLS_UNLOCKING) { + if (lock->cll_users > 1) { + cl_lock_user_del(env, lock); + RETURN(0); + } + /* + * New lock users (->cll_users) are not protecting unlocking + * from proceeding. From this point, lock eventually reaches + * CLS_CACHED, is reinitialized to CLS_NEW or fails into + * CLS_FREEING. + */ + cl_lock_state_set(env, lock, CLS_UNLOCKING); + } + do { + result = 0; + + if (lock->cll_error != 0) + break; + + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_UNLOCKING); + LASSERT(lock->cll_users > 0); + LASSERT(lock->cll_holds > 0); + + result = -ENOSYS; + list_for_each_entry_reverse(slice, &lock->cll_layers, + cls_linkage) { + if (slice->cls_ops->clo_unuse != NULL) { + result = slice->cls_ops->clo_unuse(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + } while (result == CLO_REPEAT); + if (result != CLO_WAIT) + /* + * Once there is no more need to iterate ->clo_unuse() calls, + * remove lock user. This is done even if unrecoverable error + * happened during unlocking, because nothing else can be + * done. + */ + cl_lock_user_del(env, lock); + if (result == 0 || result == -ESTALE) { + enum cl_lock_state state; + + /* + * Return lock back to the cache. This is the only + * place where lock is moved into CLS_CACHED state. + * + * If one of ->clo_unuse() methods returned -ESTALE, lock + * cannot be placed into cache and has to be + * re-initialized. This happens e.g., when a sub-lock was + * canceled while unlocking was in progress. + */ + state = result == 0 ? CLS_CACHED : CLS_NEW; + cl_lock_state_set(env, lock, state); + + /* + * Hide -ESTALE error. + * If the lock is a glimpse lock, and it has multiple + * stripes. Assuming that one of its sublock returned -ENAVAIL, + * and other sublocks are matched write locks. In this case, + * we can't set this lock to error because otherwise some of + * its sublocks may not be canceled. This causes some dirty + * pages won't be written to OSTs. -jay + */ + result = 0; + } + result = result ?: lock->cll_error; + if (result < 0) + cl_lock_error(env, lock, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_unuse_try); + +static void cl_unuse_locked(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + LASSERT(lock->cll_state <= CLS_HELD); + do { + int result; + + result = cl_unuse_try(env, lock); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + EXIT; +} + +/** + * Unlocks a lock. + */ +void cl_unuse(const struct lu_env *env, struct cl_lock *lock) +{ + ENTRY; + cl_lock_mutex_get(env, lock); + cl_unuse_locked(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_release(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_unuse); + +/** + * Tries to wait for a lock. + * + * This function is called repeatedly by cl_wait() until either lock is + * granted, or error occurs. This function does not block waiting for network + * communication to complete. + * + * \see cl_wait() cl_lock_operations::clo_wait() + * \see cl_lock_state::CLS_HELD + */ +int cl_wait_try(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + do { + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_ENQUEUED || + lock->cll_state == CLS_HELD); + LASSERT(lock->cll_users > 0); + LASSERT(lock->cll_holds > 0); + + result = 0; + if (lock->cll_error != 0) + break; + if (lock->cll_state == CLS_HELD) + /* nothing to do */ + break; + + result = -ENOSYS; + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_wait != NULL) { + result = slice->cls_ops->clo_wait(env, slice); + if (result != 0) + break; + } + } + LASSERT(result != -ENOSYS); + if (result == 0) + cl_lock_state_set(env, lock, CLS_HELD); + } while (result == CLO_REPEAT); + RETURN(result ?: lock->cll_error); +} +EXPORT_SYMBOL(cl_wait_try); + +/** + * Waits until enqueued lock is granted. + * + * \pre current thread or io owns a hold on the lock + * \pre ergo(result == 0, lock->cll_state == CLS_ENQUEUED || + * lock->cll_state == CLS_HELD) + * + * \post ergo(result == 0, lock->cll_state == CLS_HELD) + */ +int cl_wait(const struct lu_env *env, struct cl_lock *lock) +{ + int result; + + ENTRY; + cl_lock_mutex_get(env, lock); + + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state == CLS_ENQUEUED || lock->cll_state == CLS_HELD); + LASSERT(lock->cll_holds > 0); + + do { + result = cl_wait_try(env, lock); + if (result == CLO_WAIT) { + result = cl_lock_state_wait(env, lock); + if (result == 0) + continue; + } + break; + } while (1); + if (result < 0) { + cl_lock_user_del(env, lock); + if (result != -EINTR) + cl_lock_error(env, lock, result); + cl_lock_lockdep_release(env, lock); + } + cl_lock_mutex_put(env, lock); + LASSERT(ergo(result == 0, lock->cll_state == CLS_HELD)); + RETURN(result); +} +EXPORT_SYMBOL(cl_wait); + +/** + * Executes cl_lock_operations::clo_weigh(), and sums results to estimate lock + * value. + */ +unsigned long cl_lock_weigh(const struct lu_env *env, struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + unsigned long pound; + unsigned long ounce; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + pound = 0; + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_weigh != NULL) { + ounce = slice->cls_ops->clo_weigh(env, slice); + pound += ounce; + if (pound < ounce) /* over-weight^Wflow */ + pound = ~0UL; + } + } + RETURN(pound); +} +EXPORT_SYMBOL(cl_lock_weigh); + +/** + * Notifies layers that lock description changed. + * + * The server can grant client a lock different from one that was requested + * (e.g., larger in extent). This method is called when actually granted lock + * description becomes known to let layers to accommodate for changed lock + * description. + * + * \see cl_lock_operations::clo_modify() + */ +int cl_lock_modify(const struct lu_env *env, struct cl_lock *lock, + const struct cl_lock_descr *desc) +{ + const struct cl_lock_slice *slice; + struct cl_object *obj = lock->cll_descr.cld_obj; + struct cl_object_header *hdr = cl_object_header(obj); + int result; + + ENTRY; + /* don't allow object to change */ + LASSERT(obj == desc->cld_obj); + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + list_for_each_entry_reverse(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_modify != NULL) { + result = slice->cls_ops->clo_modify(env, slice, desc); + if (result != 0) + RETURN(result); + } + } + CL_LOCK_DEBUG(D_DLMTRACE, env, lock, " -> "DDESCR"@"DFID"\n", + PDESCR(desc), PFID(lu_object_fid(&desc->cld_obj->co_lu))); + /* + * Just replace description in place. Nothing more is needed for + * now. If locks were indexed according to their extent and/or mode, + * that index would have to be updated here. + */ + spin_lock(&hdr->coh_lock_guard); + lock->cll_descr = *desc; + spin_unlock(&hdr->coh_lock_guard); + RETURN(0); +} +EXPORT_SYMBOL(cl_lock_modify); + +/** + * Initializes lock closure with a given origin. + * + * \see cl_lock_closure + */ +void cl_lock_closure_init(const struct lu_env *env, + struct cl_lock_closure *closure, + struct cl_lock *origin, int wait) +{ + LINVRNT(cl_lock_is_mutexed(origin)); + LINVRNT(cl_lock_invariant(env, origin)); + + CFS_INIT_LIST_HEAD(&closure->clc_list); + closure->clc_origin = origin; + closure->clc_wait = wait; + closure->clc_nr = 0; +} +EXPORT_SYMBOL(cl_lock_closure_init); + +/** + * Builds a closure of \a lock. + * + * Building of a closure consists of adding initial lock (\a lock) into it, + * and calling cl_lock_operations::clo_closure() methods of \a lock. These + * methods might call cl_lock_closure_build() recursively again, adding more + * locks to the closure, etc. + * + * \see cl_lock_closure + */ +int cl_lock_closure_build(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + const struct cl_lock_slice *slice; + int result; + + ENTRY; + LINVRNT(cl_lock_is_mutexed(closure->clc_origin)); + LINVRNT(cl_lock_invariant(env, closure->clc_origin)); + + result = cl_lock_enclosure(env, lock, closure); + if (result == 0) { + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + if (slice->cls_ops->clo_closure != NULL) { + result = slice->cls_ops->clo_closure(env, slice, + closure); + if (result != 0) + break; + } + } + } + if (result != 0) + cl_lock_disclosure(env, closure); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_closure_build); + +/** + * Adds new lock to a closure. + * + * Try-locks \a lock and if succeeded, adds it to the closure (never more than + * once). If try-lock failed, returns CLO_REPEAT, after optionally waiting + * until next try-lock is likely to succeed. + */ +int cl_lock_enclosure(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock_closure *closure) +{ + int result; + ENTRY; + if (!cl_lock_mutex_try(env, lock)) { + /* + * If lock->cll_inclosure is not empty, lock is already in + * this closure. + */ + if (list_empty(&lock->cll_inclosure)) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure", closure); + list_add(&lock->cll_inclosure, &closure->clc_list); + closure->clc_nr++; + } else + cl_lock_mutex_put(env, lock); + result = 0; + } else { + cl_lock_disclosure(env, closure); + if (closure->clc_wait) { + cl_lock_get_trust(lock); + lu_ref_add(&lock->cll_reference, "closure-w", closure); + cl_lock_mutex_put(env, closure->clc_origin); + + LASSERT(cl_lock_nr_mutexed(env) == 0); + cl_lock_mutex_get(env, lock); + cl_lock_mutex_put(env, lock); + + cl_lock_mutex_get(env, closure->clc_origin); + lu_ref_del(&lock->cll_reference, "closure-w", closure); + cl_lock_put(env, lock); + } + result = CLO_REPEAT; + } + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_enclosure); + +/** Releases mutices of enclosed locks. */ +void cl_lock_disclosure(const struct lu_env *env, + struct cl_lock_closure *closure) +{ + struct cl_lock *scan; + struct cl_lock *temp; + + list_for_each_entry_safe(scan, temp, &closure->clc_list, cll_inclosure){ + list_del_init(&scan->cll_inclosure); + cl_lock_mutex_put(env, scan); + lu_ref_del(&scan->cll_reference, "closure", closure); + cl_lock_put(env, scan); + closure->clc_nr--; + } + LASSERT(closure->clc_nr == 0); +} +EXPORT_SYMBOL(cl_lock_disclosure); + +/** Finalizes a closure. */ +void cl_lock_closure_fini(struct cl_lock_closure *closure) +{ + LASSERT(closure->clc_nr == 0); + LASSERT(list_empty(&closure->clc_list)); +} +EXPORT_SYMBOL(cl_lock_closure_fini); + +/** + * Destroys this lock. Notifies layers (bottom-to-top) that lock is being + * destroyed, then destroy the lock. If there are holds on the lock, postpone + * destruction until all holds are released. This is called when a decision is + * made to destroy the lock in the future. E.g., when a blocking AST is + * received on it, or fatal communication error happens. + * + * Caller must have a reference on this lock to prevent a situation, when + * deleted lock lingers in memory for indefinite time, because nobody calls + * cl_lock_put() to finish it. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * + * \see cl_lock_operations::clo_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_delete(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_holds == 0) + cl_lock_delete0(env, lock); + else + lock->cll_flags |= CLF_DOOMED; + EXIT; +} +EXPORT_SYMBOL(cl_lock_delete); + +/** + * Mark lock as irrecoverably failed, and mark it for destruction. This + * happens when, e.g., server fails to grant a lock to us, or networking + * time-out happens. + * + * \pre atomic_read(&lock->cll_ref) > 0 + * + * \see clo_lock_delete() + * \see cl_lock::cll_holds + */ +void cl_lock_error(const struct lu_env *env, struct cl_lock *lock, int error) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + if (lock->cll_error == 0 && error != 0) { + lock->cll_error = error; + cl_lock_signal(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + EXIT; +} +EXPORT_SYMBOL(cl_lock_error); + +/** + * Cancels this lock. Notifies layers + * (bottom-to-top) that lock is being cancelled, then destroy the lock. If + * there are holds on the lock, postpone cancellation until + * all holds are released. + * + * Cancellation notification is delivered to layers at most once. + * + * \see cl_lock_operations::clo_cancel() + * \see cl_lock::cll_holds + */ +void cl_lock_cancel(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + if (lock->cll_holds == 0) + cl_lock_cancel0(env, lock); + else + lock->cll_flags |= CLF_CANCELPEND; + EXIT; +} +EXPORT_SYMBOL(cl_lock_cancel); + +/** + * Finds an existing lock covering given page and optionally different from a + * given \a except lock. + */ +struct cl_lock *cl_lock_at_page(const struct lu_env *env, struct cl_object *obj, + struct cl_page *page, struct cl_lock *except, + int pending, int canceld) +{ + struct cl_object_header *head; + struct cl_lock *scan; + struct cl_lock *lock; + struct cl_lock_descr *need; + + ENTRY; + + head = cl_object_header(obj); + need = &cl_env_info(env)->clt_descr; + lock = NULL; + + need->cld_mode = CLM_READ; /* CLM_READ matches both READ & WRITE, but + * not PHANTOM */ + need->cld_start = need->cld_end = page->cp_index; + + spin_lock(&head->coh_lock_guard); + list_for_each_entry(scan, &head->coh_locks, cll_linkage) { + if (scan != except && + cl_lock_ext_match(&scan->cll_descr, need) && + scan->cll_state < CLS_FREEING && + /* + * This check is racy as the lock can be canceled right + * after it is done, but this is fine, because page exists + * already. + */ + (canceld || !(scan->cll_flags & CLF_CANCELLED)) && + (pending || !(scan->cll_flags & CLF_CANCELPEND))) { + /* Don't increase cs_hit here since this + * is just a helper function. */ + cl_lock_get_trust(scan); + lock = scan; + break; + } + } + spin_unlock(&head->coh_lock_guard); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_at_page); + +/** + * Returns a list of pages protected (only) by a given lock. + * + * Scans an extent of page radix tree, corresponding to the \a lock and queues + * all pages that are not protected by locks other than \a lock into \a queue. + */ +void cl_lock_page_list_fixup(const struct lu_env *env, + struct cl_io *io, struct cl_lock *lock, + struct cl_page_list *queue) +{ + struct cl_page *page; + struct cl_page *temp; + struct cl_page_list *plist = &cl_env_info(env)->clt_list; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + + /* Now, we have a list of cl_pages under the \a lock, we need + * to check if some of pages are covered by other ldlm lock. + * If this is the case, they aren't needed to be written out this time. + * + * For example, we have A:[0,200] & B:[100,300] PW locks on client, now + * the latter is to be canceled, this means other client is + * reading/writing [200,300] since A won't canceled. Actually + * we just need to write the pages covered by [200,300]. This is safe, + * since [100,200] is also protected lock A. + */ + + cl_page_list_init(plist); + cl_page_list_for_each_safe(page, temp, queue) { + pgoff_t idx = page->cp_index; + struct cl_lock *found; + struct cl_lock_descr *descr; + + /* The algorithm counts on the index-ascending page index. */ + LASSERT(ergo(&temp->cp_batch != &queue->pl_pages, + page->cp_index < temp->cp_index)); + + found = cl_lock_at_page(env, lock->cll_descr.cld_obj, + page, lock, 0, 0); + if (found == NULL) + continue; + + descr = &found->cll_descr; + list_for_each_entry_safe_from(page, temp, &queue->pl_pages, + cp_batch) { + idx = page->cp_index; + if (descr->cld_start > idx || descr->cld_end < idx) + break; + cl_page_list_move(plist, queue, page); + } + cl_lock_put(env, found); + } + + /* The pages in plist are covered by other locks, don't handle them + * this time. + */ + if (io != NULL) + cl_page_list_disown(env, io, plist); + cl_page_list_fini(env, plist); + EXIT; +} +EXPORT_SYMBOL(cl_lock_page_list_fixup); + +/** + * Invalidate pages protected by the given lock, sending them out to the + * server first, if necessary. + * + * This function does the following: + * + * - collects a list of pages to be invalidated, + * + * - unmaps them from the user virtual memory, + * + * - sends dirty pages to the server, + * + * - waits for transfer completion, + * + * - discards pages, and throws them out of memory. + * + * If \a discard is set, pages are discarded without sending them to the + * server. + * + * If error happens on any step, the process continues anyway (the reasoning + * behind this being that lock cancellation cannot be delayed indefinitely). + */ +int cl_lock_page_out(const struct lu_env *env, struct cl_lock *lock, + int discard) +{ + struct cl_thread_info *info = cl_env_info(env); + struct cl_io *io = &info->clt_io; + struct cl_2queue *queue = &info->clt_queue; + struct cl_lock_descr *descr = &lock->cll_descr; + int result; + int rc0; + int rc1; + + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + + io->ci_obj = cl_object_top(descr->cld_obj); + result = cl_io_init(env, io, CIT_MISC, io->ci_obj); + if (result == 0) { + + cl_2queue_init(queue); + cl_page_gang_lookup(env, descr->cld_obj, io, descr->cld_start, + descr->cld_end, &queue->c2_qin); + if (queue->c2_qin.pl_nr > 0) { + result = cl_page_list_unmap(env, io, &queue->c2_qin); + if (!discard) { + rc0 = cl_io_submit_rw(env, io, + CRT_WRITE, queue); + rc1 = cl_page_list_own(env, io, + &queue->c2_qout); + result = result ?: rc0 ?: rc1; + } + cl_lock_page_list_fixup(env, io, lock, &queue->c2_qout); + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + } + cl_2queue_fini(env, queue); + } + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_lock_page_out); + +/** + * Eliminate all locks for a given object. + * + * Caller has to guarantee that no lock is in active use. + * + * \param cancel when this is set, cl_locks_prune() cancels locks before + * destroying. + */ +void cl_locks_prune(const struct lu_env *env, struct cl_object *obj, int cancel) +{ + struct cl_object_header *head; + struct cl_lock *lock; + + ENTRY; + head = cl_object_header(obj); + /* + * If locks are destroyed without cancellation, all pages must be + * already destroyed (as otherwise they will be left unprotected). + */ + LASSERT(ergo(!cancel, + head->coh_tree.rnode == NULL && head->coh_pages == 0)); + + spin_lock(&head->coh_lock_guard); + while (!list_empty(&head->coh_locks)) { + lock = container_of(head->coh_locks.next, + struct cl_lock, cll_linkage); + cl_lock_get_trust(lock); + spin_unlock(&head->coh_lock_guard); + lu_ref_add(&lock->cll_reference, "prune", cfs_current()); + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING) { + LASSERT(lock->cll_holds == 0); + LASSERT(lock->cll_users == 0); + if (cancel) + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + } + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, "prune", cfs_current()); + cl_lock_put(env, lock); + spin_lock(&head->coh_lock_guard); + } + spin_unlock(&head->coh_lock_guard); + EXIT; +} +EXPORT_SYMBOL(cl_locks_prune); + +/** + * Returns true if \a addr is an address of an allocated cl_lock. Used in + * assertions. This check is optimistically imprecise, i.e., it occasionally + * returns true for the incorrect addresses, but if it returns false, then the + * address is guaranteed to be incorrect. (Should be named cl_lockp().) + * + * \see cl_is_page() + */ +int cl_is_lock(const void *addr) +{ + return cfs_mem_is_in_cache(addr, cl_lock_kmem); +} +EXPORT_SYMBOL(cl_is_lock); + +static struct cl_lock *cl_lock_hold_mutex(const struct lu_env *env, + const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + while (1) { + lock = cl_lock_find(env, io, need); + if (IS_ERR(lock)) + break; + cl_lock_mutex_get(env, lock); + if (lock->cll_state < CLS_FREEING) { + cl_lock_hold_mod(env, lock, +1); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + break; + } + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + } + RETURN(lock); +} + +/** + * Returns a lock matching \a need description with a reference and a hold on + * it. + * + * This is much like cl_lock_find(), except that cl_lock_hold() additionally + * guarantees that lock is not in the CLS_FREEING state on return. + */ +struct cl_lock *cl_lock_hold(const struct lu_env *env, const struct cl_io *io, + const struct cl_lock_descr *need, + const char *scope, const void *source) +{ + struct cl_lock *lock; + + ENTRY; + + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (!IS_ERR(lock)) + cl_lock_mutex_put(env, lock); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_hold); + +/** + * Main high-level entry point of cl_lock interface that finds existing or + * enqueues new lock matching given description. + */ +struct cl_lock *cl_lock_request(const struct lu_env *env, struct cl_io *io, + const struct cl_lock_descr *need, + __u32 enqflags, + const char *scope, const void *source) +{ + struct cl_lock *lock; + const struct lu_fid *fid; + int rc; + int iter; + int warn; + + ENTRY; + fid = lu_object_fid(&io->ci_obj->co_lu); + iter = 0; + do { + warn = iter >= 16 && IS_PO2(iter); + CDEBUG(warn ? D_WARNING : D_DLMTRACE, + DDESCR"@"DFID" %i %08x `%s'\n", + PDESCR(need), PFID(fid), iter, enqflags, scope); + lock = cl_lock_hold_mutex(env, io, need, scope, source); + if (!IS_ERR(lock)) { + rc = cl_enqueue_locked(env, lock, io, enqflags); + if (rc == 0) { + if (cl_lock_fits_into(env, lock, need, io)) { + cl_lock_mutex_put(env, lock); + cl_lock_lockdep_acquire(env, + lock, enqflags); + break; + } else if (warn) + CL_LOCK_DEBUG(D_WARNING, env, lock, + "got\n"); + cl_unuse_locked(env, lock); + } + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + lock = ERR_PTR(rc); + } else + rc = PTR_ERR(lock); + iter++; + } while (rc == 0); + RETURN(lock); +} +EXPORT_SYMBOL(cl_lock_request); + +/** + * Adds a hold to a known lock. + */ +void cl_lock_hold_add(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_state != CLS_FREEING); + + ENTRY; + cl_lock_hold_mod(env, lock, +1); + cl_lock_get(lock); + lu_ref_add(&lock->cll_holders, scope, source); + lu_ref_add(&lock->cll_reference, scope, source); + EXIT; +} +EXPORT_SYMBOL(cl_lock_hold_add); + +/** + * Releases a hold and a reference on a lock, on which caller acquired a + * mutex. + */ +void cl_lock_unhold(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_hold_release(env, lock, scope, source); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_unhold); + +/** + * Releases a hold and a reference on a lock, obtained by cl_lock_hold(). + */ +void cl_lock_release(const struct lu_env *env, struct cl_lock *lock, + const char *scope, const void *source) +{ + LINVRNT(cl_lock_invariant(env, lock)); + ENTRY; + cl_lock_mutex_get(env, lock); + cl_lock_hold_release(env, lock, scope, source); + cl_lock_mutex_put(env, lock); + lu_ref_del(&lock->cll_reference, scope, source); + cl_lock_put(env, lock); + EXIT; +} +EXPORT_SYMBOL(cl_lock_release); + +void cl_lock_user_add(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + + ENTRY; + cl_lock_used_mod(env, lock, +1); + EXIT; +} +EXPORT_SYMBOL(cl_lock_user_add); + +int cl_lock_user_del(const struct lu_env *env, struct cl_lock *lock) +{ + LINVRNT(cl_lock_is_mutexed(lock)); + LINVRNT(cl_lock_invariant(env, lock)); + LASSERT(lock->cll_users > 0); + + ENTRY; + cl_lock_used_mod(env, lock, -1); + RETURN(lock->cll_users == 0); +} +EXPORT_SYMBOL(cl_lock_user_del); + +/** + * Check if two lock's mode are compatible. + * + * This returns true iff en-queuing \a lock2 won't cause cancellation of \a + * lock1 even when these locks overlap. + */ +int cl_lock_compatible(const struct cl_lock *lock1, const struct cl_lock *lock2) +{ + enum cl_lock_mode mode1; + enum cl_lock_mode mode2; + + ENTRY; + mode1 = lock1->cll_descr.cld_mode; + mode2 = lock2->cll_descr.cld_mode; + RETURN(mode2 == CLM_PHANTOM || + (mode1 == CLM_READ && mode2 == CLM_READ)); +} +EXPORT_SYMBOL(cl_lock_compatible); + +const char *cl_lock_mode_name(const enum cl_lock_mode mode) +{ + static const char *names[] = { + [CLM_PHANTOM] = "PHANTOM", + [CLM_READ] = "READ", + [CLM_WRITE] = "WRITE" + }; + if (0 <= mode && mode < ARRAY_SIZE(names)) + return names[mode]; + else + return "UNKNW"; +} +EXPORT_SYMBOL(cl_lock_mode_name); + +/** + * Prints human readable representation of a lock description. + */ +void cl_lock_descr_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, + const struct cl_lock_descr *descr) +{ + const struct lu_fid *fid; + + fid = lu_object_fid(&descr->cld_obj->co_lu); + (*printer)(env, cookie, DDESCR"@"DFID, PDESCR(descr), PFID(fid)); +} +EXPORT_SYMBOL(cl_lock_descr_print); + +/** + * Prints human readable representation of \a lock to the \a f. + */ +void cl_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_lock *lock) +{ + const struct cl_lock_slice *slice; + (*printer)(env, cookie, "lock@%p[%d %d %d %d %d %08lx] ", + lock, atomic_read(&lock->cll_ref), + lock->cll_state, lock->cll_error, lock->cll_holds, + lock->cll_users, lock->cll_flags); + cl_lock_descr_print(env, cookie, printer, &lock->cll_descr); + (*printer)(env, cookie, " {\n"); + + list_for_each_entry(slice, &lock->cll_layers, cls_linkage) { + (*printer)(env, cookie, " %s@%p: ", + slice->cls_obj->co_lu.lo_dev->ld_type->ldt_name, + slice); + if (slice->cls_ops->clo_print != NULL) + slice->cls_ops->clo_print(env, cookie, printer, slice); + (*printer)(env, cookie, "\n"); + } + (*printer)(env, cookie, "} lock@%p\n", lock); +} +EXPORT_SYMBOL(cl_lock_print); + +int cl_lock_init(void) +{ + return lu_kmem_init(cl_lock_caches); +} + +void cl_lock_fini(void) +{ + lu_kmem_fini(cl_lock_caches); +} diff --git a/lustre/obdclass/cl_object.c b/lustre/obdclass/cl_object.c new file mode 100644 index 0000000..0310b75 --- /dev/null +++ b/lustre/obdclass/cl_object.c @@ -0,0 +1,1077 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Object. + * + * Author: Nikita Danilov + */ + +/* + * Locking. + * + * i_mutex + * PG_locked + * ->coh_page_guard + * ->coh_lock_guard + * ->coh_attr_guard + * ->ls_guard + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +/* class_put_type() */ +#include +#include +#include +#include +/* lu_time_global_{init,fini}() */ +#include + +#include +#include "cl_internal.h" + +static cfs_mem_cache_t *cl_env_kmem; + +/** Lock class of cl_object_header::coh_page_guard */ +static struct lock_class_key cl_page_guard_class; +/** Lock class of cl_object_header::coh_lock_guard */ +static struct lock_class_key cl_lock_guard_class; +/** Lock class of cl_object_header::coh_attr_guard */ +static struct lock_class_key cl_attr_guard_class; + +/** + * Initialize cl_object_header. + */ +int cl_object_header_init(struct cl_object_header *h) +{ + int result; + + ENTRY; + result = lu_object_header_init(&h->coh_lu); + if (result == 0) { + spin_lock_init(&h->coh_page_guard); + spin_lock_init(&h->coh_lock_guard); + spin_lock_init(&h->coh_attr_guard); + lockdep_set_class(&h->coh_attr_guard, &cl_page_guard_class); + lockdep_set_class(&h->coh_attr_guard, &cl_lock_guard_class); + lockdep_set_class(&h->coh_attr_guard, &cl_attr_guard_class); + h->coh_pages = 0; + /* XXX hard coded GFP_* mask. */ + INIT_RADIX_TREE(&h->coh_tree, GFP_ATOMIC); + CFS_INIT_LIST_HEAD(&h->coh_locks); + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_header_init); + +/** + * Finalize cl_object_header. + */ +void cl_object_header_fini(struct cl_object_header *h) +{ + LASSERT(list_empty(&h->coh_locks)); + lu_object_header_fini(&h->coh_lu); +} +EXPORT_SYMBOL(cl_object_header_fini); + +/** + * Returns a cl_object with a given \a fid. + * + * Returns either cached or newly created object. Additional reference on the + * returned object is acquired. + * + * \see lu_object_find(), cl_page_find(), cl_lock_find() + */ +struct cl_object *cl_object_find(const struct lu_env *env, + struct cl_device *cd, const struct lu_fid *fid, + const struct cl_object_conf *c) +{ + might_sleep(); + return lu2cl(lu_object_find_slice(env, cl2lu_dev(cd), fid, &c->coc_lu)); +} +EXPORT_SYMBOL(cl_object_find); + +/** + * Releases a reference on \a o. + * + * When last reference is released object is returned to the cache, unless + * lu_object_header_flags::LU_OBJECT_HEARD_BANSHEE bit is set in its header. + * + * \see cl_page_put(), cl_lock_put(). + */ +void cl_object_put(const struct lu_env *env, struct cl_object *o) +{ + lu_object_put(env, &o->co_lu); +} +EXPORT_SYMBOL(cl_object_put); + +/** + * Acquire an additional reference to the object \a o. + * + * This can only be used to acquire _additional_ reference, i.e., caller + * already has to possess at least one reference to \a o before calling this. + * + * \see cl_page_get(), cl_lock_get(). + */ +void cl_object_get(struct cl_object *o) +{ + lu_object_get(&o->co_lu); +} +EXPORT_SYMBOL(cl_object_get); + +/** + * Returns the top-object for a given \a o. + * + * \see cl_page_top(), cl_io_top() + */ +struct cl_object *cl_object_top(struct cl_object *o) +{ + struct cl_object_header *hdr = cl_object_header(o); + struct cl_object *top; + + while (hdr->coh_parent != NULL) + hdr = hdr->coh_parent; + + top = lu2cl(lu_object_top(&hdr->coh_lu)); + CDEBUG(D_TRACE, "%p -> %p\n", o, top); + return top; +} +EXPORT_SYMBOL(cl_object_top); + +/** + * Returns pointer to the lock protecting data-attributes for the given object + * \a o. + * + * Data-attributes are protected by the cl_object_header::coh_attr_guard + * spin-lock in the top-object. + * + * \see cl_attr, cl_object_attr_lock(), cl_object_operations::coo_attr_get(). + */ +static spinlock_t *cl_object_attr_guard(struct cl_object *o) +{ + return &cl_object_header(cl_object_top(o))->coh_attr_guard; +} + +/** + * Locks data-attributes. + * + * Prevents data-attributes from changing, until lock is released by + * cl_object_attr_unlock(). This has to be called before calls to + * cl_object_attr_get(), cl_object_attr_set(). + */ +void cl_object_attr_lock(struct cl_object *o) +{ + spin_lock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_lock); + +/** + * Releases data-attributes lock, acquired by cl_object_attr_lock(). + */ +void cl_object_attr_unlock(struct cl_object *o) +{ + spin_unlock(cl_object_attr_guard(o)); +} +EXPORT_SYMBOL(cl_object_attr_unlock); + +/** + * Returns data-attributes of an object \a obj. + * + * Every layer is asked (by calling cl_object_operations::coo_attr_get()) + * top-to-bottom to fill in parts of \a attr that this layer is responsible + * for. + */ +int cl_object_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lu_object_header *top; + int result; + + LASSERT_SPIN_LOCKED(cl_object_attr_guard(obj)); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_get != NULL) { + result = obj->co_ops->coo_attr_get(env, obj, attr); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_get); + +/** + * Updates data-attributes of an object \a obj. + * + * Only attributes, mentioned in a validness bit-mask \a v are + * updated. Calls cl_object_operations::coo_attr_set() on every layer, bottom + * to top. + */ +int cl_object_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned v) +{ + struct lu_object_header *top; + int result; + + LASSERT_SPIN_LOCKED(cl_object_attr_guard(obj)); + ENTRY; + + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_attr_set != NULL) { + result = obj->co_ops->coo_attr_set(env, obj, attr, v); + if (result != 0) { + if (result > 0) + result = 0; + break; + } + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_object_attr_set); + +/** + * Notifies layers (bottom-to-top) that glimpse AST was received. + * + * Layers have to fill \a lvb fields with information that will be shipped + * back to glimpse issuer. + * + * \see cl_lock_operations::clo_glimpse() + */ +int cl_object_glimpse(const struct lu_env *env, struct cl_object *obj, + struct ost_lvb *lvb) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry_reverse(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_glimpse != NULL) { + result = obj->co_ops->coo_glimpse(env, obj, lvb); + if (result != 0) + break; + } + } + LU_OBJECT_HEADER(D_DLMTRACE, env, lu_object_top(top), + "size: "LPU64" mtime: "LPU64" atime: "LPU64" " + "ctime: "LPU64" blocks: "LPU64"\n", + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); + RETURN(result); +} +EXPORT_SYMBOL(cl_object_glimpse); + +/** + * Updates a configuration of an object \a obj. + */ +int cl_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + struct lu_object_header *top; + int result; + + ENTRY; + top = obj->co_lu.lo_header; + result = 0; + list_for_each_entry(obj, &top->loh_layers, co_lu.lo_linkage) { + if (obj->co_ops->coo_conf_set != NULL) { + result = obj->co_ops->coo_conf_set(env, obj, conf); + if (result != 0) + break; + } + } + RETURN(result); +} +EXPORT_SYMBOL(cl_conf_set); + +/** + * Helper function removing all object locks, and marking object for + * deletion. All object pages must have been deleted at this point. + * + * This is called by cl_inode_fini() and lov_object_delete() to destroy top- + * and sub- objects respectively. + */ +void cl_object_kill(const struct lu_env *env, struct cl_object *obj) +{ + struct cl_object_header *hdr; + + hdr = cl_object_header(obj); + LASSERT(hdr->coh_tree.rnode == NULL); + LASSERT(hdr->coh_pages == 0); + + set_bit(LU_OBJECT_HEARD_BANSHEE, &hdr->coh_lu.loh_flags); + /* + * Destroy all locks. Object destruction (including cl_inode_fini()) + * cannot cancel the locks, because in the case of a local client, + * where client and server share the same thread running + * prune_icache(), this can dead-lock with ldlm_cancel_handler() + * waiting on __wait_on_freeing_inode(). + */ + cl_locks_prune(env, obj, 0); +} +EXPORT_SYMBOL(cl_object_kill); + +/** + * Prunes caches of pages and locks for this object. + */ +void cl_object_prune(const struct lu_env *env, struct cl_object *obj) +{ + ENTRY; + cl_pages_prune(env, obj); + cl_locks_prune(env, obj, 1); + EXIT; +} +EXPORT_SYMBOL(cl_object_prune); + +void cache_stats_init(struct cache_stats *cs, const char *name) +{ + cs->cs_name = name; + atomic_set(&cs->cs_lookup, 0); + atomic_set(&cs->cs_hit, 0); + atomic_set(&cs->cs_total, 0); + atomic_set(&cs->cs_busy, 0); +} + +int cache_stats_print(const struct cache_stats *cs, + char *page, int count, int h) +{ + int nob = 0; +/* + lookup hit total cached create + env: ...... ...... ...... ...... ...... +*/ + if (h) + nob += snprintf(page, count, + " lookup hit total busy create\n"); + + nob += snprintf(page + nob, count - nob, + "%5.5s: %6u %6u %6u %6u %6u", + cs->cs_name, + atomic_read(&cs->cs_lookup), + atomic_read(&cs->cs_hit), + atomic_read(&cs->cs_total), + atomic_read(&cs->cs_busy), + atomic_read(&cs->cs_created)); + return nob; +} + +/** + * Initialize client site. + * + * Perform common initialization (lu_site_init()), and initialize statistical + * counters. Also perform global initializations on the first call. + */ +int cl_site_init(struct cl_site *s, struct cl_device *d) +{ + int i; + int result; + + result = lu_site_init(&s->cs_lu, &d->cd_lu_dev); + if (result == 0) { + cache_stats_init(&s->cs_pages, "pages"); + cache_stats_init(&s->cs_locks, "locks"); + for (i = 0; i < ARRAY_SIZE(s->cs_pages_state); ++i) + atomic_set(&s->cs_pages_state[0], 0); + for (i = 0; i < ARRAY_SIZE(s->cs_locks_state); ++i) + atomic_set(&s->cs_locks_state[i], 0); + } + return result; +} +EXPORT_SYMBOL(cl_site_init); + +/** + * Finalize client site. Dual to cl_site_init(). + */ +void cl_site_fini(struct cl_site *s) +{ + lu_site_fini(&s->cs_lu); +} +EXPORT_SYMBOL(cl_site_fini); + +static struct cache_stats cl_env_stats = { + .cs_name = "envs", + .cs_created = ATOMIC_INIT(0), + .cs_lookup = ATOMIC_INIT(0), + .cs_hit = ATOMIC_INIT(0), + .cs_total = ATOMIC_INIT(0), + .cs_busy = ATOMIC_INIT(0) +}; + +/** + * Outputs client site statistical counters into a buffer. Suitable for + * ll_rd_*()-style functions. + */ +int cl_site_stats_print(const struct cl_site *site, char *page, int count) +{ + int nob; + int i; + static const char *pstate[] = { + [CPS_CACHED] = "c", + [CPS_OWNED] = "o", + [CPS_PAGEOUT] = "w", + [CPS_PAGEIN] = "r", + [CPS_FREEING] = "f" + }; + static const char *lstate[] = { + [CLS_NEW] = "n", + [CLS_QUEUING] = "q", + [CLS_ENQUEUED] = "e", + [CLS_HELD] = "h", + [CLS_UNLOCKING] = "u", + [CLS_CACHED] = "c", + [CLS_FREEING] = "f" + }; +/* + lookup hit total busy create +pages: ...... ...... ...... ...... ...... [...... ...... ...... ......] +locks: ...... ...... ...... ...... ...... [...... ...... ...... ...... ......] + env: ...... ...... ...... ...... ...... + */ + nob = lu_site_stats_print(&site->cs_lu, page, count); + nob += cache_stats_print(&site->cs_pages, page + nob, count - nob, 1); + nob += snprintf(page + nob, count - nob, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_pages_state); ++i) + nob += snprintf(page + nob, count - nob, "%s: %u ", + pstate[i], + atomic_read(&site->cs_pages_state[i])); + nob += snprintf(page + nob, count - nob, "]\n"); + nob += cache_stats_print(&site->cs_locks, page + nob, count - nob, 0); + nob += snprintf(page + nob, count - nob, " ["); + for (i = 0; i < ARRAY_SIZE(site->cs_locks_state); ++i) + nob += snprintf(page + nob, count - nob, "%s: %u ", + lstate[i], + atomic_read(&site->cs_locks_state[i])); + nob += snprintf(page + nob, count - nob, "]\n"); + nob += cache_stats_print(&cl_env_stats, page + nob, count - nob, 0); + nob += snprintf(page + nob, count - nob, "\n"); + return nob; +} +EXPORT_SYMBOL(cl_site_stats_print); + +/***************************************************************************** + * + * lu_env handling on client. + * + */ + +/* + * TBD: Description. + * + * XXX: this assumes that re-entrant file system calls (e.g., ->writepage()) + * do not modify already existing current->journal_info. + */ + +static CFS_LIST_HEAD(cl_envs); +static unsigned cl_envs_cached_nr = 0; +static unsigned cl_envs_cached_max = 128; /* XXX: prototype: arbitrary limit + * for now. */ +static spinlock_t cl_envs_guard = SPIN_LOCK_UNLOCKED; + +struct cl_env { + void *ce_magic; + struct lu_env ce_lu; + struct lu_context ce_ses; + /* + * Linkage into global list of all client environments. Used for + * garbage collection. + */ + struct list_head ce_linkage; + /* + * + */ + int ce_ref; + void *ce_prev; + /* + * Debugging field: address of the caller who made original + * allocation. + */ + void *ce_debug; + void *ce_owner; +}; + +#define CL_ENV_INC(counter) atomic_inc(&cl_env_stats.counter) + +#define CL_ENV_DEC(counter) \ + do { \ + LASSERT(atomic_read(&cl_env_stats.counter) > 0); \ + atomic_dec(&cl_env_stats.counter); \ + } while (0) + +static void cl_env_init0(struct cl_env *cle, void *debug) +{ + LASSERT(cle->ce_ref == 0); + LASSERT(cle->ce_magic == &cl_env_init0); + LASSERT(cle->ce_debug == NULL && cle->ce_owner == NULL); + + cle->ce_ref = 1; + cle->ce_prev = current->journal_info; + cle->ce_debug = debug; + cle->ce_owner = current; + current->journal_info = cle; + CL_ENV_INC(cs_busy); +} + +static struct lu_env *cl_env_new(__u32 tags, void *debug) +{ + struct lu_env *env; + struct cl_env *cle; + + OBD_SLAB_ALLOC_PTR(cle, cl_env_kmem); + if (cle != NULL) { + int rc; + + CFS_INIT_LIST_HEAD(&cle->ce_linkage); + cle->ce_magic = &cl_env_init0; + env = &cle->ce_lu; + rc = lu_env_init(env, LCT_CL_THREAD|tags); + if (rc == 0) { + rc = lu_context_init(&cle->ce_ses, LCT_SESSION|tags); + if (rc == 0) { + lu_context_enter(&cle->ce_ses); + env->le_ses = &cle->ce_ses; + cl_env_init0(cle, debug); + } else + lu_env_fini(env); + } + if (rc != 0) { + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); + env = ERR_PTR(rc); + } else { + CL_ENV_INC(cs_created); + CL_ENV_INC(cs_total); + } + } else + env = ERR_PTR(-ENOMEM); + return env; +} + +static void cl_env_fini(struct cl_env *cle) +{ + CL_ENV_DEC(cs_total); + lu_context_fini(&cle->ce_lu.le_ctx); + lu_context_fini(&cle->ce_ses); + OBD_SLAB_FREE_PTR(cle, cl_env_kmem); +} + +static struct lu_env *cl_env_obtain(void *debug) +{ + struct cl_env *cle; + struct lu_env *env; + + ENTRY; + spin_lock(&cl_envs_guard); + LASSERT(equi(cl_envs_cached_nr == 0, list_empty(&cl_envs))); + if (cl_envs_cached_nr > 0) { + int rc; + + cle = container_of(cl_envs.next, struct cl_env, ce_linkage); + list_del_init(&cle->ce_linkage); + cl_envs_cached_nr--; + spin_unlock(&cl_envs_guard); + + env = &cle->ce_lu; + rc = lu_env_refill(env); + if (rc == 0) { + cl_env_init0(cle, debug); + lu_context_enter(&env->le_ctx); + lu_context_enter(&cle->ce_ses); + } else { + cl_env_fini(cle); + env = ERR_PTR(rc); + } + } else { + spin_unlock(&cl_envs_guard); + env = cl_env_new(0, debug); + } + RETURN(env); +} + +static inline struct cl_env *cl_env_container(struct lu_env *env) +{ + return container_of(env, struct cl_env, ce_lu); +} + +struct lu_env *cl_env_peek(int *refcheck) +{ + struct lu_env *env; + struct cl_env *cle; + + CL_ENV_INC(cs_lookup); + + /* check that we don't go far from untrusted pointer */ + CLASSERT(offsetof(struct cl_env, ce_magic) == 0); + + env = NULL; + cle = current->journal_info; + if (cle != NULL && cle->ce_magic == &cl_env_init0) { + CL_ENV_INC(cs_hit); + env = &cle->ce_lu; + *refcheck = ++cle->ce_ref; + } + CDEBUG(D_OTHER, "%i@%p\n", cle ? cle->ce_ref : 0, cle); + return env; +} +EXPORT_SYMBOL(cl_env_peek); + +/** + * Returns lu_env: if there already is an environment associated with the + * current thread, it is returned, otherwise, new environment is allocated. + * + * Allocations are amortized through the global cache of environments. + * + * \param refcheck pointer to a counter used to detect environment leaks. In + * the usual case cl_env_get() and cl_env_put() are called in the same lexical + * scope and pointer to the same integer is passed as \a refcheck. This is + * used to detect missed cl_env_put(). + * + * \see cl_env_put() + */ +struct lu_env *cl_env_get(int *refcheck) +{ + struct lu_env *env; + + env = cl_env_peek(refcheck); + if (env == NULL) { + env = cl_env_obtain(__builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle); + } + } + return env; +} +EXPORT_SYMBOL(cl_env_get); + +/** + * Forces an allocation of a fresh environment with given tags. + * + * \see cl_env_get() + */ +struct lu_env *cl_env_alloc(int *refcheck, __u32 tags) +{ + struct lu_env *env; + + LASSERT(cl_env_peek(refcheck) == NULL); + env = cl_env_new(tags, __builtin_return_address(0)); + if (!IS_ERR(env)) { + struct cl_env *cle; + + cle = cl_env_container(env); + *refcheck = cle->ce_ref; + CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle); + } + return env; +} +EXPORT_SYMBOL(cl_env_alloc); + +static void cl_env_exit(struct cl_env *cle) +{ + lu_context_exit(&cle->ce_lu.le_ctx); + lu_context_exit(&cle->ce_ses); +} + +/** + * Finalizes and frees a given number of cached environments. This is done to + * (1) free some memory (not currently hooked into VM), or (2) release + * references to modules. + */ +unsigned cl_env_cache_purge(unsigned nr) +{ + struct cl_env *cle; + + ENTRY; + spin_lock(&cl_envs_guard); + for (; !list_empty(&cl_envs) && nr > 0; --nr) { + cle = container_of(cl_envs.next, struct cl_env, ce_linkage); + list_del_init(&cle->ce_linkage); + LASSERT(cl_envs_cached_nr > 0); + cl_envs_cached_nr--; + spin_unlock(&cl_envs_guard); + + cl_env_fini(cle); + spin_lock(&cl_envs_guard); + } + LASSERT(equi(cl_envs_cached_nr == 0, list_empty(&cl_envs))); + spin_unlock(&cl_envs_guard); + RETURN(nr); +} +EXPORT_SYMBOL(cl_env_cache_purge); + +/** + * Release an environment. + * + * Decrement \a env reference counter. When counter drops to 0, nothing in + * this thread is using environment and it is returned to the allocation + * cache, or freed straight away, if cache is large enough. + */ +void cl_env_put(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle; + + cle = cl_env_container(env); + + LASSERT(cle->ce_ref > 0); + LASSERT(ergo(refcheck != NULL, cle->ce_ref == *refcheck)); + + CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle); + if (--cle->ce_ref == 0) { + CL_ENV_DEC(cs_busy); + current->journal_info = cle->ce_prev; + LASSERT(cle->ce_prev == NULL || + cl_env_container(cle->ce_prev)->ce_magic != + &cl_env_init0); + cle->ce_debug = NULL; + cle->ce_owner = NULL; + cl_env_exit(cle); + /* + * Don't bother to take a lock here. + * + * Return environment to the cache only when it was allocated + * with the standard tags. + */ + if (cl_envs_cached_nr < cl_envs_cached_max && + (env->le_ctx.lc_tags & ~LCT_HAS_EXIT) == LCT_CL_THREAD && + (env->le_ses->lc_tags & ~LCT_HAS_EXIT) == LCT_SESSION) { + spin_lock(&cl_envs_guard); + list_add(&cle->ce_linkage, &cl_envs); + cl_envs_cached_nr++; + spin_unlock(&cl_envs_guard); + } else + cl_env_fini(cle); + } +} +EXPORT_SYMBOL(cl_env_put); + +/** + * Declares a point of re-entrancy. + * + * In Linux kernel environments are attached to the thread through + * current->journal_info pointer that is used by other sub-systems also. When + * lustre code is invoked in the situation where current->journal_info is + * potentially already set, cl_env_reenter() is called to save + * current->journal_info value, so that current->journal_info field can be + * used to store pointer to the environment. + * + * \see cl_env_reexit() + */ +void *cl_env_reenter(void) +{ + void *cookie; + + cookie = current->journal_info; + current->journal_info = NULL; + CDEBUG(D_OTHER, "cookie: %p\n", cookie); + return cookie; +} +EXPORT_SYMBOL(cl_env_reenter); + +/** + * Exits re-entrancy. + * + * This restores old value of current->journal_info that was saved by + * cl_env_reenter(). + */ +void cl_env_reexit(void *cookie) +{ + current->journal_info = cookie; + CDEBUG(D_OTHER, "cookie: %p\n", cookie); +} +EXPORT_SYMBOL(cl_env_reexit); + +/** + * Setup user-supplied \a env as a current environment. This is to be used to + * guaranteed that environment exists even when cl_env_get() fails. It is up + * to user to ensure proper concurrency control. + * + * \see cl_env_unplant() + */ +void cl_env_implant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(current->journal_info == NULL); + LASSERT(cle->ce_ref > 0); + + current->journal_info = cle; + cl_env_get(refcheck); + CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle); +} +EXPORT_SYMBOL(cl_env_implant); + +/** + * Detach environment installed earlier by cl_env_implant(). + */ +void cl_env_unplant(struct lu_env *env, int *refcheck) +{ + struct cl_env *cle = cl_env_container(env); + + LASSERT(cle == current->journal_info); + LASSERT(cle->ce_ref > 1); + + CDEBUG(D_OTHER, "%i@%p\n", cle->ce_ref, cle); + + cl_env_put(env, refcheck); + current->journal_info = NULL; +} +EXPORT_SYMBOL(cl_env_unplant); + +struct lu_env *cl_env_nested_get(struct cl_env_nest *nest) +{ + struct lu_env *env; + + nest->cen_cookie = NULL; + env = cl_env_peek(&nest->cen_refcheck); + if (env != NULL) { + if (!cl_io_is_going(env)) + return env; + else { + cl_env_put(env, &nest->cen_refcheck); + nest->cen_cookie = cl_env_reenter(); + } + } + env = cl_env_get(&nest->cen_refcheck); + LASSERT(ergo(!IS_ERR(env), !cl_io_is_going(env))); + return env; +} +EXPORT_SYMBOL(cl_env_nested_get); + +void cl_env_nested_put(struct cl_env_nest *nest, struct lu_env *env) +{ + cl_env_put(env, &nest->cen_refcheck); + cl_env_reexit(nest->cen_cookie); +} +EXPORT_SYMBOL(cl_env_nested_put); + +/** + * Converts struct cl_attr to struct ost_lvb. + * + * \see cl_lvb2attr + */ +void cl_attr2lvb(struct ost_lvb *lvb, const struct cl_attr *attr) +{ + ENTRY; + lvb->lvb_size = attr->cat_size; + lvb->lvb_mtime = attr->cat_mtime; + lvb->lvb_atime = attr->cat_atime; + lvb->lvb_ctime = attr->cat_ctime; + lvb->lvb_blocks = attr->cat_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_attr2lvb); + +/** + * Converts struct ost_lvb to struct cl_attr. + * + * \see cl_attr2lvb + */ +void cl_lvb2attr(struct cl_attr *attr, const struct ost_lvb *lvb) +{ + ENTRY; + attr->cat_size = lvb->lvb_size; + attr->cat_mtime = lvb->lvb_mtime; + attr->cat_atime = lvb->lvb_atime; + attr->cat_ctime = lvb->lvb_ctime; + attr->cat_blocks = lvb->lvb_blocks; + EXIT; +} +EXPORT_SYMBOL(cl_lvb2attr); + + +/***************************************************************************** + * + * Temporary prototype thing: mirror obd-devices into cl devices. + * + */ + +struct cl_device *cl_type_setup(const struct lu_env *env, struct lu_site *site, + struct lu_device_type *ldt, + struct lu_device *next) +{ + const char *typename; + struct lu_device *d; + + LASSERT(ldt != NULL); + + typename = ldt->ldt_name; + d = ldt->ldt_ops->ldto_device_alloc(env, ldt, NULL); + if (!IS_ERR(d)) { + int rc; + + if (site != NULL) + d->ld_site = site; + rc = ldt->ldt_ops->ldto_device_init(env, d, typename, next); + if (rc == 0) { + lu_device_get(d); + lu_ref_add(&d->ld_reference, + "lu-stack", &lu_site_init); + } else { + ldt->ldt_ops->ldto_device_free(env, d); + CERROR("can't init device '%s', %d\n", typename, rc); + d = ERR_PTR(rc); + } + } else + CERROR("Cannot allocate device: '%s'\n", typename); + return lu2cl_dev(d); +} +EXPORT_SYMBOL(cl_type_setup); + +/** + * Finalize device stack by calling lu_stack_fini(). + */ +void cl_stack_fini(const struct lu_env *env, struct cl_device *cl) +{ + lu_stack_fini(env, cl2lu_dev(cl)); +} +EXPORT_SYMBOL(cl_stack_fini); + +int cl_lock_init(void); +void cl_lock_fini(void); + +int cl_page_init(void); +void cl_page_fini(void); + +static struct lu_context_key cl_key; + +struct cl_thread_info *cl_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &cl_key); +} + +/* defines cl0_key_{init,fini}() */ +LU_KEY_INIT_FINI(cl0, struct cl_thread_info); + +static void *cl_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct cl_thread_info *info; + + info = cl0_key_init(ctx, key); + if (!IS_ERR(info)) + lu_ref_init(&info->clt_locks_locked); + return info; +} + +static void cl_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info; + + info = data; + lu_ref_fini(&info->clt_locks_locked); + cl0_key_fini(ctx, key, data); +} + +static void cl_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct cl_thread_info *info = data; + + LASSERT(info->clt_nr_locks_locked == 0); + LASSERT(info->clt_nr_held == 0); + LASSERT(info->clt_nr_used == 0); + LASSERT(info->clt_nr_locks_acquired == 0); + + lu_ref_fini(&info->clt_locks_locked); + lu_ref_init(&info->clt_locks_locked); +} + +static struct lu_context_key cl_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = cl_key_init, + .lct_fini = cl_key_fini, + .lct_exit = cl_key_exit +}; + +static struct lu_kmem_descr cl_object_caches[] = { + { + .ckd_cache = &cl_env_kmem, + .ckd_name = "cl_env_kmem", + .ckd_size = sizeof (struct cl_env) + }, + { + .ckd_cache = NULL + } +}; + +/** + * Global initialization of cl-data. Create kmem caches, register + * lu_context_key's, etc. + * + * \see cl_global_fini() + */ +int cl_global_init(void) +{ + int result; + + result = lu_kmem_init(cl_object_caches); + if (result == 0) { + LU_CONTEXT_KEY_INIT(&cl_key); + result = lu_context_key_register(&cl_key); + if (result == 0) { + result = cl_lock_init(); + if (result == 0) + result = cl_page_init(); + } + } + return result; +} + +/** + * Finalization of global cl-data. Dual to cl_global_init(). + */ +void cl_global_fini(void) +{ + cl_lock_fini(); + cl_page_fini(); + lu_context_key_degister(&cl_key); + lu_kmem_fini(cl_object_caches); +} diff --git a/lustre/obdclass/cl_page.c b/lustre/obdclass/cl_page.c new file mode 100644 index 0000000..feac1ff --- /dev/null +++ b/lustre/obdclass/cl_page.c @@ -0,0 +1,1519 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Client Lustre Page. + * + * Author: Nikita Danilov + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include + +#include +#include "cl_internal.h" + +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix); + +static cfs_mem_cache_t *cl_page_kmem = NULL; + +static struct lu_kmem_descr cl_page_caches[] = { + { + .ckd_cache = &cl_page_kmem, + .ckd_name = "cl_page_kmem", + .ckd_size = sizeof (struct cl_page) + }, + { + .ckd_cache = NULL + } +}; + +#ifdef LIBCFS_DEBUG +# define PASSERT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LASSERT(0); \ + } \ + } while (0) +#else /* !LIBCFS_DEBUG */ +# define PASSERT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !LIBCFS_DEBUG */ + +#ifdef INVARIANT_CHECK +# define PINVRNT(env, page, expr) \ + do { \ + if (unlikely(!(expr))) { \ + CL_PAGE_DEBUG(D_ERROR, (env), (page), #expr "\n"); \ + LINVRNT(0); \ + } \ + } while (0) +#else /* !INVARIANT_CHECK */ +# define PINVRNT(env, page, exp) \ + ((void)sizeof(env), (void)sizeof(page), (void)sizeof !!(exp)) +#endif /* !INVARIANT_CHECK */ + +/** + * Internal version of cl_page_top, it should be called with page referenced, + * or coh_page_guard held. + */ +static struct cl_page *cl_page_top_trusted(struct cl_page *page) +{ + LASSERT(cl_is_page(page)); + while (page->cp_parent != NULL) + page = page->cp_parent; + return page; +} + +/** + * Internal version of cl_page_get(). + * + * This function can be used to obtain initial reference to previously + * unreferenced cached object. It can be called only if concurrent page + * reclamation is somehow prevented, e.g., by locking page radix-tree + * (cl_object_header::hdr->coh_page_guard), or by keeping a lock on a VM page, + * associated with \a page. + * + * Use with care! Not exported. + */ +static void cl_page_get_trust(struct cl_page *page) +{ + LASSERT(cl_is_page(page)); + /* + * Checkless version for trusted users. + */ + if (atomic_inc_return(&page->cp_ref) == 1) + atomic_inc(&cl_object_site(page->cp_obj)->cs_pages.cs_busy); +} + +/** + * Returns a slice within a page, corresponding to the given layer in the + * device stack. + * + * \see cl_lock_at() + */ +static const struct cl_page_slice * +cl_page_at_trusted(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + const struct cl_page_slice *slice; + +#ifdef INVARIANT_CHECK + struct cl_object_header *ch = cl_object_header(page->cp_obj); + + if (!atomic_read(&page->cp_ref)) + LASSERT_SPIN_LOCKED(&ch->coh_page_guard); +#endif + ENTRY; + + page = cl_page_top_trusted((struct cl_page *)page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_obj->co_lu.lo_dev->ld_type == dtype) + RETURN(slice); + } + page = page->cp_child; + } while (page != NULL); + RETURN(NULL); +} + +/** + * Returns a page with given index in the given object, or NULL if no page is + * found. Acquires a reference on \a page. + * + * Locking: called under cl_object_header::coh_page_guard spin-lock. + */ +struct cl_page *cl_page_lookup(struct cl_object_header *hdr, pgoff_t index) +{ + struct cl_page *page; + + LASSERT_SPIN_LOCKED(&hdr->coh_page_guard); + + page = radix_tree_lookup(&hdr->coh_tree, index); + if (page != NULL) { + LASSERT(cl_is_page(page)); + cl_page_get_trust(page); + } + return page; +} +EXPORT_SYMBOL(cl_page_lookup); + +/** + * Returns a list of pages by a given [start, end] of @obj. + * + * Gang tree lookup (radix_tree_gang_lookup()) optimization is absolutely + * crucial in the face of [offset, EOF] locks. + */ +void cl_page_gang_lookup(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io, pgoff_t start, pgoff_t end, + struct cl_page_list *queue) +{ + struct cl_object_header *hdr; + struct cl_page *page; + struct cl_page **pvec; + const struct cl_page_slice *slice; + const struct lu_device_type *dtype; + pgoff_t idx; + unsigned int nr; + unsigned int i; + unsigned int j; + ENTRY; + + idx = start; + hdr = cl_object_header(obj); + pvec = cl_env_info(env)->clt_pvec; + dtype = cl_object_top(obj)->co_lu.lo_dev->ld_type; + spin_lock(&hdr->coh_page_guard); + while ((nr = radix_tree_gang_lookup(&hdr->coh_tree, (void **)pvec, + idx, CLT_PVEC_SIZE)) > 0) { + idx = pvec[nr - 1]->cp_index + 1; + for (i = 0, j = 0; i < nr; ++i) { + page = pvec[i]; + PASSERT(env, page, cl_is_page(page)); + pvec[i] = NULL; + if (page->cp_index > end) + break; + if (page->cp_state == CPS_FREEING) + continue; + if (page->cp_type == CPT_TRANSIENT) { + /* God, we found a transient page!*/ + continue; + } + + slice = cl_page_at_trusted(page, dtype); + /* + * Pages for lsm-less file has no underneath sub-page + * for osc, in case of ... + */ + PASSERT(env, page, slice != NULL); + page = slice->cpl_page; + /* + * Can safely call cl_page_get_trust() under + * radix-tree spin-lock. + * + * XXX not true, because @page is from object another + * than @hdr and protected by different tree lock. + */ + cl_page_get_trust(page); + lu_ref_add_atomic(&page->cp_reference, + "page_list", cfs_current()); + pvec[j++] = page; + } + + /* + * Here a delicate locking dance is performed. Current thread + * holds a reference to a page, but has to own it before it + * can be placed into queue. Owning implies waiting, so + * radix-tree lock is to be released. After a wait one has to + * check that pages weren't truncated (cl_page_own() returns + * error in the latter case). + */ + spin_unlock(&hdr->coh_page_guard); + for (i = 0; i < j; ++i) { + page = pvec[i]; + if (cl_page_own(env, io, page) == 0) + cl_page_list_add(queue, page); + lu_ref_del(&page->cp_reference, + "page_list", cfs_current()); + cl_page_put(env, page); + } + spin_lock(&hdr->coh_page_guard); + if (nr < CLT_PVEC_SIZE) + break; + } + spin_unlock(&hdr->coh_page_guard); + EXIT; +} +EXPORT_SYMBOL(cl_page_gang_lookup); + +static void cl_page_free(const struct lu_env *env, struct cl_page *page) +{ + struct cl_object *obj = page->cp_obj; + struct cl_site *site = cl_object_site(obj); + + PASSERT(env, page, cl_is_page(page)); + PASSERT(env, page, list_empty(&page->cp_batch)); + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, page->cp_req == NULL); + PASSERT(env, page, page->cp_parent == NULL); + PASSERT(env, page, page->cp_state == CPS_FREEING); + + ENTRY; + might_sleep(); + while (!list_empty(&page->cp_layers)) { + struct cl_page_slice *slice; + + slice = list_entry(page->cp_layers.next, struct cl_page_slice, + cpl_linkage); + list_del_init(page->cp_layers.next); + slice->cpl_ops->cpo_fini(env, slice); + } + atomic_dec(&site->cs_pages.cs_total); + atomic_dec(&site->cs_pages_state[page->cp_state]); + lu_object_ref_del_at(&obj->co_lu, page->cp_obj_ref, "cl_page", page); + cl_object_put(env, obj); + lu_ref_fini(&page->cp_reference); + OBD_SLAB_FREE_PTR(page, cl_page_kmem); + EXIT; +} + +/** + * Helper function updating page state. This is the only place in the code + * where cl_page::cp_state field is mutated. + */ +static inline void cl_page_state_set_trust(struct cl_page *page, + enum cl_page_state state) +{ + /* bypass const. */ + *(enum cl_page_state *)&page->cp_state = state; +} + +static int cl_page_alloc(const struct lu_env *env, struct cl_object *o, + pgoff_t ind, struct page *vmpage, + enum cl_page_type type, struct cl_page **out) +{ + struct cl_page *page; + struct cl_page *err = NULL; + struct lu_object_header *head; + struct cl_site *site = cl_object_site(o); + int result; + + ENTRY; + result = +1; + OBD_SLAB_ALLOC_PTR(page, cl_page_kmem); + if (page != NULL) { + atomic_set(&page->cp_ref, 1); + page->cp_obj = o; + cl_object_get(o); + page->cp_obj_ref = lu_object_ref_add(&o->co_lu, + "cl_page", page); + page->cp_index = ind; + cl_page_state_set_trust(page, CPS_CACHED); + page->cp_type = type; + CFS_INIT_LIST_HEAD(&page->cp_layers); + CFS_INIT_LIST_HEAD(&page->cp_batch); + CFS_INIT_LIST_HEAD(&page->cp_flight); + mutex_init(&page->cp_mutex); + lu_ref_init(&page->cp_reference); + head = o->co_lu.lo_header; + list_for_each_entry(o, &head->loh_layers, co_lu.lo_linkage) { + if (o->co_ops->coo_page_init != NULL) { + err = o->co_ops->coo_page_init(env, o, + page, vmpage); + if (err != NULL) { + cl_page_state_set_trust(page, + CPS_FREEING); + cl_page_free(env, page); + page = err; + break; + } + } + } + if (err == NULL) { + atomic_inc(&site->cs_pages.cs_busy); + atomic_inc(&site->cs_pages.cs_total); + atomic_inc(&site->cs_pages_state[CPS_CACHED]); + atomic_inc(&site->cs_pages.cs_created); + result = 0; + } + } else + page = ERR_PTR(-ENOMEM); + *out = page; + RETURN(result); +} + +/** + * Returns a cl_page with index \a idx at the object \a o, and associated with + * the VM page \a vmpage. + * + * This is the main entry point into the cl_page caching interface. First, a + * cache (implemented as a per-object radix tree) is consulted. If page is + * found there, it is returned immediately. Otherwise new page is allocated + * and returned. In any case, additional reference to page is acquired. + * + * \see cl_object_find(), cl_lock_find() + */ +struct cl_page *cl_page_find(const struct lu_env *env, struct cl_object *o, + pgoff_t idx, struct page *vmpage, + enum cl_page_type type) +{ + struct cl_page *page; + struct cl_page *ghost = NULL; + struct cl_object_header *hdr; + struct cl_site *site = cl_object_site(o); + int err; + + LINVRNT(type == CPT_CACHEABLE || type == CPT_TRANSIENT); + might_sleep(); + + ENTRY; + + hdr = cl_object_header(o); + atomic_inc(&site->cs_pages.cs_lookup); + + CDEBUG(D_PAGE, "%lu@"DFID" %p %lu %i\n", + idx, PFID(&hdr->coh_lu.loh_fid), vmpage, vmpage->private, type); + /* fast path. */ + if (type == CPT_CACHEABLE) { + /* + * cl_vmpage_page() can be called here without any locks as + * + * - "vmpage" is locked (which prevents ->private from + * concurrent updates), and + * + * - "o" cannot be destroyed while current thread holds a + * reference on it. + */ + page = cl_vmpage_page(vmpage, o); + PINVRNT(env, page, + ergo(page != NULL, + cl_page_vmpage(env, page) == vmpage && + (void *)radix_tree_lookup(&hdr->coh_tree, + idx) == page)); + } else { + spin_lock(&hdr->coh_page_guard); + page = cl_page_lookup(hdr, idx); + spin_unlock(&hdr->coh_page_guard); + } + if (page != NULL) { + atomic_inc(&site->cs_pages.cs_hit); + RETURN(page); + } + + /* allocate and initialize cl_page */ + err = cl_page_alloc(env, o, idx, vmpage, type, &page); + if (err != 0) + RETURN(page); + /* + * XXX optimization: use radix_tree_preload() here, and change tree + * gfp mask to GFP_KERNEL in cl_object_header_init(). + */ + spin_lock(&hdr->coh_page_guard); + err = radix_tree_insert(&hdr->coh_tree, idx, page); + if (err != 0) { + ghost = page; + /* + * Noted by Jay: a lock on \a vmpage protects cl_page_find() + * from this race, but + * + * 0. it's better to have cl_page interface "locally + * consistent" so that its correctness can be reasoned + * about without appealing to the (obscure world of) VM + * locking. + * + * 1. handling this race allows ->coh_tree to remain + * consistent even when VM locking is somehow busted, + * which is very useful during diagnosing and debugging. + */ + if (err == -EEXIST) { + /* + * XXX in case of a lookup for CPT_TRANSIENT page, + * nothing protects a CPT_CACHEABLE page from being + * concurrently moved into CPS_FREEING state. + */ + page = cl_page_lookup(hdr, idx); + PASSERT(env, page, page != NULL); + if (page->cp_type == CPT_TRANSIENT && + type == CPT_CACHEABLE) { + /* XXX: We should make sure that inode sem + * keeps being held in the lifetime of + * transient pages, so it is impossible to + * have conflicting transient pages. + */ + spin_unlock(&hdr->coh_page_guard); + cl_page_put(env, page); + spin_lock(&hdr->coh_page_guard); + page = ERR_PTR(-EBUSY); + } + } else + page = ERR_PTR(err); + } else + hdr->coh_pages++; + spin_unlock(&hdr->coh_page_guard); + + if (unlikely(ghost != NULL)) { + atomic_dec(&site->cs_pages.cs_busy); + cl_page_delete0(env, ghost, 0); + cl_page_free(env, ghost); + } + RETURN(page); +} +EXPORT_SYMBOL(cl_page_find); + +static inline int cl_page_invariant(const struct cl_page *pg) +{ + struct cl_object_header *header; + struct cl_page *parent; + struct cl_page *child; + struct cl_io *owner; + + LASSERT(cl_is_page(pg)); + /* + * Page invariant is protected by a VM lock. + */ + LINVRNT(cl_page_is_vmlocked(NULL, pg)); + + header = cl_object_header(pg->cp_obj); + parent = pg->cp_parent; + child = pg->cp_child; + owner = pg->cp_owner; + + return atomic_read(&pg->cp_ref) > 0 && + ergo(parent != NULL, parent->cp_child == pg) && + ergo(child != NULL, child->cp_parent == pg) && + ergo(child != NULL, pg->cp_obj != child->cp_obj) && + ergo(parent != NULL, pg->cp_obj != parent->cp_obj) && + ergo(owner != NULL && parent != NULL, + parent->cp_owner == pg->cp_owner->ci_parent) && + ergo(owner != NULL && child != NULL, + child->cp_owner->ci_parent == owner) && + /* + * Either page is early in initialization (has neither child + * nor parent yet), or it is in the object radix tree. + */ + ergo(pg->cp_state < CPS_FREEING, + (void *)radix_tree_lookup(&header->coh_tree, + pg->cp_index) == pg || + (child == NULL && parent == NULL)); +} + +static void cl_page_state_set0(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + enum cl_page_state old; + struct cl_site *site = cl_object_site(page->cp_obj); + + /* + * Matrix of allowed state transitions [old][new], for sanity + * checking. + */ + static const int allowed_transitions[CPS_NR][CPS_NR] = { + [CPS_CACHED] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 1, /* io finds existing cached page */ + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 1, /* write-out from the cache */ + [CPS_FREEING] = 1, /* eviction on the memory pressure */ + }, + [CPS_OWNED] = { + [CPS_CACHED] = 1, /* release to the cache */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 1, /* start read immediately */ + [CPS_PAGEOUT] = 1, /* start write immediately */ + [CPS_FREEING] = 1, /* lock invalidation or truncate */ + }, + [CPS_PAGEIN] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_PAGEOUT] = { + [CPS_CACHED] = 1, /* io completion */ + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + }, + [CPS_FREEING] = { + [CPS_CACHED] = 0, + [CPS_OWNED] = 0, + [CPS_PAGEIN] = 0, + [CPS_PAGEOUT] = 0, + [CPS_FREEING] = 0, + } + }; + + ENTRY; + old = page->cp_state; + PASSERT(env, page, allowed_transitions[old][state]); + CL_PAGE_HEADER(D_TRACE, env, page, "%i -> %i\n", old, state); + for (; page != NULL; page = page->cp_child) { + PASSERT(env, page, page->cp_state == old); + PASSERT(env, page, + equi(state == CPS_OWNED, page->cp_owner != NULL)); + + atomic_dec(&site->cs_pages_state[page->cp_state]); + atomic_inc(&site->cs_pages_state[state]); + cl_page_state_set_trust(page, state); + } + EXIT; +} + +static void cl_page_state_set(const struct lu_env *env, + struct cl_page *page, enum cl_page_state state) +{ + PINVRNT(env, page, cl_page_invariant(page)); + cl_page_state_set0(env, page, state); +} + +/** + * Acquires an additional reference to a page. + * + * This can be called only by caller already possessing a reference to \a + * page. + * + * \see cl_object_get(), cl_lock_get(). + */ +void cl_page_get(struct cl_page *page) +{ + ENTRY; + LASSERT(page->cp_state != CPS_FREEING); + cl_page_get_trust(page); + EXIT; +} +EXPORT_SYMBOL(cl_page_get); + +/** + * Releases a reference to a page. + * + * When last reference is released, page is returned to the cache, unless it + * is in cl_page_state::CPS_FREEING state, in which case it is immediately + * destroyed. + * + * \see cl_object_put(), cl_lock_put(). + */ +void cl_page_put(const struct lu_env *env, struct cl_page *page) +{ + struct cl_object_header *hdr; + struct cl_site *site = cl_object_site(page->cp_obj); + + PASSERT(env, page, atomic_read(&page->cp_ref) > !!page->cp_parent); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, page, "%i\n", atomic_read(&page->cp_ref)); + hdr = cl_object_header(page->cp_obj); + if (atomic_dec_and_test(&page->cp_ref)) { + atomic_dec(&site->cs_pages.cs_busy); + if (page->cp_state == CPS_FREEING) { + PASSERT(env, page, page->cp_owner == NULL); + PASSERT(env, page, list_empty(&page->cp_batch)); + /* + * Page is no longer reachable by other threads. Tear + * it down. + */ + cl_page_free(env, page); + } + } + EXIT; +} +EXPORT_SYMBOL(cl_page_put); + +/** + * Returns a VM page associated with a given cl_page. + */ +cfs_page_t *cl_page_vmpage(const struct lu_env *env, struct cl_page *page) +{ + const struct cl_page_slice *slice; + + /* + * Find uppermost layer with ->cpo_vmpage() method, and return its + * result. + */ + page = cl_page_top(page); + do { + list_for_each_entry(slice, &page->cp_layers, cpl_linkage) { + if (slice->cpl_ops->cpo_vmpage != NULL) + RETURN(slice->cpl_ops->cpo_vmpage(env, slice)); + } + page = page->cp_child; + } while (page != NULL); + LBUG(); /* ->cpo_vmpage() has to be defined somewhere in the stack */ +} +EXPORT_SYMBOL(cl_page_vmpage); + +/** + * Returns a cl_page associated with a VM page, and given cl_object. + */ +struct cl_page *cl_vmpage_page(cfs_page_t *vmpage, struct cl_object *obj) +{ + struct cl_page *page; + + ENTRY; + KLASSERT(PageLocked(vmpage)); + + /* + * NOTE: absence of races and liveness of data are guaranteed by page + * lock on a "vmpage". That works because object destruction has + * bottom-to-top pass. + */ + + /* + * This loop assumes that ->private points to the top-most page. This + * can be rectified easily. + */ + for (page = (void *)vmpage->private; + page != NULL; page = page->cp_child) { + if (cl_object_same(page->cp_obj, obj)) { + cl_page_get_trust(page); + break; + } + } + LASSERT(ergo(page, cl_is_page(page) && page->cp_type == CPT_CACHEABLE)); + RETURN(page); +} +EXPORT_SYMBOL(cl_vmpage_page); + +/** + * Returns the top-page for a given page. + * + * \see cl_object_top(), cl_io_top() + */ +struct cl_page *cl_page_top(struct cl_page *page) +{ + return cl_page_top_trusted(page); +} +EXPORT_SYMBOL(cl_page_top); + +/** + * Returns true if \a addr is an address of an allocated cl_page. Used in + * assertions. This check is optimistically imprecise, i.e., it occasionally + * returns true for the incorrect addresses, but if it returns false, then the + * address is guaranteed to be incorrect. (Should be named cl_pagep().) + * + * \see cl_is_lock() + */ +int cl_is_page(const void *addr) +{ + return cfs_mem_is_in_cache(addr, cl_page_kmem); +} +EXPORT_SYMBOL(cl_is_page); + +const struct cl_page_slice *cl_page_at(const struct cl_page *page, + const struct lu_device_type *dtype) +{ + return cl_page_at_trusted(page, dtype); +} +EXPORT_SYMBOL(cl_page_at); + +#define CL_PAGE_OP(opname) offsetof(struct cl_page_operations, opname) + +#define CL_PAGE_INVOKE(_env, _page, _op, _proto, ...) \ +({ \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + int __result; \ + ptrdiff_t __op = (_op); \ + int (*__method)_proto; \ + \ + __result = 0; \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) { \ + __result = (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + if (__result != 0) \ + break; \ + } \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL && __result == 0); \ + if (__result > 0) \ + __result = 0; \ + __result; \ +}) + +#define CL_PAGE_INVOID(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + __page = cl_page_top(__page); \ + do { \ + list_for_each_entry(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_child; \ + } while (__page != NULL); \ +} while (0) + +#define CL_PAGE_INVOID_REVERSE(_env, _page, _op, _proto, ...) \ +do { \ + const struct lu_env *__env = (_env); \ + struct cl_page *__page = (_page); \ + const struct cl_page_slice *__scan; \ + ptrdiff_t __op = (_op); \ + void (*__method)_proto; \ + \ + /* get to the bottom page. */ \ + while (__page->cp_child != NULL) \ + __page = __page->cp_child; \ + do { \ + list_for_each_entry_reverse(__scan, &__page->cp_layers, \ + cpl_linkage) { \ + __method = *(void **)((char *)__scan->cpl_ops + \ + __op); \ + if (__method != NULL) \ + (*__method)(__env, __scan, \ + ## __VA_ARGS__); \ + } \ + __page = __page->cp_parent; \ + } while (__page != NULL); \ +} while (0) + +static int cl_page_invoke(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + RETURN(CL_PAGE_INVOKE(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io)); +} + +static void cl_page_invoid(const struct lu_env *env, + struct cl_io *io, struct cl_page *page, ptrdiff_t op) + +{ + PINVRNT(env, page, cl_object_same(page->cp_obj, io->ci_obj)); + ENTRY; + CL_PAGE_INVOID(env, page, op, + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), io); + EXIT; +} + +static void cl_page_owner_clear(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + if (page->cp_owner != NULL) { + LASSERT(page->cp_owner->ci_owned_nr > 0); + page->cp_owner->ci_owned_nr--; + page->cp_owner = NULL; + } + } + EXIT; +} + +static void cl_page_owner_set(struct cl_page *page) +{ + ENTRY; + for (page = cl_page_top(page); page != NULL; page = page->cp_child) { + LASSERT(page->cp_owner != NULL); + page->cp_owner->ci_owned_nr++; + } + EXIT; +} + +void cl_page_disown0(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + enum cl_page_state state; + + ENTRY; + state = pg->cp_state; + PINVRNT(env, pg, state == CPS_OWNED || state == CPS_FREEING); + PINVRNT(env, pg, cl_page_invariant(pg)); + cl_page_owner_clear(pg); + + if (state == CPS_OWNED) + cl_page_state_set(env, pg, CPS_CACHED); + /* + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for VFS/VM interaction runs + * last and can release locks safely. + */ + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_disown), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} + +/** + * returns true, iff page is owned by the given io. + */ +int cl_page_is_owned(const struct cl_page *pg, const struct cl_io *io) +{ + LINVRNT(cl_object_same(pg->cp_obj, io->ci_obj)); + ENTRY; + RETURN(pg->cp_state == CPS_OWNED && pg->cp_owner == io); +} +EXPORT_SYMBOL(cl_page_is_owned); + +/** + * Owns a page by IO. + * + * Waits until page is in cl_page_state::CPS_CACHED state, and then switch it + * into cl_page_state::CPS_OWNED state. + * + * \pre !cl_page_is_owned(pg, io) + * \post result == 0 iff cl_page_is_owned(pg, io) + * + * \retval 0 success + * + * \retval -ve failure, e.g., page was destroyed (and landed in + * cl_page_state::CPS_FREEING instead of cl_page_state::CPS_CACHED). + * + * \see cl_page_disown() + * \see cl_page_operations::cpo_own() + */ +int cl_page_own(const struct lu_env *env, struct cl_io *io, struct cl_page *pg) +{ + int result; + + PINVRNT(env, pg, !cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_own)); + PASSERT(env, pg, pg->cp_owner == NULL); + PASSERT(env, pg, pg->cp_req == NULL); + pg->cp_owner = io; + cl_page_owner_set(pg); + if (pg->cp_state != CPS_FREEING) { + cl_page_state_set(env, pg, CPS_OWNED); + result = 0; + } else { + cl_page_disown0(env, io, pg); + result = -EAGAIN; + } + PINVRNT(env, pg, ergo(result == 0, cl_page_invariant(pg))); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_own); + +/** + * Assume page ownership. + * + * Called when page is already locked by the hosting VM. + * + * \pre !cl_page_is_owned(pg, io) + * \post cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_assume() + */ +void cl_page_assume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PASSERT(env, pg, pg->cp_state < CPS_OWNED); + PASSERT(env, pg, pg->cp_owner == NULL); + PINVRNT(env, pg, cl_object_same(pg->cp_obj, io->ci_obj)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_assume)); + pg->cp_owner = io; + cl_page_owner_set(pg); + cl_page_state_set(env, pg, CPS_OWNED); + EXIT; +} +EXPORT_SYMBOL(cl_page_assume); + +/** + * Releases page ownership without unlocking the page. + * + * Moves page into cl_page_state::CPS_CACHED without releasing a lock on the + * underlying VM page (as VM is supposed to do this itself). + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_assume() + */ +void cl_page_unassume(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(cpo_unassume), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + EXIT; +} +EXPORT_SYMBOL(cl_page_unassume); + +/** + * Releases page ownership. + * + * Moves page into cl_page_state::CPS_CACHED. + * + * \pre cl_page_is_owned(pg, io) + * \post !cl_page_is_owned(pg, io) + * + * \see cl_page_own() + * \see cl_page_operations::cpo_disown() + */ +void cl_page_disown(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + + ENTRY; + pg = cl_page_top(pg); + io = cl_io_top(io); + cl_page_disown0(env, io, pg); + EXIT; +} +EXPORT_SYMBOL(cl_page_disown); + +/** + * Called when page is to be removed from the object, e.g., as a result of + * truncate. + * + * Calls cl_page_operations::cpo_discard() top-to-bottom. + * + * \pre cl_page_is_owned(pg, io) + * + * \see cl_page_operations::cpo_discard() + */ +void cl_page_discard(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + cl_page_invoid(env, io, pg, CL_PAGE_OP(cpo_discard)); +} +EXPORT_SYMBOL(cl_page_discard); + +/** + * Version of cl_page_delete() that can be called for not fully constructed + * pages, e.g,. in a error handling cl_page_find()->cl_page_delete0() + * path. Doesn't check page invariant. + */ +static void cl_page_delete0(const struct lu_env *env, struct cl_page *pg, + int radix) +{ + PASSERT(env, pg, pg == cl_page_top(pg)); + PASSERT(env, pg, pg->cp_state != CPS_FREEING); + + ENTRY; + /* + * Severe all ways to obtain new pointers to @pg. + */ + cl_page_owner_clear(pg); + cl_page_state_set0(env, pg, CPS_FREEING); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_delete), + (const struct lu_env *, const struct cl_page_slice *)); + if (!radix) + /* + * !radix means that @pg is not yet in the radix tree, skip + * removing it. + */ + pg = pg->cp_child; + for (; pg != NULL; pg = pg->cp_child) { + void *value; + struct cl_object_header *hdr; + + hdr = cl_object_header(pg->cp_obj); + spin_lock(&hdr->coh_page_guard); + value = radix_tree_delete(&hdr->coh_tree, pg->cp_index); + PASSERT(env, pg, value == pg); + PASSERT(env, pg, hdr->coh_pages > 0); + hdr->coh_pages--; + spin_unlock(&hdr->coh_page_guard); + } + EXIT; +} + +/** + * Called when a decision is made to throw page out of memory. + * + * Notifies all layers about page destruction by calling + * cl_page_operations::cpo_delete() method top-to-bottom. + * + * Moves page into cl_page_state::CPS_FREEING state (this is the only place + * where transition to this state happens). + * + * Eliminates all venues through which new references to the page can be + * obtained: + * + * - removes page from the radix trees, + * + * - breaks linkage from VM page to cl_page. + * + * Once page reaches cl_page_state::CPS_FREEING, all remaining references will + * drain after some time, at which point page will be recycled. + * + * \pre pg == cl_page_top(pg) + * \pre VM page is locked + * \post pg->cp_state == CPS_FREEING + * + * \see cl_page_operations::cpo_delete() + */ +void cl_page_delete(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + ENTRY; + cl_page_delete0(env, pg, 1); + EXIT; +} +EXPORT_SYMBOL(cl_page_delete); + +/** + * Unmaps page from user virtual memory. + * + * Calls cl_page_operations::cpo_unmap() through all layers top-to-bottom. The + * layer responsible for VM interaction has to unmap page from user space + * virtual memory. + * + * \see cl_page_operations::cpo_unmap() + */ +int cl_page_unmap(const struct lu_env *env, + struct cl_io *io, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + return cl_page_invoke(env, io, pg, CL_PAGE_OP(cpo_unmap)); +} +EXPORT_SYMBOL(cl_page_unmap); + +/** + * Marks page up-to-date. + * + * Call cl_page_operations::cpo_export() through all layers top-to-bottom. The + * layer responsible for VM interaction has to mark page as up-to-date. From + * this moment on, page can be shown to the user space without Lustre being + * notified, hence the name. + * + * \see cl_page_operations::cpo_export() + */ +void cl_page_export(const struct lu_env *env, struct cl_page *pg) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_export), + (const struct lu_env *, const struct cl_page_slice *)); +} +EXPORT_SYMBOL(cl_page_export); + +/** + * Returns true, iff \a pg is VM locked in a suitable sense by the calling + * thread. + */ +int cl_page_is_vmlocked(const struct lu_env *env, const struct cl_page *pg) +{ + int result; + const struct cl_page_slice *slice; + + ENTRY; + pg = cl_page_top_trusted((struct cl_page *)pg); + slice = container_of(pg->cp_layers.next, + const struct cl_page_slice, cpl_linkage); + PASSERT(env, pg, slice->cpl_ops->cpo_is_vmlocked != NULL); + /* + * Call ->cpo_is_vmlocked() directly instead of going through + * CL_PAGE_INVOKE(), because cl_page_is_vmlocked() is used by + * cl_page_invariant(). + */ + result = slice->cpl_ops->cpo_is_vmlocked(env, slice); + PASSERT(env, pg, result == -EBUSY || result == -ENODATA); + RETURN(result == -EBUSY); +} +EXPORT_SYMBOL(cl_page_is_vmlocked); + +static enum cl_page_state cl_req_type_state(enum cl_req_type crt) +{ + ENTRY; + RETURN(crt == CRT_WRITE ? CPS_PAGEOUT : CPS_PAGEIN); +} + +static void cl_page_io_start(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt) +{ + /* + * Page is queued for IO, change its state. + */ + ENTRY; + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, cl_req_type_state(crt)); + EXIT; +} + +/** + * Prepares page for immediate transfer. cl_page_operations::cpo_prep() is + * called top-to-bottom. Every layer either agrees to submit this page (by + * returning 0), or requests to omit this page (by returning -EALREADY). Layer + * handling interactions with the VM also has to inform VM that page is under + * transfer now. + */ +int cl_page_prep(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + PINVRNT(env, pg, crt < CRT_NR); + + /* + * XXX this has to be called bottom-to-top, so that llite can set up + * PG_writeback without risking other layers deciding to skip this + * page. + */ + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_prep)); + if (result == 0) + cl_page_io_start(env, pg, crt); + + KLASSERT(ergo(crt == CRT_WRITE && pg->cp_type == CPT_CACHEABLE, + equi(result == 0, + PageWriteback(cl_page_vmpage(env, pg))))); + CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result); + return result; +} +EXPORT_SYMBOL(cl_page_prep); + +/** + * Notify layers about transfer completion. + * + * Invoked by transfer sub-system (which is a part of osc) to notify layers + * that a transfer, of which this page is a part of has completed. + * + * Completion call-backs are executed in the bottom-up order, so that + * uppermost layer (llite), responsible for the VFS/VM interaction runs last + * and can release locks safely. + * + * \pre pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * \post pg->cp_state == CPS_CACHED + * + * \see cl_page_operations::cpo_completion() + */ +void cl_page_completion(const struct lu_env *env, + struct cl_page *pg, enum cl_req_type crt, int ioret) +{ + PASSERT(env, pg, crt < CRT_NR); + /* cl_page::cp_req already cleared by the caller (osc_completion()) */ + PASSERT(env, pg, pg->cp_req == NULL); + PASSERT(env, pg, pg->cp_state == cl_req_type_state(crt)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, ioret); + if (crt == CRT_READ) { + PASSERT(env, pg, !(pg->cp_flags & CPF_READ_COMPLETED)); + pg->cp_flags |= CPF_READ_COMPLETED; + } + + cl_page_state_set(env, pg, CPS_CACHED); + CL_PAGE_INVOID_REVERSE(env, pg, CL_PAGE_OP(io[crt].cpo_completion), + (const struct lu_env *, + const struct cl_page_slice *, int), ioret); + + KLASSERT(!PageWriteback(cl_page_vmpage(env, pg))); + EXIT; +} +EXPORT_SYMBOL(cl_page_completion); + +/** + * Notify layers that transfer formation engine decided to yank this page from + * the cache and to make it a part of a transfer. + * + * \pre pg->cp_state == CPS_CACHED + * \post pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT + * + * \see cl_page_operations::cpo_make_ready() + */ +int cl_page_make_ready(const struct lu_env *env, struct cl_page *pg, + enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, crt < CRT_NR); + + ENTRY; + result = CL_PAGE_INVOKE(env, pg, CL_PAGE_OP(io[crt].cpo_make_ready), + (const struct lu_env *, + const struct cl_page_slice *)); + if (result == 0) { + PASSERT(env, pg, pg->cp_state == CPS_CACHED); + cl_page_io_start(env, pg, crt); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_make_ready); + +/** + * Notify layers that high level io decided to place this page into a cache + * for future transfer. + * + * The layer implementing transfer engine (osc) has to register this page in + * its queues. + * + * \pre cl_page_is_owned(pg, io) + * \post ergo(result == 0, + * pg->cp_state == CPS_PAGEIN || pg->cp_state == CPS_PAGEOUT) + * + * \see cl_page_operations::cpo_cache_add() + */ +int cl_page_cache_add(const struct lu_env *env, struct cl_io *io, + struct cl_page *pg, enum cl_req_type crt) +{ + int result; + + PINVRNT(env, pg, crt < CRT_NR); + PINVRNT(env, pg, cl_page_is_owned(pg, io)); + PINVRNT(env, pg, cl_page_invariant(pg)); + + ENTRY; + result = cl_page_invoke(env, io, pg, CL_PAGE_OP(io[crt].cpo_cache_add)); + if (result == 0) { + cl_page_owner_clear(pg); + cl_page_state_set(env, pg, CPS_CACHED); + } + CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", crt, result); + RETURN(result); +} +EXPORT_SYMBOL(cl_page_cache_add); + +/** + * Checks whether page is protected by any extent lock is at least required + * mode. + * + * \return the same as in cl_page_operations::cpo_is_under_lock() method. + * \see cl_page_operations::cpo_is_under_lock() + */ +int cl_page_is_under_lock(const struct lu_env *env, struct cl_io *io, + struct cl_page *page) +{ + int rc; + + PINVRNT(env, page, cl_page_invariant(page)); + + ENTRY; + rc = CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_is_under_lock), + (const struct lu_env *, + const struct cl_page_slice *, struct cl_io *), + io); + PASSERT(env, page, rc != 0); + RETURN(rc); +} +EXPORT_SYMBOL(cl_page_is_under_lock); + +/** + * Purges all cached pages belonging to the object \a obj. + */ +int cl_pages_prune(const struct lu_env *env, struct cl_object *clobj) +{ + struct cl_thread_info *info; + struct cl_object *obj = cl_object_top(clobj); + struct cl_io *io; + struct cl_page_list *plist; + int result; + + ENTRY; + info = cl_env_info(env); + plist = &info->clt_list; + io = &info->clt_io; + + /* + * initialize the io. This is ugly since we never do IO in this + * function, we just make cl_page_list functions happy. -jay + */ + io->ci_obj = obj; + result = cl_io_init(env, io, CIT_MISC, obj); + if (result != 0) { + cl_io_fini(env, io); + RETURN(io->ci_result); + } + + cl_page_list_init(plist); + cl_page_gang_lookup(env, obj, io, 0, CL_PAGE_EOF, plist); + /* + * Since we're purging the pages of an object, we don't care + * the possible outcomes of the following functions. + */ + cl_page_list_unmap(env, io, plist); + cl_page_list_discard(env, io, plist); + cl_page_list_disown(env, io, plist); + cl_page_list_fini(env, plist); + + cl_io_fini(env, io); + RETURN(result); +} +EXPORT_SYMBOL(cl_pages_prune); + +/** + * Tells transfer engine that only part of a page is to be transmitted. + * + * \see cl_page_operations::cpo_clip() + */ +void cl_page_clip(const struct lu_env *env, struct cl_page *pg, + int from, int to) +{ + PINVRNT(env, pg, cl_page_invariant(pg)); + + CL_PAGE_HEADER(D_TRACE, env, pg, "%i %i\n", from, to); + CL_PAGE_INVOID(env, pg, CL_PAGE_OP(cpo_clip), + (const struct lu_env *, + const struct cl_page_slice *,int, int), + from, to); +} +EXPORT_SYMBOL(cl_page_clip); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_header_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + (*printer)(env, cookie, + "page@%p[%d %p:%lu ^%p_%p %d %d %d %p %p %#x]\n", + pg, atomic_read(&pg->cp_ref), pg->cp_obj, + pg->cp_index, pg->cp_parent, pg->cp_child, + pg->cp_state, pg->cp_error, pg->cp_type, + pg->cp_owner, pg->cp_req, pg->cp_flags); +} +EXPORT_SYMBOL(cl_page_header_print); + +/** + * Prints human readable representation of \a pg to the \a f. + */ +void cl_page_print(const struct lu_env *env, void *cookie, + lu_printer_t printer, const struct cl_page *pg) +{ + struct cl_page *scan; + + for (scan = cl_page_top((struct cl_page *)pg); + scan != NULL; scan = scan->cp_child) + cl_page_header_print(env, cookie, printer, scan); + CL_PAGE_INVOKE(env, (struct cl_page *)pg, CL_PAGE_OP(cpo_print), + (const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t p), cookie, printer); + (*printer)(env, cookie, "end page@%p\n", pg); +} +EXPORT_SYMBOL(cl_page_print); + +/** + * Cancel a page which is still in a transfer. + */ +int cl_page_cancel(const struct lu_env *env, struct cl_page *page) +{ + return CL_PAGE_INVOKE(env, page, CL_PAGE_OP(cpo_cancel), + (const struct lu_env *, + const struct cl_page_slice *)); +} +EXPORT_SYMBOL(cl_page_cancel); + +/** + * Converts a byte offset within object \a obj into a page index. + */ +loff_t cl_offset(const struct cl_object *obj, pgoff_t idx) +{ + /* + * XXX for now. + */ + return (loff_t)idx << CFS_PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_offset); + +/** + * Converts a page index into a byte offset within object \a obj. + */ +pgoff_t cl_index(const struct cl_object *obj, loff_t offset) +{ + /* + * XXX for now. + */ + return offset >> CFS_PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_index); + +int cl_page_size(const struct cl_object *obj) +{ + return 1 << CFS_PAGE_SHIFT; +} +EXPORT_SYMBOL(cl_page_size); + +/** + * Adds page slice to the compound page. + * + * This is called by cl_object_operations::coo_page_init() methods to add a + * per-layer state to the page. New state is added at the end of + * cl_page::cp_layers list, that is, it is at the bottom of the stack. + * + * \see cl_lock_slice_add(), cl_req_slice_add(), cl_io_slice_add() + */ +void cl_page_slice_add(struct cl_page *page, struct cl_page_slice *slice, + struct cl_object *obj, + const struct cl_page_operations *ops) +{ + ENTRY; + list_add_tail(&slice->cpl_linkage, &page->cp_layers); + slice->cpl_obj = obj; + slice->cpl_ops = ops; + slice->cpl_page = page; + EXIT; +} +EXPORT_SYMBOL(cl_page_slice_add); + +int cl_page_init(void) +{ + return lu_kmem_init(cl_page_caches); +} + +void cl_page_fini(void) +{ + lu_kmem_fini(cl_page_caches); +} diff --git a/lustre/obdclass/class_hash.c b/lustre/obdclass/class_hash.c index 0dd18f1..0befb37 100644 --- a/lustre/obdclass/class_hash.c +++ b/lustre/obdclass/class_hash.c @@ -58,14 +58,14 @@ /** * Initialize new lustre hash, where: * @name - Descriptive hash name - * @cur_size - Initial hash table size - * @max_size - Maximum allowed hash table resize + * @cur_bits - Initial hash table size, in bits + * @max_bits - Maximum allowed hash table resize, in bits * @ops - Registered hash table operations * @flags - LH_REHASH enable synamic hash resizing * - LH_SORT enable chained hash sort */ lustre_hash_t * -lustre_hash_init(char *name, unsigned int cur_size, unsigned int max_size, +lustre_hash_init(char *name, unsigned int cur_bits, unsigned int max_bits, lustre_hash_ops_t *ops, int flags) { lustre_hash_t *lh; @@ -75,14 +75,9 @@ lustre_hash_init(char *name, unsigned int cur_size, unsigned int max_size, LASSERT(name != NULL); LASSERT(ops != NULL); - /* - * Ensure hash is a power of two to allow the use of a bitmask - * in the hash function instead of a more expensive modulus. - */ - LASSERTF(cur_size && (cur_size & (cur_size - 1)) == 0, - "Size (%u) is not power of 2\n", cur_size); - LASSERTF(max_size && (max_size & (max_size - 1)) == 0, - "Size (%u) is not power of 2\n", max_size); + LASSERT(cur_bits > 0); + LASSERT(max_bits >= cur_bits); + LASSERT(max_bits < 31); OBD_ALLOC_PTR(lh); if (!lh) @@ -92,21 +87,27 @@ lustre_hash_init(char *name, unsigned int cur_size, unsigned int max_size, atomic_set(&lh->lh_rehash_count, 0); atomic_set(&lh->lh_count, 0); rwlock_init(&lh->lh_rwlock); - lh->lh_cur_size = cur_size; - lh->lh_min_size = cur_size; - lh->lh_max_size = max_size; - lh->lh_min_theta = 500; /* theta * 1000 */ - lh->lh_max_theta = 2000; /* theta * 1000 */ + lh->lh_cur_bits = cur_bits; + lh->lh_cur_mask = (1 << cur_bits) - 1; + lh->lh_min_bits = cur_bits; + lh->lh_max_bits = max_bits; + /* XXX: need to fixup lustre_hash_rehash_bits() before this can be + * anything other than 0.5 and 2.0 */ + lh->lh_min_theta = 1 << (LH_THETA_BITS - 1); + lh->lh_max_theta = 1 << (LH_THETA_BITS + 1); lh->lh_ops = ops; lh->lh_flags = flags; - OBD_VMALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size); + /* theta * 1000 */ + __lustre_hash_set_theta(lh, 500, 2000); + + OBD_VMALLOC(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits); if (!lh->lh_buckets) { OBD_FREE_PTR(lh); RETURN(NULL); } - for (i = 0; i < lh->lh_cur_size; i++) { + for (i = 0; i <= lh->lh_cur_mask; i++) { INIT_HLIST_HEAD(&lh->lh_buckets[i].lhb_head); rwlock_init(&lh->lh_buckets[i].lhb_rwlock); atomic_set(&lh->lh_buckets[i].lhb_count, 0); @@ -145,7 +146,7 @@ lustre_hash_exit(lustre_hash_t *lh) write_unlock(&lhb->lhb_rwlock); } - OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) * lh->lh_cur_size); + OBD_VFREE(lh->lh_buckets, sizeof(*lh->lh_buckets) << lh->lh_cur_bits); LASSERT(atomic_read(&lh->lh_count) == 0); write_unlock(&lh->lh_rwlock); @@ -154,18 +155,20 @@ lustre_hash_exit(lustre_hash_t *lh) } EXPORT_SYMBOL(lustre_hash_exit); -static inline unsigned int lustre_hash_rehash_size(lustre_hash_t *lh) +static inline unsigned int lustre_hash_rehash_bits(lustre_hash_t *lh) { if (!(lh->lh_flags & LH_REHASH)) return 0; - if ((lh->lh_cur_size < lh->lh_max_size) && + /* XXX: need to handle case with max_theta != 2.0 + * and the case with min_theta != 0.5 */ + if ((lh->lh_cur_bits < lh->lh_max_bits) && (__lustre_hash_theta(lh) > lh->lh_max_theta)) - return MIN(lh->lh_cur_size * 2, lh->lh_max_size); + return lh->lh_cur_bits + 1; - if ((lh->lh_cur_size > lh->lh_min_size) && + if ((lh->lh_cur_bits > lh->lh_min_bits) && (__lustre_hash_theta(lh) < lh->lh_min_theta)) - return MAX(lh->lh_cur_size / 2, lh->lh_min_size); + return lh->lh_cur_bits - 1; return 0; } @@ -178,26 +181,26 @@ void lustre_hash_add(lustre_hash_t *lh, void *key, struct hlist_node *hnode) { lustre_hash_bucket_t *lhb; - int size; + int bits; unsigned i; ENTRY; __lustre_hash_key_validate(lh, key, hnode); read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); LASSERT(hlist_unhashed(hnode)); write_lock(&lhb->lhb_rwlock); __lustre_hash_bucket_add(lh, lhb, hnode); write_unlock(&lhb->lhb_rwlock); - size = lustre_hash_rehash_size(lh); + bits = lustre_hash_rehash_bits(lh); read_unlock(&lh->lh_rwlock); - if (size) - lustre_hash_rehash(lh, size); + if (bits) + lustre_hash_rehash(lh, bits); EXIT; } @@ -207,18 +210,18 @@ static struct hlist_node * lustre_hash_findadd_unique_hnode(lustre_hash_t *lh, void *key, struct hlist_node *hnode) { + int bits = 0; struct hlist_node *ehnode; lustre_hash_bucket_t *lhb; - int size; unsigned i; ENTRY; __lustre_hash_key_validate(lh, key, hnode); read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); LASSERT(hlist_unhashed(hnode)); write_lock(&lhb->lhb_rwlock); @@ -228,13 +231,12 @@ lustre_hash_findadd_unique_hnode(lustre_hash_t *lh, void *key, } else { __lustre_hash_bucket_add(lh, lhb, hnode); ehnode = hnode; + bits = lustre_hash_rehash_bits(lh); } write_unlock(&lhb->lhb_rwlock); - - size = lustre_hash_rehash_size(lh); read_unlock(&lh->lh_rwlock); - if (size) - lustre_hash_rehash(lh, size); + if (bits) + lustre_hash_rehash(lh, bits); RETURN(ehnode); } @@ -251,9 +253,10 @@ lustre_hash_add_unique(lustre_hash_t *lh, void *key, struct hlist_node *hnode) ENTRY; ehnode = lustre_hash_findadd_unique_hnode(lh, key, hnode); - if (ehnode != hnode) + if (ehnode != hnode) { + lh_put(lh, ehnode); RETURN(-EALREADY); - + } RETURN(0); } EXPORT_SYMBOL(lustre_hash_add_unique); @@ -290,7 +293,6 @@ void * lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode) { lustre_hash_bucket_t *lhb; - int size; unsigned i; void *obj; ENTRY; @@ -298,19 +300,15 @@ lustre_hash_del(lustre_hash_t *lh, void *key, struct hlist_node *hnode) __lustre_hash_key_validate(lh, key, hnode); read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); LASSERT(!hlist_unhashed(hnode)); write_lock(&lhb->lhb_rwlock); obj = __lustre_hash_bucket_del(lh, lhb, hnode); write_unlock(&lhb->lhb_rwlock); - - size = lustre_hash_rehash_size(lh); read_unlock(&lh->lh_rwlock); - if (size) - lustre_hash_rehash(lh, size); RETURN(obj); } @@ -327,15 +325,14 @@ lustre_hash_del_key(lustre_hash_t *lh, void *key) { struct hlist_node *hnode; lustre_hash_bucket_t *lhb; - int size; unsigned i; void *obj = NULL; ENTRY; read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); write_lock(&lhb->lhb_rwlock); hnode = __lustre_hash_bucket_lookup(lh, lhb, key); @@ -343,11 +340,7 @@ lustre_hash_del_key(lustre_hash_t *lh, void *key) obj = __lustre_hash_bucket_del(lh, lhb, hnode); write_unlock(&lhb->lhb_rwlock); - - size = lustre_hash_rehash_size(lh); read_unlock(&lh->lh_rwlock); - if (size) - lustre_hash_rehash(lh, size); RETURN(obj); } @@ -371,9 +364,9 @@ lustre_hash_lookup(lustre_hash_t *lh, void *key) ENTRY; read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); read_lock(&lhb->lhb_rwlock); hnode = __lustre_hash_bucket_lookup(lh, lhb, key); @@ -499,7 +492,7 @@ restart: } EXPORT_SYMBOL(lustre_hash_for_each_empty); - /* +/* * For each item in the lustre hash @lh which matches the @key call * the passed callback @func and pass to it as an argument each hash * item and the private @data. Before each callback ops->lh_get will @@ -517,9 +510,9 @@ lustre_hash_for_each_key(lustre_hash_t *lh, void *key, ENTRY; read_lock(&lh->lh_rwlock); - i = lh_hash(lh, key, lh->lh_cur_size - 1); + i = lh_hash(lh, key, lh->lh_cur_mask); lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); read_lock(&lhb->lhb_rwlock); hlist_for_each(hnode, &(lhb->lhb_head)) { @@ -540,7 +533,7 @@ lustre_hash_for_each_key(lustre_hash_t *lh, void *key, EXPORT_SYMBOL(lustre_hash_for_each_key); /** - * Rehash the lustre hash @lh to the given @size. This can be used + * Rehash the lustre hash @lh to the given @bits. This can be used * to grow the hash size when excessive chaining is detected, or to * shrink the hash when it is larger than needed. When the LH_REHASH * flag is set in @lh the lustre hash may be dynamically rehashed @@ -551,7 +544,7 @@ EXPORT_SYMBOL(lustre_hash_for_each_key); * theta thresholds for @lh are tunable via lustre_hash_set_theta(). */ int -lustre_hash_rehash(lustre_hash_t *lh, int size) +lustre_hash_rehash(lustre_hash_t *lh, int bits) { struct hlist_node *hnode; struct hlist_node *pos; @@ -560,18 +553,21 @@ lustre_hash_rehash(lustre_hash_t *lh, int size) lustre_hash_bucket_t *lh_lhb; lustre_hash_bucket_t *rehash_lhb; int i; - int lh_size; int theta; + int lh_mask; + int lh_bits; + int mask = (1 << bits) - 1; void *key; ENTRY; - LASSERT(size > 0); + LASSERT(!in_interrupt()); + LASSERT(mask > 0); - OBD_VMALLOC(rehash_buckets, sizeof(*rehash_buckets) * size); + OBD_VMALLOC(rehash_buckets, sizeof(*rehash_buckets) << bits); if (!rehash_buckets) RETURN(-ENOMEM); - for (i = 0; i < size; i++) { + for (i = 0; i <= mask; i++) { INIT_HLIST_HEAD(&rehash_buckets[i].lhb_head); rwlock_init(&rehash_buckets[i].lhb_rwlock); atomic_set(&rehash_buckets[i].lhb_count, 0); @@ -585,19 +581,21 @@ lustre_hash_rehash(lustre_hash_t *lh, int size) */ theta = __lustre_hash_theta(lh); if ((theta >= lh->lh_min_theta) && (theta <= lh->lh_max_theta)) { - OBD_VFREE(rehash_buckets, sizeof(*rehash_buckets) * size); + OBD_VFREE(rehash_buckets, sizeof(*rehash_buckets) << bits); write_unlock(&lh->lh_rwlock); RETURN(-EALREADY); } - lh_size = lh->lh_cur_size; + lh_bits = lh->lh_cur_bits; lh_buckets = lh->lh_buckets; + lh_mask = (1 << lh_bits) - 1; - lh->lh_cur_size = size; + lh->lh_cur_bits = bits; + lh->lh_cur_mask = (1 << bits) - 1; lh->lh_buckets = rehash_buckets; atomic_inc(&lh->lh_rehash_count); - for (i = 0; i < lh_size; i++) { + for (i = 0; i <= lh_mask; i++) { lh_lhb = &lh_buckets[i]; write_lock(&lh_lhb->lhb_rwlock); @@ -609,7 +607,7 @@ lustre_hash_rehash(lustre_hash_t *lh, int size) * Validate hnode is in the correct bucket. */ if (unlikely(lh->lh_flags & LH_DEBUG)) - LASSERT(lh_hash(lh, key, lh_size - 1) == i); + LASSERT(lh_hash(lh, key, lh_mask) == i); /* * Delete from old hash bucket. @@ -621,7 +619,7 @@ lustre_hash_rehash(lustre_hash_t *lh, int size) /* * Add to rehash bucket, ops->lh_key must be defined. */ - rehash_lhb = &rehash_buckets[lh_hash(lh, key, size-1)]; + rehash_lhb = &rehash_buckets[lh_hash(lh, key, mask)]; hlist_add_head(hnode, &(rehash_lhb->lhb_head)); atomic_inc(&rehash_lhb->lhb_count); } @@ -631,7 +629,7 @@ lustre_hash_rehash(lustre_hash_t *lh, int size) write_unlock(&lh_lhb->lhb_rwlock); } - OBD_VFREE(lh_buckets, sizeof(*lh_buckets) * lh_size); + OBD_VFREE(lh_buckets, sizeof(*lh_buckets) << lh_bits); write_unlock(&lh->lh_rwlock); RETURN(0); @@ -661,13 +659,13 @@ void lustre_hash_rehash_key(lustre_hash_t *lh, void *old_key, void *new_key, read_lock(&lh->lh_rwlock); - i = lh_hash(lh, old_key, lh->lh_cur_size - 1); + i = lh_hash(lh, old_key, lh->lh_cur_mask); old_lhb = &lh->lh_buckets[i]; - LASSERT(i < lh->lh_cur_size); + LASSERT(i <= lh->lh_cur_mask); - j = lh_hash(lh, new_key, lh->lh_cur_size - 1); + j = lh_hash(lh, new_key, lh->lh_cur_mask); new_lhb = &lh->lh_buckets[j]; - LASSERT(j < lh->lh_cur_size); + LASSERT(j <= lh->lh_cur_mask); write_lock(&old_lhb->lhb_rwlock); write_lock(&new_lhb->lhb_rwlock); @@ -713,16 +711,19 @@ int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size) read_lock(&lh->lh_rwlock); theta = __lustre_hash_theta(lh); - c += snprintf(str + c, size - c, "%-36s ",lh->lh_name); - c += snprintf(str + c, size - c, "%5d ", lh->lh_cur_size); - c += snprintf(str + c, size - c, "%5d ", lh->lh_min_size); - c += snprintf(str + c, size - c, "%5d ", lh->lh_max_size); + c += snprintf(str + c, size - c, "%-36s ", lh->lh_name); + c += snprintf(str + c, size - c, "%5d ", 1 << lh->lh_cur_bits); + c += snprintf(str + c, size - c, "%5d ", 1 << lh->lh_min_bits); + c += snprintf(str + c, size - c, "%5d ", 1 << lh->lh_max_bits); c += snprintf(str + c, size - c, "%d.%03d ", - theta / 1000, theta % 1000); + __lustre_hash_theta_int(theta), + __lustre_hash_theta_frac(theta)); c += snprintf(str + c, size - c, "%d.%03d ", - lh->lh_min_theta / 1000, lh->lh_min_theta % 1000); + __lustre_hash_theta_int(lh->lh_min_theta), + __lustre_hash_theta_frac(lh->lh_min_theta)); c += snprintf(str + c, size - c, "%d.%03d ", - lh->lh_max_theta / 1000, lh->lh_max_theta % 1000); + __lustre_hash_theta_int(lh->lh_max_theta), + __lustre_hash_theta_frac(lh->lh_max_theta)); c += snprintf(str + c, size - c, " 0x%02x ", lh->lh_flags); c += snprintf(str + c, size - c, "%6d ", atomic_read(&lh->lh_rehash_count)); @@ -743,7 +744,7 @@ int lustre_hash_debug_str(lustre_hash_t *lh, char *str, int size) * Non-Uniform hash distribution: 128/125/0/0/0/0/2/1 */ lh_for_each_bucket(lh, lhb, i) - dist[MIN(__fls(atomic_read(&lhb->lhb_count)/MAX(theta,1)),7)]++; + dist[min(__fls(atomic_read(&lhb->lhb_count)/max(theta,1)),7)]++; for (i = 0; i < 8; i++) c += snprintf(str + c, size - c, "%d%c", dist[i], diff --git a/lustre/obdclass/class_obd.c b/lustre/obdclass/class_obd.c index 2911586..d5bbe44 100644 --- a/lustre/obdclass/class_obd.c +++ b/lustre/obdclass/class_obd.c @@ -76,6 +76,7 @@ unsigned int obd_timeout = OBD_TIMEOUT_DEFAULT; /* seconds */ unsigned int ldlm_timeout = LDLM_TIMEOUT_DEFAULT; /* seconds */ unsigned int obd_max_dirty_pages = 256; atomic_t obd_dirty_pages; +atomic_t obd_dirty_transit_pages; cfs_waitq_t obd_race_waitq; int obd_race_state; @@ -390,6 +391,7 @@ EXPORT_SYMBOL(obd_timeout); EXPORT_SYMBOL(ldlm_timeout); EXPORT_SYMBOL(obd_max_dirty_pages); EXPORT_SYMBOL(obd_dirty_pages); +EXPORT_SYMBOL(obd_dirty_transit_pages); EXPORT_SYMBOL(ptlrpc_put_connection_superhack); EXPORT_SYMBOL(proc_lustre_root); @@ -403,7 +405,6 @@ EXPORT_SYMBOL(class_name2obd); EXPORT_SYMBOL(class_uuid2dev); EXPORT_SYMBOL(class_uuid2obd); EXPORT_SYMBOL(class_find_client_obd); -EXPORT_SYMBOL(class_find_client_notype); EXPORT_SYMBOL(class_devices_in_group); EXPORT_SYMBOL(class_conn2export); EXPORT_SYMBOL(class_exp2obd); @@ -590,10 +591,10 @@ int init_obdclass(void) err = obd_init_caches(); if (err) return err; -#ifdef __KERNEL__ err = lu_global_init(); if (err) return err; +#ifdef __KERNEL__ err = class_procfs_init(); if (err) return err; diff --git a/lustre/obdclass/dt_object.c b/lustre/obdclass/dt_object.c index 2a99005..79c160e 100644 --- a/lustre/obdclass/dt_object.c +++ b/lustre/obdclass/dt_object.c @@ -52,6 +52,28 @@ /* fid_be_to_cpu() */ #include +struct dt_find_hint { + struct lu_fid *dfh_fid; + struct dt_device *dfh_dt; + struct dt_object *dfh_o; +}; + +struct dt_thread_info { + char dti_buf[DT_MAX_PATH]; + struct lu_fid_pack dti_pack; + struct dt_find_hint dti_dfh; +}; + +/* context key constructor/destructor: dt_global_key_init, dt_global_key_fini */ +LU_KEY_INIT(dt_global, struct dt_thread_info); +LU_KEY_FINI(dt_global, struct dt_thread_info); + +static struct lu_context_key dt_key = { + .lct_tags = LCT_MD_THREAD|LCT_DT_THREAD, + .lct_init = dt_global_key_init, + .lct_fini = dt_global_key_fini +}; + /* no lock is necessary to protect the list, because call-backs * are added during system startup. Please refer to "struct dt_device". */ @@ -157,13 +179,44 @@ int dt_try_as_dir(const struct lu_env *env, struct dt_object *obj) } EXPORT_SYMBOL(dt_try_as_dir); -extern struct lu_context_key lu_global_key; +enum dt_format_type dt_mode_to_dft(__u32 mode) +{ + enum dt_format_type result; + + switch (mode & S_IFMT) { + case S_IFDIR: + result = DFT_DIR; + break; + case S_IFREG: + result = DFT_REGULAR; + break; + case S_IFLNK: + result = DFT_SYM; + break; + case S_IFCHR: + case S_IFBLK: + case S_IFIFO: + case S_IFSOCK: + result = DFT_NODE; + break; + default: + LBUG(); + break; + } + return result; +} + +EXPORT_SYMBOL(dt_mode_to_dft); +/** + * lookup fid for object named \a name in directory \a dir. + */ static int dt_lookup(const struct lu_env *env, struct dt_object *dir, const char *name, struct lu_fid *fid) { - struct lu_fid_pack *pack = lu_context_key_get(&env->le_ctx, - &lu_global_key); + struct dt_thread_info *info = lu_context_key_get(&env->le_ctx, + &dt_key); + struct lu_fid_pack *pack = &info->dti_pack; struct dt_rec *rec = (struct dt_rec *)pack; const struct dt_key *key = (const struct dt_key *)name; int result; @@ -171,16 +224,21 @@ static int dt_lookup(const struct lu_env *env, struct dt_object *dir, if (dt_try_as_dir(env, dir)) { result = dir->do_index_ops->dio_lookup(env, dir, rec, key, BYPASS_CAPA); - if (result == 0) + if (result > 0) result = fid_unpack(pack, fid); + else if (result == 0) + result = -ENOENT; } else result = -ENOTDIR; return result; } -static struct dt_object *dt_locate(const struct lu_env *env, - struct dt_device *dev, - const struct lu_fid *fid) +/** + * get object for given \a fid. + */ +struct dt_object *dt_locate(const struct lu_env *env, + struct dt_device *dev, + const struct lu_fid *fid) { struct lu_object *obj; struct dt_object *dt; @@ -191,38 +249,156 @@ static struct dt_object *dt_locate(const struct lu_env *env, LASSERT(obj != NULL); dt = container_of(obj, struct dt_object, do_lu); } else - dt = (void *)obj; + dt = (struct dt_object *)obj; return dt; } +EXPORT_SYMBOL(dt_locate); -struct dt_object *dt_store_open(const struct lu_env *env, - struct dt_device *dt, const char *name, - struct lu_fid *fid) +/** + * find a object named \a entry in given \a dfh->dfh_o directory. + */ +static int dt_find_entry(const struct lu_env *env, const char *entry, void *data) { + struct dt_find_hint *dfh = data; + struct dt_device *dt = dfh->dfh_dt; + struct lu_fid *fid = dfh->dfh_fid; + struct dt_object *obj = dfh->dfh_o; + int result; + + result = dt_lookup(env, obj, entry, fid); + lu_object_put(env, &obj->do_lu); + if (result == 0) { + obj = dt_locate(env, dt, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + dfh->dfh_o = obj; + return result; +} + +/** + * Abstract function which parses path name. This function feeds + * path component to \a entry_func. + */ +int dt_path_parser(const struct lu_env *env, + char *path, dt_entry_func_t entry_func, + void *data) +{ + char *e; + int rc = 0; + + while (1) { + e = strsep(&path, "/"); + if (e == NULL) + break; + + if (e[0] == 0) { + if (!path || path[0] == '\0') + break; + continue; + } + rc = entry_func(env, e, data); + if (rc) + break; + } + + return rc; +} + +static struct dt_object *dt_store_resolve(const struct lu_env *env, + struct dt_device *dt, + const char *path, + struct lu_fid *fid) +{ + struct dt_thread_info *info = lu_context_key_get(&env->le_ctx, + &dt_key); + struct dt_find_hint *dfh = &info->dti_dfh; + struct dt_object *obj; + char *local = info->dti_buf; int result; - struct dt_object *root; - struct dt_object *child; + dfh->dfh_dt = dt; + dfh->dfh_fid = fid; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; result = dt->dd_ops->dt_root_get(env, dt, fid); if (result == 0) { - root = dt_locate(env, dt, fid); - if (!IS_ERR(root)) { - result = dt_lookup(env, root, name, fid); - if (result == 0) - child = dt_locate(env, dt, fid); + obj = dt_locate(env, dt, fid); + if (!IS_ERR(obj)) { + dfh->dfh_o = obj; + result = dt_path_parser(env, local, dt_find_entry, dfh); + if (result != 0) + obj = ERR_PTR(result); else - child = ERR_PTR(result); - lu_object_put(env, &root->do_lu); - } else { - CERROR("No root\n"); - child = (void *)root; + obj = dfh->dfh_o; } - } else - child = ERR_PTR(result); - return child; + } else { + obj = ERR_PTR(result); + } + return obj; +} + +static struct dt_object *dt_reg_open(const struct lu_env *env, + struct dt_device *dt, + struct dt_object *p, + const char *name, + struct lu_fid *fid) +{ + struct dt_object *o; + int result; + + result = dt_lookup(env, p, name, fid); + if (result == 0){ + o = dt_locate(env, dt, fid); + } + else + o = ERR_PTR(result); + + return o; +} + +/** + * Open dt object named \a filename from \a dirname directory. + * \param dt dt device + * \param fid on success, object fid is stored in *fid + */ +struct dt_object *dt_store_open(const struct lu_env *env, + struct dt_device *dt, + const char *dirname, + const char *filename, + struct lu_fid *fid) +{ + struct dt_object *file; + struct dt_object *dir; + + dir = dt_store_resolve(env, dt, dirname, fid); + if (!IS_ERR(dir)) { + file = dt_reg_open(env, dt, dir, + filename, fid); + lu_object_put(env, &dir->do_lu); + } else { + file = dir; + } + return file; } EXPORT_SYMBOL(dt_store_open); +/* dt class init function. */ +int dt_global_init(void) +{ + int result; + + LU_CONTEXT_KEY_INIT(&dt_key); + result = lu_context_key_register(&dt_key); + return result; +} + +void dt_global_fini(void) +{ + lu_context_key_degister(&dt_key); +} + const struct dt_index_features dt_directory_features; EXPORT_SYMBOL(dt_directory_features); diff --git a/lustre/obdclass/genops.c b/lustre/obdclass/genops.c index 3051655..2de3465 100644 --- a/lustre/obdclass/genops.c +++ b/lustre/obdclass/genops.c @@ -506,18 +506,6 @@ struct obd_device * class_find_client_obd(struct obd_uuid *tgt_uuid, return NULL; } -struct obd_device *class_find_client_notype(struct obd_uuid *tgt_uuid, - struct obd_uuid *grp_uuid) -{ - struct obd_device *obd; - - obd = class_find_client_obd(tgt_uuid, LUSTRE_MDC_NAME, NULL); - if (!obd) - obd = class_find_client_obd(tgt_uuid, LUSTRE_OSC_NAME, - grp_uuid); - return obd; -} - /* Iterate the obd_device list looking devices have grp_uuid. Start searching at *next, and if a device is found, the next index to look at is saved in *next. If next is NULL, then the first matching device @@ -550,6 +538,49 @@ struct obd_device * class_devices_in_group(struct obd_uuid *grp_uuid, int *next) return NULL; } +/** + * to notify sptlrpc log for @fsname has changed, let every relevant OBD + * adjust sptlrpc settings accordingly. + */ +int class_notify_sptlrpc_conf(const char *fsname, int namelen) +{ + struct obd_device *obd; + const char *type; + int i, rc = 0, rc2; + + LASSERT(namelen > 0); + + spin_lock(&obd_dev_lock); + for (i = 0; i < class_devno_max(); i++) { + obd = class_num2obd(i); + + if (obd == NULL || obd->obd_set_up == 0 || obd->obd_stopping) + continue; + + /* only notify mdc, osc, mdt, ost */ + type = obd->obd_type->typ_name; + if (strcmp(type, LUSTRE_MDC_NAME) != 0 && + strcmp(type, LUSTRE_OSC_NAME) != 0 && + strcmp(type, LUSTRE_MDT_NAME) != 0 && + strcmp(type, LUSTRE_OST_NAME) != 0) + continue; + + if (strncmp(obd->obd_name, fsname, namelen)) + continue; + + class_incref(obd, __FUNCTION__, obd); + spin_unlock(&obd_dev_lock); + rc2 = obd_set_info_async(obd->obd_self_export, + sizeof(KEY_SPTLRPC_CONF), + KEY_SPTLRPC_CONF, 0, NULL, NULL); + rc = rc ? rc : rc2; + class_decref(obd, __FUNCTION__, obd); + spin_lock(&obd_dev_lock); + } + spin_unlock(&obd_dev_lock); + return rc; +} +EXPORT_SYMBOL(class_notify_sptlrpc_conf); void obd_cleanup_caches(void) { @@ -729,6 +760,7 @@ static void class_export_destroy(struct obd_export *exp) LASSERT(list_empty(&exp->exp_outstanding_replies)); LASSERT(list_empty(&exp->exp_req_replay_queue)); + LASSERT(list_empty(&exp->exp_queued_rpc)); obd_destroy_export(exp); class_decref(obd, "export", exp); @@ -757,6 +789,7 @@ struct obd_export *class_new_export(struct obd_device *obd, CFS_INIT_LIST_HEAD(&export->exp_outstanding_replies); CFS_INIT_LIST_HEAD(&export->exp_req_replay_queue); CFS_INIT_LIST_HEAD(&export->exp_handle.h_link); + CFS_INIT_LIST_HEAD(&export->exp_queued_rpc); class_handle_hash(&export->exp_handle, export_handle_addref); export->exp_last_request_time = cfs_time_current_sec(); spin_lock_init(&export->exp_lock); @@ -825,8 +858,9 @@ struct obd_import *class_import_get(struct obd_import *import) LASSERT(atomic_read(&import->imp_refcount) >= 0); LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a); atomic_inc(&import->imp_refcount); - CDEBUG(D_INFO, "import %p refcount=%d\n", import, - atomic_read(&import->imp_refcount)); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount), + import->imp_obd->obd_name); return import; } EXPORT_SYMBOL(class_import_get); @@ -839,13 +873,12 @@ void class_import_put(struct obd_import *import) LASSERT(atomic_read(&import->imp_refcount) < 0x5a5a5a); LASSERT(list_empty(&import->imp_zombie_chain)); - CDEBUG(D_INFO, "import %p refcount=%d\n", import, - atomic_read(&import->imp_refcount) - 1); + CDEBUG(D_INFO, "import %p refcount=%d obd=%s\n", import, + atomic_read(&import->imp_refcount) - 1, + import->imp_obd->obd_name); if (atomic_dec_and_test(&import->imp_refcount)) { - CDEBUG(D_INFO, "final put import %p\n", import); - spin_lock(&obd_zombie_impexp_lock); list_add(&import->imp_zombie_chain, &obd_zombie_imports); spin_unlock(&obd_zombie_impexp_lock); @@ -917,6 +950,7 @@ struct obd_import *class_new_import(struct obd_device *obd) cfs_waitq_init(&imp->imp_recovery_waitq); atomic_set(&imp->imp_refcount, 2); + atomic_set(&imp->imp_unregistering, 0); atomic_set(&imp->imp_inflight, 0); atomic_set(&imp->imp_replay_inflight, 0); atomic_set(&imp->imp_inval_count, 0); @@ -1164,146 +1198,6 @@ int class_disconnect_stale_exports(struct obd_device *obd, } EXPORT_SYMBOL(class_disconnect_stale_exports); -int oig_init(struct obd_io_group **oig_out) -{ - struct obd_io_group *oig; - ENTRY; - - OBD_ALLOC(oig, sizeof(*oig)); - if (oig == NULL) - RETURN(-ENOMEM); - - spin_lock_init(&oig->oig_lock); - oig->oig_rc = 0; - oig->oig_pending = 0; - atomic_set(&oig->oig_refcount, 1); - cfs_waitq_init(&oig->oig_waitq); - CFS_INIT_LIST_HEAD(&oig->oig_occ_list); - - *oig_out = oig; - RETURN(0); -}; -EXPORT_SYMBOL(oig_init); - -static inline void oig_grab(struct obd_io_group *oig) -{ - atomic_inc(&oig->oig_refcount); -} - -void oig_release(struct obd_io_group *oig) -{ - if (atomic_dec_and_test(&oig->oig_refcount)) - OBD_FREE(oig, sizeof(*oig)); -} -EXPORT_SYMBOL(oig_release); - -int oig_add_one(struct obd_io_group *oig, struct oig_callback_context *occ) -{ - int rc = 0; - CDEBUG(D_CACHE, "oig %p ready to roll\n", oig); - spin_lock(&oig->oig_lock); - if (oig->oig_rc) { - rc = oig->oig_rc; - } else { - oig->oig_pending++; - if (occ != NULL) - list_add_tail(&occ->occ_oig_item, &oig->oig_occ_list); - } - spin_unlock(&oig->oig_lock); - oig_grab(oig); - - return rc; -} -EXPORT_SYMBOL(oig_add_one); - -void oig_complete_one(struct obd_io_group *oig, - struct oig_callback_context *occ, int rc) -{ - cfs_waitq_t *wake = NULL; - int old_rc; - - spin_lock(&oig->oig_lock); - - if (occ != NULL) - list_del_init(&occ->occ_oig_item); - - old_rc = oig->oig_rc; - if (oig->oig_rc == 0 && rc != 0) - oig->oig_rc = rc; - - if (--oig->oig_pending <= 0) - wake = &oig->oig_waitq; - - spin_unlock(&oig->oig_lock); - - CDEBUG(D_CACHE, "oig %p completed, rc %d -> %d via %d, %d now " - "pending (racey)\n", oig, old_rc, oig->oig_rc, rc, - oig->oig_pending); - if (wake) - cfs_waitq_signal(wake); - oig_release(oig); -} -EXPORT_SYMBOL(oig_complete_one); - -static int oig_done(struct obd_io_group *oig) -{ - int rc = 0; - spin_lock(&oig->oig_lock); - if (oig->oig_pending <= 0) - rc = 1; - spin_unlock(&oig->oig_lock); - return rc; -} - -static void interrupted_oig(void *data) -{ - struct obd_io_group *oig = data; - struct oig_callback_context *occ; - - spin_lock(&oig->oig_lock); - /* We need to restart the processing each time we drop the lock, as - * it is possible other threads called oig_complete_one() to remove - * an entry elsewhere in the list while we dropped lock. We need to - * drop the lock because osc_ap_completion() calls oig_complete_one() - * which re-gets this lock ;-) as well as a lock ordering issue. */ -restart: - list_for_each_entry(occ, &oig->oig_occ_list, occ_oig_item) { - if (occ->interrupted) - continue; - occ->interrupted = 1; - spin_unlock(&oig->oig_lock); - occ->occ_interrupted(occ); - spin_lock(&oig->oig_lock); - goto restart; - } - spin_unlock(&oig->oig_lock); -} - -int oig_wait(struct obd_io_group *oig) -{ - struct l_wait_info lwi = LWI_INTR(interrupted_oig, oig); - int rc; - - CDEBUG(D_CACHE, "waiting for oig %p\n", oig); - - do { - rc = l_wait_event(oig->oig_waitq, oig_done(oig), &lwi); - LASSERTF(rc == 0 || rc == -EINTR, "rc: %d\n", rc); - /* we can't continue until the oig has emptied and stopped - * referencing state that the caller will free upon return */ - if (rc == -EINTR) - lwi = (struct l_wait_info){ 0, }; - } while (rc == -EINTR); - - LASSERTF(oig->oig_pending == 0, - "exiting oig_wait(oig = %p) with %d pending\n", oig, - oig->oig_pending); - - CDEBUG(D_CACHE, "done waiting on oig %p rc %d\n", oig, oig->oig_rc); - return oig->oig_rc; -} -EXPORT_SYMBOL(oig_wait); - void class_fail_export(struct obd_export *exp) { int rc, already_failed; @@ -1461,7 +1355,7 @@ enum { /** * check for work for kill zombie import/export thread. */ -int obd_zombie_impexp_check(void *arg) +static int obd_zombie_impexp_check(void *arg) { int rc; @@ -1483,6 +1377,32 @@ static void obd_zombie_impexp_notify(void) cfs_waitq_signal(&obd_zombie_waitq); } +/** + * check whether obd_zombie is idle + */ +static int obd_zombie_is_idle(void) +{ + int rc; + + LASSERT(!test_bit(OBD_ZOMBIE_STOP, &obd_zombie_flags)); + spin_lock(&obd_zombie_impexp_lock); + rc = list_empty(&obd_zombie_imports) && + list_empty(&obd_zombie_exports); + spin_unlock(&obd_zombie_impexp_lock); + return rc; +} + +/** + * wait when obd_zombie import/export queues become empty + */ +void obd_zombie_barrier(void) +{ + struct l_wait_info lwi = { 0 }; + + l_wait_event(obd_zombie_waitq, obd_zombie_is_idle(), &lwi); +} +EXPORT_SYMBOL(obd_zombie_barrier); + #ifdef __KERNEL__ /** @@ -1505,6 +1425,8 @@ static int obd_zombie_impexp_thread(void *unused) l_wait_event(obd_zombie_waitq, !obd_zombie_impexp_check(NULL), &lwi); obd_zombie_impexp_cull(); + /* Notify obd_zombie_barrier callers that queues may be empty */ + cfs_waitq_signal(&obd_zombie_waitq); } complete(&obd_zombie_stop); diff --git a/lustre/obdclass/linux/linux-module.c b/lustre/obdclass/linux/linux-module.c index a7feee4..a801349 100644 --- a/lustre/obdclass/linux/linux-module.c +++ b/lustre/obdclass/linux/linux-module.c @@ -345,7 +345,7 @@ static void obd_device_list_seq_stop(struct seq_file *p, void *v) } static void *obd_device_list_seq_next(struct seq_file *p, void *v, loff_t *pos) -{ +{ ++*pos; if (*pos >= class_devno_max()) return NULL; diff --git a/lustre/obdclass/linux/linux-obdo.c b/lustre/obdclass/linux/linux-obdo.c index 775f6c0..e85b5ab 100644 --- a/lustre/obdclass/linux/linux-obdo.c +++ b/lustre/obdclass/linux/linux-obdo.c @@ -65,7 +65,7 @@ void obdo_from_inode(struct obdo *dst, struct inode *src, obd_flag valid) if (valid & (OBD_MD_FLCTIME | OBD_MD_FLMTIME)) CDEBUG(D_INODE, "valid %x, new time %lu/%lu\n", - valid, LTIME_S(src->i_mtime), + valid, LTIME_S(src->i_mtime), LTIME_S(src->i_ctime)); if (valid & OBD_MD_FLATIME) { @@ -185,7 +185,7 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid) /* mtime is always updated with ctime, but can be set in past. As write and utime(2) may happen within 1 second, and utime's - mtime has a priority over write's one, leave mtime from mds + mtime has a priority over write's one, leave mtime from mds for the same ctimes. */ if (valid & OBD_MD_FLCTIME && src->o_ctime > LTIME_S(dst->i_ctime)) { LTIME_S(dst->i_ctime) = src->o_ctime; @@ -211,6 +211,10 @@ void obdo_refresh_inode(struct inode *dst, struct obdo *src, obd_flag valid) /* allocation of space */ if (valid & OBD_MD_FLBLOCKS && src->o_blocks > dst->i_blocks) + /* + * XXX shouldn't overflow be checked here like in + * obdo_to_inode(). + */ dst->i_blocks = src->o_blocks; } EXPORT_SYMBOL(obdo_refresh_inode); diff --git a/lustre/obdclass/llog_cat.c b/lustre/obdclass/llog_cat.c index 73ee3c5..a5c568b 100644 --- a/lustre/obdclass/llog_cat.c +++ b/lustre/obdclass/llog_cat.c @@ -181,7 +181,7 @@ int llog_cat_id2handle(struct llog_handle *cathandle, struct llog_handle **res, if (!rc) { loghandle->u.phd.phd_cat_handle = cathandle; loghandle->u.phd.phd_cookie.lgc_lgl = cathandle->lgh_id; - loghandle->u.phd.phd_cookie.lgc_index = + loghandle->u.phd.phd_cookie.lgc_index = loghandle->lgh_hdr->llh_cat_idx; } @@ -446,14 +446,14 @@ int llog_cat_process_thread(void *data) if (cb) { rc = llog_cat_process(llh, (llog_cb_t)cb, NULL); - if (rc != LLOG_PROC_BREAK) + if (rc != LLOG_PROC_BREAK && rc != 0) CERROR("llog_cat_process() failed %d\n", rc); } else { CWARN("No callback function for recovery\n"); } - /* - * Make sure that all cached data is sent. + /* + * Make sure that all cached data is sent. */ llog_sync(ctxt, NULL); GOTO(release_llh, rc); diff --git a/lustre/obdclass/llog_lvfs.c b/lustre/obdclass/llog_lvfs.c index 47c1e53..60ee61c 100644 --- a/lustre/obdclass/llog_lvfs.c +++ b/lustre/obdclass/llog_lvfs.c @@ -108,7 +108,7 @@ static int llog_lvfs_write_blob(struct obd_device *obd, struct l_file *file, file->f_pos = off; - if (buflen == 0) + if (buflen == 0) CWARN("0-length record\n"); if (!buf) { @@ -244,8 +244,8 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle, RETURN(rc); if (buf) - /* write_blob adds header and tail to lrh_len. */ - reclen = sizeof(*rec) + rec->lrh_len + + /* write_blob adds header and tail to lrh_len. */ + reclen = sizeof(*rec) + rec->lrh_len + sizeof(struct llog_rec_tail); if (idx != -1) { @@ -260,7 +260,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle, if (idx && llh->llh_size && llh->llh_size != rec->lrh_len) RETURN(-EINVAL); - if (!ext2_test_bit(idx, llh->llh_bitmap)) + if (!ext2_test_bit(idx, llh->llh_bitmap)) CERROR("Modify unset record %u\n", idx); if (idx != rec->lrh_index) CERROR("Index mismatch %d %u\n", idx, rec->lrh_index); @@ -290,13 +290,13 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle, RETURN(-EFAULT); } #if 1 /* FIXME remove this safety check at some point */ - /* Verify that the record we're modifying is the + /* Verify that the record we're modifying is the right one. */ rc = llog_lvfs_read_blob(obd, file, &check, sizeof(check), saved_offset); if (check.lrh_index != idx || check.lrh_len != reclen) { CERROR("Bad modify idx %u/%u size %u/%u (%d)\n", - idx, check.lrh_index, reclen, + idx, check.lrh_index, reclen, check.lrh_len, rc); RETURN(-EFAULT); } @@ -366,7 +366,7 @@ static int llog_lvfs_write_rec(struct llog_handle *loghandle, if (rc == 0 && reccookie) { reccookie->lgc_lgl = loghandle->lgh_id; reccookie->lgc_index = index; - if ((rec->lrh_type == MDS_UNLINK_REC) || + if ((rec->lrh_type == MDS_UNLINK_REC) || (rec->lrh_type == MDS_SETATTR_REC)) reccookie->lgc_subsys = LLOG_MDS_OST_ORIG_CTXT; else if (rec->lrh_type == OST_SZ_REC) @@ -639,12 +639,12 @@ static int llog_lvfs_create(struct llog_ctxt *ctxt, struct llog_handle **res, } else if (name) { /* COMPAT_146 */ if (strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME) == 0) { - handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, + handle->lgh_file = llog_filp_open(MDT_LOGS_DIR, name, open_flags, 0644); } else { /* end COMPAT_146 */ handle->lgh_file = llog_filp_open(MOUNT_CONFIGS_DIR, - name, open_flags, + name, open_flags, 0644); } if (IS_ERR(handle->lgh_file)) @@ -760,7 +760,7 @@ static int llog_lvfs_destroy(struct llog_handle *handle) if (rc) GOTO(out, rc); - rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL, NULL); + rc = obd_destroy(handle->lgh_ctxt->loc_exp, oa, NULL, NULL, NULL, NULL); out: OBDO_FREE(oa); RETURN(rc); @@ -777,7 +777,7 @@ int llog_get_cat_list(struct obd_device *obd, struct obd_device *disk_obd, loff_t off = idx * sizeof(*idarray); ENTRY; - if (!count) + if (!count) RETURN(0); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); diff --git a/lustre/obdclass/llog_obd.c b/lustre/obdclass/llog_obd.c index 6d24d60..13a1e9a 100644 --- a/lustre/obdclass/llog_obd.c +++ b/lustre/obdclass/llog_obd.c @@ -72,7 +72,7 @@ static void llog_ctxt_destroy(struct llog_ctxt *ctxt) class_import_put(ctxt->loc_imp); ctxt->loc_imp = NULL; } - + LASSERT(ctxt->loc_llcd == NULL); OBD_FREE_PTR(ctxt); return; } @@ -134,7 +134,7 @@ int llog_cleanup(struct llog_ctxt *ctxt) /* try to free the ctxt */ rc = __llog_ctxt_put(ctxt); if (rc) - CERROR("Error %d while cleaning up ctxt %p\n", + CERROR("Error %d while cleaning up ctxt %p\n", rc, ctxt); l_wait_event(olg->olg_waitq, @@ -227,7 +227,7 @@ int llog_add(struct llog_ctxt *ctxt, struct llog_rec_hdr *rec, CERROR("No ctxt\n"); RETURN(-ENODEV); } - + CTXT_CHECK_OP(ctxt, add, -EOPNOTSUPP); raised = cfs_cap_raised(CFS_CAP_SYS_RESOURCE); if (!raised) @@ -249,7 +249,7 @@ int llog_cancel(struct llog_ctxt *ctxt, struct lov_stripe_md *lsm, CERROR("No ctxt\n"); RETURN(-ENODEV); } - + CTXT_CHECK_OP(ctxt, cancel, -EOPNOTSUPP); rc = CTXTP(ctxt, cancel)(ctxt, lsm, count, cookies, flags); RETURN(rc); diff --git a/lustre/obdclass/llog_swab.c b/lustre/obdclass/llog_swab.c index d91c1df..b76cca8 100644 --- a/lustre/obdclass/llog_swab.c +++ b/lustre/obdclass/llog_swab.c @@ -107,12 +107,13 @@ void lustre_swab_lu_fid(struct lu_fid *fid) } EXPORT_SYMBOL(lustre_swab_lu_fid); -void lustre_swab_lu_range(struct lu_range *range) +void lustre_swab_lu_seq_range(struct lu_seq_range *range) { - __swab64s (&range->lr_start); - __swab64s (&range->lr_end); + __swab64s (&range->lsr_start); + __swab64s (&range->lsr_end); + __swab32s (&range->lsr_mdt); } -EXPORT_SYMBOL(lustre_swab_lu_range); +EXPORT_SYMBOL(lustre_swab_lu_seq_range); void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail) { @@ -154,6 +155,17 @@ void lustre_swab_llog_rec(struct llog_rec_hdr *rec, struct llog_rec_tail *tail) break; } + case MDS_SETATTR64_REC: { + struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec; + + __swab64s(&lsr->lsr_oid); + __swab32s(&lsr->lsr_ogen); + __swab32s(&lsr->lsr_uid); + __swab32s(&lsr->lsr_gid); + + break; + } + case OBD_CFG_REC: case PTL_CFG_REC: /* obsolete */ /* these are swabbed as they are consumed */ diff --git a/lustre/obdclass/lprocfs_status.c b/lustre/obdclass/lprocfs_status.c index 393c008..a347c0d 100644 --- a/lustre/obdclass/lprocfs_status.c +++ b/lustre/obdclass/lprocfs_status.c @@ -686,6 +686,7 @@ int lprocfs_rd_import(char *page, char **start, off_t off, int count, " target: %s@%s\n" " state: %s\n" " inflight: %u\n" + " unregistering: %u\n" " conn_cnt: %u\n" " generation: %u\n" " inval_cnt: %u\n" @@ -697,6 +698,7 @@ int lprocfs_rd_import(char *page, char **start, off_t off, int count, obd2cli_tgt(obd), imp->imp_connection->c_remote_uuid.uuid, imp_state_name, atomic_read(&imp->imp_inflight), + atomic_read(&imp->imp_unregistering), imp->imp_conn_cnt, imp->imp_generation, atomic_read(&imp->imp_inval_count), @@ -1246,15 +1248,6 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr); LPROCFS_OBD_OP_INIT(num_private_stats, stats, getattr_async); LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, brw_async); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, prep_async_page); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, reget_short_lock); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, release_short_lock); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_async_io); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, queue_group_io); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, trigger_group_io); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, set_async_flags); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, teardown_async_page); LPROCFS_OBD_OP_INIT(num_private_stats, stats, merge_lvb); LPROCFS_OBD_OP_INIT(num_private_stats, stats, adjust_kms); LPROCFS_OBD_OP_INIT(num_private_stats, stats, punch); @@ -1265,7 +1258,6 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, preprw); LPROCFS_OBD_OP_INIT(num_private_stats, stats, commitrw); LPROCFS_OBD_OP_INIT(num_private_stats, stats, enqueue); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, match); LPROCFS_OBD_OP_INIT(num_private_stats, stats, change_cbdata); LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel); LPROCFS_OBD_OP_INIT(num_private_stats, stats, cancel_unused); @@ -1283,11 +1275,8 @@ void lprocfs_init_ops_stats(int num_private_stats, struct lprocfs_stats *stats) LPROCFS_OBD_OP_INIT(num_private_stats, stats, get_uuid); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotacheck); LPROCFS_OBD_OP_INIT(num_private_stats, stats, quotactl); + LPROCFS_OBD_OP_INIT(num_private_stats, stats, quota_adjust_qunit); LPROCFS_OBD_OP_INIT(num_private_stats, stats, ping); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_page_removal_cb); - LPROCFS_OBD_OP_INIT(num_private_stats,stats,unregister_page_removal_cb); - LPROCFS_OBD_OP_INIT(num_private_stats, stats, register_lock_cancel_cb); - LPROCFS_OBD_OP_INIT(num_private_stats, stats,unregister_lock_cancel_cb); LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_new); LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_rem); LPROCFS_OBD_OP_INIT(num_private_stats, stats, pool_add); @@ -1389,6 +1378,7 @@ int lprocfs_alloc_md_stats(struct obd_device *obd, LPROCFS_MD_OP_INIT(num_private_stats, stats, lock_match); LPROCFS_MD_OP_INIT(num_private_stats, stats, cancel_unused); LPROCFS_MD_OP_INIT(num_private_stats, stats, renew_capa); + LPROCFS_MD_OP_INIT(num_private_stats, stats, unpack_capa); LPROCFS_MD_OP_INIT(num_private_stats, stats, get_remote_perm); LPROCFS_MD_OP_INIT(num_private_stats, stats, intent_getattr_async); LPROCFS_MD_OP_INIT(num_private_stats, stats, revalidate_lock); @@ -1868,7 +1858,7 @@ int lprocfs_write_frac_u64_helper(const char *buffer, unsigned long count, __u64 whole, frac = 0, units; unsigned frac_d = 1; - if (count > (sizeof(kernbuf) - 1) ) + if (count > (sizeof(kernbuf) - 1)) return -EINVAL; if (copy_from_user(kernbuf, buffer, count)) diff --git a/lustre/obdclass/lu_object.c b/lustre/obdclass/lu_object.c index c4e3f2b..01b2d3e 100644 --- a/lustre/obdclass/lu_object.c +++ b/lustre/obdclass/lu_object.c @@ -194,10 +194,10 @@ static struct lu_object *lu_object_alloc(const struct lu_env *env, */ static void lu_object_free(const struct lu_env *env, struct lu_object *o) { - struct list_head splice; + struct list_head splice; struct lu_object *scan; - struct lu_site *site; - struct list_head *layers; + struct lu_site *site; + struct list_head *layers; site = o->lo_dev->ld_site; layers = &o->lo_header->loh_layers; @@ -336,7 +336,7 @@ int lu_cdebug_printer(const struct lu_env *env, struct lu_cdebug_data *key; int used; int complete; - va_list args; + va_list args; va_start(args, format); @@ -352,9 +352,9 @@ int lu_cdebug_printer(const struct lu_env *env, ARRAY_SIZE(key->lck_area) - used, format, args); if (complete) { if (cdebug_show(info->lpi_mask, info->lpi_subsys)) - libcfs_debug_msg(NULL, info->lpi_subsys, info->lpi_mask, - (char *)info->lpi_file, info->lpi_fn, - info->lpi_line, "%s", key->lck_area); + libcfs_debug_msg(NULL, info->lpi_subsys, info->lpi_mask, + (char *)info->lpi_file, info->lpi_fn, + info->lpi_line, "%s", key->lck_area); key->lck_area[0] = 0; } va_end(args); @@ -367,7 +367,7 @@ EXPORT_SYMBOL(lu_cdebug_printer); */ void lu_object_header_print(const struct lu_env *env, void *cookie, lu_printer_t printer, - const struct lu_object_header *hdr) + const struct lu_object_header *hdr) { (*printer)(env, cookie, "header@%p[%#lx, %d, "DFID"%s%s%s]", hdr, hdr->loh_flags, atomic_read(&hdr->loh_ref), @@ -400,7 +400,7 @@ void lu_object_print(const struct lu_env *env, void *cookie, (*printer)(env, cookie, "%*.*s%s@%p", depth, depth, ruler, o->lo_dev->ld_type->ldt_name, o); if (o->lo_ops->loo_object_print != NULL) - o->lo_ops->loo_object_print(env, cookie, printer, o); + o->lo_ops->loo_object_print(env, cookie, printer, o); (*printer)(env, cookie, "\n"); } (*printer)(env, cookie, "} header@%p\n", top); @@ -496,8 +496,8 @@ static struct lu_object *lu_object_find_try(const struct lu_env *env, cfs_waitlink_t *waiter) { struct lu_site *s; - struct lu_object *o; - struct lu_object *shadow; + struct lu_object *o; + struct lu_object *shadow; struct hlist_head *bucket; /* @@ -844,9 +844,12 @@ void lu_device_fini(struct lu_device *d) struct lu_device_type *t; t = d->ld_type; - if (d->ld_obd != NULL) + if (d->ld_obd != NULL) { /* finish lprocfs */ lprocfs_obd_cleanup(d->ld_obd); + d->ld_obd->obd_lu_dev = NULL; + d->ld_obd = NULL; + } lu_ref_fini(&d->ld_reference); LASSERTF(atomic_read(&d->ld_ref) == 0, @@ -1001,9 +1004,9 @@ void lu_stack_fini(const struct lu_env *env, struct lu_device *top) next = ldt->ldt_ops->ldto_device_free(env, scan); type = ldt->ldt_obd_type; if (type != NULL) { - type->typ_refcnt--; - class_put_type(type); - } + type->typ_refcnt--; + class_put_type(type); + } } } EXPORT_SYMBOL(lu_stack_fini); @@ -1060,7 +1063,7 @@ EXPORT_SYMBOL(lu_context_key_register); static void key_fini(struct lu_context *ctx, int index) { - if (ctx->lc_value[index] != NULL) { + if (ctx->lc_value != NULL && ctx->lc_value[index] != NULL) { struct lu_context_key *key; key = lu_keys[index]; @@ -1088,6 +1091,8 @@ void lu_context_key_degister(struct lu_context_key *key) LASSERT(atomic_read(&key->lct_used) >= 1); LINVRNT(0 <= key->lct_index && key->lct_index < ARRAY_SIZE(lu_keys)); + lu_context_key_quiesce(key); + ++key_set_version; key_fini(&lu_shrink_env.le_ctx, key->lct_index); @@ -1205,8 +1210,13 @@ static CFS_LIST_HEAD(lu_context_remembered); void lu_context_key_quiesce(struct lu_context_key *key) { struct lu_context *ctx; + extern unsigned cl_env_cache_purge(unsigned nr); if (!(key->lct_tags & LCT_QUIESCENT)) { + /* + * XXX layering violation. + */ + cl_env_cache_purge(~0); key->lct_tags |= LCT_QUIESCENT; /* * XXX memory barrier has to go here. @@ -1263,6 +1273,7 @@ static int keys_fill(struct lu_context *ctx) value = key->lct_init(ctx, key); if (unlikely(IS_ERR(value))) return PTR_ERR(value); + LASSERT(key->lct_owner != NULL); if (!(ctx->lc_tags & LCT_NOREF)) try_module_get(key->lct_owner); @@ -1375,30 +1386,16 @@ int lu_context_refill(struct lu_context *ctx) } EXPORT_SYMBOL(lu_context_refill); -static int lu_env_setup(struct lu_env *env, struct lu_context *ses, - __u32 tags, int noref) +int lu_env_init(struct lu_env *env, __u32 tags) { int result; - LINVRNT(ergo(!noref, !(tags & LCT_NOREF))); - - env->le_ses = ses; + env->le_ses = NULL; result = lu_context_init(&env->le_ctx, tags); if (likely(result == 0)) lu_context_enter(&env->le_ctx); return result; } - -static int lu_env_init_noref(struct lu_env *env, struct lu_context *ses, - __u32 tags) -{ - return lu_env_setup(env, ses, tags, 1); -} - -int lu_env_init(struct lu_env *env, struct lu_context *ses, __u32 tags) -{ - return lu_env_setup(env, ses, tags, 0); -} EXPORT_SYMBOL(lu_env_init); void lu_env_fini(struct lu_env *env) @@ -1455,6 +1452,54 @@ static int lu_cache_shrink(int nr, unsigned int gfp_mask) return cached; } +/* + * Debugging stuff. + */ + +/** + * Environment to be used in debugger, contains all tags. + */ +struct lu_env lu_debugging_env; + +/** + * Debugging printer function using printk(). + */ +int lu_printk_printer(const struct lu_env *env, + void *_, const char *format, ...) +{ + va_list args; + + va_start(args, format); + vprintk(format, args); + va_end(args); + return 0; +} + +void lu_debugging_setup(void) +{ + lu_env_init(&lu_debugging_env, ~0); +} + +void lu_context_keys_dump(void) +{ + int i; + + for (i = 0; i < ARRAY_SIZE(lu_keys); ++i) { + struct lu_context_key *key; + + key = lu_keys[i]; + if (key != NULL) { + CERROR("[%i]: %p %x (%p,%p,%p) %i %i \"%s\"@%p\n", + i, key, key->lct_tags, + key->lct_init, key->lct_fini, key->lct_exit, + key->lct_index, atomic_read(&key->lct_used), + key->lct_owner ? key->lct_owner->name : "", + key->lct_owner); + lu_ref_print(&key->lct_reference); + } + } +} +EXPORT_SYMBOL(lu_context_keys_dump); #else /* !__KERNEL__ */ static int lu_cache_shrink(int nr, unsigned int gfp_mask) { @@ -1462,9 +1507,17 @@ static int lu_cache_shrink(int nr, unsigned int gfp_mask) } #endif /* __KERNEL__ */ +int cl_global_init(void); +void cl_global_fini(void); int lu_ref_global_init(void); void lu_ref_global_fini(void); +int dt_global_init(void); +void dt_global_fini(void); + +int llo_global_init(void); +void llo_global_fini(void); + /** * Initialization of global lu_* data. */ @@ -1478,21 +1531,21 @@ int lu_global_init(void) result = lu_context_key_register(&lu_global_key); if (result != 0) return result; - /* + /* * At this level, we don't know what tags are needed, so allocate them * conservatively. This should not be too bad, because this * environment is global. - */ - down(&lu_sites_guard); - result = lu_env_init_noref(&lu_shrink_env, NULL, LCT_SHRINKER); - up(&lu_sites_guard); + */ + down(&lu_sites_guard); + result = lu_env_init(&lu_shrink_env, LCT_SHRINKER); + up(&lu_sites_guard); if (result != 0) return result; result = lu_ref_global_init(); if (result != 0) return result; - /* + /* * seeks estimation: 3 seeks to read a record from oi, one to read * inode, one for ea. Unfortunately setting this high value results in * lu_object/inode cache consuming all the memory. @@ -1501,7 +1554,22 @@ int lu_global_init(void) if (lu_site_shrinker == NULL) return -ENOMEM; - result = lu_time_global_init(); + result = lu_time_global_init(); + if (result) + GOTO(out, result); + +#ifdef __KERNEL__ + result = dt_global_init(); + if (result) + GOTO(out, result); + + result = llo_global_init(); + if (result) + GOTO(out, result); +#endif + result = cl_global_init(); +out: + return result; } @@ -1510,6 +1578,11 @@ int lu_global_init(void) */ void lu_global_fini(void) { + cl_global_fini(); +#ifdef __KERNEL__ + llo_global_fini(); + dt_global_fini(); +#endif lu_time_global_fini(); if (lu_site_shrinker != NULL) { remove_shrinker(lu_site_shrinker); @@ -1566,6 +1639,7 @@ int lu_site_stats_print(const struct lu_site *s, char *page, int count) } EXPORT_SYMBOL(lu_site_stats_print); +#ifdef __KERNEL__ /* * XXX: Functions below logically belong to the fid module, but they are used * by dt_store_open(). Put them here until better place is found. @@ -1640,6 +1714,7 @@ int fid_unpack(const struct lu_fid_pack *pack, struct lu_fid *fid) return result; } EXPORT_SYMBOL(fid_unpack); +#endif /* #ifdef __KERNEL__ */ const char *lu_time_names[LU_TIME_NR] = { [LU_TIME_FIND_LOOKUP] = "find_lookup", @@ -1686,4 +1761,3 @@ void lu_kmem_fini(struct lu_kmem_descr *caches) } } EXPORT_SYMBOL(lu_kmem_fini); - diff --git a/lustre/obdclass/lu_time.c b/lustre/obdclass/lu_time.c index 66a8687..26513cf 100644 --- a/lustre/obdclass/lu_time.c +++ b/lustre/obdclass/lu_time.c @@ -161,7 +161,7 @@ unsigned long long lu_time_stamp_get(void) /* * Return timestamp with microsecond precision. This has to be cheap. */ -//#ifdef CONFIG_X86 +//#ifdef CONFIG_X86 #if defined(CONFIG_X86) && !defined(CONFIG_X86_64) /* * do_gettimeofday() goes backwards sometimes :(. Usethe TSC diff --git a/lustre/obdclass/md_local_object.c b/lustre/obdclass/md_local_object.c new file mode 100644 index 0000000..7446cd8 --- /dev/null +++ b/lustre/obdclass/md_local_object.c @@ -0,0 +1,464 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * lustre/obdclass/md_local_object.c + * + * Lustre Local Object create APIs + * 'create on first mount' facility. Files registed under llo module will + * be created on first mount. + * + * Author: Pravin Shelar + */ + +#define DEBUG_SUBSYSTEM S_CLASS +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif + +#include +#include +#include +#include +#include +#include + + +/** List head to hold list of objects to be created. */ +static struct list_head llo_lobj_list; + +/** Lock to protect list manipulations */ +static struct mutex llo_lock; + +/** + * Structure used to maintain state of path parsing. + * \see llo_find_entry, llo_store_resolve + */ +struct llo_find_hint { + struct lu_fid *lfh_cfid; + struct md_device *lfh_md; + struct md_object *lfh_pobj; +}; + +/** + * Thread Local storage for this module. + */ +struct llo_thread_info { + /** buffer to resolve path */ + char lti_buf[DT_MAX_PATH]; + /** used for path resolve */ + struct lu_fid lti_fid; + /** used to pass child object fid */ + struct lu_fid lti_cfid; + struct llo_find_hint lti_lfh; + struct md_op_spec lti_spc; + struct md_attr lti_ma; + struct lu_name lti_lname; +}; + +LU_KEY_INIT(llod_global, struct llo_thread_info); +LU_KEY_FINI(llod_global, struct llo_thread_info); + +static struct lu_context_key llod_key = { + .lct_tags = LCT_MD_THREAD | LCT_DT_THREAD, + .lct_init = llod_global_key_init, + .lct_fini = llod_global_key_fini +}; + +static inline struct llo_thread_info * llo_env_info(const struct lu_env *env) +{ + return lu_context_key_get(&env->le_ctx, &llod_key); +} + +/** + * Search md object for given fid. + */ +static struct md_object *llo_locate(const struct lu_env *env, + struct md_device *md, + const struct lu_fid *fid) +{ + struct lu_object *obj; + struct md_object *mdo; + + obj = lu_object_find(env, &md->md_lu_dev, fid, NULL); + if (!IS_ERR(obj)) { + obj = lu_object_locate(obj->lo_header, md->md_lu_dev.ld_type); + LASSERT(obj != NULL); + mdo = (struct md_object *) obj; + } else + mdo = (struct md_object *)obj; + return mdo; +} + +/** + * Lookup FID for object named \a name in directory \a pobj. + */ +static int llo_lookup(const struct lu_env *env, + struct md_object *pobj, + const char *name, + struct lu_fid *fid) +{ + struct llo_thread_info *info = llo_env_info(env); + struct lu_name *lname = &info->lti_lname; + struct md_op_spec *spec = &info->lti_spc; + + spec->sp_feat = NULL; + spec->sp_cr_flags = 0; + spec->sp_cr_lookup = 0; + spec->sp_cr_mode = 0; + spec->sp_ck_split = 0; + + lname->ln_name = name; + lname->ln_namelen = strlen(name); + + return mdo_lookup(env, pobj, lname, fid, spec); +} + +/** + * Function to look up path component, this is passed to parsing + * function. \see llo_store_resolve + * + * \retval rc returns error code for lookup or locate operation + * + * pointer to object is returned in data (lfh->lfh_pobj) + */ +static int llo_find_entry(const struct lu_env *env, + const char *name, void *data) +{ + struct llo_find_hint *lfh = data; + struct md_device *md = lfh->lfh_md; + struct lu_fid *fid = lfh->lfh_cfid; + struct md_object *obj = lfh->lfh_pobj; + int result; + + /* lookup fid for object */ + result = llo_lookup(env, obj, name, fid); + lu_object_put(env, &obj->mo_lu); + + if (result == 0) { + /* get md object for fid that we got in lookup */ + obj = llo_locate(env, md, fid); + if (IS_ERR(obj)) + result = PTR_ERR(obj); + } + + lfh->lfh_pobj = obj; + return result; +} + +static struct md_object *llo_reg_open(const struct lu_env *env, + struct md_device *md, + struct md_object *p, + const char *name, + struct lu_fid *fid) +{ + struct md_object *o; + int result; + + result = llo_lookup(env, p, name, fid); + if (result == 0) + o = llo_locate(env, md, fid); + else + o = ERR_PTR(result); + + return o; +} + +/** + * Resolve given \a path, on success function returns + * md object for last directory and \a fid points to + * its fid. + */ +struct md_object *llo_store_resolve(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *path, + struct lu_fid *fid) +{ + struct llo_thread_info *info = llo_env_info(env); + struct llo_find_hint *lfh = &info->lti_lfh; + char *local = info->lti_buf; + struct md_object *obj; + int result; + + strncpy(local, path, DT_MAX_PATH); + local[DT_MAX_PATH - 1] = '\0'; + + lfh->lfh_md = md; + lfh->lfh_cfid = fid; + /* start path resolution from backend fs root. */ + result = dt->dd_ops->dt_root_get(env, dt, fid); + if (result == 0) { + /* get md object for root */ + obj = llo_locate(env, md, fid); + if (!IS_ERR(obj)) { + /* start path parser from root md */ + lfh->lfh_pobj = obj; + result = dt_path_parser(env, local, llo_find_entry, lfh); + if (result != 0) + obj = ERR_PTR(result); + else + obj = lfh->lfh_pobj; + } + } else { + obj = ERR_PTR(result); + } + return obj; +} +EXPORT_SYMBOL(llo_store_resolve); + +/** + * Returns md object for \a objname in given \a dirname. + */ +struct md_object *llo_store_open(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + struct lu_fid *fid) +{ + struct md_object *obj; + struct md_object *dir; + + /* search md object for parent dir */ + dir = llo_store_resolve(env, md, dt, dirname, fid); + if (!IS_ERR(dir)) { + obj = llo_reg_open(env, md, dir, objname, fid); + lu_object_put(env, &dir->mo_lu); + } else + obj = dir; + + return obj; +} +EXPORT_SYMBOL(llo_store_open); + +static struct md_object *llo_create_obj(const struct lu_env *env, + struct md_device *md, + struct md_object *dir, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat) +{ + struct llo_thread_info *info = llo_env_info(env); + struct md_object *mdo; + struct md_attr *ma = &info->lti_ma; + struct md_op_spec *spec = &info->lti_spc; + struct lu_name *lname = &info->lti_lname; + struct lu_attr *la = &ma->ma_attr; + int rc; + + mdo = llo_locate(env, md, fid); + if (IS_ERR(mdo)) + return mdo; + + lname->ln_name = objname; + lname->ln_namelen = strlen(objname); + + spec->sp_feat = feat; + spec->sp_cr_flags = 0; + spec->sp_cr_lookup = 1; + spec->sp_cr_mode = 0; + spec->sp_ck_split = 0; + + if (feat == &dt_directory_features) + la->la_mode = S_IFDIR; + else + la->la_mode = S_IFREG; + + la->la_mode |= S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH; + la->la_uid = la->la_gid = 0; + la->la_valid = LA_MODE | LA_UID | LA_GID; + + ma->ma_valid = 0; + ma->ma_need = 0; + + rc = mdo_create(env, dir, lname, mdo, spec, ma); + + if (rc) { + lu_object_put(env, &mdo->mo_lu); + mdo = ERR_PTR(rc); + } + + return mdo; +} + +/** + * Create md object, object could be diretcory or + * special index defined by \a feat in \a directory. + * + * \param md device + * \param dir parent directory + * \param objname file name + * \param fid object fid + * \param feat index features required for directory create + */ + +struct md_object *llo_store_create_index(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid, + const struct dt_index_features *feat) +{ + struct llo_thread_info *info = llo_env_info(env); + struct md_object *obj; + struct md_object *dir; + struct lu_fid *ignore = &info->lti_fid; + + dir = llo_store_resolve(env, md, dt, dirname, ignore); + if (!IS_ERR(dir)) { + obj = llo_create_obj(env, md, dir, objname, fid, feat); + lu_object_put(env, &dir->mo_lu); + } else { + obj = dir; + } + return obj; +} + +EXPORT_SYMBOL(llo_store_create_index); + +/** + * Create md object for regular file in \a directory. + * + * \param md device + * \param dir parent directory + * \param objname file name + * \param fid object fid. + */ + +struct md_object *llo_store_create(const struct lu_env *env, + struct md_device *md, + struct dt_device *dt, + const char *dirname, + const char *objname, + const struct lu_fid *fid) +{ + return llo_store_create_index(env, md, dt, dirname, + objname, fid, NULL); +} + +EXPORT_SYMBOL(llo_store_create); + +/** + * Register object for 'create on first mount' facility. + * objects are created in order of registration. + */ + +void llo_local_obj_register(struct lu_local_obj_desc *llod) +{ + mutex_lock(&llo_lock); + list_add_tail(&llod->llod_linkage, &llo_lobj_list); + mutex_unlock(&llo_lock); +} + +EXPORT_SYMBOL(llo_local_obj_register); + +void llo_local_obj_unregister(struct lu_local_obj_desc *llod) +{ + mutex_lock(&llo_lock); + list_del(&llod->llod_linkage); + mutex_unlock(&llo_lock); +} + +EXPORT_SYMBOL(llo_local_obj_unregister); + +/** + * Created registed objects. + */ + +int llo_local_objects_setup(const struct lu_env *env, + struct md_device * md, + struct dt_device *dt) +{ + struct llo_thread_info *info = llo_env_info(env); + struct lu_fid *fid; + struct lu_local_obj_desc *scan; + struct md_object *mdo; + const char *dir; + int rc = 0; + + fid = &info->lti_cfid; + mutex_lock(&llo_lock); + + list_for_each_entry(scan, &llo_lobj_list, llod_linkage) { + lu_local_obj_fid(fid, scan->llod_oid); + dir = ""; + if (scan->llod_dir) + dir = scan->llod_dir; + + if (scan->llod_is_index) + mdo = llo_store_create_index(env, md, dt , + dir, scan->llod_name, + fid, + scan->llod_feat); + else + mdo = llo_store_create(env, md, dt, + dir, scan->llod_name, + fid); + if (IS_ERR(mdo) && PTR_ERR(mdo) != -EEXIST) { + rc = PTR_ERR(mdo); + CERROR("creating obj [%s] fid = "DFID" rc = %d\n", + scan->llod_name, PFID(fid), rc); + goto out; + } + + if (!IS_ERR(mdo)) + lu_object_put(env, &mdo->mo_lu); + } + +out: + mutex_unlock(&llo_lock); + return rc; +} + +EXPORT_SYMBOL(llo_local_objects_setup); + +int llo_global_init(void) +{ + int result; + + CFS_INIT_LIST_HEAD(&llo_lobj_list); + mutex_init(&llo_lock); + + LU_CONTEXT_KEY_INIT(&llod_key); + result = lu_context_key_register(&llod_key); + return result; +} + +void llo_global_fini(void) +{ + lu_context_key_degister(&llod_key); + LASSERT(list_empty(&llo_lobj_list)); +} diff --git a/lustre/obdclass/obd_config.c b/lustre/obdclass/obd_config.c index 8e3a854..eaadb09 100644 --- a/lustre/obdclass/obd_config.c +++ b/lustre/obdclass/obd_config.c @@ -76,6 +76,72 @@ int class_find_param(char *buf, char *key, char **valp) return 0; } +/** + * Finds a parameter in \a params and copies it to \a copy. + * + * Leading spaces are skipped. Next space or end of string is the + * parameter terminator with the exception that spaces inside single or double + * quotes get included into a parameter. The parameter is copied into \a copy + * which has to be allocated big enough by a caller, quotes are stripped in + * the copy and the copy is terminated by 0. + * + * On return \a params is set to next parameter or to NULL if last + * parameter is returned. + * + * \retval 0 if parameter is returned in \a copy + * \retval 1 otherwise + * \retval -EINVAL if unbalanced quota is found + */ +int class_get_next_param(char **params, char *copy) +{ + char *q1, *q2, *str; + int len; + + str = *params; + while (*str == ' ') + str++; + + if (*str == '\0') { + *params = NULL; + return 1; + } + + while (1) { + q1 = strpbrk(str, " '\""); + if (q1 == NULL) { + len = strlen(str); + memcpy(copy, str, len); + copy[len] = '\0'; + *params = NULL; + return 0; + } + len = q1 - str; + if (*q1 == ' ') { + memcpy(copy, str, len); + copy[len] = '\0'; + *params = str + len; + return 0; + } + + memcpy(copy, str, len); + copy += len; + + /* search for the matching closing quote */ + str = q1 + 1; + q2 = strchr(str, *q1); + if (q2 == NULL) { + CERROR("Unbalanced quota in parameters: \"%s\"\n", + *params); + return -EINVAL; + } + len = q2 - str; + memcpy(copy, str, len); + copy += len; + str = q2 + 1; + } + return 1; +} + /* returns 0 if this is the first key in the buffer, else 1. valp points to first char after key. */ int class_match_param(char *buf, char *key, char **valp) @@ -129,6 +195,7 @@ int class_parse_nid(char *buf, lnet_nid_t *nid, char **endh) } EXPORT_SYMBOL(class_find_param); +EXPORT_SYMBOL(class_get_next_param); EXPORT_SYMBOL(class_match_param); EXPORT_SYMBOL(class_parse_nid); @@ -289,19 +356,19 @@ int class_setup(struct obd_device *obd, struct lustre_cfg *lcfg) spin_unlock(&obd->obd_dev_lock); /* create an uuid-export lustre hash */ - obd->obd_uuid_hash = lustre_hash_init("UUID_HASH", 128, 128, + obd->obd_uuid_hash = lustre_hash_init("UUID_HASH", 7, 7, &uuid_hash_ops, 0); if (!obd->obd_uuid_hash) GOTO(err_hash, err = -ENOMEM); /* create a nid-export lustre hash */ - obd->obd_nid_hash = lustre_hash_init("NID_HASH", 128, 128, + obd->obd_nid_hash = lustre_hash_init("NID_HASH", 7, 7, &nid_hash_ops, 0); if (!obd->obd_nid_hash) GOTO(err_hash, err = -ENOMEM); /* create a nid-stats lustre hash */ - obd->obd_nid_stats_hash = lustre_hash_init("NID_STATS", 128, 128, + obd->obd_nid_stats_hash = lustre_hash_init("NID_STATS", 7, 7, &nid_stat_hash_ops, 0); if (!obd->obd_nid_stats_hash) GOTO(err_hash, err = -ENOMEM); @@ -439,11 +506,15 @@ int class_cleanup(struct obd_device *obd, struct lustre_cfg *lcfg) obd->obd_fail = 1; obd->obd_no_transno = 1; obd->obd_no_recov = 1; - /* Set the obd readonly if we can */ - if (OBP(obd, iocontrol)) + if (OBP(obd, iocontrol)) { + obd_iocontrol(OBD_IOC_SYNC, + obd->obd_self_export, + 0, NULL, NULL); + /* Set the obd readonly if we can */ obd_iocontrol(OBD_IOC_SET_READONLY, obd->obd_self_export, 0, NULL, NULL); + } break; default: CERROR("unrecognised flag '%c'\n", @@ -884,6 +955,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, int i, keylen, vallen; int matched = 0, j = 0; int rc = 0; + int skip = 0; ENTRY; if (lcfg->lcfg_command != LCFG_PARAM) { @@ -939,6 +1011,7 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, CERROR("%s: unknown param %s\n", (char *)lustre_cfg_string(lcfg, 0), key); /* rc = -EINVAL; continue parsing other params */ + skip++; } else { LCONSOLE_INFO("%s.%.*s: set parameter %.*s=%s\n", lustre_cfg_string(lcfg, 0), @@ -949,6 +1022,8 @@ int class_process_proc_param(char *prefix, struct lprocfs_vars *lvars, if (rc > 0) rc = 0; + if (!rc && skip) + rc = skip; RETURN(rc); #else CDEBUG(D_CONFIG, "liblustre can't process params.\n"); @@ -1010,8 +1085,9 @@ static int class_config_llog_handler(struct llog_handle * handle, CDEBUG(D_CONFIG, "SKIP #%d\n", marker->cm_step); } else if ((marker->cm_flags & CM_EXCLUDE) || - lustre_check_exclusion(clli->cfg_sb, - marker->cm_tgtname)) { + (clli->cfg_sb && + lustre_check_exclusion(clli->cfg_sb, + marker->cm_tgtname))) { clli->cfg_flags |= CFG_F_EXCLUDE; CDEBUG(D_CONFIG, "EXCLUDE %d\n", marker->cm_step); @@ -1039,6 +1115,29 @@ static int class_config_llog_handler(struct llog_handle * handle, break; } + /* + * For interoperability between 1.8 and 2.0, + * rename "mds" obd device type to "mdt". + */ + { + char *typename = lustre_cfg_string(lcfg, 1); + char *index = lustre_cfg_string(lcfg, 2); + + if ((lcfg->lcfg_command == LCFG_ATTACH && typename && + strcmp(typename, "mds") == 0)) { + CWARN("For 1.8 interoperability, rename obd " + "type from mds to mdt\n"); + typename[2] = 't'; + } + if ((lcfg->lcfg_command == LCFG_SETUP && index && + strcmp(index, "type") == 0)) { + CWARN("For 1.8 interoperability, set this" + " index to '0'\n"); + index[0] = '0'; + index[1] = 0; + } + } + if ((clli->cfg_flags & CFG_F_EXCLUDE) && (lcfg->lcfg_command == LCFG_LOV_ADD_OBD)) /* Add inactive instead */ @@ -1069,6 +1168,22 @@ static int class_config_llog_handler(struct llog_handle * handle, lustre_cfg_bufs_set_string(&bufs, 2, clli->cfg_uuid.uuid); } + /* + * sptlrpc config record, we expect 2 data segments: + * [0]: fs_name/target_name, + * [1]: rule string + * moving them to index [1] and [2], and insert MGC's + * obdname at index [0]. + */ + if (clli && clli->cfg_instance == NULL && + lcfg->lcfg_command == LCFG_SPTLRPC_CONF) { + lustre_cfg_bufs_set(&bufs, 2, bufs.lcfg_buf[1], + bufs.lcfg_buflen[1]); + lustre_cfg_bufs_set(&bufs, 1, bufs.lcfg_buf[0], + bufs.lcfg_buflen[0]); + lustre_cfg_bufs_set_string(&bufs, 0, + clli->cfg_obdname); + } lcfg_new = lustre_cfg_new(lcfg->lcfg_command, &bufs); diff --git a/lustre/obdclass/obd_mount.c b/lustre/obdclass/obd_mount.c index f55604b..d6e7222 100644 --- a/lustre/obdclass/obd_mount.c +++ b/lustre/obdclass/obd_mount.c @@ -167,7 +167,7 @@ struct lustre_mount_info *server_get_mount(const char *name) lsi = s2lsi(lmi->lmi_sb); mntget(lmi->lmi_mnt); atomic_inc(&lsi->lsi_mounts); - + CDEBUG(D_MOUNT, "get_mnt %p from %s, refs=%d, vfscount=%d\n", lmi->lmi_mnt, name, atomic_read(&lsi->lsi_mounts), atomic_read(&lmi->lmi_mnt->mnt_count)); @@ -572,7 +572,7 @@ static int lustre_start_mgc(struct super_block *sb) struct obd_uuid *uuid; class_uuid_t uuidc; lnet_nid_t nid; - char *mgcname, *niduuid; + char *mgcname, *niduuid, *mgssec; char *ptr; int recov_bk; int rc = 0, i = 0, j, len; @@ -615,10 +615,18 @@ static int lustre_start_mgc(struct super_block *sb) GOTO(out_free, rc = -ENOMEM); sprintf(mgcname, "%s%s", LUSTRE_MGC_OBDNAME, libcfs_nid2str(nid)); + mgssec = lsi->lsi_lmd->lmd_mgssec ? lsi->lsi_lmd->lmd_mgssec : ""; + mutex_down(&mgc_start_lock); obd = class_name2obd(mgcname); if (obd && !obd->obd_stopping) { + rc = obd_set_info_async(obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + /* Re-using an existing MGC */ atomic_inc(&obd->u.cli.cl_mgc_refcount); @@ -731,6 +739,12 @@ static int lustre_start_mgc(struct super_block *sb) GOTO(out_free, rc = -ENOTCONN); } + rc = obd_set_info_async(obd->obd_self_export, + strlen(KEY_MGSSEC), KEY_MGSSEC, + strlen(mgssec), mgssec, NULL); + if (rc) + GOTO(out_free, rc); + /* Keep a refcount of servers/clients who started with "mount", so we know when we can get rid of the mgc. */ atomic_set(&obd->u.cli.cl_mgc_refcount, 1); @@ -791,6 +805,7 @@ static int lustre_stop_mgc(struct super_block *sb) lsi->lsi_mgc = NULL; mutex_down(&mgc_start_lock); + LASSERT(atomic_read(&obd->u.cli.cl_mgc_refcount) > 0); if (!atomic_dec_and_test(&obd->u.cli.cl_mgc_refcount)) { /* This is not fatal, every client that stops will call in here. */ @@ -826,7 +841,8 @@ static int lustre_stop_mgc(struct super_block *sb) /* Clean the nid uuids */ if (!niduuid) - RETURN(-ENOMEM); + GOTO(out, rc = -ENOMEM); + for (i = 0; i < lsi->lsi_lmd->lmd_mgs_failnodes; i++) { sprintf(ptr, "_%x", i); rc = do_lcfg(LUSTRE_MGC_OBDNAME, 0, LCFG_DEL_UUID, @@ -835,10 +851,11 @@ static int lustre_stop_mgc(struct super_block *sb) CERROR("del MDC UUID %s failed: rc = %d\n", niduuid, rc); } - OBD_FREE(niduuid, len); - /* class_import_put will get rid of the additional connections */ - out: + if (niduuid) + OBD_FREE(niduuid, len); + + /* class_import_put will get rid of the additional connections */ mutex_up(&mgc_start_lock); RETURN(rc); } @@ -1132,6 +1149,7 @@ static int server_start_targets(struct super_block *sb, struct vfsmount *mnt) if (rc) { CERROR("failed to start server %s: %d\n", lsi->lsi_ldd->ldd_svname, rc); + server_deregister_mount(lsi->lsi_ldd->ldd_svname); GOTO(out_mgc, rc); } @@ -1164,15 +1182,15 @@ out_mgc: struct lustre_sb_info *lustre_init_lsi(struct super_block *sb) { - struct lustre_sb_info *lsi = NULL; + struct lustre_sb_info *lsi; ENTRY; - OBD_ALLOC(lsi, sizeof(*lsi)); + OBD_ALLOC_PTR(lsi); if (!lsi) RETURN(NULL); - OBD_ALLOC(lsi->lsi_lmd, sizeof(*lsi->lsi_lmd)); + OBD_ALLOC_PTR(lsi->lsi_lmd); if (!lsi->lsi_lmd) { - OBD_FREE(lsi, sizeof(*lsi)); + OBD_FREE_PTR(lsi); RETURN(NULL); } @@ -1208,6 +1226,9 @@ static int lustre_free_lsi(struct super_block *sb) if (lsi->lsi_lmd->lmd_profile != NULL) OBD_FREE(lsi->lsi_lmd->lmd_profile, strlen(lsi->lsi_lmd->lmd_profile) + 1); + if (lsi->lsi_lmd->lmd_mgssec != NULL) + OBD_FREE(lsi->lsi_lmd->lmd_mgssec, + strlen(lsi->lsi_lmd->lmd_mgssec) + 1); if (lsi->lsi_lmd->lmd_opts != NULL) OBD_FREE(lsi->lsi_lmd->lmd_opts, strlen(lsi->lsi_lmd->lmd_opts) + 1); @@ -1378,7 +1399,6 @@ static void server_put_super(struct super_block *sb) int tmpname_sz; int lddflags = lsi->lsi_ldd->ldd_flags; int lsiflags = lsi->lsi_flags; - int rc; ENTRY; LASSERT(lsiflags & LSI_SERVER); @@ -1423,17 +1443,13 @@ static void server_put_super(struct super_block *sb) /* If they wanted the mgs to stop separately from the mdt, they should have put it on a different device. */ if (IS_MGS(lsi->lsi_ldd)) { - /* stop the mgc before the mgs so the connection gets cleaned - up */ - lustre_stop_mgc(sb); /* if MDS start with --nomgs, don't stop MGS then */ if (!(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) server_stop_mgs(sb); } /* Clean the mgc and sb */ - rc = lustre_common_put_super(sb); - /* FIXME how can I report a failure to umount? */ + lustre_common_put_super(sb); /* Wait for the targets to really clean up - can't exit (and let the sb get destroyed) while the mount is still in use */ @@ -1604,7 +1620,7 @@ static int server_fill_super(struct super_block *sb) } /* Start MGS before MGC */ - if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)) { + if (IS_MGS(lsi->lsi_ldd) && !(lsi->lsi_lmd->lmd_flags & LMD_FLG_NOMGS)){ rc = server_start_mgs(sb); if (rc) GOTO(out_mnt, rc); @@ -1667,6 +1683,8 @@ int server_name2index(char *svname, __u32 *idx, char **endptr) rc = LDD_F_SV_TYPE_OST; else return(-EINVAL); + if (strcmp(dash + 4, "all") == 0) + return rc | LDD_F_SV_ALL; index = simple_strtoul(dash + 4, endptr, 16); *idx = index; @@ -1696,6 +1714,7 @@ int lustre_common_put_super(struct super_block *sb) } /* Drop a ref to the mounted disk */ lustre_put_lsi(sb); + lu_types_stop(); RETURN(rc); } @@ -1799,6 +1818,31 @@ static int lmd_make_exclusion(struct lustre_mount_data *lmd, char *ptr) RETURN(rc); } +static int lmd_parse_mgssec(struct lustre_mount_data *lmd, char *ptr) +{ + char *tail; + int length; + + if (lmd->lmd_mgssec != NULL) { + OBD_FREE(lmd->lmd_mgssec, strlen(lmd->lmd_mgssec) + 1); + lmd->lmd_mgssec = NULL; + } + + tail = strchr(ptr, ','); + if (tail == NULL) + length = strlen(ptr); + else + length = tail - ptr; + + OBD_ALLOC(lmd->lmd_mgssec, length + 1); + if (lmd->lmd_mgssec == NULL) + return -ENOMEM; + + memcpy(lmd->lmd_mgssec, ptr, length); + lmd->lmd_mgssec[length] = '\0'; + return 0; +} + /* mount -v -t lustre uml1:uml2:/lustre-client /mnt/lustre */ static int lmd_parse(char *options, struct lustre_mount_data *lmd) { @@ -1846,6 +1890,11 @@ static int lmd_parse(char *options, struct lustre_mount_data *lmd) } else if (strncmp(s1, "nomgs", 5) == 0) { lmd->lmd_flags |= LMD_FLG_NOMGS; clear++; + } else if (strncmp(s1, "mgssec=", 7) == 0) { + rc = lmd_parse_mgssec(lmd, s1 + 7); + if (rc) + goto invalid; + clear++; /* ost exclusion list */ } else if (strncmp(s1, "exclude=", 8) == 0) { rc = lmd_make_exclusion(lmd, s1 + 7); @@ -1939,6 +1988,12 @@ int lustre_fill_super(struct super_block *sb, void *data, int silent) RETURN(-ENOMEM); lmd = lsi->lsi_lmd; + /* + * Disable lockdep during mount, because mount locking patterns are + * `special'. + */ + lockdep_off(); + /* Figure out the lmd from the mount options */ if (lmd_parse((char *)data, lmd)) { lustre_put_lsi(sb); @@ -1982,9 +2037,10 @@ out: CERROR("Unable to mount %s (%d)\n", s2lsi(sb) ? lmd->lmd_dev : "", rc); } else { - CDEBUG(D_SUPER, "Mount %s complete\n", + CDEBUG(D_SUPER, "Mount %s complete\n", lmd->lmd_dev); } + lockdep_on(); return rc; } diff --git a/lustre/obdecho/autoMakefile.am b/lustre/obdecho/autoMakefile.am index bd83a99..c8b7df3 100644 --- a/lustre/obdecho/autoMakefile.am +++ b/lustre/obdecho/autoMakefile.am @@ -68,4 +68,4 @@ endif # MODULES install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(obdecho-objs:%.o=%.c) +DIST_SOURCES = $(obdecho-objs:%.o=%.c) echo_internal.h diff --git a/lustre/obdecho/echo.c b/lustre/obdecho/echo.c index 5c77003..ebc3bba3 100644 --- a/lustre/obdecho/echo.c +++ b/lustre/obdecho/echo.c @@ -47,11 +47,12 @@ #include #include -#include #include #include #include +#include "echo_internal.h" + #define ECHO_INIT_OBJID 0x1000000000000000ULL #define ECHO_HANDLE_MAGIC 0xabcd0123fedc9876ULL @@ -152,7 +153,7 @@ int echo_create(struct obd_export *exp, struct obdo *oa, int echo_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obd_export *md_exp, void *capa) { struct obd_device *obd = class_exp2obd(exp); @@ -536,7 +537,7 @@ static int echo_setup(struct obd_device *obd, struct lustre_cfg *lcfg) rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_PLAIN, NULL, LCK_NL, &lock_flags, NULL, ldlm_completion_ast, NULL, NULL, - 0, NULL, &obd->u.echo.eo_nl_lock); + 0, NULL, NULL, &obd->u.echo.eo_nl_lock); LASSERT (rc == ELDLM_OK); lprocfs_echo_init_vars(&lvars); diff --git a/lustre/obdecho/echo_client.c b/lustre/obdecho/echo_client.c index 1be3ce0..01b9572 100644 --- a/lustre/obdecho/echo_client.c +++ b/lustre/obdecho/echo_client.c @@ -44,47 +44,1129 @@ #include #include #include -#include #include #include +#include -static obd_id last_object_id; +#include "echo_internal.h" + +struct echo_device { + struct cl_device ed_cl; + struct echo_client_obd *ed_ec; + + struct cl_site ed_site_myself; + struct cl_site *ed_site; + struct lu_device *ed_next; + int ed_next_islov; +}; + +struct echo_object { + struct cl_object eo_cl; + struct cl_object_header eo_hdr; + + struct echo_device *eo_dev; + struct list_head eo_obj_chain; + struct lov_stripe_md *eo_lsm; + atomic_t eo_npages; + int eo_deleted; +}; + +struct echo_object_conf { + struct cl_object_conf eoc_cl; + struct lov_stripe_md **eoc_md; +}; + +struct echo_page { + struct cl_page_slice ep_cl; + struct cl_sync_io *ep_sync_io; + cfs_page_t *ep_vmpage; +}; + +struct echo_lock { + struct cl_lock_slice el_cl; + struct list_head el_chain; + struct echo_object *el_object; + __u64 el_cookie; +}; + +struct echo_io { + struct cl_io_slice ei_cl; +}; #if 0 -static void -echo_printk_object (char *msg, struct ec_object *eco) +struct echo_req { + struct cl_req_slice er_cl; +}; +#endif + +static int echo_client_setup(struct obd_device *obddev, + struct lustre_cfg *lcfg); +static int echo_client_cleanup(struct obd_device *obddev); + + +/** \defgroup echo_helpers + * @{ + */ +static inline struct echo_device *cl2echo_dev(const struct cl_device *dev) { - struct lov_stripe_md *lsm = eco->eco_lsm; - int i; + return container_of0(dev, struct echo_device, ed_cl); +} - CDEBUG(D_INFO, "%s: object %p: "LPX64", refs %d%s: "LPX64"=%u!%u\n", - msg, eco, eco->eco_id, eco->eco_refcount, - eco->eco_deleted ? "(deleted) " : "", - lsm->lsm_object_id, lsm->lsm_stripe_size, - lsm->lsm_stripe_count); +static inline struct cl_device *echo_dev2cl(struct echo_device *d) +{ + return &d->ed_cl; +} - for (i = 0; i < lsm->lsm_stripe_count; i++) - CDEBUG(D_INFO, "@%2u:"LPX64"\n", - lsm->lsm_oinfo[i].loi_ost_idx, - lsm->lsm_oinfo[i].loi_id); +static inline struct echo_device *obd2echo_dev(const struct obd_device *obd) +{ + return cl2echo_dev(lu2cl_dev(obd->obd_lu_dev)); } + +static inline struct cl_object *echo_obj2cl(struct echo_object *eco) +{ + return &eco->eo_cl; +} + +static inline struct echo_object *cl2echo_obj(const struct cl_object *o) +{ + return container_of(o, struct echo_object, eo_cl); +} + +static inline struct echo_page *cl2echo_page(const struct cl_page_slice *s) +{ + return container_of(s, struct echo_page, ep_cl); +} + +static inline struct echo_lock *cl2echo_lock(const struct cl_lock_slice *s) +{ + return container_of(s, struct echo_lock, el_cl); +} + +static inline struct cl_lock *echo_lock2cl(const struct echo_lock *ecl) +{ + return ecl->el_cl.cls_lock; +} + +static struct lu_context_key echo_thread_key; +static inline struct echo_thread_info *echo_env_info(const struct lu_env *env) +{ + struct echo_thread_info *info; + info = lu_context_key_get(&env->le_ctx, &echo_thread_key); + LASSERT(info != NULL); + return info; +} + +static inline +struct echo_object_conf *cl2echo_conf(const struct cl_object_conf *c) +{ + return container_of(c, struct echo_object_conf, eoc_cl); +} + +static inline void lsm2fid(struct lov_stripe_md *lsm, struct lu_fid *fid) +{ + fid_zero(fid); + fid->f_seq = lsm->lsm_object_gr << 16 | lsm->lsm_object_id >> 32; + fid->f_oid = lsm->lsm_object_id; +} +/** @} echo_helpers */ + +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsm); +static int cl_echo_object_put(struct echo_object *eco); +static int cl_echo_enqueue (struct echo_object *eco, obd_off start, + obd_off end, int mode, __u64 *cookie); +static int cl_echo_cancel (struct echo_device *d, __u64 cookie); +static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset, + cfs_page_t **pages, int npages, int async); + +static struct echo_thread_info *echo_env_info(const struct lu_env *env); + +struct echo_thread_info { + struct echo_object_conf eti_conf; + struct lustre_md eti_md; + + struct cl_2queue eti_queue; + struct cl_io eti_io; + struct cl_sync_io eti_anchor; + struct cl_lock_descr eti_descr; + struct lu_fid eti_fid; +}; + +/* No session used right now */ +struct echo_session_info { + unsigned long dummy; +}; + +static cfs_mem_cache_t *echo_page_kmem; +static cfs_mem_cache_t *echo_lock_kmem; +static cfs_mem_cache_t *echo_object_kmem; +static cfs_mem_cache_t *echo_thread_kmem; +static cfs_mem_cache_t *echo_session_kmem; +//static cfs_mem_cache_t *echo_req_kmem; + +static struct lu_kmem_descr echo_caches[] = { + { + .ckd_cache = &echo_page_kmem, + .ckd_name = "echo_page_kmem", + .ckd_size = sizeof (struct echo_page) + }, + { + .ckd_cache = &echo_lock_kmem, + .ckd_name = "echo_lock_kmem", + .ckd_size = sizeof (struct echo_lock) + }, + { + .ckd_cache = &echo_object_kmem, + .ckd_name = "echo_object_kmem", + .ckd_size = sizeof (struct echo_object) + }, + { + .ckd_cache = &echo_thread_kmem, + .ckd_name = "echo_thread_kmem", + .ckd_size = sizeof (struct echo_thread_info) + }, + { + .ckd_cache = &echo_session_kmem, + .ckd_name = "echo_session_kmem", + .ckd_size = sizeof (struct echo_session_info) + }, +#if 0 + { + .ckd_cache = &echo_req_kmem, + .ckd_name = "echo_req_kmem", + .ckd_size = sizeof (struct echo_req) + }, #endif + { + .ckd_cache = NULL + } +}; + +/** defgroup echo_page echo_page + * + * Echo page operations. + * + * @{ + */ +cfs_page_t *echo_page_vmpage(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return cl2echo_page(slice)->ep_vmpage; +} + +static void echo_page_discard(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + cl_page_delete(env, slice->cpl_page); +} + +static int echo_page_is_vmlocked(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + return 1; +} + +static void echo_page_completion(const struct lu_env *env, + const struct cl_page_slice *slice, + int ioret) +{ + struct echo_page *ecp = cl2echo_page(slice); + struct cl_sync_io *anchor = ecp->ep_sync_io; + ENTRY; + + LASSERT(anchor != NULL); + ecp->ep_sync_io = NULL; + cl_sync_io_note(anchor, ioret); + EXIT; +} + +static void echo_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct echo_page *ep = cl2echo_page(slice); + struct echo_object *eco = cl2echo_obj(slice->cpl_obj); + cfs_page_t *vmpage = ep->ep_vmpage; + ENTRY; + + atomic_dec(&eco->eo_npages); + page_cache_release(vmpage); + OBD_SLAB_FREE_PTR(ep, echo_page_kmem); + EXIT; +} + +static int echo_page_prep(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + return 0; +} + +static int echo_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct echo_page *ep = cl2echo_page(slice); + + (*printer)(env, cookie, LUSTRE_ECHO_CLIENT_NAME"-page@%p vm@%p\n", + ep, ep->ep_vmpage); + return 0; +} + +static const struct cl_page_operations echo_page_ops = { + .cpo_discard = echo_page_discard, + .cpo_vmpage = echo_page_vmpage, + .cpo_fini = echo_page_fini, + .cpo_print = echo_page_print, + .cpo_is_vmlocked = echo_page_is_vmlocked, + .io = { + [CRT_READ] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + }, + [CRT_WRITE] = { + .cpo_prep = echo_page_prep, + .cpo_completion = echo_page_completion, + } + } +}; +/** @} echo_page */ + +/** \defgroup echo_lock echo_lock + * + * echo lock operations + * + * @{ + */ +static void echo_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); + OBD_SLAB_FREE_PTR(ecl, echo_lock_kmem); +} + +static void echo_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct echo_lock *ecl = cl2echo_lock(slice); + + LASSERT(list_empty(&ecl->el_chain)); +} + +static int echo_lock_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *_) +{ + return 1; +} + +static struct cl_lock_operations echo_lock_ops = { + .clo_fini = echo_lock_fini, + .clo_delete = echo_lock_delete, + .clo_fits_into = echo_lock_fits_into +}; + +/** @} echo_lock */ + +/** \defgroup echo_cl_ops echo_cl_ops + * + * operations for cl_object + * + * @{ + */ +static struct cl_page *echo_page_init(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage) +{ + struct echo_page *ep; + ENTRY; + + OBD_SLAB_ALLOC_PTR(ep, echo_page_kmem); + if (ep != NULL) { + struct echo_object *eco = cl2echo_obj(obj); + ep->ep_vmpage = vmpage; + page_cache_get(vmpage); + cl_page_slice_add(page, &ep->ep_cl, obj, &echo_page_ops); + atomic_inc(&eco->eo_npages); + } + RETURN(ERR_PTR(ep ? 0 : -ENOMEM)); +} + +static int echo_io_init(const struct lu_env *env, struct cl_object *obj, + struct cl_io *io) +{ + return 0; +} + +static int echo_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *_) +{ + struct echo_lock *el; + ENTRY; + + OBD_SLAB_ALLOC_PTR(el, echo_lock_kmem); + if (el != NULL) { + cl_lock_slice_add(lock, &el->el_cl, obj, &echo_lock_ops); + el->el_object = cl2echo_obj(obj); + CFS_INIT_LIST_HEAD(&el->el_chain); + } + RETURN(el == NULL ? -ENOMEM : 0); +} + +static int echo_conf_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_object_conf *conf) +{ + return 0; +} + +static const struct cl_object_operations echo_cl_obj_ops = { + .coo_page_init = echo_page_init, + .coo_lock_init = echo_lock_init, + .coo_io_init = echo_io_init, + .coo_conf_set = echo_conf_set +}; +/** @} echo_cl_ops */ + +/** \defgroup echo_lu_ops echo_lu_ops + * + * operations for echo lu object. + * + * @{ + */ +static int echo_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + const struct cl_object_conf *cconf = lu2cl_conf(conf); + struct echo_object_conf *econf = cl2echo_conf(cconf); + struct echo_device *ed = cl2echo_dev(lu2cl_dev(obj->lo_dev)); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + ENTRY; + + if (ed->ed_next) { + struct lu_object *below; + struct lu_device *under; + + under = ed->ed_next; + below = under->ld_ops->ldo_object_alloc(env, obj->lo_header, + under); + if (below == NULL) + RETURN(-ENOMEM); + lu_object_add(obj, below); + } + + LASSERT(econf->eoc_md); + eco->eo_lsm = *econf->eoc_md; + eco->eo_dev = ed; + atomic_set(&eco->eo_npages, 0); + + /* clear the lsm pointer so that it won't get freed. */ + *econf->eoc_md = NULL; + + spin_lock(&ec->ec_lock); + list_add_tail(&eco->eo_obj_chain, &ec->ec_objects); + spin_unlock(&ec->ec_lock); + + RETURN(0); +} + +static void echo_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct echo_object *eco = cl2echo_obj(lu2cl(obj)); + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct lov_stripe_md *lsm = eco->eo_lsm; + ENTRY; + + LASSERT(atomic_read(&eco->eo_npages) == 0); + + spin_lock(&ec->ec_lock); + list_del_init(&eco->eo_obj_chain); + spin_unlock(&ec->ec_lock); + + lu_object_fini(obj); + lu_object_header_fini(obj->lo_header); + + if (lsm) + obd_free_memmd(ec->ec_exp, &lsm); + OBD_SLAB_FREE_PTR(eco, echo_object_kmem); + EXIT; +} + +static int echo_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *o) +{ + struct echo_object *obj = cl2echo_obj(lu2cl(o)); + + return (*p)(env, cookie, "echoclient-object@%p", obj); +} + + +static const struct lu_object_operations echo_lu_obj_ops = { + .loo_object_init = echo_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = echo_object_free, + .loo_object_print = echo_object_print, + .loo_object_invariant = NULL +}; +/** @} echo_lu_ops */ + +/** \defgroup echo_lu_dev_ops + * + * Operations for echo lu device. + * + * @{ + */ +static struct lu_object *echo_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev) +{ + struct echo_object *eco; + struct lu_object *obj = NULL; + ENTRY; + + /* we're the top dev. */ + LASSERT(hdr == NULL); + OBD_SLAB_ALLOC_PTR(eco, echo_object_kmem); + if (eco != NULL) { + struct cl_object_header *hdr = &eco->eo_hdr; + + obj = &echo_obj2cl(eco)->co_lu; + cl_object_header_init(hdr); + lu_object_init(obj, &hdr->coh_lu, dev); + lu_object_add_top(&hdr->coh_lu, obj); + + eco->eo_cl.co_ops = &echo_cl_obj_ops; + obj->lo_ops = &echo_lu_obj_ops; + } + RETURN(obj); +} + +static struct lu_device_operations echo_device_lu_ops = { + .ldo_object_alloc = echo_object_alloc, +}; +/** @} echo_lu_dev_ops */ + +static struct cl_device_operations echo_device_cl_ops = { +}; + +/** \defgroup echo_init echo_init + * + * Init and fini functions for echo client. + * + * @{ + */ +static int echo_site_init(const struct lu_env *env, struct echo_device *ed) +{ + struct cl_site *site = &ed->ed_site_myself; + int rc; + + /* initialize site */ + rc = cl_site_init(site, &ed->ed_cl); + if (rc) { + CERROR("Cannot initilize site for echo client(%d)\n", rc); + return rc; + } + + rc = lu_site_init_finish(&site->cs_lu); + if (rc) + return rc; + + ed->ed_site = site; + return 0; +} -static struct ec_object * -echo_find_object_locked (struct obd_device *obd, obd_id id) +static void echo_site_fini(const struct lu_env *env, struct echo_device *ed) { - struct echo_client_obd *ec = &obd->u.echo_client; - struct ec_object *eco = NULL; + if (ed->ed_site) { + cl_site_fini(ed->ed_site); + ed->ed_site = NULL; + } +} + +static void *echo_thread_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_thread_info *info; + + OBD_SLAB_ALLOC_PTR(info, echo_thread_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void echo_thread_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, echo_thread_kmem); +} + +static void echo_thread_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_thread_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = echo_thread_key_init, + .lct_fini = echo_thread_key_fini, + .lct_exit = echo_thread_key_exit +}; + +static void *echo_session_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct echo_session_info *session; + + OBD_SLAB_ALLOC_PTR(session, echo_session_kmem); + if (session == NULL) + session = ERR_PTR(-ENOMEM); + return session; +} + +static void echo_session_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct echo_session_info *session = data; + OBD_SLAB_FREE_PTR(session, echo_session_kmem); +} + +static void echo_session_key_exit(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ +} + +static struct lu_context_key echo_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = echo_session_key_init, + .lct_fini = echo_session_key_fini, + .lct_exit = echo_session_key_exit +}; + +LU_TYPE_INIT_FINI(echo, &echo_thread_key, &echo_session_key); + +static struct lu_device *echo_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *next; + struct echo_device *ed; + struct cl_device *cd; + struct obd_device *obd = NULL; /* to keep compiler happy */ + struct obd_device *tgt; + const char *tgt_type_name; + int rc; + int cleanup = 0; + ENTRY; + + OBD_ALLOC_PTR(ed); + if (ed == NULL) + GOTO(out, rc = -ENOMEM); + + cleanup = 1; + cd = &ed->ed_cl; + rc = cl_device_init(cd, t); + if (rc) + GOTO(out, rc); + + cd->cd_lu_dev.ld_ops = &echo_device_lu_ops; + cd->cd_ops = &echo_device_cl_ops; + + cleanup = 2; + rc = echo_site_init(env, ed); + if (rc) + GOTO(out, rc); + + cleanup = 3; + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = echo_client_setup(obd, cfg); + if (rc) + GOTO(out, rc); + ed->ed_ec = &obd->u.echo_client; + + cleanup = 4; + tgt = class_name2obd(lustre_cfg_string(cfg, 1)); + LASSERT(tgt != NULL); + next = tgt->obd_lu_dev; + if (!lu_device_is_cl(next)) + next = NULL; + + /* + * if echo client is to be stacked upon ost device, the next is NULL + * since ost is not a clio device so far + */ + tgt_type_name = tgt->obd_type->typ_name; + if (next != NULL) { + LASSERT(next != NULL); + if (next->ld_site != NULL) + GOTO(out, rc = -EBUSY); + + next->ld_site = &ed->ed_site->cs_lu; + rc = next->ld_type->ldt_ops->ldto_device_init(env, next, + next->ld_type->ldt_name, NULL); + if (rc) + GOTO(out, rc); + + /* Trikcy case, I have to determine the obd type since clio + * uses the different parameters to initialize objects for + * lov & osc. + */ + if (strcmp(tgt_type_name, LUSTRE_LOV_NAME) == 0) + ed->ed_next_islov = 1; + else + LASSERT(strcmp(tgt_type_name, LUSTRE_OSC_NAME) == 0); + } else + LASSERT(strcmp(tgt_type_name, LUSTRE_OST_NAME) == 0); + + ed->ed_next = next; + RETURN(&cd->cd_lu_dev); + +out: + switch(cleanup) { + case 4: { + int rc2; + rc2 = echo_client_cleanup(obd); + if (rc2) + CERROR("Cleanup obd device %s error(%d)\n", + obd->obd_name, rc2); + } + + case 3: + echo_site_fini(env, ed); + case 2: + cl_device_fini(&ed->ed_cl); + case 1: + OBD_FREE_PTR(ed); + case 0: + default: + break; + } + return(ERR_PTR(rc)); +} + +static int echo_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + LBUG(); + return 0; +} + +static struct lu_device *echo_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct lu_device *next = ed->ed_next; + + while (next) + next = next->ld_type->ldt_ops->ldto_device_fini(env, next); + return NULL; +} + +static struct lu_device *echo_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct echo_device *ed = cl2echo_dev(lu2cl_dev(d)); + struct echo_client_obd *ec = ed->ed_ec; + struct lu_device *next = ed->ed_next; + + printk("ed = %p, ec = %p, next = %p\n", ed, ec, next); + + /* destroy locks */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_locks)) { + struct echo_lock *ecl = list_entry(ec->ec_locks.next, + struct echo_lock, el_chain); + struct cl_lock *lock = echo_lock2cl(ecl); + + list_del_init(&ecl->el_chain); + spin_unlock(&ec->ec_lock); + + CERROR("echo client: pending lock %p\n", ecl); + + cl_lock_get(lock); + cl_unuse(env, lock); + cl_lock_release(env, lock, "ec enqueue", ecl->el_object); + + cl_lock_mutex_get(env, lock); + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + cl_lock_mutex_put(env, lock); + cl_lock_put(env, lock); + + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + LASSERT(ed->ed_site); + lu_site_purge(env, &ed->ed_site->cs_lu, -1); + + /* check if there are objects still alive, assume only one reference */ + spin_lock(&ec->ec_lock); + while (!list_empty(&ec->ec_objects)) { + struct echo_object *eco; + eco = list_entry(ec->ec_objects.next, struct echo_object, + eo_obj_chain); + spin_unlock(&ec->ec_lock); + + eco->eo_deleted = 1; + cl_echo_object_put(eco); + + spin_lock(&ec->ec_lock); + } + spin_unlock(&ec->ec_lock); + + echo_client_cleanup(d->ld_obd); + + while (next) + next = next->ld_type->ldt_ops->ldto_device_free(env, next); + + LASSERT(ed->ed_site == lu2cl_site(d->ld_site)); + echo_site_fini(env, ed); + cl_device_fini(&ed->ed_cl); + OBD_FREE_PTR(ed); + + return NULL; +} + +static const struct lu_device_type_operations echo_device_type_ops = { + .ldto_init = echo_type_init, + .ldto_fini = echo_type_fini, + + .ldto_start = echo_type_start, + .ldto_stop = echo_type_stop, + + .ldto_device_alloc = echo_device_alloc, + .ldto_device_free = echo_device_free, + .ldto_device_init = echo_device_init, + .ldto_device_fini = echo_device_fini +}; + +static struct lu_device_type echo_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_ECHO_CLIENT_NAME, + .ldt_ops = &echo_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; +/** @} echo_init */ + +/** \defgroup echo_exports + * + * exporting functions to echo client + * + * @{ + */ + +/* Interfaces to echo client obd device */ +static struct echo_object *cl_echo_object_find(struct echo_device *d, + struct lov_stripe_md **lsmp) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct echo_object_conf *conf; + struct lov_stripe_md *lsm; + struct echo_object *eco; + struct cl_object *obj; + struct lu_fid *fid; + int refcheck; + ENTRY; + + LASSERT(lsmp); + lsm = *lsmp; + LASSERT(lsm); + LASSERT(lsm->lsm_object_id); + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN((void *)env); + + info = echo_env_info(env); + conf = &info->eti_conf; + if (d->ed_next) { + if (!d->ed_next_islov) { + struct lov_oinfo *oinfo = lsm->lsm_oinfo[0]; + LASSERT(oinfo != NULL); + oinfo->loi_id = lsm->lsm_object_id; + oinfo->loi_gr = lsm->lsm_object_gr; + conf->eoc_cl.u.coc_oinfo = oinfo; + } else { + struct lustre_md *md; + md = &info->eti_md; + memset(md, 0, sizeof *md); + md->lsm = lsm; + conf->eoc_cl.u.coc_md = md; + } + } + conf->eoc_md = lsmp; + + fid = &info->eti_fid; + lsm2fid(lsm, fid); + + obj = cl_object_find(env, echo_dev2cl(d), fid, &conf->eoc_cl); + if (IS_ERR(obj)) + GOTO(out, eco = (void*)obj); + + eco = cl2echo_obj(obj); + if (eco->eo_deleted) { + cl_object_put(env, obj); + eco = ERR_PTR(-EAGAIN); + } + +out: + cl_env_put(env, &refcheck); + RETURN(eco); +} + +static int cl_echo_object_put(struct echo_object *eco) +{ + struct lu_env *env; + struct cl_object *obj = echo_obj2cl(eco); + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + /* an external function to kill an object? */ + if (eco->eo_deleted) { + struct lu_object_header *loh = obj->co_lu.lo_header; + LASSERT(&eco->eo_hdr == luh2coh(loh)); + set_bit(LU_OBJECT_HEARD_BANSHEE, &loh->loh_flags); + cl_object_prune(env, obj); + } + + cl_object_put(env, obj); + cl_env_put(env, &refcheck); + RETURN(0); +} + +static int cl_echo_enqueue(struct echo_object *eco, obd_off start, obd_off end, + int mode, __u64 *cookie) +{ + struct lu_env *env; + struct cl_lock *lck; + struct echo_thread_info *info; + struct cl_io *io; + struct cl_lock_descr *descr; + struct cl_object *obj = echo_obj2cl(eco); + int refcheck; + int result; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + descr = &info->eti_descr; + descr->cld_obj = obj; + descr->cld_start = cl_index(obj, start); + descr->cld_end = cl_index(obj, end); + descr->cld_mode = mode == LCK_PW ? CLM_WRITE : CLM_READ; + + io = &info->eti_io; + io->ci_obj = obj; + result = cl_io_init(env, io, CIT_MISC, obj); + if (result < 0) + GOTO(out, result); + LASSERT(result == 0); + + result = -ENOMEM; + lck = cl_lock_request(env, io, descr, CEF_ASYNC, "ec enqueue", eco); + if (lck) { + struct echo_client_obd *ec = eco->eo_dev->ed_ec; + struct echo_lock *el; + + result = cl_wait(env, lck); + if (result == 0) { + el = cl2echo_lock(cl_lock_at(lck, &echo_device_type)); + spin_lock(&ec->ec_lock); + list_add(&el->el_chain, &ec->ec_locks); + *cookie = el->el_cookie = ++ec->ec_unique; + spin_unlock(&ec->ec_lock); + } else + cl_lock_release(env, lck, "ec enqueue", cfs_current()); + } + cl_io_fini(env, io); + + EXIT; +out: + cl_env_put(env, &refcheck); + return result; +} + +static int cl_echo_cancel(struct echo_device *ed, __u64 cookie) +{ + struct echo_client_obd *ec = ed->ed_ec; + struct echo_lock *ecl = NULL; struct list_head *el; + int found = 0; + int result; + + struct lu_env *env; + int refcheck; + ENTRY; + + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + spin_lock (&ec->ec_lock); + list_for_each (el, &ec->ec_locks) { + ecl = list_entry (el, struct echo_lock, el_chain); + CDEBUG(D_INFO, "ecl: %p, cookie: %llx\n", ecl, ecl->el_cookie); + found = (ecl->el_cookie == cookie); + if (found) { + list_del_init(&ecl->el_chain); + break; + } + } + spin_unlock (&ec->ec_lock); + + result = -ENOENT; + if (found) { + struct cl_lock *clk = echo_lock2cl(ecl); + + cl_lock_get(clk); + cl_unuse(env, clk); + cl_lock_release(env, clk, "ec enqueue", ecl->el_object); + + cl_lock_mutex_get(env, clk); + cl_lock_cancel(env, clk); + cl_lock_delete(env, clk); + cl_lock_mutex_put(env, clk); + cl_lock_put(env, clk); + result = 0; + } + cl_env_put(env, &refcheck); + RETURN(result); +} + +static int cl_echo_async_brw(const struct lu_env *env, struct cl_io *io, + enum cl_req_type _, struct cl_2queue *queue) +{ + struct cl_page *clp; + struct cl_page *temp; + int result = 0; + ENTRY; + + cl_page_list_splice(&queue->c2_qin, &queue->c2_qout); + cl_page_list_for_each_safe(clp, temp, &queue->c2_qout) { + int rc; + rc = cl_page_cache_add(env, io, clp, CRT_WRITE); + if (rc == 0) + continue; + cl_page_list_move(&queue->c2_qin, &queue->c2_qout, clp); + result = result ?: rc; + } + RETURN(list_empty(&queue->c2_qout.pl_pages) ? result : 0); +} + +static int cl_echo_object_brw(struct echo_object *eco, int rw, obd_off offset, + cfs_page_t **pages, int npages, int async) +{ + struct lu_env *env; + struct echo_thread_info *info; + struct cl_object *obj = echo_obj2cl(eco); + struct echo_device *ed = eco->eo_dev; + struct cl_sync_io *anchor; + struct cl_2queue *queue; + struct cl_io *io; + struct cl_page *clp; + struct echo_page *ep; + + int page_size = cl_page_size(obj); + int refcheck; + int rc; + int i; + ENTRY; + + LASSERT(ed->ed_next != NULL); + env = cl_env_get(&refcheck); + if (IS_ERR(env)) + RETURN(PTR_ERR(env)); + + info = echo_env_info(env); + io = &info->eti_io; + anchor = &info->eti_anchor; + queue = &info->eti_queue; + + cl_sync_io_init(anchor, npages); + cl_2queue_init(queue); + rc = cl_io_init(env, io, CIT_MISC, obj); + if (rc < 0) + GOTO(out, rc); + LASSERT(rc == 0); + + for (i = 0; i < npages; i++) { + LASSERT(pages[i]); + clp = cl_page_find(env, obj, cl_index(obj, offset), + pages[i], CPT_TRANSIENT); + if (IS_ERR(clp)) { + rc = PTR_ERR(clp); + break; + } + LASSERT(clp->cp_type == CPT_TRANSIENT); + + rc = cl_page_own(env, io, clp); + if (rc) { + LASSERT(clp->cp_state == CPS_FREEING); + cl_page_put(env, clp); + break; + } + + ep = cl2echo_page(cl_page_at(clp, &echo_device_type)); + ep->ep_sync_io = anchor; + cl_2queue_add(queue, clp); - list_for_each (el, &ec->ec_objects) { - eco = list_entry (el, struct ec_object, eco_obj_chain); + /* drop the reference count for cl_page_find, so that the page + * will be freed in cl_2queue_fini. */ + cl_page_put(env, clp); + offset += page_size; + } - if (eco->eco_id == id) - return (eco); + if (rc == 0) { + enum cl_req_type typ = rw == READ ? CRT_READ : CRT_WRITE; + + async = async && (typ == CRT_WRITE); + rc = (async ? cl_echo_async_brw : cl_io_submit_rw)(env, io, + typ, queue); + CDEBUG(D_INFO, "echo_client %s write returns %d\n", + async ? "async" : "sync", rc); + if (rc == 0) { + /* + * If some pages weren't sent for any reason (e.g., + * direct-io read found up-to-date pages in the + * cache), count them as completed to avoid infinite + * wait. + */ + cl_page_list_for_each(clp, &queue->c2_qin) + cl_sync_io_note(anchor, +1); + /* wait for the IO to be finished. */ + rc = cl_sync_io_wait(env, io, &queue->c2_qout, anchor); + } } - return (NULL); + + cl_2queue_discard(env, io, queue); + cl_2queue_disown(env, io, queue); + cl_2queue_fini(env, queue); + cl_io_fini(env, io); + + EXIT; +out: + cl_env_put(env, &refcheck); + return rc; } +/** @} echo_exports */ + + +static obd_id last_object_id; static int echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) @@ -108,10 +1190,10 @@ echo_copyout_lsm (struct lov_stripe_md *lsm, void *_ulsm, int ulsm_nob) } static int -echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm, +echo_copyin_lsm (struct echo_device *ed, struct lov_stripe_md *lsm, void *ulsm, int ulsm_nob) { - struct echo_client_obd *ec = &obd->u.echo_client; + struct echo_client_obd *ec = ed->ed_ec; int i; if (ulsm_nob < sizeof (*lsm)) @@ -121,93 +1203,51 @@ echo_copyin_lsm (struct obd_device *obd, struct lov_stripe_md *lsm, return (-EFAULT); if (lsm->lsm_stripe_count > ec->ec_nstripes || - lsm->lsm_magic != LOV_MAGIC || - (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 || - ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) - return (-EINVAL); - - - for (i = 0; i < lsm->lsm_stripe_count; i++) { - if (copy_from_user(lsm->lsm_oinfo[i], - ((struct lov_stripe_md *)ulsm)->lsm_oinfo[i], - sizeof(lsm->lsm_oinfo[0]))) - return (-EFAULT); - } - return (0); -} - -static struct ec_object * -echo_allocate_object (struct obd_device *obd) -{ - struct echo_client_obd *ec = &obd->u.echo_client; - struct ec_object *eco; - int rc; - - OBD_ALLOC(eco, sizeof (*eco)); - if (eco == NULL) - return NULL; - - rc = obd_alloc_memmd(ec->ec_exp, &eco->eco_lsm); - if (rc < 0) { - OBD_FREE(eco, sizeof (*eco)); - return NULL; - } - - eco->eco_device = obd; - eco->eco_deleted = 0; - eco->eco_refcount = 0; - eco->eco_lsm->lsm_magic = LOV_MAGIC; - /* leave stripe count 0 by default */ - - return (eco); -} + lsm->lsm_magic != LOV_MAGIC || + (lsm->lsm_stripe_size & (~CFS_PAGE_MASK)) != 0 || + ((__u64)lsm->lsm_stripe_size * lsm->lsm_stripe_count > ~0UL)) + return (-EINVAL); -static void -echo_free_object (struct ec_object *eco) -{ - struct obd_device *obd = eco->eco_device; - struct echo_client_obd *ec = &obd->u.echo_client; - LASSERT (eco->eco_refcount == 0); - if (!eco->eco_lsm) - CERROR("No object %s\n", obd->obd_name); - else - obd_free_memmd(ec->ec_exp, &eco->eco_lsm); - OBD_FREE (eco, sizeof (*eco)); + for (i = 0; i < lsm->lsm_stripe_count; i++) { + if (copy_from_user(lsm->lsm_oinfo[i], + ((struct lov_stripe_md *)ulsm)->lsm_oinfo[i], + sizeof(lsm->lsm_oinfo[0]))) + return (-EFAULT); + } + return (0); } -static int echo_create_object(struct obd_device *obd, int on_target, +static int echo_create_object(struct echo_device *ed, int on_target, struct obdo *oa, void *ulsm, int ulsm_nob, struct obd_trans_info *oti) { - struct echo_client_obd *ec = &obd->u.echo_client; - struct ec_object *eco2; - struct ec_object *eco; - struct lov_stripe_md *lsm; + struct echo_object *eco; + struct echo_client_obd *ec = ed->ed_ec; + struct lov_stripe_md *lsm = NULL; int rc; - int i, idx; + int created = 0; + ENTRY; if ((oa->o_valid & OBD_MD_FLID) == 0 && /* no obj id */ (on_target || /* set_stripe */ ec->ec_nstripes != 0)) { /* LOV */ CERROR ("No valid oid\n"); - return (-EINVAL); + RETURN(-EINVAL); } - if (ulsm != NULL) { - eco = echo_allocate_object (obd); - if (eco == NULL) - return (-ENOMEM); + rc = obd_alloc_memmd(ec->ec_exp, &lsm); + if (rc < 0) { + CERROR("Cannot allocate md, rc = %d\n", rc); + GOTO(failed, rc); + } - lsm = eco->eco_lsm; + if (ulsm != NULL) { + int i, idx; - rc = echo_copyin_lsm (obd, lsm, ulsm, ulsm_nob); + rc = echo_copyin_lsm (ed, lsm, ulsm, ulsm_nob); if (rc != 0) - goto failed; - - /* setup object ID here for !on_target and LOV hint */ - if ((oa->o_valid & OBD_MD_FLID) != 0) - eco->eco_id = lsm->lsm_object_id = oa->o_id; + GOTO(failed, rc); if (lsm->lsm_stripe_count == 0) lsm->lsm_stripe_count = ec->ec_nstripes; @@ -225,197 +1265,91 @@ static int echo_create_object(struct obd_device *obd, int on_target, lsm->lsm_oinfo[i]->loi_ost_idx = (idx + i) % ec->ec_nstripes; } - } else { - OBD_ALLOC(eco, sizeof(*eco)); - if (!eco) - return (-ENOMEM); - eco->eco_device = obd; - lsm = NULL; } - if (oa->o_id == 0) - oa->o_id = ++last_object_id; + /* setup object ID here for !on_target and LOV hint */ + if (oa->o_valid & OBD_MD_FLID) + lsm->lsm_object_id = oa->o_id; + if (lsm->lsm_object_id == 0) + lsm->lsm_object_id = ++last_object_id; + + rc = 0; if (on_target) { oa->o_gr = FILTER_GROUP_ECHO; oa->o_valid |= OBD_MD_FLGROUP; rc = obd_create(ec->ec_exp, oa, &lsm, oti); - if (rc != 0) - goto failed; - - /* See what object ID we were given */ - eco->eco_id = oa->o_id = lsm->lsm_object_id; - oa->o_valid |= OBD_MD_FLID; - - LASSERT(eco->eco_lsm == NULL || eco->eco_lsm == lsm); - eco->eco_lsm = lsm; + if (rc != 0) { + CERROR("Cannot create objects, rc = %d\n", rc); + GOTO(failed, rc); + } + created = 1; } - spin_lock (&ec->ec_lock); - - eco2 = echo_find_object_locked (obd, oa->o_id); - if (eco2 != NULL) { /* conflict */ - spin_unlock (&ec->ec_lock); - - CERROR ("Can't create object id "LPX64": id already exists%s\n", - oa->o_id, on_target ? " (undoing create)" : ""); + /* See what object ID we were given */ + oa->o_id = lsm->lsm_object_id; + oa->o_valid |= OBD_MD_FLID; - if (on_target) - obd_destroy(ec->ec_exp, oa, lsm, oti, NULL); - - rc = -EEXIST; - goto failed; - } + eco = cl_echo_object_find(ed, &lsm); + if (IS_ERR(eco)) + GOTO(failed, rc = PTR_ERR(eco)); + cl_echo_object_put(eco); - list_add (&eco->eco_obj_chain, &ec->ec_objects); - spin_unlock (&ec->ec_lock); - CDEBUG (D_INFO, - "created %p: "LPX64"=%u#%u@%u refs %d del %d\n", - eco, eco->eco_id, - eco->eco_lsm->lsm_stripe_size, - eco->eco_lsm->lsm_stripe_count, - eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx, - eco->eco_refcount, eco->eco_deleted); - return (0); + CDEBUG(D_INFO, "oa->o_id = %lx\n", (long)oa->o_id); + EXIT; failed: - echo_free_object (eco); - if (rc) - CERROR("%s: err %d on create\n", obd->obd_name, rc); + if (created && rc) + obd_destroy(ec->ec_exp, oa, lsm, oti, NULL, NULL); + if (lsm) + obd_free_memmd(ec->ec_exp, &lsm); + if (rc) + CERROR("create object failed with rc = %d\n", rc); return (rc); } -static int -echo_get_object (struct ec_object **ecop, struct obd_device *obd, - struct obdo *oa) +static int echo_get_object(struct echo_object **ecop, struct echo_device *ed, + struct obdo *oa) { - struct echo_client_obd *ec = &obd->u.echo_client; - struct ec_object *eco; - struct ec_object *eco2; + struct echo_client_obd *ec = ed->ed_ec; + struct lov_stripe_md *lsm = NULL; + struct echo_object *eco; int rc; + ENTRY; if ((oa->o_valid & OBD_MD_FLID) == 0 || - oa->o_id == 0) /* disallow use of object id 0 */ + oa->o_id == 0) /* disallow use of object id 0 */ { CERROR ("No valid oid\n"); - return (-EINVAL); - } - - spin_lock (&ec->ec_lock); - eco = echo_find_object_locked (obd, oa->o_id); - if (eco != NULL) { - if (eco->eco_deleted) { /* being deleted */ - spin_unlock(&ec->ec_lock);/* (see comment in cleanup) */ - return (-EAGAIN); - } - - eco->eco_refcount++; - spin_unlock (&ec->ec_lock); - *ecop = eco; - CDEBUG (D_INFO, - "found %p: "LPX64"=%u#%u@%u refs %d del %d\n", - eco, eco->eco_id, - eco->eco_lsm->lsm_stripe_size, - eco->eco_lsm->lsm_stripe_count, - eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx, - eco->eco_refcount, eco->eco_deleted); - return (0); + RETURN(-EINVAL); } - spin_unlock (&ec->ec_lock); - if (ec->ec_nstripes != 0) /* striping required */ - return (-ENOENT); - - eco = echo_allocate_object (obd); - if (eco == NULL) - return (-ENOMEM); - - eco->eco_id = eco->eco_lsm->lsm_object_id = oa->o_id; + rc = obd_alloc_memmd(ec->ec_exp, &lsm); + if (rc < 0) + RETURN(rc); - spin_lock (&ec->ec_lock); + lsm->lsm_object_id = oa->o_id; + if (oa->o_valid & OBD_MD_FLGROUP) + lsm->lsm_object_gr = oa->o_gr; + else + lsm->lsm_object_gr = FILTER_GROUP_ECHO; - eco2 = echo_find_object_locked (obd, oa->o_id); - if (eco2 == NULL) { /* didn't race */ - list_add (&eco->eco_obj_chain, &ec->ec_objects); - spin_unlock (&ec->ec_lock); - eco->eco_refcount = 1; + rc = 0; + eco = cl_echo_object_find(ed, &lsm); + if (!IS_ERR(eco)) *ecop = eco; - CDEBUG (D_INFO, - "created %p: "LPX64"=%u#%u@%d refs %d del %d\n", - eco, eco->eco_id, - eco->eco_lsm->lsm_stripe_size, - eco->eco_lsm->lsm_stripe_count, - eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx, - eco->eco_refcount, eco->eco_deleted); - return (0); - } - - if (eco2->eco_deleted) - rc = -EAGAIN; /* lose race */ - else { - eco2->eco_refcount++; /* take existing */ - *ecop = eco2; - rc = 0; - LASSERT (eco2->eco_id == eco2->eco_lsm->lsm_object_id); - CDEBUG (D_INFO, - "found(2) %p: "LPX64"=%u#%u@%d refs %d del %d\n", - eco2, eco2->eco_id, - eco2->eco_lsm->lsm_stripe_size, - eco2->eco_lsm->lsm_stripe_count, - eco2->eco_lsm->lsm_oinfo[0]->loi_ost_idx, - eco2->eco_refcount, eco2->eco_deleted); - } - - spin_unlock (&ec->ec_lock); - - echo_free_object (eco); - return (rc); + else + rc = PTR_ERR(eco); + if (lsm) + obd_free_memmd(ec->ec_exp, &lsm); + RETURN(rc); } -static void -echo_put_object (struct ec_object *eco) +static void echo_put_object(struct echo_object *eco) { - struct obd_device *obd = eco->eco_device; - struct echo_client_obd *ec = &obd->u.echo_client; - - /* Release caller's ref on the object. - * delete => mark for deletion when last ref goes - */ - - spin_lock (&ec->ec_lock); - - eco->eco_refcount--; - LASSERT (eco->eco_refcount >= 0); - - CDEBUG(D_INFO, "put %p: "LPX64"=%u#%u@%d refs %d del %d\n", - eco, eco->eco_id, - eco->eco_lsm->lsm_stripe_size, - eco->eco_lsm->lsm_stripe_count, - eco->eco_lsm->lsm_oinfo[0]->loi_ost_idx, - eco->eco_refcount, eco->eco_deleted); - - if (eco->eco_refcount != 0 || !eco->eco_deleted) { - spin_unlock (&ec->ec_lock); - return; - } - - spin_unlock (&ec->ec_lock); - - /* NB leave obj in the object list. We must prevent anyone from - * attempting to enqueue on this object number until we can be - * sure there will be no more lock callbacks. - */ - obd_cancel_unused(ec->ec_exp, eco->eco_lsm, 0, NULL); - - /* now we can let it go */ - spin_lock (&ec->ec_lock); - list_del (&eco->eco_obj_chain); - spin_unlock (&ec->ec_lock); - - LASSERT (eco->eco_refcount == 0); - - echo_free_object (eco); + if (cl_echo_object_put(eco)) + CERROR("echo client: drop an object failed"); } static void @@ -512,20 +1446,23 @@ static int echo_client_page_debug_check(struct lov_stripe_md *lsm, return rc; } -static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, - struct lov_stripe_md *lsm, obd_off offset, - obd_size count, struct obd_trans_info *oti) +static int echo_client_kbrw(struct echo_device *ed, int rw, struct obdo *oa, + struct echo_object *eco, obd_off offset, + obd_size count, int async, + struct obd_trans_info *oti) { - struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_info oinfo = { { { 0 } } }; + struct echo_client_obd *ec = ed->ed_ec; + struct lov_stripe_md *lsm = eco->eo_lsm; obd_count npages; struct brw_page *pga; struct brw_page *pgp; + cfs_page_t **pages; obd_off off; int i; int rc; int verify; int gfp_mask; + ENTRY; verify = ((oa->o_id) != ECHO_PERSISTENT_OBJID && (oa->o_valid & OBD_MD_FLFLAGS) != 0 && @@ -539,14 +1476,20 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, if (count <= 0 || (count & (~CFS_PAGE_MASK)) != 0) - return (-EINVAL); + RETURN(-EINVAL); /* XXX think again with misaligned I/O */ npages = count >> CFS_PAGE_SHIFT; OBD_ALLOC(pga, npages * sizeof(*pga)); if (pga == NULL) - return (-ENOMEM); + RETURN(-ENOMEM); + + OBD_ALLOC(pages, npages * sizeof(*pages)); + if (pages == NULL) { + OBD_FREE(pga, npages * sizeof(*pga)); + RETURN(-ENOMEM); + } for (i = 0, pgp = pga, off = offset; i < npages; @@ -559,6 +1502,7 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, if (pgp->pg == NULL) goto out; + pages[i] = pgp->pg; pgp->count = CFS_PAGE_SIZE; pgp->off = off; pgp->flag = 0; @@ -568,9 +1512,13 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, oa->o_id, off, pgp->count); } - oinfo.oi_oa = oa; - oinfo.oi_md = lsm; - rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti); + if (ed->ed_next == NULL) { + struct obd_info oinfo = { { { 0 } } }; + oinfo.oi_oa = oa; + oinfo.oi_md = lsm; + rc = obd_brw(rw, ec->ec_exp, &oinfo, npages, pga, oti); + } else + rc = cl_echo_object_brw(eco, rw, offset, pages, npages, async); out: if (rc != 0 || rw != OBD_BRW_READ) @@ -590,268 +1538,16 @@ static int echo_client_kbrw(struct obd_device *obd, int rw, struct obdo *oa, OBD_PAGE_FREE(pgp->pg); } OBD_FREE(pga, npages * sizeof(*pga)); - return (rc); -} - -struct echo_async_state; - -#define EAP_MAGIC 79277927 -struct echo_async_page { - int eap_magic; - cfs_page_t *eap_page; - void *eap_cookie; - obd_off eap_off; - struct echo_async_state *eap_eas; - struct list_head eap_item; -}; - -static inline struct echo_async_page *eap_from_cookie(void *ptr) -{ - struct echo_async_page *ap = ptr; - LASSERT(ap->eap_magic == EAP_MAGIC); - return ap; -} - -struct echo_async_state { - spinlock_t eas_lock; - obd_off eas_next_offset; - obd_off eas_end_offset; - int eas_in_flight; - int eas_rc; - cfs_waitq_t eas_waitq; - struct list_head eas_avail; - struct obdo eas_oa; - struct lov_stripe_md *eas_lsm; -}; - -static int eas_should_wake(struct echo_async_state *eas) -{ - int rc = 0; - - spin_lock(&eas->eas_lock); - if (eas->eas_rc == 0 && !list_empty(&eas->eas_avail)) - rc = 1; - spin_unlock(&eas->eas_lock); - return rc; -}; - -static int ec_ap_make_ready(void *data, int cmd) -{ - /* our pages are issued ready */ - LBUG(); - return 0; -} -static int ec_ap_refresh_count(void *data, int cmd) -{ - /* our pages are issued with a stable count */ - LBUG(); - return CFS_PAGE_SIZE; -} -static void ec_ap_fill_obdo(void *data, int cmd, struct obdo *oa) -{ - struct echo_async_page *eap = eap_from_cookie(data); - - memcpy(oa, &eap->eap_eas->eas_oa, sizeof(*oa)); -} - -static int ec_ap_completion(void *data, int cmd, struct obdo *oa, int rc) -{ - struct echo_async_page *eap = eap_from_cookie(data); - struct echo_async_state *eas; - - eas = eap->eap_eas; - - if (cmd == OBD_BRW_READ && - eas->eas_oa.o_id != ECHO_PERSISTENT_OBJID && - (eas->eas_oa.o_valid & OBD_MD_FLFLAGS) != 0 && - (eas->eas_oa.o_flags & OBD_FL_DEBUG_CHECK) != 0) - echo_client_page_debug_check(eas->eas_lsm, eap->eap_page, - eas->eas_oa.o_id, eap->eap_off, - CFS_PAGE_SIZE); - - spin_lock(&eas->eas_lock); - if (rc && !eas->eas_rc) - eas->eas_rc = rc; - eas->eas_in_flight--; - list_add(&eap->eap_item, &eas->eas_avail); - cfs_waitq_signal(&eas->eas_waitq); - spin_unlock(&eas->eas_lock); - return 0; -} - -static struct obd_async_page_ops ec_async_page_ops = { - .ap_make_ready = ec_ap_make_ready, - .ap_refresh_count = ec_ap_refresh_count, - .ap_fill_obdo = ec_ap_fill_obdo, - .ap_completion = ec_ap_completion, -}; - -static int echo_client_async_page(struct obd_export *exp, int rw, - struct obdo *oa, struct lov_stripe_md *lsm, - obd_off offset, obd_size count, - obd_size batching) -{ - obd_count npages, i; - struct echo_async_page *eap; - struct echo_async_state eas; - int rc = 0; - struct echo_async_page **aps = NULL; - - ENTRY; -#if 0 - int verify; - int gfp_mask; - - verify = ((oa->o_id) != ECHO_PERSISTENT_OBJID && - (oa->o_valid & OBD_MD_FLFLAGS) != 0 && - (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0); - - gfp_mask = ((oa->o_id & 2) == 0) ? GFP_KERNEL : GFP_HIGHUSER; -#endif - - LASSERT(rw == OBD_BRW_WRITE || rw == OBD_BRW_READ); - - if (count <= 0 || - (count & (~CFS_PAGE_MASK)) != 0 || - (lsm != NULL && - lsm->lsm_object_id != oa->o_id)) - return (-EINVAL); - - /* XXX think again with misaligned I/O */ - npages = batching >> CFS_PAGE_SHIFT; - - memcpy(&eas.eas_oa, oa, sizeof(*oa)); - eas.eas_next_offset = offset; - eas.eas_end_offset = offset + count; - spin_lock_init(&eas.eas_lock); - cfs_waitq_init(&eas.eas_waitq); - eas.eas_in_flight = 0; - eas.eas_rc = 0; - eas.eas_lsm = lsm; - CFS_INIT_LIST_HEAD(&eas.eas_avail); - - OBD_ALLOC(aps, npages * sizeof aps[0]); - if (aps == NULL) - return (-ENOMEM); - - /* prepare the group of pages that we're going to be keeping - * in flight */ - for (i = 0; i < npages; i++) { - cfs_page_t *page; - OBD_PAGE_ALLOC(page, CFS_ALLOC_STD); - if (page == NULL) - GOTO(out, rc = -ENOMEM); - - OBD_ALLOC(eap, sizeof(*eap)); - if (eap == NULL) { - OBD_PAGE_FREE(page); - GOTO(out, rc = -ENOMEM); - } - - eap->eap_magic = EAP_MAGIC; - eap->eap_page = page; - eap->eap_eas = &eas; - list_add_tail(&eap->eap_item, &eas.eas_avail); - aps[i] = eap; - } - - /* first we spin queueing io and being woken by its completion */ - spin_lock(&eas.eas_lock); - for(;;) { - int rc; - - /* sleep until we have a page to send */ - spin_unlock(&eas.eas_lock); - rc = wait_event_interruptible(eas.eas_waitq, - eas_should_wake(&eas)); - spin_lock(&eas.eas_lock); - if (rc && !eas.eas_rc) - eas.eas_rc = rc; - if (eas.eas_rc) - break; - if (list_empty(&eas.eas_avail)) - continue; - eap = list_entry(eas.eas_avail.next, struct echo_async_page, - eap_item); - list_del(&eap->eap_item); - spin_unlock(&eas.eas_lock); - - /* unbind the eap from its old page offset */ - if (eap->eap_cookie != NULL) { - obd_teardown_async_page(exp, lsm, NULL, - eap->eap_cookie); - eap->eap_cookie = NULL; - } - - eas.eas_next_offset += CFS_PAGE_SIZE; - eap->eap_off = eas.eas_next_offset; - - rc = obd_prep_async_page(exp, lsm, NULL, eap->eap_page, - eap->eap_off, &ec_async_page_ops, - eap, &eap->eap_cookie, 1, NULL); - if (rc) { - spin_lock(&eas.eas_lock); - eas.eas_rc = rc; - break; - } - - if (oa->o_id != ECHO_PERSISTENT_OBJID && - (oa->o_valid & OBD_MD_FLFLAGS) != 0 && - (oa->o_flags & OBD_FL_DEBUG_CHECK) != 0) - echo_client_page_debug_setup(lsm, eap->eap_page, rw, - oa->o_id, - eap->eap_off, CFS_PAGE_SIZE); - - /* always asserts urgent, which isn't quite right */ - rc = obd_queue_async_io(exp, lsm, NULL, eap->eap_cookie, - rw, 0, CFS_PAGE_SIZE, 0, - ASYNC_READY | ASYNC_URGENT | - ASYNC_COUNT_STABLE); - spin_lock(&eas.eas_lock); - if (rc && !eas.eas_rc) { - eas.eas_rc = rc; - break; - } - eas.eas_in_flight++; - if (eas.eas_next_offset == eas.eas_end_offset) - break; - } - - /* still hold the eas_lock here.. */ - - /* now we just spin waiting for all the rpcs to complete */ - while(eas.eas_in_flight) { - spin_unlock(&eas.eas_lock); - wait_event_interruptible(eas.eas_waitq, - eas.eas_in_flight == 0); - spin_lock(&eas.eas_lock); - } - spin_unlock(&eas.eas_lock); - -out: - if (aps != NULL) { - for (i = 0; i < npages; ++ i) { - cfs_page_t *page; - - eap = aps[i]; - page = eap->eap_page; - if (eap->eap_cookie != NULL) - obd_teardown_async_page(exp, lsm, NULL, - eap->eap_cookie); - OBD_FREE(eap, sizeof(*eap)); - OBD_PAGE_FREE(page); - } - OBD_FREE(aps, npages * sizeof aps[0]); - } - + OBD_FREE(pages, npages * sizeof(*pages)); RETURN(rc); } static int echo_client_prep_commit(struct obd_export *exp, int rw, - struct obdo *oa, struct lov_stripe_md *lsm, + struct obdo *oa, struct echo_object *eco, obd_off offset, obd_size count, obd_size batch, struct obd_trans_info *oti) { + struct lov_stripe_md *lsm = eco->eo_lsm; struct obd_ioobj ioo; struct niobuf_local *lnb; struct niobuf_remote *rnb; @@ -935,17 +1631,19 @@ out: RETURN(ret); } -int echo_client_brw_ioctl(int rw, struct obd_export *exp, - struct obd_ioctl_data *data) +static int echo_client_brw_ioctl(int rw, struct obd_export *exp, + struct obd_ioctl_data *data) { struct obd_device *obd = class_exp2obd(exp); - struct echo_client_obd *ec = &obd->u.echo_client; - struct obd_trans_info dummy_oti = { .oti_thread_id = -1 }; - struct ec_object *eco; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct obd_trans_info dummy_oti = { .oti_thread = NULL }; + struct echo_object *eco; int rc; + int async = 1; ENTRY; - rc = echo_get_object(&eco, obd, &data->ioc_obdo1); + rc = echo_get_object(&eco, ed, &data->ioc_obdo1); if (rc) RETURN(rc); @@ -955,18 +1653,16 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp, switch((long)data->ioc_pbuf1) { case 1: - rc = echo_client_kbrw(obd, rw, &data->ioc_obdo1, - eco->eco_lsm, data->ioc_offset, - data->ioc_count, &dummy_oti); - break; + async = 0; + /* fall through */ case 2: - rc = echo_client_async_page(ec->ec_exp, rw, &data->ioc_obdo1, - eco->eco_lsm, data->ioc_offset, - data->ioc_count, data->ioc_plen1); + rc = echo_client_kbrw(ed, rw, &data->ioc_obdo1, + eco, data->ioc_offset, + data->ioc_count, async, &dummy_oti); break; case 3: rc = echo_client_prep_commit(ec->ec_exp, rw, &data->ioc_obdo1, - eco->eco_lsm, data->ioc_offset, + eco, data->ioc_offset, data->ioc_count, data->ioc_plen1, &dummy_oti); break; @@ -978,165 +1674,61 @@ int echo_client_brw_ioctl(int rw, struct obd_export *exp, } static int -echo_ldlm_callback (struct ldlm_lock *lock, struct ldlm_lock_desc *new, - void *data, int flag) -{ - struct ec_object *eco = (struct ec_object *)data; - struct echo_client_obd *ec = &(eco->eco_device->u.echo_client); - struct lustre_handle lockh; - struct list_head *el; - int found = 0; - int rc; - - ldlm_lock2handle (lock, &lockh); - - /* #ifdef this out if we're not feeling paranoid */ - spin_lock (&ec->ec_lock); - list_for_each (el, &ec->ec_objects) { - found = (eco == list_entry(el, struct ec_object, - eco_obj_chain)); - if (found) - break; - } - spin_unlock (&ec->ec_lock); - LASSERT (found); - - switch (flag) { - case LDLM_CB_BLOCKING: - CDEBUG(D_INFO, "blocking callback on "LPX64", handle "LPX64"\n", - eco->eco_id, lockh.cookie); - rc = ldlm_cli_cancel (&lockh); - if (rc != ELDLM_OK) - CERROR ("ldlm_cli_cancel failed: %d\n", rc); - break; - - case LDLM_CB_CANCELING: - CDEBUG(D_INFO, "cancel callback on "LPX64", handle "LPX64"\n", - eco->eco_id, lockh.cookie); - break; - - default: - LBUG (); - } - - return (0); -} - -static int echo_client_enqueue(struct obd_export *exp, struct obdo *oa, int mode, obd_off offset, obd_size nob) { - struct obd_device *obd = exp->exp_obd; - struct echo_client_obd *ec = &obd->u.echo_client; + struct echo_device *ed = obd2echo_dev(exp->exp_obd); struct lustre_handle *ulh = &oa->o_handle; - struct ldlm_enqueue_info einfo = { 0 }; - struct obd_info oinfo = { { { 0 } } }; - struct ec_object *eco; - struct ec_lock *ecl; + struct echo_object *eco; + obd_off end; int rc; + ENTRY; + + if (ed->ed_next == NULL) + RETURN(-EOPNOTSUPP); if (!(mode == LCK_PR || mode == LCK_PW)) - return -EINVAL; + RETURN(-EINVAL); if ((offset & (~CFS_PAGE_MASK)) != 0 || (nob & (~CFS_PAGE_MASK)) != 0) - return -EINVAL; - - rc = echo_get_object (&eco, obd, oa); - if (rc != 0) - return rc; + RETURN(-EINVAL); - rc = -ENOMEM; - OBD_ALLOC (ecl, sizeof (*ecl)); - if (ecl == NULL) - goto failed_0; - - ecl->ecl_mode = mode; - ecl->ecl_object = eco; - ecl->ecl_policy.l_extent.start = offset; - ecl->ecl_policy.l_extent.end = - (nob == 0) ? ((obd_off) -1) : (offset + nob - 1); - - einfo.ei_type = LDLM_EXTENT; - einfo.ei_mode = mode; - einfo.ei_cb_bl = echo_ldlm_callback; - einfo.ei_cb_cp = ldlm_completion_ast; - einfo.ei_cb_gl = NULL; - einfo.ei_cbdata = eco; - - oinfo.oi_policy = ecl->ecl_policy; - oinfo.oi_lockh = &ecl->ecl_lock_handle; - oinfo.oi_md = eco->eco_lsm; - rc = obd_enqueue(ec->ec_exp, &oinfo, &einfo, NULL); + rc = echo_get_object (&eco, ed, oa); if (rc != 0) - goto failed_1; - - CDEBUG(D_INFO, "enqueue handle "LPX64"\n", ecl->ecl_lock_handle.cookie); - - /* NB ecl takes object ref from echo_get_object() above */ - spin_lock(&ec->ec_lock); - - list_add(&ecl->ecl_exp_chain, &exp->exp_ec_data.eced_locks); - ulh->cookie = ecl->ecl_cookie = ec->ec_unique++; - - spin_unlock(&ec->ec_lock); - - oa->o_valid |= OBD_MD_FLHANDLE; - return 0; + RETURN(rc); - failed_1: - OBD_FREE (ecl, sizeof (*ecl)); - failed_0: - echo_put_object (eco); - return (rc); + end = (nob == 0) ? ((obd_off) -1) : (offset + nob - 1); + rc = cl_echo_enqueue(eco, offset, end, mode, &ulh->cookie); + if (rc == 0) { + oa->o_valid |= OBD_MD_FLHANDLE; + CDEBUG(D_INFO, "Cookie is %llx\n", ulh->cookie); + } + echo_put_object(eco); + RETURN(rc); } static int echo_client_cancel(struct obd_export *exp, struct obdo *oa) { - struct obd_device *obd = exp->exp_obd; - struct echo_client_obd *ec = &obd->u.echo_client; - struct lustre_handle *ulh = &oa->o_handle; - struct ec_lock *ecl = NULL; - int found = 0; - struct list_head *el; - int rc; + struct echo_device *ed = obd2echo_dev(exp->exp_obd); + __u64 cookie = oa->o_handle.cookie; if ((oa->o_valid & OBD_MD_FLHANDLE) == 0) return -EINVAL; - spin_lock (&ec->ec_lock); - - list_for_each (el, &exp->exp_ec_data.eced_locks) { - ecl = list_entry (el, struct ec_lock, ecl_exp_chain); - found = (ecl->ecl_cookie == ulh->cookie); - if (found) { - list_del (&ecl->ecl_exp_chain); - break; - } - } - - spin_unlock (&ec->ec_lock); - - if (!found) - return (-ENOENT); - - rc = obd_cancel(ec->ec_exp, ecl->ecl_object->eco_lsm, ecl->ecl_mode, - &ecl->ecl_lock_handle); - - echo_put_object (ecl->ecl_object); - OBD_FREE (ecl, sizeof (*ecl)); - - return rc; + CDEBUG(D_INFO, "Cookie is %llx\n", cookie); + return cl_echo_cancel(ed, cookie); } static int echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg) { - struct obd_device *obd; - struct echo_client_obd *ec; - struct ec_object *eco; + struct obd_device *obd = exp->exp_obd; + struct echo_device *ed = obd2echo_dev(obd); + struct echo_client_obd *ec = ed->ed_ec; + struct echo_object *eco; struct obd_ioctl_data *data = karg; struct obd_trans_info dummy_oti; struct oti_req_ack_lock *ack_lock; @@ -1150,15 +1742,12 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, memset(&dummy_oti, 0, sizeof(dummy_oti)); - obd = exp->exp_obd; - ec = &obd->u.echo_client; - switch (cmd) { case OBD_IOC_CREATE: /* may create echo object */ if (!cfs_capable(CFS_CAP_SYS_ADMIN)) GOTO (out, rc = -EPERM); - rc = echo_create_object (obd, 1, &data->ioc_obdo1, + rc = echo_create_object (ed, 1, &data->ioc_obdo1, data->ioc_pbuf1, data->ioc_plen1, &dummy_oti); GOTO(out, rc); @@ -1167,24 +1756,24 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, if (!cfs_capable(CFS_CAP_SYS_ADMIN)) GOTO (out, rc = -EPERM); - rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + rc = echo_get_object (&eco, ed, &data->ioc_obdo1); if (rc == 0) { oa = &data->ioc_obdo1; oa->o_gr = FILTER_GROUP_ECHO; oa->o_valid |= OBD_MD_FLGROUP; - rc = obd_destroy(ec->ec_exp, oa, eco->eco_lsm, - &dummy_oti, NULL); + rc = obd_destroy(ec->ec_exp, oa, eco->eo_lsm, + &dummy_oti, NULL, NULL); if (rc == 0) - eco->eco_deleted = 1; + eco->eo_deleted = 1; echo_put_object(eco); } GOTO(out, rc); case OBD_IOC_GETATTR: - rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + rc = echo_get_object (&eco, ed, &data->ioc_obdo1); if (rc == 0) { struct obd_info oinfo = { { { 0 } } }; - oinfo.oi_md = eco->eco_lsm; + oinfo.oi_md = eco->eo_lsm; oinfo.oi_oa = &data->ioc_obdo1; rc = obd_getattr(ec->ec_exp, &oinfo); echo_put_object(eco); @@ -1195,11 +1784,11 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, if (!cfs_capable(CFS_CAP_SYS_ADMIN)) GOTO (out, rc = -EPERM); - rc = echo_get_object (&eco, obd, &data->ioc_obdo1); + rc = echo_get_object (&eco, ed, &data->ioc_obdo1); if (rc == 0) { struct obd_info oinfo = { { { 0 } } }; oinfo.oi_oa = &data->ioc_obdo1; - oinfo.oi_md = eco->eco_lsm; + oinfo.oi_md = eco->eo_lsm; rc = obd_setattr(ec->ec_exp, &oinfo, NULL); echo_put_object(eco); @@ -1217,9 +1806,9 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, GOTO(out, rc); case ECHO_IOC_GET_STRIPE: - rc = echo_get_object(&eco, obd, &data->ioc_obdo1); + rc = echo_get_object(&eco, ed, &data->ioc_obdo1); if (rc == 0) { - rc = echo_copyout_lsm(eco->eco_lsm, data->ioc_pbuf1, + rc = echo_copyout_lsm(eco->eo_lsm, data->ioc_pbuf1, data->ioc_plen1); echo_put_object(eco); } @@ -1230,13 +1819,13 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, GOTO (out, rc = -EPERM); if (data->ioc_pbuf1 == NULL) { /* unset */ - rc = echo_get_object(&eco, obd, &data->ioc_obdo1); + rc = echo_get_object(&eco, ed, &data->ioc_obdo1); if (rc == 0) { - eco->eco_deleted = 1; + eco->eo_deleted = 1; echo_put_object(eco); } } else { - rc = echo_create_object(obd, 0, &data->ioc_obdo1, + rc = echo_create_object(ed, 0, &data->ioc_obdo1, data->ioc_pbuf1, data->ioc_plen1, &dummy_oti); } @@ -1248,7 +1837,8 @@ echo_client_iocontrol(unsigned int cmd, struct obd_export *exp, rc = echo_client_enqueue(exp, &data->ioc_obdo1, data->ioc_conn1, /* lock mode */ - data->ioc_offset, data->ioc_count);/*extent*/ + data->ioc_offset, + data->ioc_count);/*extent*/ GOTO (out, rc); case ECHO_IOC_CANCEL: @@ -1300,7 +1890,9 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) spin_lock_init (&ec->ec_lock); CFS_INIT_LIST_HEAD (&ec->ec_objects); + CFS_INIT_LIST_HEAD (&ec->ec_locks); ec->ec_unique = 0; + ec->ec_nstripes = 0; OBD_ALLOC(ocd, sizeof(*ocd)); if (ocd == NULL) { @@ -1309,7 +1901,8 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) return -ENOMEM; } - ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL; + ocd->ocd_connect_flags = OBD_CONNECT_VERSION | OBD_CONNECT_REQPORTAL | + OBD_CONNECT_GRANT; ocd->ocd_version = LUSTRE_VERSION_CODE; ocd->ocd_group = FILTER_GROUP_ECHO; @@ -1329,8 +1922,6 @@ static int echo_client_setup(struct obd_device *obddev, struct lustre_cfg *lcfg) static int echo_client_cleanup(struct obd_device *obddev) { - struct list_head *el; - struct ec_object *eco; struct echo_client_obd *ec = &obddev->u.echo_client; int rc; ENTRY; @@ -1340,17 +1931,6 @@ static int echo_client_cleanup(struct obd_device *obddev) RETURN(-EBUSY); } - /* XXX assuming sole access */ - while (!list_empty(&ec->ec_objects)) { - el = ec->ec_objects.next; - eco = list_entry(el, struct ec_object, eco_obj_chain); - - LASSERT(eco->eco_refcount == 0); - eco->eco_refcount = 1; - eco->eco_deleted = 1; - echo_put_object(eco); - } - rc = obd_disconnect(ec->ec_exp); if (rc != 0) CERROR("fail to disconnect device: %d\n", rc); @@ -1370,7 +1950,6 @@ static int echo_client_connect(const struct lu_env *env, rc = class_connect(conn, src, cluuid); if (rc == 0) { exp = class_conn2export(conn); - CFS_INIT_LIST_HEAD(&exp->exp_ec_data.eced_locks); class_export_put(exp); } @@ -1379,15 +1958,18 @@ static int echo_client_connect(const struct lu_env *env, static int echo_client_disconnect(struct obd_export *exp) { +#if 0 struct obd_device *obd; struct echo_client_obd *ec; struct ec_lock *ecl; +#endif int rc; ENTRY; if (exp == NULL) GOTO(out, rc = -EINVAL); +#if 0 obd = exp->exp_obd; ec = &obd->u.echo_client; @@ -1406,6 +1988,7 @@ static int echo_client_disconnect(struct obd_export *exp) echo_put_object (ecl->ecl_object); OBD_FREE (ecl, sizeof (*ecl)); } +#endif rc = class_disconnect(exp); GOTO(out, rc); @@ -1415,8 +1998,12 @@ static int echo_client_disconnect(struct obd_export *exp) static struct obd_ops echo_obd_ops = { .o_owner = THIS_MODULE, + +#if 0 .o_setup = echo_client_setup, .o_cleanup = echo_client_cleanup, +#endif + .o_iocontrol = echo_client_iocontrol, .o_connect = echo_client_connect, .o_disconnect = echo_client_disconnect @@ -1425,13 +2012,19 @@ static struct obd_ops echo_obd_ops = { int echo_client_init(void) { struct lprocfs_static_vars lvars = { 0 }; + int rc; lprocfs_echo_init_vars(&lvars); - return class_register_type(&echo_obd_ops, NULL, lvars.module_vars, - LUSTRE_ECHO_CLIENT_NAME, NULL); + rc = class_register_type(&echo_obd_ops, NULL, lvars.module_vars, + LUSTRE_ECHO_CLIENT_NAME, &echo_device_type); + if (rc == 0) + lu_kmem_init(echo_caches); + return rc; } void echo_client_exit(void) { class_unregister_type(LUSTRE_ECHO_CLIENT_NAME); + lu_kmem_fini(echo_caches); } + diff --git a/lustre/obdecho/echo_internal.h b/lustre/obdecho/echo_internal.h new file mode 100644 index 0000000..c45f0c6 --- /dev/null +++ b/lustre/obdecho/echo_internal.h @@ -0,0 +1,30 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + */ + +#ifndef _ECHO_INTERNAL_H +#define _ECHO_INTERNAL_H + +/* The persistent object (i.e. actually stores stuff!) */ +#define ECHO_PERSISTENT_OBJID 1ULL +#define ECHO_PERSISTENT_SIZE ((__u64)(1<<20)) + +/* block size to use for data verification */ +#define OBD_ECHO_BLOCK_SIZE (4<<10) + +#ifndef __KERNEL__ +/* Kludge here, define some functions and macros needed by liblustre -jay */ +static inline void page_cache_get(struct page *page) +{ +} + +static inline void page_cache_release(struct page *page) +{ +} + +#define READ 0 +#define WRITE 1 + +#endif /* ifndef __KERNEL__ */ + +#endif diff --git a/lustre/obdfilter/filter.c b/lustre/obdfilter/filter.c index 2809116..8b3fd41 100644 --- a/lustre/obdfilter/filter.c +++ b/lustre/obdfilter/filter.c @@ -80,7 +80,7 @@ #include "filter_internal.h" /* Group 0 is no longer a legal group, to catch uninitialized IDs */ -#define FILTER_MIN_GROUPS FILTER_GROUP_MDS0 +#define FILTER_MIN_GROUPS FILTER_GROUP_MDS1_N_BASE static struct lvfs_callback_ops filter_lvfs_ops; cfs_mem_cache_t *ll_fmd_cachep; @@ -955,7 +955,9 @@ static int filter_update_last_group(struct obd_device *obd, int group) CDEBUG(D_INODE, "error reading LAST_GROUP: rc %d\n",rc); GOTO(cleanup, rc); } - LASSERT(off == 0 || last_group >= FILTER_MIN_GROUPS); + LASSERTF(off == 0 || CHECK_MDS_GROUP(last_group), + "off = %llu and last_group = %d\n", off, last_group); + CDEBUG(D_INODE, "%s: previous %d, new %d\n", obd->obd_name, last_group, group); @@ -1145,8 +1147,6 @@ static int filter_read_groups(struct obd_device *obd, int last_group, down(&filter->fo_init_lock); old_count = filter->fo_group_count; for (group = old_count; group <= last_group; group++) { - if (group == 0) - continue; /* no group zero */ rc = filter_read_group_internal(obd, group, create); if (rc != 0) @@ -1245,7 +1245,7 @@ static int filter_prep_groups(struct obd_device *obd) if (off == 0) { last_group = FILTER_MIN_GROUPS; } else { - LASSERT(last_group >= FILTER_MIN_GROUPS); + LASSERT_MDS_GROUP(last_group); } CWARN("%s: initialize groups [%d,%d]\n", obd->obd_name, @@ -1369,7 +1369,7 @@ static void filter_post(struct obd_device *obd) if (rc) CERROR("error writing server data: rc = %d\n", rc); - for (i = 1; i < filter->fo_group_count; i++) { + for (i = 0; i < filter->fo_group_count; i++) { rc = filter_update_last_objid(obd, i, (i == filter->fo_group_count - 1)); if (rc) @@ -1416,7 +1416,6 @@ obd_id filter_last_id(struct filter_obd *filter, obd_gr group) spin_lock(&filter->fo_objidlock); id = filter->fo_last_objids[group]; spin_unlock(&filter->fo_objidlock); - return id; } @@ -1433,7 +1432,7 @@ struct dentry *filter_parent(struct obd_device *obd, obd_gr group, obd_id objid) struct filter_subdirs *subdirs; LASSERT(group < filter->fo_group_count); /* FIXME: object groups */ - if ((group > 0 && group < FILTER_GROUP_MDS0) || + if ((group > FILTER_GROUP_MDS0 && group < FILTER_GROUP_MDS1_N_BASE) || filter->fo_subdir_count == 0) return filter->fo_dentry_O_groups[group]; @@ -1545,7 +1544,7 @@ static int filter_prepare_destroy(struct obd_device *obd, obd_id objid, rc = ldlm_cli_enqueue_local(obd->obd_namespace, &res_id, LDLM_EXTENT, &policy, LCK_PW, &flags, ldlm_blocking_ast, ldlm_completion_ast, NULL, NULL, 0, NULL, - &lockh); + NULL, &lockh); /* We only care about the side-effects, just drop the lock. */ if (rc == ELDLM_OK) @@ -1889,6 +1888,30 @@ static void filter_iobuf_pool_done(struct filter_obd *filter) EXIT; } +static int filter_adapt_sptlrpc_conf(struct obd_device *obd, int initial) +{ + struct filter_obd *filter = &obd->u.filter; + struct sptlrpc_rule_set tmp_rset; + int rc; + + sptlrpc_rule_set_init(&tmp_rset); + rc = sptlrpc_conf_target_get_rules(obd, &tmp_rset, initial); + if (rc) { + CERROR("obd %s: failed get sptlrpc rules: %d\n", + obd->obd_name, rc); + return rc; + } + + sptlrpc_target_update_exp_flavor(obd, &tmp_rset); + + write_lock(&filter->fo_sptlrpc_lock); + sptlrpc_rule_set_free(&filter->fo_sptlrpc_rset); + filter->fo_sptlrpc_rset = tmp_rset; + write_unlock(&filter->fo_sptlrpc_lock); + + return 0; +} + /* * pre-allocate pool of iobuf's to be used by filter_{prep,commit}rw_write(). */ @@ -1915,7 +1938,8 @@ static int filter_iobuf_pool_init(struct filter_obd *filter) * If we haven't allocated a pool entry for this thread before, do so now. */ void *filter_iobuf_get(struct filter_obd *filter, struct obd_trans_info *oti) { - int thread_id = oti ? oti->oti_thread_id : -1; + int thread_id = (oti && oti->oti_thread) ? + oti->oti_thread->t_id : -1; struct filter_iobuf *pool = NULL; struct filter_iobuf **pool_place = NULL; @@ -2031,7 +2055,7 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, sema_init(&filter->fo_alloc_lock, 1); init_brw_stats(&filter->fo_filter_stats); filter->fo_read_cache = 1; /* enable read-only cache by default */ - filter->fo_writethrough_cache = 1; /* disable writethrough cache */ + filter->fo_writethrough_cache = 1; /* enable writethrough cache */ filter->fo_readcache_max_filesize = FILTER_MAX_CACHE_SIZE; filter->fo_fmd_max_num = FILTER_FMD_MAX_NUM_DEFAULT; filter->fo_fmd_max_age = FILTER_FMD_MAX_AGE_DEFAULT; @@ -2039,10 +2063,7 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, CFS_INIT_LIST_HEAD(&filter->fo_llog_list); spin_lock_init(&filter->fo_llog_list_lock); - rwlock_init(&filter->fo_sptlrpc_lock); - sptlrpc_rule_set_init(&filter->fo_sptlrpc_rset); - - filter->fo_fl_oss_capa = 0; + filter->fo_fl_oss_capa = 1; CFS_INIT_LIST_HEAD(&filter->fo_capa_keys); filter->fo_capa_hash = init_capa_hash(); if (filter->fo_capa_hash == NULL) @@ -2066,6 +2087,11 @@ int filter_common_setup(struct obd_device *obd, struct lustre_cfg* lcfg, GOTO(err_post, rc); } + rwlock_init(&filter->fo_sptlrpc_lock); + sptlrpc_rule_set_init(&filter->fo_sptlrpc_rset); + /* do this after llog being initialized */ + filter_adapt_sptlrpc_conf(obd, 1); + rc = lquota_setup(filter_quota_interface_ref, obd); if (rc) GOTO(err_post, rc); @@ -2221,10 +2247,18 @@ static int filter_olg_fini(struct obd_llog_group *olg) rc = llog_cleanup(ctxt); ctxt = llog_group_get_ctxt(olg, LLOG_SIZE_ORIG_CTXT); - if (ctxt) + if (ctxt) { + rc2 = llog_cleanup(ctxt); + if (!rc) + rc = rc2; + } + + ctxt = llog_group_get_ctxt(olg, LLOG_CONFIG_ORIG_CTXT); + if (ctxt) { rc2 = llog_cleanup(ctxt); - if (!rc) - rc = rc2; + if (!rc) + rc = rc2; + } RETURN(rc); } @@ -2277,6 +2311,11 @@ filter_default_olg_init(struct obd_device *obd, struct obd_llog_group *olg, if (rc) GOTO(cleanup_lcm, rc); + rc = llog_setup(obd, olg, LLOG_CONFIG_ORIG_CTXT, tgt, 0, NULL, + &llog_lvfs_ops); + if (rc) + GOTO(cleanup_olg, rc); + ctxt = llog_group_get_ctxt(olg, LLOG_MDS_OST_REPL_CTXT); if (!ctxt) { CERROR("Can't get ctxt for %p:%x\n", olg, @@ -2334,19 +2373,27 @@ static int filter_llog_finish(struct obd_device *obd, int count) ENTRY; ctxt = llog_group_get_ctxt(&obd->obd_olg, LLOG_MDS_OST_REPL_CTXT); - LASSERT(ctxt != NULL); - mutex_down(&ctxt->loc_sem); - if (ctxt->loc_imp) { + if (ctxt) { + /* + * Make sure that no cached llcds left in recov_thread. + * We actually do sync in disconnect time, but disconnect + * may not come being marked rq_no_resend = 1. + */ + llog_sync(ctxt, NULL); + /* - * Balance class_import_get() in llog_receptor_accept(). This - * is safe to do here, as llog is already synchronized and its - * import may go. + * Balance class_import_get() in llog_receptor_accept(). + * This is safe to do, as llog is already synchronized + * and its import may go. */ - class_import_put(ctxt->loc_imp); - ctxt->loc_imp = NULL; + mutex_down(&ctxt->loc_sem); + if (ctxt->loc_imp) { + class_import_put(ctxt->loc_imp); + ctxt->loc_imp = NULL; + } + mutex_up(&ctxt->loc_sem); + llog_ctxt_put(ctxt); } - mutex_up(&ctxt->loc_sem); - llog_ctxt_put(ctxt); if (filter->fo_lcm) { llog_recov_thread_fini(filter->fo_lcm, obd->obd_force); @@ -2761,8 +2808,6 @@ static int filter_connect(const struct lu_env *env, } group = data->ocd_group; - if (group == 0) - GOTO(cleanup, rc); CWARN("%s: Received MDS connection ("LPX64"); group %d\n", obd->obd_name, exp->exp_handle.h_cookie, group); @@ -2912,9 +2957,7 @@ static int filter_destroy_export(struct obd_export *exp) exp->exp_obd->obd_name, exp->exp_client_uuid.uuid, exp, exp->exp_filter_data.fed_pending); - /* Not ported yet the b1_6 quota functionality - * lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd); - */ + lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd); target_destroy_export(exp); ldlm_destroy_export(exp); @@ -2941,7 +2984,7 @@ static void filter_sync_llogs(struct obd_device *obd, struct obd_export *dexp) { struct obd_llog_group *olg_min, *olg; struct filter_obd *filter; - int worked = 0, group; + int worked = -1, group; struct llog_ctxt *ctxt; ENTRY; @@ -2978,18 +3021,20 @@ static void filter_sync_llogs(struct obd_device *obd, struct obd_export *dexp) (dexp == olg_min->olg_exp || dexp == NULL)) { int err; ctxt = llog_group_get_ctxt(olg_min, - LLOG_MDS_OST_REPL_CTXT); - LASSERT(ctxt != NULL); - err = llog_sync(ctxt, olg_min->olg_exp); - llog_ctxt_put(ctxt); - if (err) - CERROR("error flushing logs to MDS: rc %d\n", - err); + LLOG_MDS_OST_REPL_CTXT); + if (ctxt) { + err = llog_sync(ctxt, olg_min->olg_exp); + llog_ctxt_put(ctxt); + if (err) { + CERROR("error flushing logs to MDS: " + "rc %d\n", err); + } + } } } while (olg_min != NULL); } -/* also incredibly similar to mds_disconnect */ +/* Also incredibly similar to mds_disconnect */ static int filter_disconnect(struct obd_export *exp) { struct obd_device *obd = exp->exp_obd; @@ -3003,6 +3048,9 @@ static int filter_disconnect(struct obd_export *exp) filter_grant_sanity_check(obd, __FUNCTION__); filter_grant_discard(exp); + /* Flush any remaining cancel messages out to the target */ + filter_sync_llogs(obd, exp); + /* Disconnect early so that clients can't keep using export */ rc = class_disconnect(exp); if (exp->exp_obd->obd_namespace != NULL) @@ -3012,8 +3060,6 @@ static int filter_disconnect(struct obd_export *exp) lprocfs_exp_cleanup(exp); - /* flush any remaining cancel messages out to the target */ - filter_sync_llogs(obd, exp); class_export_put(exp); RETURN(rc); } @@ -3166,9 +3212,9 @@ int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, } if (ia_valid & ATTR_SIZE || ia_valid & (ATTR_UID | ATTR_GID)) { - old_size = i_size_read(inode); DQUOT_INIT(inode); LOCK_INODE_MUTEX(inode); + old_size = i_size_read(inode); locked = 1; } @@ -3288,43 +3334,52 @@ out_unlock: int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti) { + struct obdo *oa = oinfo->oi_oa; + struct lustre_capa *capa = oinfo_capa(oinfo); struct ldlm_res_id res_id; struct filter_mod_data *fmd; struct lvfs_run_ctxt saved; struct filter_obd *filter; struct ldlm_resource *res; struct dentry *dentry; + __u64 opc = CAPA_OPC_META_WRITE; int rc; ENTRY; - osc_build_res_name(oinfo->oi_oa->o_id, oinfo->oi_oa->o_gr, &res_id); - rc = filter_auth_capa(exp, NULL, oinfo_mdsno(oinfo), - oinfo_capa(oinfo), CAPA_OPC_META_WRITE); + if (oa->o_valid & OBD_FL_TRUNC) + opc |= CAPA_OPC_OSS_TRUNC; + rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), capa, opc); if (rc) RETURN(rc); + if (oa->o_valid & (OBD_MD_FLUID | OBD_MD_FLGID)) { + rc = filter_capa_fixoa(exp, oa, obdo_mdsno(oa), capa); + if (rc) + RETURN(rc); + } + + osc_build_res_name(oa->o_id, oa->o_gr, &res_id); /* This would be very bad - accidentally truncating a file when * changing the time or similar - bug 12203. */ - if (oinfo->oi_oa->o_valid & OBD_MD_FLSIZE && + if (oa->o_valid & OBD_MD_FLSIZE && oinfo->oi_policy.l_extent.end != OBD_OBJECT_EOF) { static char mdsinum[48]; - if (oinfo->oi_oa->o_valid & OBD_MD_FLFID) + if (oa->o_valid & OBD_MD_FLFID) snprintf(mdsinum, sizeof(mdsinum) - 1, - " of inode "LPU64"/%u", oinfo->oi_oa->o_fid, - oinfo->oi_oa->o_generation); + " of inode "LPU64"/%u", oa->o_fid, + oa->o_generation); else mdsinum[0] = '\0'; CERROR("%s: setattr from %s trying to truncate objid "LPU64 " %s\n", exp->exp_obd->obd_name, obd_export_nid2str(exp), - oinfo->oi_oa->o_id, mdsinum); + oa->o_id, mdsinum); RETURN(-EPERM); } - dentry = __filter_oa2dentry(exp->exp_obd, oinfo->oi_oa, - __FUNCTION__, 1); + dentry = __filter_oa2dentry(exp->exp_obd, oa, __FUNCTION__, 1); if (IS_ERR(dentry)) RETURN(PTR_ERR(dentry)); @@ -3332,16 +3387,16 @@ int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, push_ctxt(&saved, &exp->exp_obd->obd_lvfs_ctxt, NULL); lock_kernel(); - if (oinfo->oi_oa->o_valid & + if (oa->o_valid & (OBD_MD_FLMTIME | OBD_MD_FLATIME | OBD_MD_FLCTIME)) { - fmd = filter_fmd_get(exp,oinfo->oi_oa->o_id,oinfo->oi_oa->o_gr); + fmd = filter_fmd_get(exp, oa->o_id, oa->o_gr); if (fmd && fmd->fmd_mactime_xid < oti->oti_xid) fmd->fmd_mactime_xid = oti->oti_xid; filter_fmd_put(exp, fmd); } /* setting objects attributes (including owner/group) */ - rc = filter_setattr_internal(exp, dentry, oinfo->oi_oa, oti); + rc = filter_setattr_internal(exp, dentry, oa, oti); if (rc) GOTO(out_unlock, rc); @@ -3355,10 +3410,10 @@ int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, ldlm_resource_putref(res); } - oinfo->oi_oa->o_valid = OBD_MD_FLID; + oa->o_valid = OBD_MD_FLID; /* Quota release need uid/gid info */ - obdo_from_inode(oinfo->oi_oa, dentry->d_inode, + obdo_from_inode(oa, dentry->d_inode, FILTER_VALID_FLAGS | OBD_MD_FLUID | OBD_MD_FLGID); EXIT; @@ -3435,7 +3490,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, ENTRY; LASSERT(oa); - LASSERT(oa->o_gr != 0); + LASSERT_MDS_GROUP(oa->o_gr); LASSERT(oa->o_valid & OBD_MD_FLGROUP); LASSERT(down_trylock(&filter->fo_create_locks[oa->o_gr]) != 0); @@ -3458,7 +3513,7 @@ static int filter_destroy_precreated(struct obd_export *exp, struct obdo *oa, for (id = last; id > oa->o_id; id--) { doa.o_id = id; - rc = filter_destroy(exp, &doa, NULL, NULL, NULL); + rc = filter_destroy(exp, &doa, NULL, NULL, NULL, NULL); if (rc && rc != -ENOENT) /* this is pretty fatal... */ CEMERG("error destroying precreate objid "LPU64": %d\n", id, rc); @@ -3533,8 +3588,8 @@ static int filter_handle_precreate(struct obd_export *exp, struct obdo *oa, obd->obd_name); GOTO(out, rc = 0); } - /* only precreate if group == 0 and o_id is specified */ - if (group < FILTER_GROUP_MDS0 || oa->o_id == 0) + /* only precreate if group == 0 and o_id is specfied */ + if (group == FILTER_GROUP_LLOG || oa->o_id == 0) diff = 1; else diff = oa->o_id - filter_last_id(filter, group); @@ -3813,7 +3868,7 @@ static int filter_create(struct obd_export *exp, struct obdo *oa, CDEBUG(D_INODE, "%s: filter_create(od->o_gr="LPU64",od->o_id=" LPU64")\n", obd->obd_name, oa->o_gr, oa->o_id); - if (!(oa->o_valid & OBD_MD_FLGROUP) || group == 0) { + if (!(oa->o_valid & OBD_MD_FLGROUP)) { CERROR("!!! nid %s sent invalid object group %d\n", obd_export_nid2str(exp), group); RETURN(-EINVAL); @@ -3877,7 +3932,7 @@ static int filter_create(struct obd_export *exp, struct obdo *oa, int filter_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *md, struct obd_trans_info *oti, - struct obd_export *md_exp) + struct obd_export *md_exp, void *capa) { unsigned int qcids[MAXQUOTAS] = {0, 0}; struct obd_device *obd; @@ -3892,6 +3947,11 @@ int filter_destroy(struct obd_export *exp, struct obdo *oa, LASSERT(oa->o_valid & OBD_MD_FLGROUP); + rc = filter_auth_capa(exp, NULL, obdo_mdsno(oa), + (struct lustre_capa *)capa, CAPA_OPC_OSS_DESTROY); + if (rc) + RETURN(rc); + obd = exp->exp_obd; filter = &obd->u.filter; @@ -4036,9 +4096,8 @@ cleanup: qcids[GRPQUOTA] = oa->o_gid; rc2 = lquota_adjust(filter_quota_interface_ref, obd, qcids, NULL, rc, FSFILT_OP_UNLINK); - if (rc2) - CDEBUG(D_QUOTA, "filter adjust qunit! (rc:%d)\n", rc2); + CERROR("filter adjust qunit! (rc:%d)\n", rc2); return rc; } @@ -4060,13 +4119,10 @@ static int filter_truncate(struct obd_export *exp, struct obd_info *oinfo, ", o_size = "LPD64"\n", oinfo->oi_oa->o_id, oinfo->oi_oa->o_valid, oinfo->oi_policy.l_extent.start); - rc = filter_auth_capa(exp, NULL, oinfo_mdsno(oinfo), - oinfo_capa(oinfo), CAPA_OPC_OSS_TRUNC); - if (rc) - RETURN(rc); - oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start; + oinfo->oi_oa->o_valid |= OBD_FL_TRUNC; rc = filter_setattr(exp, oinfo, oti); + oinfo->oi_oa->o_valid &= ~OBD_FL_TRUNC; RETURN(rc); } @@ -4087,10 +4143,10 @@ static int filter_sync(struct obd_export *exp, struct obdo *oa, filter = &exp->exp_obd->u.filter; - /* an objid of zero is taken to mean "sync whole filesystem" */ + /* An objid of zero is taken to mean "sync whole filesystem" */ if (!oa || !(oa->o_valid & OBD_MD_FLID)) { rc = fsfilt_sync(exp->exp_obd, filter->fo_obt.obt_sb); - /* flush any remaining cancel messages out to the target */ + /* Flush any remaining cancel messages out to the target */ filter_sync_llogs(exp->exp_obd, exp); RETURN(rc); } @@ -4210,13 +4266,32 @@ static int filter_get_info(struct obd_export *exp, __u32 keylen, RETURN(-EINVAL); } +static inline int filter_setup_llog_group(struct obd_export *exp, + struct obd_device *obd, + int group) +{ + struct obd_llog_group *olg; + struct llog_ctxt *ctxt; + int rc; + + olg = filter_find_create_olg(obd, group); + if (IS_ERR(olg)) + RETURN(PTR_ERR(olg)); + + llog_group_set_export(olg, exp); + + ctxt = llog_group_get_ctxt(olg, LLOG_MDS_OST_REPL_CTXT); + LASSERTF(ctxt != NULL, "ctxt is null\n"); + + rc = llog_receptor_accept(ctxt, exp->exp_imp_reverse); + llog_ctxt_put(ctxt); + return rc; +} static int filter_set_info_async(struct obd_export *exp, __u32 keylen, void *key, __u32 vallen, void *val, struct ptlrpc_request_set *set) { struct obd_device *obd; - struct obd_llog_group *olg; - struct llog_ctxt *ctxt; int rc = 0, group; ENTRY; @@ -4235,6 +4310,12 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen, if (KEY_IS(KEY_REVIMP_UPD)) { filter_revimp_update(exp); + lquota_clearinfo(filter_quota_interface_ref, exp, exp->exp_obd); + RETURN(0); + } + + if (KEY_IS(KEY_SPTLRPC_CONF)) { + filter_adapt_sptlrpc_conf(obd, 0); RETURN(0); } @@ -4247,23 +4328,20 @@ static int filter_set_info_async(struct obd_export *exp, __u32 keylen, /* setup llog imports */ LASSERT(val != NULL); - group = (int)(*(__u32 *)val); - LASSERT(group >= FILTER_GROUP_MDS0); - - olg = filter_find_create_olg(obd, group); - if (IS_ERR(olg)) - RETURN(PTR_ERR(olg)); - - llog_group_set_export(olg, exp); - ctxt = llog_group_get_ctxt(olg, LLOG_MDS_OST_REPL_CTXT); - LASSERTF(ctxt != NULL, "ctxt is null\n"); - - rc = llog_receptor_accept(ctxt, exp->exp_imp_reverse); - llog_ctxt_put(ctxt); + group = (int)(*(__u32 *)val); + LASSERT_MDS_GROUP(group); + rc = filter_setup_llog_group(exp, obd, group); + if (rc) + goto out; - lquota_setinfo(filter_quota_interface_ref, exp, obd); + lquota_setinfo(filter_quota_interface_ref, obd, exp); + if (group == FILTER_GROUP_MDS0) { + /* setup llog group 1 for interop */ + filter_setup_llog_group(exp, obd, FILTER_GROUP_LLOG); + } +out: RETURN(rc); } @@ -4370,39 +4448,13 @@ static int filter_process_config(struct obd_device *obd, obd_count len, int rc = 0; switch (lcfg->lcfg_command) { - case LCFG_SPTLRPC_CONF: { - struct filter_obd *filter = &obd->u.filter; - struct sptlrpc_conf_log *log; - struct sptlrpc_rule_set tmp_rset; - - log = sptlrpc_conf_log_extract(lcfg); - if (IS_ERR(log)) { - rc = PTR_ERR(log); - break; - } - - sptlrpc_rule_set_init(&tmp_rset); - - rc = sptlrpc_rule_set_from_log(&tmp_rset, log); - if (rc) { - CERROR("obd %s: failed get sptlrpc rules: %d\n", - obd->obd_name, rc); - break; - } - - write_lock(&filter->fo_sptlrpc_lock); - sptlrpc_rule_set_free(&filter->fo_sptlrpc_rset); - filter->fo_sptlrpc_rset = tmp_rset; - write_unlock(&filter->fo_sptlrpc_lock); - - sptlrpc_target_update_exp_flavor(obd, &tmp_rset); - break; - } default: lprocfs_filter_init_vars(&lvars); rc = class_process_proc_param(PARAM_OST, lvars.obd_vars, lcfg, obd); + if (rc > 0) + rc = 0; break; } diff --git a/lustre/obdfilter/filter_capa.c b/lustre/obdfilter/filter_capa.c index 1fbddb6..bc43a6a 100644 --- a/lustre/obdfilter/filter_capa.c +++ b/lustre/obdfilter/filter_capa.c @@ -128,6 +128,9 @@ int filter_auth_capa(struct obd_export *exp, struct lu_fid *fid, __u64 mdsid, if (!filter->fo_fl_oss_capa) RETURN(0); + if (!(exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA)) + RETURN(0); + if (capa == NULL) { if (fid) CERROR("mdsno/fid/opc "LPU64"/"DFID"/"LPX64 @@ -164,8 +167,13 @@ int filter_auth_capa(struct obd_export *exp, struct lu_fid *fid, __u64 mdsid, RETURN(rc); } + if (capa_is_expired_sec(capa)) { + DEBUG_CAPA(D_ERROR, capa, "expired"); + RETURN(-ESTALE); + } + spin_lock(&capa_lock); - list_for_each_entry(k, &filter->fo_capa_keys, k_list) + list_for_each_entry(k, &filter->fo_capa_keys, k_list) { if (k->k_key.lk_mdsid == mdsid) { keys_ready = 1; if (k->k_key.lk_keyid == capa_keyid(capa)) { @@ -174,6 +182,7 @@ int filter_auth_capa(struct obd_export *exp, struct lu_fid *fid, __u64 mdsid, break; } } + } spin_unlock(&capa_lock); if (!keys_ready) { @@ -212,6 +221,64 @@ int filter_auth_capa(struct obd_export *exp, struct lu_fid *fid, __u64 mdsid, RETURN(0); } +int filter_capa_fixoa(struct obd_export *exp, struct obdo *oa, __u64 mdsid, + struct lustre_capa *capa) +{ + int rc = 0; + ENTRY; + + if (!(exp->exp_connect_flags & OBD_CONNECT_OSS_CAPA)) + RETURN(0); + + if (unlikely(!capa)) + RETURN(-EACCES); + + if (capa_flags(capa) == LC_ID_CONVERT) { + struct obd_device *obd = exp->exp_obd; + struct filter_obd *filter = &obd->u.filter; + struct filter_capa_key *k; + int found = 0; + + spin_lock(&capa_lock); + list_for_each_entry(k, &filter->fo_capa_keys, k_list) { + if (k->k_key.lk_mdsid == mdsid && + k->k_key.lk_keyid == capa_keyid(capa)) { + found = 1; + break; + } + } + spin_unlock(&capa_lock); + + if (found) { + union { + __u64 id64; + __u32 id32[2]; + } uid, gid; + __u32 d[4], s[4]; + + uid.id64 = capa_uid(capa); + gid.id64 = capa_gid(capa); + s[0] = uid.id32[0]; + s[1] = uid.id32[1]; + s[2] = gid.id32[0]; + s[3] = gid.id32[1]; + + rc = capa_decrypt_id(d, s, k->k_key.lk_key, + CAPA_HMAC_KEY_MAX_LEN); + if (unlikely(rc)) + RETURN(rc); + + oa->o_uid = d[0]; + oa->o_gid = d[2]; + } else { + DEBUG_CAPA(D_ERROR, capa, "no matched capability key for"); + rc = -ESTALE; + } + } + + RETURN(rc); +} + void filter_free_capa_keys(struct filter_obd *filter) { struct filter_capa_key *key, *n; diff --git a/lustre/obdfilter/filter_internal.h b/lustre/obdfilter/filter_internal.h index fb82e44..28578d6 100644 --- a/lustre/obdfilter/filter_internal.h +++ b/lustre/obdfilter/filter_internal.h @@ -150,7 +150,7 @@ int filter_common_setup(struct obd_device *, struct lustre_cfg *lcfg, void *option); int filter_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *md, struct obd_trans_info *, - struct obd_export *); + struct obd_export *, void *); int filter_setattr_internal(struct obd_export *exp, struct dentry *dentry, struct obdo *oa, struct obd_trans_info *oti); int filter_setattr(struct obd_export *exp, struct obd_info *oinfo, @@ -235,15 +235,11 @@ static void lprocfs_filter_init_vars(struct lprocfs_static_vars *lvars) /* Quota stuff */ extern quota_interface_t *filter_quota_interface_ref; -/* Capability */ -static inline __u64 obdo_mdsno(struct obdo *oa) -{ - return oa->o_gr - FILTER_GROUP_MDS0; -} - int filter_update_capa_key(struct obd_device *obd, struct lustre_capa_key *key); int filter_auth_capa(struct obd_export *exp, struct lu_fid *fid, __u64 mdsid, struct lustre_capa *capa, __u64 opc); +int filter_capa_fixoa(struct obd_export *exp, struct obdo *oa, __u64 mdsid, + struct lustre_capa *capa); void filter_free_capa_keys(struct filter_obd *filter); void blacklist_add(uid_t uid); diff --git a/lustre/obdfilter/filter_io.c b/lustre/obdfilter/filter_io.c index 74d5aa8..07b2f9c 100644 --- a/lustre/obdfilter/filter_io.c +++ b/lustre/obdfilter/filter_io.c @@ -328,8 +328,11 @@ void filter_invalidate_cache(struct obd_device *obd, struct obd_ioobj *obj, LASSERT(inode != NULL); for (i = 0, rnb = nb; i < obj->ioo_bufcnt; i++, rnb++) { - obd_off start = rnb->offset >> CFS_PAGE_SHIFT; - obd_off end = (rnb->offset + rnb->len) >> CFS_PAGE_SHIFT; + obd_off start; + obd_off end; + + start = rnb->offset >> CFS_PAGE_SHIFT; + end = (rnb->offset + rnb->len) >> CFS_PAGE_SHIFT; invalidate_mapping_pages(inode->i_mapping, start, end); /* just to avoid warnings */ start = 0; @@ -345,7 +348,6 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, struct lustre_capa *capa) { struct obd_device *obd = exp->exp_obd; - struct filter_obd *fo = &obd->u.filter; struct timeval start, end; struct lvfs_run_ctxt saved; struct niobuf_local *lnb; @@ -467,10 +469,6 @@ static int filter_preprw_read(int cmd, struct obd_export *exp, struct obdo *oa, } } - if (inode && (fo->fo_read_cache == 0 || - i_size_read(inode) > fo->fo_readcache_max_filesize)) - filter_invalidate_cache(obd, obj, nb, inode); - if (rc != 0) { if (dentry != NULL) f_dput(dentry); @@ -655,6 +653,13 @@ static int filter_preprw_write(int cmd, struct obd_export *exp, struct obdo *oa, GOTO(cleanup, rc = -ENOENT); } + if (oa->o_valid & (OBD_MD_FLUID | OBD_MD_FLGID) && + dentry->d_inode->i_mode & (S_ISUID | S_ISGID)) { + rc = filter_capa_fixoa(exp, oa, obdo_mdsno(oa), capa); + if (rc) + GOTO(cleanup, rc); + } + rc = filter_map_remote_to_local(objcount, obj, nb, npages, res); if (rc) GOTO(cleanup, rc); @@ -830,6 +835,7 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, int npages, struct niobuf_local *res, struct obd_trans_info *oti, int rc) { + struct filter_obd *fo = &exp->exp_obd->u.filter; struct inode *inode = NULL; struct ldlm_res_id res_id; struct ldlm_resource *resource = NULL; @@ -846,7 +852,9 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, resource = ldlm_resource_get(ns, NULL, &res_id, LDLM_EXTENT, 0); if (resource != NULL) { + LDLM_RESOURCE_ADDREF(resource); ns->ns_lvbo->lvbo_update(resource, NULL, 0, 1); + LDLM_RESOURCE_DELREF(resource); ldlm_resource_putref(resource); } } @@ -861,6 +869,10 @@ static int filter_commitrw_read(struct obd_export *exp, struct obdo *oa, } } + if (inode && (fo->fo_read_cache == 0 || + i_size_read(inode) > fo->fo_readcache_max_filesize)) + filter_invalidate_cache(exp->exp_obd, obj, rnb, inode); + if (res->dentry != NULL) f_dput(res->dentry); RETURN(rc); diff --git a/lustre/obdfilter/filter_io_26.c b/lustre/obdfilter/filter_io_26.c index 8940804..aa49c15 100644 --- a/lustre/obdfilter/filter_io_26.c +++ b/lustre/obdfilter/filter_io_26.c @@ -246,7 +246,8 @@ void filter_free_iobuf(struct filter_iobuf *iobuf) void filter_iobuf_put(struct filter_obd *filter, struct filter_iobuf *iobuf, struct obd_trans_info *oti) { - int thread_id = oti ? oti->oti_thread_id : -1; + int thread_id = (oti && oti->oti_thread) ? + oti->oti_thread->t_id : -1; if (unlikely(thread_id < 0)) { filter_free_iobuf(iobuf); @@ -556,7 +557,8 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, struct obd_device *obd = exp->exp_obd; struct filter_obd *fo = &obd->u.filter; void *wait_handle; - int total_size = 0, rc2; + int total_size = 0; + int rec_pending = 0; unsigned int qcids[MAXQUOTAS] = {0, 0}; ENTRY; @@ -567,21 +569,11 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (rc != 0) GOTO(cleanup, rc); - /* Unfortunately, if quota master is too busy to handle the - * pre-dqacq in time and quota hash on ost is used up, we - * have to wait for the completion of in flight dqacq/dqrel, - * then try again */ - if ((rc2 = lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid, - oa->o_gid, niocount)) == QUOTA_RET_ACQUOTA) { - OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90); - lquota_acquire(filter_quota_interface_ref, obd, oa->o_uid, - oa->o_gid); - } - - if (rc2 < 0) { - rc = rc2; - GOTO(cleanup, rc); - } + /* we try to get enough quota to write here, and let ldiskfs + * decide if it is out of quota or not b=14783 */ + lquota_chkquota(filter_quota_interface_ref, obd, oa->o_uid, + oa->o_gid, niocount, &rec_pending, oti, + LQUOTA_FLAGS_BLK); iobuf = filter_iobuf_get(&obd->u.filter, oti); if (IS_ERR(iobuf)) @@ -595,9 +587,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, iobuf->dr_ignore_quota = 0; for (i = 0, lnb = res; i < niocount; i++, lnb++) { loff_t this_size; + __u32 flags = lnb->flags; /* If overwriting an existing block, we don't need a grant */ - if (!(lnb->flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC && + if (!(flags & OBD_BRW_GRANTED) && lnb->rc == -ENOSPC && filter_range_is_mapped(inode, lnb->offset, lnb->len)) lnb->rc = 0; @@ -627,10 +620,15 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, if (this_size > iattr.ia_size) iattr.ia_size = this_size; - /* if one page is a write-back page from client cache, or it's - * written by root, then mark the whole io request as ignore - * quota request */ - if (lnb->flags & (OBD_BRW_FROM_GRANT | OBD_BRW_NOQUOTA)) + /* if one page is a write-back page from client cache and + * not from direct_io, or it's written by root, then mark + * the whole io request as ignore quota request, remote + * client can not break through quota. */ + if (exp_connect_rmtclient(exp)) + flags &= ~OBD_BRW_NOQUOTA; + if ((flags & OBD_BRW_NOQUOTA) || + (flags & (OBD_BRW_FROM_GRANT | OBD_BRW_SYNC)) == + OBD_BRW_FROM_GRANT) iobuf->dr_ignore_quota = 1; } @@ -721,6 +719,10 @@ int filter_commitrw_write(struct obd_export *exp, struct obdo *oa, fsfilt_check_slow(obd, now, "commitrw commit"); cleanup: + if (rec_pending) + lquota_pending_commit(filter_quota_interface_ref, obd, oa->o_uid, + oa->o_gid, niocount, 1); + filter_grant_commit(exp, niocount, res); switch (cleanup_phase) { diff --git a/lustre/obdfilter/filter_log.c b/lustre/obdfilter/filter_log.c index 2e1faf0..2462a07 100644 --- a/lustre/obdfilter/filter_log.c +++ b/lustre/obdfilter/filter_log.c @@ -173,7 +173,7 @@ static int filter_recov_log_unlink_cb(struct llog_ctxt *ctxt, oa->o_lcookie = *cookie; oid = oa->o_id; - rc = filter_destroy(exp, oa, NULL, NULL, NULL); + rc = filter_destroy(exp, oa, NULL, NULL, NULL, NULL); OBDO_FREE(oa); if (rc == -ENOENT) { CDEBUG(D_RPCTRACE, "object already removed, send cookie\n"); @@ -195,24 +195,34 @@ static int filter_recov_log_setattr_cb(struct llog_ctxt *ctxt, { struct obd_device *obd = ctxt->loc_obd; struct obd_export *exp = obd->obd_self_export; - struct llog_setattr_rec *lsr; struct obd_info oinfo = { { { 0 } } }; obd_id oid; int rc = 0; ENTRY; - lsr = (struct llog_setattr_rec *)rec; OBDO_ALLOC(oinfo.oi_oa); if (oinfo.oi_oa == NULL) RETURN(-ENOMEM); + if (rec->lrh_type == MDS_SETATTR_REC) { + struct llog_setattr_rec *lsr = (struct llog_setattr_rec *)rec; + + oinfo.oi_oa->o_id = lsr->lsr_oid; + oinfo.oi_oa->o_gr = lsr->lsr_ogen; + oinfo.oi_oa->o_uid = lsr->lsr_uid; + oinfo.oi_oa->o_gid = lsr->lsr_gid; + } else { + struct llog_setattr64_rec *lsr = (struct llog_setattr64_rec *)rec; + + oinfo.oi_oa->o_id = lsr->lsr_oid; + oinfo.oi_oa->o_gr = lsr->lsr_ogen; + oinfo.oi_oa->o_uid = lsr->lsr_uid; + oinfo.oi_oa->o_gid = lsr->lsr_gid; + } + oinfo.oi_oa->o_valid |= (OBD_MD_FLID | OBD_MD_FLUID | OBD_MD_FLGID | OBD_MD_FLCOOKIE); - oinfo.oi_oa->o_id = lsr->lsr_oid; - oinfo.oi_oa->o_gr = lsr->lsr_ogen; oinfo.oi_oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP; - oinfo.oi_oa->o_uid = lsr->lsr_uid; - oinfo.oi_oa->o_gid = lsr->lsr_gid; oinfo.oi_oa->o_lcookie = *cookie; oid = oinfo.oi_oa->o_id; @@ -257,6 +267,7 @@ int filter_recov_log_mds_ost_cb(struct llog_handle *llh, rc = filter_recov_log_unlink_cb(ctxt, rec, &cookie); break; case MDS_SETATTR_REC: + case MDS_SETATTR64_REC: rc = filter_recov_log_setattr_cb(ctxt, rec, &cookie); break; case LLOG_GEN_REC: { diff --git a/lustre/obdfilter/lproc_obdfilter.c b/lustre/obdfilter/lproc_obdfilter.c index cbe0753..5e5f1c7 100644 --- a/lustre/obdfilter/lproc_obdfilter.c +++ b/lustre/obdfilter/lproc_obdfilter.c @@ -103,8 +103,14 @@ static int lprocfs_filter_rd_last_id(char *page, char **start, off_t off, if (obd == NULL) return 0; + rc = snprintf(page, count, LPU64"\n",filter_last_id(filter, 0)); + if (rc < 0) + return rc; + page += rc; + count -= rc; + retval += rc; - for (i = FILTER_GROUP_MDS0; i < filter->fo_group_count; i++) { + for (i = FILTER_GROUP_MDS1_N_BASE + 1; i < filter->fo_group_count; i++) { rc = snprintf(page, count, LPU64"\n",filter_last_id(filter, i)); if (rc < 0) { retval = rc; @@ -143,7 +149,6 @@ int lprocfs_filter_wr_readcache(struct file *file, const char *buffer, return count; } - int lprocfs_filter_rd_fmd_max_num(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -242,6 +247,37 @@ static int lprocfs_filter_rd_capa_count(char *page, char **start, off_t off, capa_count[CAPA_SITE_SERVER]); } +static int lprocfs_rd_sec_level(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + + return snprintf(page, count, "%d\n", obd->u.filter.fo_sec_level); +} + +static int lprocfs_wr_sec_level(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + int val, rc; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val > LUSTRE_SEC_ALL || val < LUSTRE_SEC_NONE) + return -EINVAL; + + if (val == LUSTRE_SEC_SPECIFY) { + CWARN("security level %d will be supported in future.\n", + LUSTRE_SEC_SPECIFY); + return -EINVAL; + } + + obd->u.filter.fo_sec_level = val; + return count; +} + static int lprocfs_filter_rd_cache(char *page, char **start, off_t off, int count, int *eof, void *data) { @@ -318,11 +354,8 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = { lprocfs_filter_rd_readcache, lprocfs_filter_wr_readcache, 0 }, #ifdef HAVE_QUOTA_SUPPORT - { "quota_bunit_sz", lprocfs_rd_bunit, lprocfs_wr_bunit, 0}, - { "quota_btune_sz", lprocfs_rd_btune, lprocfs_wr_btune, 0}, - { "quota_iunit_sz", lprocfs_rd_iunit, lprocfs_wr_iunit, 0}, - { "quota_itune_sz", lprocfs_rd_itune, lprocfs_wr_itune, 0}, - { "quota_type", lprocfs_rd_type, lprocfs_wr_type, 0}, + { "quota_type", lprocfs_quota_rd_type, + lprocfs_quota_wr_type, 0}, #endif { "client_cache_count", lprocfs_filter_rd_fmd_max_num, lprocfs_filter_wr_fmd_max_num, 0 }, @@ -331,6 +364,8 @@ static struct lprocfs_vars lprocfs_filter_obd_vars[] = { { "capa", lprocfs_filter_rd_capa, lprocfs_filter_wr_capa, 0 }, { "capa_count", lprocfs_filter_rd_capa_count, 0, 0 }, + { "sec_level", lprocfs_rd_sec_level, + lprocfs_wr_sec_level, 0 }, { "read_cache_enable", lprocfs_filter_rd_cache, lprocfs_filter_wr_cache, 0}, { "writethrough_cache_enable", lprocfs_filter_rd_wcache, lprocfs_filter_wr_wcache, 0}, diff --git a/lustre/osc/Makefile.in b/lustre/osc/Makefile.in index 2eb2eea..438ce4c 100644 --- a/lustre/osc/Makefile.in +++ b/lustre/osc/Makefile.in @@ -1,4 +1,4 @@ MODULES := osc -osc-objs := osc_request.o lproc_osc.o osc_create.o cache.o +osc-objs := osc_request.o lproc_osc.o osc_create.o osc_dev.o osc_object.o osc_page.o osc_lock.o osc_io.o @INCLUDE_RULES@ diff --git a/lustre/osc/autoMakefile.am b/lustre/osc/autoMakefile.am index 65c588b..cf370ba 100644 --- a/lustre/osc/autoMakefile.am +++ b/lustre/osc/autoMakefile.am @@ -36,7 +36,8 @@ if LIBLUSTRE noinst_LIBRARIES = libosc.a -libosc_a_SOURCES = osc_request.c osc_create.c osc_internal.h cache.c +libosc_a_SOURCES = osc_request.c osc_create.c osc_internal.h osc_cl_internal.h osc_dev.c osc_object.c osc_page.c osc_lock.c osc_io.c + libosc_a_CPPFLAGS = $(LLCPPFLAGS) libosc_a_CFLAGS = $(LLCFLAGS) endif @@ -52,6 +53,11 @@ macos_PROGRAMS := osc osc_SOURCES := \ osc_create.c \ + osc_dev.c \ + osc_object.c \ + osc_page.c \ + osc_lock.c \ + osc_io.c \ osc_request.c osc_CFLAGS := $(EXTRA_KCFLAGS) @@ -69,4 +75,4 @@ endif install-data-hook: $(install_data_hook) MOSTLYCLEANFILES := @MOSTLYCLEANFILES@ -DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h +DIST_SOURCES = $(osc-objs:%.o=%.c) osc_internal.h osc_cl_internal.h diff --git a/lustre/osc/cache.c b/lustre/osc/cache.c deleted file mode 100644 index 90fb60a..0000000 --- a/lustre/osc/cache.c +++ /dev/null @@ -1,445 +0,0 @@ -/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- - * vim:expandtab:shiftwidth=8:tabstop=8: - * - * GPL HEADER START - * - * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. - * - * This program is free software; you can redistribute it and/or modify - * it under the terms of the GNU General Public License version 2 only, - * as published by the Free Software Foundation. - * - * This program is distributed in the hope that it will be useful, but - * WITHOUT ANY WARRANTY; without even the implied warranty of - * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU - * General Public License version 2 for more details (a copy is included - * in the LICENSE file that accompanied this code). - * - * You should have received a copy of the GNU General Public License - * version 2 along with this program; If not, see - * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf - * - * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, - * CA 95054 USA or visit www.sun.com if you need additional information or - * have any questions. - * - * GPL HEADER END - */ -/* - * Copyright 2008 Sun Microsystems, Inc. All rights reserved - * Use is subject to license terms. - */ -/* - * This file is part of Lustre, http://www.lustre.org/ - * Lustre is a trademark of Sun Microsystems, Inc. - * - * lustre/osc/cache.c - * - * Cache of triples - object, lock, extent - */ - -#ifndef EXPORT_SYMTAB -# define EXPORT_SYMTAB -#endif -#define DEBUG_SUBSYSTEM S_OSC - -#ifdef __KERNEL__ -# include -# include -# include -#else /* __KERNEL__ */ -# include -#endif - -#include -#include -#include -#include - -#include "osc_internal.h" - -/* Adding @lock to the @cache */ -int cache_add_lock(struct lustre_cache *cache, struct lustre_handle *lockh) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - - if (!lock) // Lock disappeared under us. - return 0; - - spin_lock(&cache->lc_locks_list_lock); - list_add_tail(&lock->l_cache_locks_list, &cache->lc_locks_list); - spin_unlock(&cache->lc_locks_list_lock); - - LDLM_LOCK_PUT(lock); - - return 0; -} - -/* Tries to add @extent to lock represented by @lockh if non-NULL, otherwise - just tries to match some suitable lock by resource and data contained in - @extent */ -/* Should be called with oap->lock held (except on initial addition, see - comment in osc_request.c*/ -int cache_add_extent(struct lustre_cache *cache, struct ldlm_res_id *res, - struct osc_async_page *extent, struct lustre_handle *lockh) -{ - struct lustre_handle tmplockh; - ldlm_policy_data_t tmpex; - struct ldlm_lock *lock = NULL; - ENTRY; - - /* Don't add anything second time */ - if (!list_empty(&extent->oap_page_list)) { - LBUG(); - RETURN(0); - } - - if (lockh && lustre_handle_is_used(lockh)) { - lock = ldlm_handle2lock(lockh); - if (!lock) - RETURN(-ENOLCK); - - LASSERTF(lock->l_policy_data.l_extent.start <= - extent->oap_obj_off && - extent->oap_obj_off + CFS_PAGE_SIZE - 1 <= - lock->l_policy_data.l_extent.end, - "Got wrong lock [" LPU64 "," LPU64 "] for page with " - "offset " LPU64 "\n", - lock->l_policy_data.l_extent.start, - lock->l_policy_data.l_extent.end, extent->oap_obj_off); - } else { - int mode; - /* Real extent width calculation here once we have real - * extents - */ - tmpex.l_extent.start = extent->oap_obj_off; - tmpex.l_extent.end = tmpex.l_extent.start + CFS_PAGE_SIZE - 1; - - /* XXX find lock from extent or something like that */ - /* The lock mode does not matter. If this is dirty page - then - * there could be only one PW lock. If the page is clean, - * any PR lock is good - */ - - mode = ldlm_lock_match(cache->lc_obd->obd_namespace, - LDLM_FL_BLOCK_GRANTED | - LDLM_FL_CBPENDING, res, LDLM_EXTENT, - &tmpex, LCK_PW | LCK_PR, &tmplockh); - - if (mode <= 0) { - CDEBUG(D_CACHE, "No lock to attach " LPU64 "->" LPU64 - " extent to!\n", tmpex.l_extent.start, - tmpex.l_extent.end); - RETURN((mode < 0) ? mode : -ENOLCK); - } - - lock = ldlm_handle2lock(&tmplockh); - if (!lock) { // Race - lock disappeared under us (eviction?) - CDEBUG(D_CACHE, "Newly matched lock just disappeared " - "under us\n"); - RETURN(-ENOLCK); - } - ldlm_lock_decref(&tmplockh, mode); - } - - spin_lock(&lock->l_extents_list_lock); - list_add_tail(&extent->oap_page_list, &lock->l_extents_list); - spin_unlock(&lock->l_extents_list_lock); - extent->oap_ldlm_lock = lock; - LDLM_LOCK_PUT(lock); - - RETURN(0); -} - -static void cache_extent_removal_get(struct page_removal_cb_element *element) -{ - atomic_inc(&element->prce_refcnt); -} - -static void cache_extent_removal_put(struct page_removal_cb_element *element) -{ - if(atomic_dec_and_test(&element->prce_refcnt)) - OBD_FREE_PTR(element); -} - -static int cache_extent_removal_event(struct lustre_cache *cache, - void *data, int discard) -{ - struct page *page = data; - struct list_head *iter; - struct page_removal_cb_element *element; - - read_lock(&cache->lc_page_removal_cb_lock); - iter = cache->lc_page_removal_callback_list.next; - while(iter != &cache->lc_page_removal_callback_list) { - element = list_entry(iter, struct page_removal_cb_element, prce_list); - cache_extent_removal_get(element); - read_unlock(&cache->lc_page_removal_cb_lock); - - element->prce_callback(page, discard); - - read_lock(&cache->lc_page_removal_cb_lock); - iter = iter->next; - cache_extent_removal_put(element); - } - read_unlock(&cache->lc_page_removal_cb_lock); - - return 0; -} - -/* Registers set of pin/remove callbacks for extents. Current limitation is - there could be only one pin_cb per cache. - @pin_cb is called when we have the page locked to pin it in memory so that - it does not disappear after we release page lock (which we need to do - to avoid deadlocks). - @func_cb is removal callback that is called after page and all spinlocks are - released, and is supposed to clean the page and remove it from all - (vfs) caches it might be in */ -int cache_add_extent_removal_cb(struct lustre_cache *cache, - obd_page_removal_cb_t func_cb, - obd_pin_extent_cb pin_cb) -{ - struct page_removal_cb_element *element; - - if (!func_cb) - return 0; - - OBD_ALLOC_PTR(element); - if (!element) - return -ENOMEM; - element->prce_callback = func_cb; - atomic_set(&element->prce_refcnt, 1); - - write_lock(&cache->lc_page_removal_cb_lock); - list_add_tail(&element->prce_list, - &cache->lc_page_removal_callback_list); - write_unlock(&cache->lc_page_removal_cb_lock); - - cache->lc_pin_extent_cb = pin_cb; - return 0; -} -EXPORT_SYMBOL(cache_add_extent_removal_cb); - -/* Unregister exntent removal callback registered earlier. If the list of - registered removal callbacks becomes empty, we also clear pin callback - since it could only be one */ -int cache_del_extent_removal_cb(struct lustre_cache *cache, - obd_page_removal_cb_t func_cb) -{ - int found = 0; - struct page_removal_cb_element *element, *t; - - write_lock(&cache->lc_page_removal_cb_lock); - list_for_each_entry_safe(element, t, - &cache->lc_page_removal_callback_list, - prce_list) { - if (element->prce_callback == func_cb) { - list_del(&element->prce_list); - write_unlock(&cache->lc_page_removal_cb_lock); - found = 1; - cache_extent_removal_put(element); - write_lock(&cache->lc_page_removal_cb_lock); - /* We continue iterating the list in case this function - was registered more than once */ - } - } - write_unlock(&cache->lc_page_removal_cb_lock); - - if (list_empty(&cache->lc_page_removal_callback_list)) - cache->lc_pin_extent_cb = NULL; - - return !found; -} -EXPORT_SYMBOL(cache_del_extent_removal_cb); - -static int cache_remove_extent_nolock(struct lustre_cache *cache, - struct osc_async_page *extent) -{ - int have_lock = !!extent->oap_ldlm_lock; - /* We used to check oap_ldlm_lock for non NULL here, but it might be - NULL, in fact, due to parallel page eviction clearing it and waiting - on a lock's page list lock */ - extent->oap_ldlm_lock = NULL; - - if (!list_empty(&extent->oap_page_list)) - list_del_init(&extent->oap_page_list); - - return have_lock; -} - -/* Request the @extent to be removed from cache and locks it belongs to. */ -void cache_remove_extent(struct lustre_cache *cache, - struct osc_async_page *extent) -{ - struct ldlm_lock *lock; - - spin_lock(&extent->oap_lock); - lock = extent->oap_ldlm_lock; - - extent->oap_ldlm_lock = NULL; - spin_unlock(&extent->oap_lock); - - /* No lock - means this extent is not in any list */ - if (!lock) - return; - - spin_lock(&lock->l_extents_list_lock); - if (!list_empty(&extent->oap_page_list)) - list_del_init(&extent->oap_page_list); - spin_unlock(&lock->l_extents_list_lock); -} - -/* iterate through list of extents in given lock identified by @lockh, - calling @cb_func for every such extent. also passed @data to every call. - stops iterating prematurely if @cb_func returns nonzero. */ -int cache_iterate_extents(struct lustre_cache *cache, - struct lustre_handle *lockh, - cache_iterate_extents_cb_t cb_func, void *data) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - struct osc_async_page *extent, *t; - - if (!lock) // Lock disappeared - return 0; - /* Parallel page removal from mem pressure can race with us */ - spin_lock(&lock->l_extents_list_lock); - list_for_each_entry_safe(extent, t, &lock->l_extents_list, - oap_page_list) { - if (cb_func(cache, lockh, extent, data)) - break; - } - spin_unlock(&lock->l_extents_list_lock); - LDLM_LOCK_PUT(lock); - - return 0; -} - -static int cache_remove_extents_from_lock(struct lustre_cache *cache, - struct ldlm_lock *lock, void *data) -{ - struct osc_async_page *extent; - void *ext_data; - - LASSERT(lock); - - spin_lock(&lock->l_extents_list_lock); - while (!list_empty(&lock->l_extents_list)) { - extent = list_entry(lock->l_extents_list.next, - struct osc_async_page, oap_page_list); - - spin_lock(&extent->oap_lock); - /* If there is no lock referenced from this oap, it means - there is parallel page-removal process waiting to free that - page on l_extents_list_lock and it holds page lock. - We need this page to completely go away and for that to - happen we will just try to truncate it here too. - Serialisation on page lock will achieve that goal for us. */ - /* Try to add extent back to the cache first, but only if we - * cancel read lock, write locks cannot have other overlapping - * locks. If adding is not possible (or canceling pw lock), - * then remove extent from cache */ - if (!cache_remove_extent_nolock(cache, extent) || - (lock->l_granted_mode == LCK_PW) || - cache_add_extent(cache, &lock->l_resource->lr_name, extent, - NULL)) { - /* We need to remember this oap_page value now, - once we release spinlocks, extent struct - might be freed and we endup requesting - page with address 0x5a5a5a5a in - cache_extent_removal_event */ - ext_data = extent->oap_page; - cache->lc_pin_extent_cb(extent->oap_page); - spin_unlock(&extent->oap_lock); - spin_unlock(&lock->l_extents_list_lock); - cache_extent_removal_event(cache, ext_data, - lock-> - l_flags & - LDLM_FL_DISCARD_DATA); - spin_lock(&lock->l_extents_list_lock); - } else { - spin_unlock(&extent->oap_lock); - } - } - spin_unlock(&lock->l_extents_list_lock); - - return 0; -} - -/* Remoes @lock from cache after necessary checks. */ -int cache_remove_lock(struct lustre_cache *cache, struct lustre_handle *lockh) -{ - struct ldlm_lock *lock = ldlm_handle2lock(lockh); - - if (!lock) // The lock was removed by somebody just now, nothing to do - return 0; - - cache_remove_extents_from_lock(cache, lock, NULL /*data */ ); - - spin_lock(&cache->lc_locks_list_lock); - list_del_init(&lock->l_cache_locks_list); - spin_unlock(&cache->lc_locks_list_lock); - - LDLM_LOCK_PUT(lock); - - return 0; -} - -/* Supposed to iterate through all locks in the cache for given resource. - Not implemented atthe moment. */ -int cache_iterate_locks(struct lustre_cache *cache, struct ldlm_res_id *res, - cache_iterate_locks_cb_t cb_fun, void *data) -{ - return -ENOTSUPP; -} - -/* Create lustre cache and attach it to @obd */ -struct lustre_cache *cache_create(struct obd_device *obd) -{ - struct lustre_cache *cache; - - OBD_ALLOC(cache, sizeof(*cache)); - if (!cache) - GOTO(out, NULL); - spin_lock_init(&cache->lc_locks_list_lock); - CFS_INIT_LIST_HEAD(&cache->lc_locks_list); - CFS_INIT_LIST_HEAD(&cache->lc_page_removal_callback_list); - rwlock_init(&cache->lc_page_removal_cb_lock); - cache->lc_obd = obd; - - out: - return cache; -} - -/* Destroy @cache and free its memory */ -int cache_destroy(struct lustre_cache *cache) -{ - if (cache) { - spin_lock(&cache->lc_locks_list_lock); - if (!list_empty(&cache->lc_locks_list)) { - struct ldlm_lock *lock, *tmp; - CERROR("still have locks in the list on cleanup:\n"); - - list_for_each_entry_safe(lock, tmp, - &cache->lc_locks_list, - l_cache_locks_list) { - list_del_init(&lock->l_cache_locks_list); - /* XXX: Of course natural idea would be to print - offending locks here, but if we use - e.g. LDLM_ERROR, we will likely crash here, - as LDLM error tries to access e.g. - nonexisting namespace. Normally this kind of - case could only happen when somebody did not - release lock reference and we have other ways - to detect this. */ - /* Make sure there are no pages left under the - lock */ - LASSERT(list_empty(&lock->l_extents_list)); - } - } - spin_unlock(&cache->lc_locks_list_lock); - LASSERT(list_empty(&cache->lc_page_removal_callback_list)); - OBD_FREE(cache, sizeof(*cache)); - } - - return 0; -} diff --git a/lustre/osc/lproc_osc.c b/lustre/osc/lproc_osc.c index e846e17..d17dbf8 100644 --- a/lustre/osc/lproc_osc.c +++ b/lustre/osc/lproc_osc.c @@ -479,6 +479,44 @@ static int osc_wr_resend_count(struct file *file, const char *buffer, return count; } +static int osc_rd_contention_seconds(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return snprintf(page, count, "%u\n", od->od_contention_time); +} + +static int osc_wr_contention_seconds(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_contention_time) ?: + count; +} + +static int osc_rd_lockless_truncate(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return snprintf(page, count, "%u\n", od->od_lockless_truncate); +} + +static int osc_wr_lockless_truncate(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = data; + struct osc_device *od = obd2osc_dev(obd); + + return lprocfs_write_helper(buffer, count, &od->od_lockless_truncate) ?: + count; +} + static struct lprocfs_vars lprocfs_osc_obd_vars[] = { { "uuid", lprocfs_rd_uuid, 0, 0 }, { "ping", 0, lprocfs_wr_ping, 0, 0, 0222 }, @@ -510,6 +548,10 @@ static struct lprocfs_vars lprocfs_osc_obd_vars[] = { { "checksum_type", osc_rd_checksum_type, osc_wd_checksum_type, 0 }, { "resend_count", osc_rd_resend_count, osc_wr_resend_count, 0}, { "timeouts", lprocfs_rd_timeouts, 0, 0 }, + { "contention_seconds", osc_rd_contention_seconds, + osc_wr_contention_seconds, 0 }, + { "lockless_truncate", osc_rd_lockless_truncate, + osc_wr_lockless_truncate, 0 }, { "import", lprocfs_rd_import, 0, 0 }, { 0 } }; @@ -637,10 +679,48 @@ static ssize_t osc_rpc_stats_seq_write(struct file *file, const char *buf, LPROC_SEQ_FOPS(osc_rpc_stats); +static int osc_stats_seq_show(struct seq_file *seq, void *v) +{ + struct timeval now; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + do_gettimeofday(&now); + + seq_printf(seq, "snapshot_time: %lu.%lu (secs.usecs)\n", + now.tv_sec, now.tv_usec); + seq_printf(seq, "lockless_write_bytes\t\t"LPU64"\n", + stats->os_lockless_writes); + seq_printf(seq, "lockless_read_bytes\t\t"LPU64"\n", + stats->os_lockless_reads); + seq_printf(seq, "lockless_truncate\t\t"LPU64"\n", + stats->os_lockless_truncates); + return 0; +} + +static ssize_t osc_stats_seq_write(struct file *file, const char *buf, + size_t len, loff_t *off) +{ + struct seq_file *seq = file->private_data; + struct obd_device *dev = seq->private; + struct osc_stats *stats = &obd2osc_dev(dev)->od_stats; + + memset(stats, 0, sizeof(*stats)); + return len; +} + +LPROC_SEQ_FOPS(osc_stats); + int lproc_osc_attach_seqstat(struct obd_device *dev) { - return lprocfs_obd_seq_create(dev, "rpc_stats", 0444, - &osc_rpc_stats_fops, dev); + int rc; + + rc = lprocfs_seq_create(dev->obd_proc_entry, "osc_stats", 0444, + &osc_stats_fops, dev); + if (rc == 0) + rc = lprocfs_obd_seq_create(dev, "rpc_stats", 0444, + &osc_rpc_stats_fops, dev); + return rc; } void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) diff --git a/lustre/osc/osc_cl_internal.h b/lustre/osc/osc_cl_internal.h new file mode 100644 index 0000000..6085101 --- /dev/null +++ b/lustre/osc/osc_cl_internal.h @@ -0,0 +1,420 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Internal interfaces of OSC layer. + * + * Author: Nikita Danilov + */ + +#ifndef OSC_CL_INTERNAL_H +#define OSC_CL_INTERNAL_H + +#ifdef __KERNEL__ +# include +#else +# include +#endif + +#include +/* osc_build_res_name() */ +#include +#include +#include "osc_internal.h" + +/** \addtogroup osc osc @{ */ + +/** + * State maintained by osc layer for each IO context. + */ +struct osc_io { + /** super class */ + struct cl_io_slice oi_cl; + /** true if this io is lockless. */ + int oi_lockless; + + struct obdo oi_oa; + struct osc_punch_cbargs { + int opc_rc; + struct completion opc_sync; + } oi_punch_cbarg; +}; + +/** + * State of transfer for osc. + */ +struct osc_req { + struct cl_req_slice or_cl; +}; + +/** + * State maintained by osc layer for the duration of a system call. + */ +struct osc_session { + struct osc_io os_io; +}; + +struct osc_thread_info { + struct ldlm_res_id oti_resname; + ldlm_policy_data_t oti_policy; + struct cl_lock_descr oti_descr; + struct cl_attr oti_attr; + struct lustre_handle oti_handle; + struct cl_lock_closure oti_closure; + struct cl_page_list oti_plist; +}; + +struct osc_object { + struct cl_object oo_cl; + struct lov_oinfo *oo_oinfo; + /** + * True if locking against this stripe got -EUSERS. + */ + int oo_contended; + cfs_time_t oo_contention_time; +#ifdef INVARIANT_CHECK + /** + * IO context used for invariant checks in osc_lock_has_pages(). + */ + struct cl_io oo_debug_io; + /** Serialization object for osc_object::oo_debug_io. */ + struct mutex oo_debug_mutex; +#endif + /** + * List of pages in transfer. + */ + struct list_head oo_inflight[CRT_NR]; + /** + * Lock, protecting ccc_object::cob_inflight, because a seat-belt is + * locked during take-off and landing. + */ + spinlock_t oo_seatbelt; +}; + +/* + * Lock "micro-states" for osc layer. + */ +enum osc_lock_state { + OLS_NEW, + OLS_ENQUEUED, + OLS_UPCALL_RECEIVED, + OLS_GRANTED, + OLS_RELEASED, + OLS_BLOCKED, + OLS_CANCELLED +}; + +/** + * osc-private state of cl_lock. + * + * Interaction with DLM. + * + * CLIO enqueues all DLM locks through ptlrpcd (that is, in "async" mode). + * + * Once receive upcall is invoked, osc_lock remembers a handle of DLM lock in + * osc_lock::ols_handle and a pointer to that lock in osc_lock::ols_lock. + * + * This pointer is protected through a reference, acquired by + * osc_lock_upcall0(). Also, an additional reference is acquired by + * ldlm_lock_addref() call protecting the lock from cancellation, until + * osc_lock_unuse() releases it. + * + * Below is a description of how lock references are acquired and released + * inside of DLM. + * + * - When new lock is created and enqueued to the server (ldlm_cli_enqueue()) + * - ldlm_lock_create() + * - ldlm_lock_new(): initializes a lock with 2 references. One for + * the caller (released when reply from the server is received, or on + * error), and another for the hash table. + * - ldlm_lock_addref_internal(): protects the lock from cancellation. + * + * - When reply is received from the server (osc_enqueue_interpret()) + * - ldlm_cli_enqueue_fini() + * - LDLM_LOCK_PUT(): releases caller reference acquired by + * ldlm_lock_new(). + * - if (rc != 0) + * ldlm_lock_decref(): error case: matches ldlm_cli_enqueue(). + * - ldlm_lock_decref(): for async locks, matches ldlm_cli_enqueue(). + * + * - When lock is being cancelled (ldlm_lock_cancel()) + * - ldlm_lock_destroy() + * - LDLM_LOCK_PUT(): releases hash-table reference acquired by + * ldlm_lock_new(). + * + * osc_lock is detached from ldlm_lock by osc_lock_detach() that is called + * either when lock is cancelled (osc_lock_blocking()), or when locks is + * deleted without cancellation (e.g., from cl_locks_prune()). In the latter + * case ldlm lock remains in memory, and can be re-attached to osc_lock in the + * future. + */ +struct osc_lock { + struct cl_lock_slice ols_cl; + /** underlying DLM lock */ + struct ldlm_lock *ols_lock; + /** lock value block */ + struct ost_lvb ols_lvb; + /** DLM flags with which osc_lock::ols_lock was enqueued */ + int ols_flags; + /** osc_lock::ols_lock handle */ + struct lustre_handle ols_handle; + struct ldlm_enqueue_info ols_einfo; + enum osc_lock_state ols_state; + /** + * true, if ldlm_lock_addref() was called against + * osc_lock::ols_lock. This is used for sanity checking. + * + * \see osc_lock::ols_has_ref + */ + unsigned ols_hold :1, + /** + * this is much like osc_lock::ols_hold, except that this bit is + * cleared _after_ reference in released in osc_lock_unuse(). This + * fine distinction is needed because: + * + * - if ldlm lock still has a reference, osc_ast_data_get() needs + * to return associated cl_lock (so that a flag is needed that is + * cleared after ldlm_lock_decref() returned), and + * + * - ldlm_lock_decref() can invoke blocking ast (for a + * LDLM_FL_CBPENDING lock), and osc_lock functions like + * osc_lock_cancel() called from there need to know whether to + * release lock reference (so that a flag is needed that is + * cleared before ldlm_lock_decref() is called). + */ + ols_has_ref:1, + /** + * inherit the lockless attribute from top level cl_io. + * If true, osc_lock_enqueue is able to tolerate the -EUSERS error. + */ + ols_locklessable:1, + /** + * set by osc_lock_use() to wait until blocking AST enters into + * osc_ldlm_blocking_ast0(), so that cl_lock mutex can be used for + * further synchronization. + */ + ols_ast_wait:1, + /** + * If the data of this lock has been flushed to server side. + */ + ols_flush:1, + /** + * if set, the osc_lock is a glimpse lock. For glimpse locks, we treat + * the EVAVAIL error as torerable, this will make upper logic happy + * to wait all glimpse locks to each OSTs to be completed. + * Glimpse lock converts to normal lock if the server lock is + * granted. + * Glimpse lock should be destroyed immediately after use. + */ + ols_glimpse:1; + /** + * IO that owns this lock. This field is used for a dead-lock + * avoidance by osc_lock_enqueue(). + * + * \see osc_deadlock_is_possible() + */ + struct osc_io *ols_owner; +}; + + +/** + * Page state private for osc layer. + */ +struct osc_page { + struct cl_page_slice ops_cl; + /** + * Page queues used by osc to detect when RPC can be formed. + */ + struct osc_async_page ops_oap; + /** + * An offset within page from which next transfer starts. This is used + * by cl_page_clip() to submit partial page transfers. + */ + int ops_from; + /** + * An offset within page at which next transfer ends. + * + * \see osc_page::ops_from. + */ + int ops_to; + /** + * Boolean, true iff page is under transfer. Used for sanity checking. + */ + unsigned ops_transfer_pinned:1, + /** + * True for a `temporary page' created by read-ahead code, probably + * outside of any DLM lock. + */ + ops_temp:1; + /** + * Linkage into a per-osc_object list of pages in flight. For + * debugging. + */ + struct list_head ops_inflight; + /** + * Thread that submitted this page for transfer. For debugging. + */ + cfs_task_t *ops_submitter; +}; + +extern cfs_mem_cache_t *osc_page_kmem; +extern cfs_mem_cache_t *osc_lock_kmem; +extern cfs_mem_cache_t *osc_object_kmem; +extern cfs_mem_cache_t *osc_thread_kmem; +extern cfs_mem_cache_t *osc_session_kmem; +extern cfs_mem_cache_t *osc_req_kmem; + +extern struct lu_device_type osc_device_type; +extern struct lu_context_key osc_key; +extern struct lu_context_key osc_session_key; + +#define OSC_FLAGS (ASYNC_URGENT|ASYNC_READY) + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io); +int osc_io_init (const struct lu_env *env, + struct cl_object *obj, struct cl_io *io); +int osc_req_init (const struct lu_env *env, struct cl_device *dev, + struct cl_req *req); +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *hdr, + struct lu_device *dev); +struct cl_page *osc_page_init (const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage); + +void osc_lock_build_res(const struct lu_env *env, const struct osc_object *obj, + struct ldlm_res_id *resname); +void osc_index2policy (ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end); +int osc_lvb_print (const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb); +void osc_io_submit_page(const struct lu_env *env, + struct osc_io *oio, struct osc_page *opg, + enum cl_req_type crt); + +void osc_object_set_contended (struct osc_object *obj); +void osc_object_clear_contended(struct osc_object *obj); +int osc_object_is_contended (struct osc_object *obj); + +int osc_lock_is_lockless (const struct osc_lock *olck); + +/***************************************************************************** + * + * Accessors. + * + */ + +static inline struct osc_thread_info *osc_env_info(const struct lu_env *env) +{ + struct osc_thread_info *info; + + info = lu_context_key_get(&env->le_ctx, &osc_key); + LASSERT(info != NULL); + return info; +} + +static inline struct osc_session *osc_env_session(const struct lu_env *env) +{ + struct osc_session *ses; + + ses = lu_context_key_get(env->le_ses, &osc_session_key); + LASSERT(ses != NULL); + return ses; +} + +static inline struct osc_io *osc_env_io(const struct lu_env *env) +{ + return &osc_env_session(env)->os_io; +} + +static inline int osc_is_object(const struct lu_object *obj) +{ + return obj->lo_dev->ld_type == &osc_device_type; +} + +static inline struct osc_device *lu2osc_dev(const struct lu_device *d) +{ + LINVRNT(d->ld_type == &osc_device_type); + return container_of0(d, struct osc_device, od_cl.cd_lu_dev); +} + +static inline struct obd_export *osc_export(const struct osc_object *obj) +{ + return lu2osc_dev(obj->oo_cl.co_lu.lo_dev)->od_exp; +} + +static inline struct osc_object *cl2osc(const struct cl_object *obj) +{ + LINVRNT(osc_is_object(&obj->co_lu)); + return container_of0(obj, struct osc_object, oo_cl); +} + +static inline ldlm_mode_t osc_cl_lock2ldlm(enum cl_lock_mode mode) +{ + LASSERT(mode == CLM_READ || mode == CLM_WRITE); + return mode == CLM_READ ? LCK_PR : LCK_PW; +} + +static inline enum cl_lock_mode osc_ldlm2cl_lock(ldlm_mode_t mode) +{ + LASSERT(mode == LCK_PR || mode == LCK_PW); + return mode == LCK_PR ? CLM_READ : CLM_WRITE; +} + +static inline struct osc_page *cl2osc_page(const struct cl_page_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cpl_obj->co_lu)); + return container_of0(slice, struct osc_page, ops_cl); +} + +static inline struct osc_lock *cl2osc_lock(const struct cl_lock_slice *slice) +{ + LINVRNT(osc_is_object(&slice->cls_obj->co_lu)); + return container_of0(slice, struct osc_lock, ols_cl); +} + +static inline struct osc_lock *osc_lock_at(const struct cl_lock *lock) +{ + return cl2osc_lock(cl_lock_at(lock, &osc_device_type)); +} + +/** @} osc */ + +#endif /* OSC_CL_INTERNAL_H */ diff --git a/lustre/osc/osc_create.c b/lustre/osc/osc_create.c index f6b669e..39cea5a 100644 --- a/lustre/osc/osc_create.c +++ b/lustre/osc/osc_create.c @@ -77,7 +77,7 @@ static int osc_interpret_create(const struct lu_env *env, oscc = req->rq_async_args.pointer_arg[0]; LASSERT(oscc && (oscc->oscc_obd != LP_POISON)); - + spin_lock(&oscc->oscc_lock); oscc->oscc_flags &= ~OSCC_FLAG_CREATING; switch (rc) { @@ -101,7 +101,7 @@ static int osc_interpret_create(const struct lu_env *env, DEBUG_REQ(D_INODE, req, "Got EAGAIN - resend \n"); break; case -ENOSPC: - case -EROFS: + case -EROFS: case -EFBIG: { oscc->oscc_flags |= OSCC_FLAG_NOSPC; if (body && rc == -ENOSPC) { @@ -113,7 +113,7 @@ static int osc_interpret_create(const struct lu_env *env, break; } case -EIO: { - /* filter always set body->oa.o_id as the last_id + /* filter always set body->oa.o_id as the last_id * of filter (see filter_handle_precreate for detail)*/ if (body && body->oa.o_id > oscc->oscc_last_id) oscc->oscc_last_id = body->oa.o_id; @@ -184,7 +184,7 @@ static int oscc_internal_create(struct osc_creator *oscc) spin_lock(&oscc->oscc_lock); body->oa.o_id = oscc->oscc_last_id + oscc->oscc_grow_count; body->oa.o_gr = oscc->oscc_oa.o_gr; - LASSERT(body->oa.o_gr > 0); + LASSERT_MDS_GROUP(body->oa.o_gr); body->oa.o_valid |= OBD_MD_FLID | OBD_MD_FLGROUP; spin_unlock(&oscc->oscc_lock); CDEBUG(D_RPCTRACE, "prealloc through id "LPU64" (last seen "LPU64")\n", @@ -194,7 +194,7 @@ static int oscc_internal_create(struct osc_creator *oscc) request->rq_async_args.pointer_arg[0] = oscc; request->rq_interpret_reply = osc_interpret_create; - ptlrpcd_add_req(request); + ptlrpcd_add_req(request, PSCOPE_OTHER); RETURN(0); } @@ -283,6 +283,9 @@ int osc_precreate(struct obd_export *exp) if (imp != NULL && imp->imp_deactive) RETURN(1000); + if (oscc_recovering(oscc)) + RETURN(2); + if (oscc->oscc_last_id < oscc->oscc_next_id) { spin_lock(&oscc->oscc_lock); if (oscc->oscc_flags & OSCC_FLAG_NOSPC) { @@ -293,11 +296,6 @@ int osc_precreate(struct obd_export *exp) spin_unlock(&oscc->oscc_lock); RETURN(1); } - if (oscc->oscc_flags & OSCC_FLAG_RECOVERING) { - spin_unlock(&oscc->oscc_lock); - RETURN(2); - } - if (oscc->oscc_flags & OSCC_FLAG_CREATING) { spin_unlock(&oscc->oscc_lock); RETURN(1); @@ -319,7 +317,7 @@ int osc_create(struct obd_export *exp, struct obdo *oa, LASSERT(oa); LASSERT(ea); - LASSERT(oa->o_gr > 0); + LASSERT_MDS_GROUP(oa->o_gr); LASSERT(oa->o_valid & OBD_MD_FLGROUP); if ((oa->o_valid & OBD_MD_FLFLAGS) && diff --git a/lustre/osc/osc_dev.c b/lustre/osc/osc_dev.c new file mode 100644 index 0000000..228b157 --- /dev/null +++ b/lustre/osc/osc_dev.c @@ -0,0 +1,253 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_device, cl_req for OSC layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup osc osc @{ */ + +#define DEBUG_SUBSYSTEM S_OSC + +/* class_name2obd() */ +#include + +#include "osc_cl_internal.h" + +cfs_mem_cache_t *osc_page_kmem; +cfs_mem_cache_t *osc_lock_kmem; +cfs_mem_cache_t *osc_object_kmem; +cfs_mem_cache_t *osc_thread_kmem; +cfs_mem_cache_t *osc_session_kmem; +cfs_mem_cache_t *osc_req_kmem; + +struct lu_kmem_descr osc_caches[] = { + { + .ckd_cache = &osc_page_kmem, + .ckd_name = "osc_page_kmem", + .ckd_size = sizeof (struct osc_page) + }, + { + .ckd_cache = &osc_lock_kmem, + .ckd_name = "osc_lock_kmem", + .ckd_size = sizeof (struct osc_lock) + }, + { + .ckd_cache = &osc_object_kmem, + .ckd_name = "osc_object_kmem", + .ckd_size = sizeof (struct osc_object) + }, + { + .ckd_cache = &osc_thread_kmem, + .ckd_name = "osc_thread_kmem", + .ckd_size = sizeof (struct osc_thread_info) + }, + { + .ckd_cache = &osc_session_kmem, + .ckd_name = "osc_session_kmem", + .ckd_size = sizeof (struct osc_session) + }, + { + .ckd_cache = &osc_req_kmem, + .ckd_name = "osc_req_kmem", + .ckd_size = sizeof (struct osc_req) + }, + { + .ckd_cache = NULL + } +}; + +struct lock_class_key osc_ast_guard_class; + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_device *osc2lu_dev(struct osc_device *osc) +{ + return &osc->od_cl.cd_lu_dev; +} + +/***************************************************************************** + * + * Osc device and device type functions. + * + */ + +static void *osc_key_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_thread_info *info; + + OBD_SLAB_ALLOC_PTR(info, osc_thread_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_key_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_thread_info *info = data; + OBD_SLAB_FREE_PTR(info, osc_thread_kmem); +} + +struct lu_context_key osc_key = { + .lct_tags = LCT_CL_THREAD, + .lct_init = osc_key_init, + .lct_fini = osc_key_fini +}; + +static void *osc_session_init(const struct lu_context *ctx, + struct lu_context_key *key) +{ + struct osc_session *info; + + OBD_SLAB_ALLOC_PTR(info, osc_session_kmem); + if (info == NULL) + info = ERR_PTR(-ENOMEM); + return info; +} + +static void osc_session_fini(const struct lu_context *ctx, + struct lu_context_key *key, void *data) +{ + struct osc_session *info = data; + OBD_SLAB_FREE_PTR(info, osc_session_kmem); +} + +struct lu_context_key osc_session_key = { + .lct_tags = LCT_SESSION, + .lct_init = osc_session_init, + .lct_fini = osc_session_fini +}; + +/* type constructor/destructor: osc_type_{init,fini,start,stop}(). */ +LU_TYPE_INIT_FINI(osc, &osc_key, &osc_session_key); + +static int osc_cl_process_config(const struct lu_env *env, + struct lu_device *d, struct lustre_cfg *cfg) +{ + ENTRY; + RETURN(osc_process_config_base(d->ld_obd, cfg)); +} + +static const struct lu_device_operations osc_lu_ops = { + .ldo_object_alloc = osc_object_alloc, + .ldo_process_config = osc_cl_process_config, + .ldo_recovery_complete = NULL +}; + +static const struct cl_device_operations osc_cl_ops = { + .cdo_req_init = osc_req_init +}; + +static int osc_device_init(const struct lu_env *env, struct lu_device *d, + const char *name, struct lu_device *next) +{ + RETURN(0); +} + +static struct lu_device *osc_device_fini(const struct lu_env *env, + struct lu_device *d) +{ + return 0; +} + +static struct lu_device *osc_device_free(const struct lu_env *env, + struct lu_device *d) +{ + struct osc_device *od = lu2osc_dev(d); + + cl_device_fini(lu2cl_dev(d)); + OBD_FREE_PTR(od); + return NULL; +} + +static struct lu_device *osc_device_alloc(const struct lu_env *env, + struct lu_device_type *t, + struct lustre_cfg *cfg) +{ + struct lu_device *d; + struct osc_device *od; + struct obd_device *obd; + int rc; + + OBD_ALLOC_PTR(od); + if (od == NULL) + RETURN(ERR_PTR(-ENOMEM)); + + cl_device_init(&od->od_cl, t); + d = osc2lu_dev(od); + d->ld_ops = &osc_lu_ops; + od->od_cl.cd_ops = &osc_cl_ops; + + /* Setup OSC OBD */ + obd = class_name2obd(lustre_cfg_string(cfg, 0)); + LASSERT(obd != NULL); + rc = osc_setup(obd, cfg); + if (rc) { + osc_device_free(env, d); + RETURN(ERR_PTR(rc)); + } + od->od_exp = obd->obd_self_export; + RETURN(d); +} + +static const struct lu_device_type_operations osc_device_type_ops = { + .ldto_init = osc_type_init, + .ldto_fini = osc_type_fini, + + .ldto_start = osc_type_start, + .ldto_stop = osc_type_stop, + + .ldto_device_alloc = osc_device_alloc, + .ldto_device_free = osc_device_free, + + .ldto_device_init = osc_device_init, + .ldto_device_fini = osc_device_fini +}; + +struct lu_device_type osc_device_type = { + .ldt_tags = LU_DEVICE_CL, + .ldt_name = LUSTRE_OSC_NAME, + .ldt_ops = &osc_device_type_ops, + .ldt_ctx_tags = LCT_CL_THREAD +}; + +/** @} osc */ diff --git a/lustre/osc/osc_internal.h b/lustre/osc/osc_internal.h index d6c979b..b7a5143 100644 --- a/lustre/osc/osc_internal.h +++ b/lustre/osc/osc_internal.h @@ -39,6 +39,24 @@ #define OAP_MAGIC 8675309 +struct lu_env; + +enum async_flags { + ASYNC_READY = 0x1, /* ap_make_ready will not be called before this + page is added to an rpc */ + ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ + ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called + to give the caller a chance to update + or cancel the size of the io */ +}; + +struct obd_async_page_ops { + int (*ap_make_ready)(const struct lu_env *env, void *data, int cmd); + int (*ap_refresh_count)(const struct lu_env *env, void *data, int cmd); + int (*ap_completion)(const struct lu_env *env, + void *data, int cmd, struct obdo *oa, int rc); +}; + struct osc_async_page { int oap_magic; unsigned short oap_cmd; @@ -54,13 +72,11 @@ struct osc_async_page { struct brw_page oap_brw_page; - struct oig_callback_context oap_occ; - struct obd_io_group *oap_oig; struct ptlrpc_request *oap_request; struct client_obd *oap_cli; struct lov_oinfo *oap_loi; - struct obd_async_page_ops *oap_caller_ops; + const struct obd_async_page_ops *oap_caller_ops; void *oap_caller_data; struct list_head oap_page_list; struct ldlm_lock *oap_ldlm_lock; @@ -93,6 +109,64 @@ int osc_real_create(struct obd_export *exp, struct obdo *oa, void oscc_init(struct obd_device *obd); void osc_wake_cache_waiters(struct client_obd *cli); +/* + * cl integration. + */ +#include + +extern struct ptlrpc_request_set *PTLRPCD_SET; + +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + int *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, + void *cookie, struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async); +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode); + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + int *flags, void *data, struct lustre_handle *lockh, + int unref); + +int osc_punch_base(struct obd_export *exp, struct obdo *oa, + struct obd_capa *capa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset); + +int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, + struct lov_oinfo *loi, cfs_page_t *page, + obd_off offset, const struct obd_async_page_ops *ops, + void *data, void **res, int nocache, + struct lustre_handle *lockh); +void osc_oap_to_pending(struct osc_async_page *oap); +int osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap); +void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi); +void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli); + +int osc_queue_async_io(const struct lu_env *env, + struct obd_export *exp, struct lov_stripe_md *lsm, + struct lov_oinfo *loi, void *cookie, + int cmd, obd_off off, int count, + obd_flag brw_flags, enum async_flags async_flags); +int osc_teardown_async_page(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct lov_oinfo *loi, void *cookie); +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *cfg); +int osc_set_async_flags_base(struct client_obd *cli, + struct lov_oinfo *loi, struct osc_async_page *oap, + obd_flag async_flags); +int osc_enter_cache_try(const struct lu_env *env, + struct client_obd *cli, struct lov_oinfo *loi, + struct osc_async_page *oap, int transient); + +struct cl_page *osc_oap2cl_page(struct osc_async_page *oap); +extern spinlock_t osc_ast_guard; + +int osc_cleanup(struct obd_device *obd); +int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg); + #ifdef LPROCFS int lproc_osc_attach_seqstat(struct obd_device *dev); void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars); @@ -104,6 +178,8 @@ static inline void lprocfs_osc_init_vars(struct lprocfs_static_vars *lvars) } #endif +extern struct lu_device_type osc_device_type; + static inline int osc_recoverable_error(int rc) { return (rc == -EIO || rc == -EROFS || rc == -ENOMEM || rc == -EAGAIN); @@ -112,8 +188,8 @@ static inline int osc_recoverable_error(int rc) /* return 1 if osc should be resend request */ static inline int osc_should_resend(int resend, struct client_obd *cli) { - return atomic_read(&cli->cl_resends) ? - atomic_read(&cli->cl_resends) > resend : 1; + return atomic_read(&cli->cl_resends) ? + atomic_read(&cli->cl_resends) > resend : 1; } #ifndef min_t @@ -121,4 +197,26 @@ static inline int osc_should_resend(int resend, struct client_obd *cli) ({ type __x = (x); type __y = (y); __x < __y ? __x: __y; }) #endif +struct osc_device { + struct cl_device od_cl; + struct obd_export *od_exp; + + /* Write stats is actually protected by client_obd's lock. */ + struct osc_stats { + uint64_t os_lockless_writes; /* by bytes */ + uint64_t os_lockless_reads; /* by bytes */ + uint64_t os_lockless_truncates; /* by times */ + } od_stats; + + /* configuration item(s) */ + int od_contention_time; + int od_lockless_truncate; +}; + +static inline struct osc_device *obd2osc_dev(const struct obd_device *d) +{ + return container_of0(d->obd_lu_dev, struct osc_device, od_cl.cd_lu_dev); +} + + #endif /* OSC_INTERNAL_H */ diff --git a/lustre/osc/osc_io.c b/lustre/osc/osc_io.c new file mode 100644 index 0000000..4b4ae0a --- /dev/null +++ b/lustre/osc/osc_io.c @@ -0,0 +1,653 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_io for OSC layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup osc osc @{ */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct osc_req *cl2osc_req(const struct cl_req_slice *slice) +{ + LINVRNT(slice->crs_dev->cd_lu_dev.ld_type == &osc_device_type); + return container_of0(slice, struct osc_req, or_cl); +} + +static struct osc_io *cl2osc_io(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct osc_io *oio = container_of0(slice, struct osc_io, oi_cl); + LINVRNT(oio == osc_env_io(env)); + return oio; +} + +static struct osc_page *osc_cl_page_osc(struct cl_page *page) +{ + const struct cl_page_slice *slice; + + slice = cl_page_at(page, &osc_device_type); + LASSERT(slice != NULL); + + return cl2osc_page(slice); +} + + +/***************************************************************************** + * + * io operations. + * + */ + +static void osc_io_fini(const struct lu_env *env, const struct cl_io_slice *io) +{ +} + +struct cl_page *osc_oap2cl_page(struct osc_async_page *oap) +{ + return container_of(oap, struct osc_page, ops_oap)->ops_cl.cpl_page; +} + +static void osc_io_unplug(const struct lu_env *env, struct osc_object *osc, + struct client_obd *cli) +{ + loi_list_maint(cli, osc->oo_oinfo); + osc_check_rpcs(env, cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); +} + +/** + * How many pages osc_io_submit() queues before checking whether an RPC is + * ready. + */ +#define OSC_QUEUE_GRAIN (32) + +/** + * An implementation of cl_io_operations::cio_io_submit() method for osc + * layer. Iterates over pages in the in-queue, prepares each for io by calling + * cl_page_prep() and then either submits them through osc_io_submit_page() + * or, if page is already submitted, changes osc flags through + * osc_set_async_flags_base(). + */ +static int osc_io_submit(const struct lu_env *env, + const struct cl_io_slice *ios, + enum cl_req_type crt, struct cl_2queue *queue) +{ + struct cl_page *page; + struct cl_page *tmp; + struct osc_object *osc0 = NULL; + struct client_obd *cli = NULL; + struct osc_object *osc = NULL; /* to keep gcc happy */ + struct osc_page *opg; + struct cl_io *io; + + struct cl_page_list *qin = &queue->c2_qin; + struct cl_page_list *qout = &queue->c2_qout; + int queued = 0; + int result = 0; + + LASSERT(qin->pl_nr > 0); + + CDEBUG(D_INFO, "%i %i\n", qin->pl_nr, crt); + /* + * NOTE: here @page is a top-level page. This is done to avoid + * creation of sub-page-list. + */ + cl_page_list_for_each_safe(page, tmp, qin) { + struct osc_async_page *oap; + struct obd_export *exp; + + /* Top level IO. */ + io = page->cp_owner; + LASSERT(io != NULL); + + opg = osc_cl_page_osc(page); + oap = &opg->ops_oap; + osc = cl2osc(opg->ops_cl.cpl_obj); + exp = osc_export(osc); + + /* + * This can be checked without cli->cl_loi_list_lock, because + * ->oap_*_item are always manipulated when the page is owned. + */ + if (!list_empty(&oap->oap_urgent_item) || + !list_empty(&oap->oap_rpc_item)) { + result = -EBUSY; + break; + } + + if (osc0 == NULL) { /* first iteration */ + cli = &exp->exp_obd->u.cli; + osc0 = osc; + } else /* check that all pages are against the same object + * (for now) */ + LASSERT(osc == osc0); + if (queued++ == 0) + client_obd_list_lock(&cli->cl_loi_list_lock); + result = cl_page_prep(env, io, page, crt); + if (result == 0) { + cl_page_list_move(qout, qin, page); + if (list_empty(&oap->oap_pending_item)) { + osc_io_submit_page(env, cl2osc_io(env, ios), + opg, crt); + } else { + result = osc_set_async_flags_base(cli, + osc->oo_oinfo, + oap, + OSC_FLAGS); + if (result != 0) + break; + } + } else { + LASSERT(result < 0); + if (result != -EALREADY) + break; + /* + * Handle -EALREADY error: for read case, the page is + * already in UPTODATE state; for write, the page + * is not dirty. + */ + result = 0; + } + /* + * Don't keep client_obd_list_lock() for too long. + * + * XXX lock_need_resched() should be used here, but it is not + * available in the older of supported kernels. + */ + if (queued > OSC_QUEUE_GRAIN || cfs_need_resched()) { + queued = 0; + osc_io_unplug(env, osc, cli); + cfs_cond_resched(); + } + } + + LASSERT(ergo(result == 0, cli != NULL)); + LASSERT(ergo(result == 0, osc == osc0)); + + if (queued > 0) + osc_io_unplug(env, osc, cli); + CDEBUG(D_INFO, "%i/%i %i\n", qin->pl_nr, qout->pl_nr, result); + return qout->pl_nr > 0 ? 0 : result; +} + +static void osc_page_touch_at(const struct lu_env *env, + struct cl_object *obj, pgoff_t idx, unsigned to) +{ + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid; + __u64 kms; + + /* offset within stripe */ + kms = cl_offset(obj, idx) + to; + + cl_object_attr_lock(obj); + /* + * XXX old code used + * + * ll_inode_size_lock(inode, 0); lov_stripe_lock(lsm); + * + * here + */ + CDEBUG(D_INODE, "stripe KMS %sincreasing "LPU64"->"LPU64" "LPU64"\n", + kms > loi->loi_kms ? "" : "not ", loi->loi_kms, kms, + loi->loi_lvb.lvb_size); + + valid = 0; + if (kms > loi->loi_kms) { + attr->cat_kms = kms; + valid |= CAT_KMS; + } + if (kms > loi->loi_lvb.lvb_size) { + attr->cat_size = kms; + valid |= CAT_SIZE; + } + cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); +} + +/** + * This is called when a page is accessed within file in a way that creates + * new page, if one were missing (i.e., if there were a hole at that place in + * the file, or accessed page is beyond the current file size). Examples: + * ->commit_write() and ->nopage() methods. + * + * Expand stripe KMS if necessary. + */ +static void osc_page_touch(const struct lu_env *env, + struct osc_page *opage, unsigned to) +{ + struct cl_page *page = opage->ops_cl.cpl_page; + struct cl_object *obj = opage->ops_cl.cpl_obj; + + osc_page_touch_at(env, obj, page->cp_index, to); +} + +/** + * Implements cl_io_operations::cio_prepare_write() method for osc layer. + * + * \retval -EIO transfer initiated against this osc will most likely fail + * \retval 0 transfer initiated against this osc will most likely succeed. + * + * The reason for this check is to immediately return an error to the caller + * in the case of a deactivated import. Note, that import can be deactivated + * later, while pages, dirtied by this IO, are still in the cache, but this is + * irrelevant, because that would still return an error to the application (if + * it does fsync), but many applications don't do fsync because of performance + * issues, and we wanted to return an -EIO at write time to notify the + * application. + */ +static int osc_io_prepare_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_device *dev = lu2osc_dev(slice->cpl_obj->co_lu.lo_dev); + struct obd_import *imp = class_exp2cliimp(dev->od_exp); + + ENTRY; + + /* + * This implements OBD_BRW_CHECK logic from old client. + */ + + RETURN(imp == NULL || imp->imp_invalid ? -EIO : 0); +} + +static int osc_io_commit_write(const struct lu_env *env, + const struct cl_io_slice *ios, + const struct cl_page_slice *slice, + unsigned from, unsigned to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + struct osc_async_page *oap = &opg->ops_oap; + ENTRY; + + LASSERT(to > 0); + /* + * XXX instead of calling osc_page_touch() here and in + * osc_io_fault_start() it might be more logical to introduce + * cl_page_touch() method, that generic cl_io_commit_write() and page + * fault code calls. + */ + osc_page_touch(env, cl2osc_page(slice), to); + if (!client_is_remote(osc_export(obj)) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + + RETURN(0); +} + +static int osc_io_fault_start(const struct lu_env *env, + const struct cl_io_slice *ios) +{ + struct cl_io *io; + struct cl_fault_io *fio; + + ENTRY; + + io = ios->cis_io; + fio = &io->u.ci_fault; + CDEBUG(D_INFO, "%lu %i %i\n", + fio->ft_index, fio->ft_writable, fio->ft_nob); + /* + * If mapping is writeable, adjust kms to cover this page, + * but do not extend kms beyond actual file size. + * See bug 10919. + */ + if (fio->ft_writable) + osc_page_touch_at(env, ios->cis_obj, + fio->ft_index, fio->ft_nob); + RETURN(0); +} + +static int osc_punch_upcall(void *a, int rc) +{ + struct osc_punch_cbargs *args = a; + + args->opc_rc = rc; + complete(&args->opc_sync); + return 0; +} + +#ifdef __KERNEL__ +/** + * Checks that there are no pages being written in the extent being truncated. + */ +static void osc_trunc_check(const struct lu_env *env, struct cl_io *io, + struct osc_io *oio, size_t size) +{ + struct osc_page *cp; + struct osc_object *obj; + struct cl_object *clob; + struct cl_page *page; + struct cl_page_list *list; + int partial; + pgoff_t start; + + clob = oio->oi_cl.cis_obj; + obj = cl2osc(clob); + start = cl_index(clob, size); + partial = cl_offset(clob, start) < size; + list = &osc_env_info(env)->oti_plist; + + /* + * Complain if there are pages in the truncated region. + * + * XXX this is quite expensive check. + */ + cl_page_list_init(list); + cl_page_gang_lookup(env, clob, io, start + partial, CL_PAGE_EOF, list); + + cl_page_list_for_each(page, list) + CL_PAGE_DEBUG(D_ERROR, env, page, "exists %lu\n", start); + + cl_page_list_disown(env, io, list); + cl_page_list_fini(env, list); + + spin_lock(&obj->oo_seatbelt); + list_for_each_entry(cp, &obj->oo_inflight[CRT_WRITE], ops_inflight) { + page = cp->ops_cl.cpl_page; + if (page->cp_index >= start + partial) { + cfs_task_t *submitter; + + submitter = cp->ops_submitter; + /* + * XXX Linux specific debugging stuff. + */ + CL_PAGE_DEBUG(D_ERROR, env, page, "%s/%i %lu\n", + submitter->comm, submitter->pid, start); + libcfs_debug_dumpstack(submitter); + } + } + spin_unlock(&obj->oo_seatbelt); +} +#else /* __KERNEL__ */ +# define osc_trunc_check(env, io, oio, size) do {;} while (0) +#endif + +static int osc_io_trunc_start(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct cl_object *obj = slice->cis_obj; + struct lov_oinfo *loi = cl2osc(obj)->oo_oinfo; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + struct obdo *oa = &oio->oi_oa; + struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg; + struct obd_capa *capa; + loff_t size = io->u.ci_truncate.tr_size; + int result; + + memset(oa, 0, sizeof(*oa)); + + osc_trunc_check(env, io, oio, size); + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + if (result == 0) { + attr->cat_size = attr->cat_kms = size; + result = cl_object_attr_set(env, obj, attr, CAT_SIZE|CAT_KMS); + } + cl_object_attr_unlock(obj); + + if (result == 0) { + oa->o_id = loi->loi_id; + oa->o_gr = loi->loi_gr; + oa->o_mtime = attr->cat_mtime; + oa->o_atime = attr->cat_atime; + oa->o_ctime = attr->cat_ctime; + oa->o_valid = OBD_MD_FLID | OBD_MD_FLGROUP | OBD_MD_FLATIME | + OBD_MD_FLCTIME | OBD_MD_FLMTIME; + if (oio->oi_lockless) { + oa->o_flags = OBD_FL_TRUNCLOCK; + oa->o_valid |= OBD_MD_FLFLAGS; + } + oa->o_size = size; + oa->o_blocks = OBD_OBJECT_EOF; + oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + + capa = io->u.ci_truncate.tr_capa; + init_completion(&cbargs->opc_sync); + result = osc_punch_base(osc_export(cl2osc(obj)), oa, capa, + osc_punch_upcall, cbargs, PTLRPCD_SET); + } + return result; +} + +static void osc_io_trunc_end(const struct lu_env *env, + const struct cl_io_slice *slice) +{ + struct cl_io *io = slice->cis_io; + struct osc_io *oio = cl2osc_io(env, slice); + struct osc_punch_cbargs *cbargs = &oio->oi_punch_cbarg; + struct obdo *oa = &oio->oi_oa; + int result; + + wait_for_completion(&cbargs->opc_sync); + + result = io->ci_result = cbargs->opc_rc; + if (result == 0) { + struct cl_object *obj = slice->cis_obj; + if (oio->oi_lockless == 0) { + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + int valid = 0; + + /* Update kms & size */ + if (oa->o_valid & OBD_MD_FLSIZE) { + attr->cat_size = oa->o_size; + attr->cat_kms = oa->o_size; + valid |= CAT_KMS|CAT_SIZE; + } + if (oa->o_valid & OBD_MD_FLBLOCKS) { + attr->cat_blocks = oa->o_blocks; + valid |= CAT_BLOCKS; + } + if (oa->o_valid & OBD_MD_FLMTIME) { + attr->cat_mtime = oa->o_mtime; + valid |= CAT_MTIME; + } + if (oa->o_valid & OBD_MD_FLCTIME) { + attr->cat_ctime = oa->o_ctime; + valid |= CAT_CTIME; + } + if (oa->o_valid & OBD_MD_FLATIME) { + attr->cat_atime = oa->o_atime; + valid |= CAT_ATIME; + } + cl_object_attr_lock(obj); + result = cl_object_attr_set(env, obj, attr, valid); + cl_object_attr_unlock(obj); + } else { /* lockless truncate */ + struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + /* XXX: Need a lock. */ + osd->od_stats.os_lockless_truncates++; + } + } + + /* return result; */ +} + +static const struct cl_io_operations osc_io_ops = { + .op = { + [CIT_READ] = { + .cio_fini = osc_io_fini + }, + [CIT_WRITE] = { + .cio_fini = osc_io_fini + }, + [CIT_TRUNC] = { + .cio_start = osc_io_trunc_start, + .cio_end = osc_io_trunc_end + }, + [CIT_FAULT] = { + .cio_fini = osc_io_fini, + .cio_start = osc_io_fault_start + }, + [CIT_MISC] = { + .cio_fini = osc_io_fini + } + }, + .req_op = { + [CRT_READ] = { + .cio_submit = osc_io_submit + }, + [CRT_WRITE] = { + .cio_submit = osc_io_submit + } + }, + .cio_prepare_write = osc_io_prepare_write, + .cio_commit_write = osc_io_commit_write +}; + +/***************************************************************************** + * + * Transfer operations. + * + */ + +static int osc_req_prep(const struct lu_env *env, + const struct cl_req_slice *slice) +{ + return 0; +} + +static void osc_req_completion(const struct lu_env *env, + const struct cl_req_slice *slice, int ioret) +{ + struct osc_req *or; + + or = cl2osc_req(slice); + OBD_SLAB_FREE_PTR(or, osc_req_kmem); +} + +/** + * Implementation of struct cl_req_operations::cro_attr_set() for osc + * layer. osc is responsible for struct obdo::o_id and struct obdo::o_gr + * fields. + */ +static void osc_req_attr_set(const struct lu_env *env, + const struct cl_req_slice *slice, + const struct cl_object *obj, + struct cl_req_attr *attr, obd_valid flags) +{ + struct lov_oinfo *oinfo; + struct cl_req *clerq; + struct cl_page *apage; /* _some_ page in @clerq */ + struct cl_lock *lock; /* _some_ lock protecting @apage */ + struct osc_lock *olck; + struct osc_page *opg; + struct obdo *oa; + + oa = attr->cra_oa; + oinfo = cl2osc(obj)->oo_oinfo; + if (flags & OBD_MD_FLID) { + oa->o_id = oinfo->loi_id; + oa->o_valid |= OBD_MD_FLID; + } + if (flags & OBD_MD_FLGROUP) { + oa->o_gr = oinfo->loi_gr; + oa->o_valid |= OBD_MD_FLGROUP; + } + if (flags & OBD_MD_FLHANDLE) { + clerq = slice->crs_req; + LASSERT(!list_empty(&clerq->crq_pages)); + apage = container_of(clerq->crq_pages.next, + struct cl_page, cp_flight); + opg = osc_cl_page_osc(apage); + apage = opg->ops_cl.cpl_page; /* now apage is a sub-page */ + lock = cl_lock_at_page(env, apage->cp_obj, apage, NULL, 1, 1); + if (lock != NULL) { + olck = osc_lock_at(lock); + LASSERT(olck != NULL); + /* check for lockless io. */ + if (olck->ols_lock != NULL) { + oa->o_handle = olck->ols_lock->l_remote_handle; + oa->o_valid |= OBD_MD_FLHANDLE; + } + cl_lock_put(env, lock); + } else { + /* Should only be possible with liblustre */ + LASSERT(LIBLUSTRE_CLIENT); + } + } +} + +static const struct cl_req_operations osc_req_ops = { + .cro_prep = osc_req_prep, + .cro_attr_set = osc_req_attr_set, + .cro_completion = osc_req_completion +}; + + +int osc_io_init(const struct lu_env *env, + struct cl_object *obj, struct cl_io *io) +{ + struct osc_io *oio = osc_env_io(env); + + CL_IO_SLICE_CLEAN(oio, oi_cl); + cl_io_slice_add(io, &oio->oi_cl, obj, &osc_io_ops); + return 0; +} + +int osc_req_init(const struct lu_env *env, struct cl_device *dev, + struct cl_req *req) +{ + struct osc_req *or; + int result; + + OBD_SLAB_ALLOC_PTR(or, osc_req_kmem); + if (or != NULL) { + cl_req_slice_add(req, &or->or_cl, dev, &osc_req_ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + +/** @} osc */ diff --git a/lustre/osc/osc_lock.c b/lustre/osc/osc_lock.c new file mode 100644 index 0000000..a0e3190 --- /dev/null +++ b/lustre/osc/osc_lock.c @@ -0,0 +1,1623 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_lock for OSC layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup osc osc @{ */ + +#define DEBUG_SUBSYSTEM S_OSC + +#ifdef __KERNEL__ +# include +#else +# include +#endif +/* fid_build_reg_res_name() */ +#include + +#include "osc_cl_internal.h" + +/***************************************************************************** + * + * Type conversions. + * + */ + +static const struct cl_lock_operations osc_lock_ops; +static const struct cl_lock_operations osc_lock_lockless_ops; + +int osc_lock_is_lockless(const struct osc_lock *olck) +{ + return (olck->ols_cl.cls_ops == &osc_lock_lockless_ops); +} + +/** + * Returns a weak pointer to the ldlm lock identified by a handle. Returned + * pointer cannot be dereferenced, as lock is not protected from concurrent + * reclaim. This function is a helper for osc_lock_invariant(). + */ +static struct ldlm_lock *osc_handle_ptr(struct lustre_handle *handle) +{ + struct ldlm_lock *lock; + + lock = ldlm_handle2lock(handle); + if (lock != NULL) + LDLM_LOCK_PUT(lock); + return lock; +} + +/** + * Invariant that has to be true all of the time. + */ +static int osc_lock_invariant(struct osc_lock *ols) +{ + struct ldlm_lock *lock = osc_handle_ptr(&ols->ols_handle); + struct ldlm_lock *olock = ols->ols_lock; + int handle_used = lustre_handle_is_used(&ols->ols_handle); + + return + ergo(osc_lock_is_lockless(ols), + ols->ols_locklessable && ols->ols_lock == NULL) || + (ergo(olock != NULL, handle_used) && + ergo(olock != NULL, + olock->l_handle.h_cookie == ols->ols_handle.cookie) && + /* + * Check that ->ols_handle and ->ols_lock are consistent, but + * take into account that they are set at the different time. + */ + ergo(handle_used, + ergo(lock != NULL && olock != NULL, lock == olock) && + ergo(lock == NULL, olock == NULL)) && + ergo(ols->ols_state == OLS_CANCELLED, + olock == NULL && !handle_used) && + /* + * DLM lock is destroyed only after we have seen cancellation + * ast. + */ + ergo(olock != NULL && ols->ols_state < OLS_CANCELLED, + !olock->l_destroyed) && + ergo(ols->ols_state == OLS_GRANTED, + olock != NULL && + olock->l_req_mode == olock->l_granted_mode && + ols->ols_hold)); +} + +/***************************************************************************** + * + * Lock operations. + * + */ + +/** + * Breaks a link between osc_lock and dlm_lock. + */ +static void osc_lock_detach(const struct lu_env *env, struct osc_lock *olck) +{ + struct ldlm_lock *dlmlock; + + spin_lock(&osc_ast_guard); + dlmlock = olck->ols_lock; + if (dlmlock == NULL) { + spin_unlock(&osc_ast_guard); + return; + } + + olck->ols_lock = NULL; + /* wb(); --- for all who checks (ols->ols_lock != NULL) before + * call to osc_lock_detach() */ + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + + lock_res_and_lock(dlmlock); + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) { + struct cl_object *obj = olck->ols_cl.cls_obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + __u64 old_kms = cl2osc(obj)->oo_oinfo->loi_kms; + + /* Update the kms. Need to loop all granted locks. + * Not a problem for the client */ + attr->cat_kms = ldlm_extent_shift_kms(dlmlock, old_kms); + unlock_res_and_lock(dlmlock); + + cl_object_attr_lock(obj); + cl_object_attr_set(env, obj, attr, CAT_KMS); + cl_object_attr_unlock(obj); + } else + unlock_res_and_lock(dlmlock); + + /* release a reference taken in osc_lock_upcall0(). */ + lu_ref_del(&dlmlock->l_reference, "osc_lock", olck); + LDLM_LOCK_RELEASE(dlmlock); +} + +static int osc_lock_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + int result; + + LASSERT(ols->ols_state == OLS_GRANTED || + ols->ols_state == OLS_UPCALL_RECEIVED); + LINVRNT(osc_lock_invariant(ols)); + + if (ols->ols_glimpse) { + LASSERT(ols->ols_hold == 0); + return 0; + } + LASSERT(ols->ols_hold); + + /* + * Move lock into OLS_RELEASED state before calling osc_cancel_base() + * so that possible synchronous cancellation (that always happens + * e.g., for liblustre) sees that lock is released. + */ + ols->ols_state = OLS_RELEASED; + ols->ols_hold = 0; + result = osc_cancel_base(&ols->ols_handle, ols->ols_einfo.ei_mode); + ols->ols_has_ref = 0; + return result; +} + +static void osc_lock_fini(const struct lu_env *env, + struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + + LINVRNT(osc_lock_invariant(ols)); + /* + * ->ols_hold can still be true at this point if, for example, a + * thread that requested a lock was killed (and released a reference + * to the lock), before reply from a server was received. In this case + * lock is destroyed immediately after upcall. + */ + if (ols->ols_hold) + osc_lock_unuse(env, slice); + if (ols->ols_lock != NULL) + osc_lock_detach(env, ols); + + OBD_SLAB_FREE_PTR(ols, osc_lock_kmem); +} + +void osc_lock_build_res(const struct lu_env *env, const struct osc_object *obj, + struct ldlm_res_id *resname) +{ + const struct lu_fid *fid = lu_object_fid(&obj->oo_cl.co_lu); + if (0) { + /* + * In the perfect world of the future, where ost servers talk + * idif-fids... + */ + fid_build_reg_res_name(fid, resname); + } else { + /* + * In reality, where ost server expects ->lsm_object_id and + * ->lsm_object_gr in rename. + */ + osc_build_res_name(obj->oo_oinfo->loi_id, obj->oo_oinfo->loi_gr, + resname); + } +} + +static void osc_lock_build_policy(const struct lu_env *env, + const struct cl_lock *lock, + ldlm_policy_data_t *policy) +{ + const struct cl_lock_descr *d = &lock->cll_descr; + + osc_index2policy(policy, d->cld_obj, d->cld_start, d->cld_end); +} + +static int osc_enq2ldlm_flags(__u32 enqflags) +{ + int result = 0; + + LASSERT((enqflags & ~(CEF_NONBLOCK|CEF_ASYNC|CEF_DISCARD_DATA)) == 0); + + if (enqflags & CEF_NONBLOCK) + result |= LDLM_FL_BLOCK_NOWAIT; + if (enqflags & CEF_ASYNC) + result |= LDLM_FL_HAS_INTENT; + if (enqflags & CEF_DISCARD_DATA) + result |= LDLM_AST_DISCARD_DATA; + return result; +} + +/** + * Global spin-lock protecting consistency of ldlm_lock::l_ast_data + * pointers. Initialized in osc_init(). + */ +spinlock_t osc_ast_guard; + +static struct osc_lock *osc_ast_data_get(struct ldlm_lock *dlm_lock) +{ + struct osc_lock *olck; + + lock_res_and_lock(dlm_lock); + spin_lock(&osc_ast_guard); + olck = dlm_lock->l_ast_data; + if (olck != NULL) { + struct cl_lock *lock = olck->ols_cl.cls_lock; + /* + * If osc_lock holds a reference on ldlm lock, return it even + * when cl_lock is in CLS_FREEING state. This way + * + * osc_ast_data_get(dlmlock) == NULL + * + * guarantees that all osc references on dlmlock were + * released. osc_dlm_blocking_ast0() relies on that. + */ + if (lock->cll_state < CLS_FREEING || olck->ols_has_ref) { + cl_lock_get_trust(lock); + lu_ref_add_atomic(&lock->cll_reference, + "ast", cfs_current()); + } else + olck = NULL; + } + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(dlm_lock); + return olck; +} + +static void osc_ast_data_put(const struct lu_env *env, struct osc_lock *olck) +{ + struct cl_lock *lock; + + lock = olck->ols_cl.cls_lock; + lu_ref_del(&lock->cll_reference, "ast", cfs_current()); + cl_lock_put(env, lock); +} + +static void osc_lock_to_lockless(struct osc_lock *olck) +{ + struct cl_lock_slice *slice = &olck->ols_cl; + struct cl_lock *lock = slice->cls_lock; + + /* + * TODO: Discover which locks we need to convert the lock + * to ldlmlockless. + */ + LASSERT(cl_lock_is_mutexed(lock)); + slice->cls_ops = &osc_lock_lockless_ops; +} + +/** + * Updates object attributes from a lock value block (lvb) received together + * with the DLM lock reply from the server. Copy of osc_update_enqueue() + * logic. + * + * This can be optimized to not update attributes when lock is a result of a + * local match. + */ +static void osc_lock_lvb_update(const struct lu_env *env, struct osc_lock *olck, + int rc) +{ + struct ost_lvb *lvb; + struct cl_object *obj; + struct lov_oinfo *oinfo; + struct cl_attr *attr; + unsigned valid; + + ENTRY; + + if (!(olck->ols_flags & LDLM_FL_LVB_READY)) { + EXIT; + return; + } + + lvb = &olck->ols_lvb; + obj = olck->ols_cl.cls_obj; + oinfo = cl2osc(obj)->oo_oinfo; + attr = &osc_env_info(env)->oti_attr; + valid = CAT_BLOCKS | CAT_ATIME | CAT_CTIME | CAT_MTIME | CAT_SIZE; + cl_lvb2attr(attr, lvb); + + cl_object_attr_lock(obj); + if (rc == 0) { + struct ldlm_lock *dlmlock; + __u64 size; + + dlmlock = olck->ols_lock; + LASSERT(dlmlock != NULL); + + size = lvb->lvb_size; + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (size > dlmlock->l_policy_data.l_extent.end) + size = dlmlock->l_policy_data.l_extent.end + 1; + if (size >= oinfo->loi_kms) { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss="LPU64 + ", kms="LPU64, lvb->lvb_size, size); + valid |= CAT_KMS; + attr->cat_kms = size; + } else { + LDLM_DEBUG(dlmlock, "lock acquired, setting rss=" + LPU64"; leaving kms="LPU64", end="LPU64, + lvb->lvb_size, oinfo->loi_kms, + dlmlock->l_policy_data.l_extent.end); + } + ldlm_lock_allow_match(dlmlock); + } else if (rc == -ENAVAIL && olck->ols_glimpse) { + CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" + " kms="LPU64"\n", lvb->lvb_size, oinfo->loi_kms); + } else + valid = 0; + + if (valid != 0) + cl_object_attr_set(env, obj, attr, valid); + + cl_object_attr_unlock(obj); + + EXIT; +} + +static void osc_lock_granted(const struct lu_env *env, struct osc_lock *olck, + struct ldlm_lock *dlmlock, int rc) +{ + struct ldlm_extent *ext; + struct cl_lock *lock; + struct cl_lock_descr *descr; + + LASSERT(dlmlock->l_granted_mode == dlmlock->l_req_mode); + + ENTRY; + if (olck->ols_state != OLS_GRANTED) { + lock = olck->ols_cl.cls_lock; + ext = &dlmlock->l_policy_data.l_extent; + descr = &osc_env_info(env)->oti_descr; + descr->cld_obj = lock->cll_descr.cld_obj; + + /* XXX check that ->l_granted_mode is valid. */ + descr->cld_mode = osc_ldlm2cl_lock(dlmlock->l_granted_mode); + descr->cld_start = cl_index(descr->cld_obj, ext->start); + descr->cld_end = cl_index(descr->cld_obj, ext->end); + /* + * tell upper layers the extent of the lock that was actually + * granted + */ + cl_lock_modify(env, lock, descr); + LINVRNT(osc_lock_invariant(olck)); + olck->ols_state = OLS_GRANTED; + osc_lock_lvb_update(env, olck, rc); + cl_lock_signal(env, lock); + } + EXIT; +} + +static void osc_lock_upcall0(const struct lu_env *env, struct osc_lock *olck) + +{ + struct ldlm_lock *dlmlock; + + ENTRY; + + dlmlock = ldlm_handle2lock_long(&olck->ols_handle, 0); + LASSERT(dlmlock != NULL); + + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(dlmlock->l_ast_data == olck); + LASSERT(olck->ols_lock == NULL); + olck->ols_lock = dlmlock; + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(dlmlock); + + /* + * Lock might be not yet granted. In this case, completion ast + * (osc_ldlm_completion_ast()) comes later and finishes lock + * granting. + */ + if (dlmlock->l_granted_mode == dlmlock->l_req_mode) + osc_lock_granted(env, olck, dlmlock, 0); + /* + * osc_enqueue_interpret() decrefs asynchronous locks, counter + * this. + */ + ldlm_lock_addref(&olck->ols_handle, olck->ols_einfo.ei_mode); + olck->ols_hold = olck->ols_has_ref = 1; + + /* lock reference taken by ldlm_handle2lock_long() is owned by + * osc_lock and released in osc_lock_detach() */ + lu_ref_add(&dlmlock->l_reference, "osc_lock", olck); +} + +/** + * Lock upcall function that is executed either when a reply to ENQUEUE rpc is + * received from a server, or after osc_enqueue_base() matched a local DLM + * lock. + */ +static int osc_lock_upcall(void *cookie, int errcode) +{ + struct osc_lock *olck = cookie; + struct cl_lock_slice *slice = &olck->ols_cl; + struct cl_lock *lock = slice->cls_lock; + struct lu_env *env; + + int refcheck; + + ENTRY; + /* + * XXX environment should be created in ptlrpcd. + */ + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + int rc; + + cl_lock_mutex_get(env, lock); + + LASSERT(lock->cll_state >= CLS_QUEUING); + if (olck->ols_state == OLS_ENQUEUED) { + olck->ols_state = OLS_UPCALL_RECEIVED; + rc = ldlm_error2errno(errcode); + } else if (olck->ols_state == OLS_CANCELLED) { + rc = -EIO; + } else { + CERROR("Impossible state: %i\n", olck->ols_state); + LBUG(); + } + if (rc) { + struct ldlm_lock *dlmlock; + + dlmlock = ldlm_handle2lock(&olck->ols_handle); + if (dlmlock != NULL) { + lock_res_and_lock(dlmlock); + spin_lock(&osc_ast_guard); + LASSERT(olck->ols_lock == NULL); + dlmlock->l_ast_data = NULL; + olck->ols_handle.cookie = 0ULL; + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(dlmlock); + LDLM_LOCK_PUT(dlmlock); + } + } else { + if (olck->ols_glimpse) + olck->ols_glimpse = 0; + osc_lock_upcall0(env, olck); + } + + /* Error handling, some errors are tolerable. */ + if (olck->ols_locklessable && rc == -EUSERS) { + /* This is a tolerable error, turn this lock into + * lockless lock. + */ + osc_object_set_contended(cl2osc(slice->cls_obj)); + LASSERT(slice->cls_ops == &osc_lock_ops); + + /* Change this lock to ldlmlock-less lock. */ + osc_lock_to_lockless(olck); + olck->ols_state = OLS_GRANTED; + rc = 0; + } else if (olck->ols_glimpse && rc == -ENAVAIL) { + osc_lock_lvb_update(env, olck, rc); + cl_lock_delete(env, lock); + /* Hide the error. */ + rc = 0; + } + + if (rc == 0) + /* on error, lock was signaled by cl_lock_error() */ + cl_lock_signal(env, lock); + else + cl_lock_error(env, lock, rc); + + cl_lock_mutex_put(env, lock); + + /* release cookie reference, acquired by osc_lock_enqueue() */ + lu_ref_del(&lock->cll_reference, "upcall", lock); + cl_lock_put(env, lock); + cl_env_put(env, &refcheck); + } else + /* should never happen, similar to osc_ldlm_blocking_ast(). */ + LBUG(); + RETURN(errcode); +} + +/** + * Core of osc_dlm_blocking_ast() logic. + */ +static void osc_lock_blocking(const struct lu_env *env, + struct ldlm_lock *dlmlock, + struct osc_lock *olck, int blocking) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LASSERT(olck->ols_lock == dlmlock); + CLASSERT(OLS_BLOCKED < OLS_CANCELLED); + LASSERT(!osc_lock_is_lockless(olck)); + + if (olck->ols_hold) + /* + * Lock might be still addref-ed here, if e.g., blocking ast + * is sent for a failed lock. + */ + osc_lock_unuse(env, &olck->ols_cl); + + if (blocking && olck->ols_state < OLS_BLOCKED) + /* + * Move osc_lock into OLS_BLOCKED before canceling the lock, + * because it recursively re-enters osc_lock_blocking(), with + * the state set to OLS_CANCELLED. + */ + olck->ols_state = OLS_BLOCKED; + /* + * cancel and destroy lock at least once no matter how blocking ast is + * entered (see comment above osc_ldlm_blocking_ast() for use + * cases). cl_lock_cancel() and cl_lock_delete() are idempotent. + */ + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); +} + +/** + * Helper for osc_dlm_blocking_ast() handling discrepancies between cl_lock + * and ldlm_lock caches. + */ +static int osc_dlm_blocking_ast0(const struct lu_env *env, + struct ldlm_lock *dlmlock, + void *data, int flag) +{ + struct osc_lock *olck; + struct cl_lock *lock; + int result; + int cancel; + + LASSERT(flag == LDLM_CB_BLOCKING || flag == LDLM_CB_CANCELING); + + cancel = 0; + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + LINVRNT(osc_lock_invariant(olck)); + if (olck->ols_ast_wait) { + /* wake up osc_lock_use() */ + cl_lock_signal(env, lock); + olck->ols_ast_wait = 0; + } + /* + * Lock might have been canceled while this thread was + * sleeping for lock mutex, but olck is pinned in memory. + */ + if (olck == dlmlock->l_ast_data) { + /* + * NOTE: DLM sends blocking AST's for failed locks + * (that are still in pre-OLS_GRANTED state) + * too, and they have to be canceled otherwise + * DLM lock is never destroyed and stuck in + * the memory. + * + * Alternatively, ldlm_cli_cancel() can be + * called here directly for osc_locks with + * ols_state < OLS_GRANTED to maintain an + * invariant that ->clo_cancel() is only called + * for locks that were granted. + */ + LASSERT(data == olck); + osc_lock_blocking(env, dlmlock, + olck, flag == LDLM_CB_BLOCKING); + } else + cancel = 1; + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + } else + /* + * DLM lock exists, but there is no cl_lock attached to it. + * This is a `normal' race. cl_object and its cl_lock's can be + * removed by memory pressure, together with all pages. + */ + cancel = (flag == LDLM_CB_BLOCKING); + + if (cancel) { + struct lustre_handle *lockh; + + lockh = &osc_env_info(env)->oti_handle; + ldlm_lock2handle(dlmlock, lockh); + result = ldlm_cli_cancel(lockh); + } else + result = 0; + return result; +} + +/** + * Blocking ast invoked by ldlm when dlm lock is either blocking progress of + * some other lock, or is canceled. This function is installed as a + * ldlm_lock::l_blocking_ast() for client extent locks. + * + * Control flow is tricky, because ldlm uses the same call-back + * (ldlm_lock::l_blocking_ast()) for both blocking and cancellation ast's. + * + * \param dlmlock lock for which ast occurred. + * + * \param new description of a conflicting lock in case of blocking ast. + * + * \param data value of dlmlock->l_ast_data + * + * \param flag LDLM_CB_BLOCKING or LDLM_CB_CANCELING. Used to distinguish + * cancellation and blocking ast's. + * + * Possible use cases: + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) to cancel + * lock due to lock lru pressure, or explicit user request to purge + * locks. + * + * - ldlm calls dlmlock->l_blocking_ast(..., LDLM_CB_BLOCKING) to notify + * us that dlmlock conflicts with another lock that some client is + * enqueing. Lock is canceled. + * + * - cl_lock_cancel() is called. osc_lock_cancel() calls + * ldlm_cli_cancel() that calls + * + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + * recursively entering osc_ldlm_blocking_ast(). + * + * - client cancels lock voluntary (e.g., as a part of early cancellation): + * + * cl_lock_cancel()-> + * osc_lock_cancel()-> + * ldlm_cli_cancel()-> + * dlmlock->l_blocking_ast(..., LDLM_CB_CANCELING) + * + */ +static int osc_ldlm_blocking_ast(struct ldlm_lock *dlmlock, + struct ldlm_lock_desc *new, void *data, + int flag) +{ + struct lu_env *env; + struct cl_env_nest nest; + int result; + + /* + * This can be called in the context of outer IO, e.g., + * + * cl_enqueue()->... + * ->osc_enqueue_base()->... + * ->ldlm_prep_elc_req()->... + * ->ldlm_cancel_callback()->... + * ->osc_ldlm_blocking_ast() + * + * new environment has to be created to not corrupt outer context. + */ + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) + result = osc_dlm_blocking_ast0(env, dlmlock, data, flag); + else { + result = PTR_ERR(env); + /* + * XXX This should never happen, as cl_lock is + * stuck. Pre-allocated environment a la vvp_inode_fini_env + * should be used. + */ + LBUG(); + } + if (result != 0) { + if (result == -ENODATA) + result = 0; + else + CERROR("BAST failed: %d\n", result); + } + cl_env_nested_put(&nest, env); + return result; +} + +static int osc_ldlm_completion_ast(struct ldlm_lock *dlmlock, + int flags, void *data) +{ + struct lu_env *env; + void *env_cookie; + struct osc_lock *olck; + struct cl_lock *lock; + int refcheck; + int result; + int dlmrc; + + /* first, do dlm part of the work */ + dlmrc = ldlm_completion_ast_async(dlmlock, flags, data); + /* then, notify cl_lock */ + env_cookie = cl_env_reenter(); + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + /* + * ldlm_handle_cp_callback() copied LVB from request + * to lock->l_lvb_data, store it in osc_lock. + */ + LASSERT(dlmlock->l_lvb_data != NULL); + olck->ols_lvb = *(struct ost_lvb *)dlmlock->l_lvb_data; + if (olck->ols_lock == NULL) + /* + * upcall (osc_lock_upcall()) hasn't yet been + * called. Do nothing now, upcall will bind + * olck to dlmlock and signal the waiters. + * + * This maintains an invariant that osc_lock + * and ldlm_lock are always bound when + * osc_lock is in OLS_GRANTED state. + */ + ; + else if (dlmlock->l_granted_mode != LCK_MINMODE) + osc_lock_granted(env, olck, dlmlock, dlmrc); + if (dlmrc != 0) + cl_lock_error(env, lock, dlmrc); + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + result = 0; + } else + result = -ELDLM_NO_LOCK_DATA; + cl_env_put(env, &refcheck); + } else + result = PTR_ERR(env); + cl_env_reexit(env_cookie); + return dlmrc ?: result; +} + +static int osc_ldlm_glimpse_ast(struct ldlm_lock *dlmlock, void *data) +{ + struct ptlrpc_request *req = data; + struct osc_lock *olck; + struct cl_lock *lock; + struct cl_object *obj; + struct lu_env *env; + struct ost_lvb *lvb; + struct req_capsule *cap; + int result; + int refcheck; + + LASSERT(lustre_msg_get_opc(req->rq_reqmsg) == LDLM_GL_CALLBACK); + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* + * osc_ast_data_get() has to go after environment is + * allocated, because osc_ast_data() acquires a + * reference to a lock, and it can only be released in + * environment. + */ + olck = osc_ast_data_get(dlmlock); + if (olck != NULL) { + lock = olck->ols_cl.cls_lock; + cl_lock_mutex_get(env, lock); + cap = &req->rq_pill; + req_capsule_extend(cap, &RQF_LDLM_GL_CALLBACK); + req_capsule_set_size(cap, &RMF_DLM_LVB, RCL_SERVER, + sizeof *lvb); + result = req_capsule_server_pack(cap); + if (result == 0) { + lvb = req_capsule_server_get(cap, &RMF_DLM_LVB); + obj = lock->cll_descr.cld_obj; + result = cl_object_glimpse(env, obj, lvb); + } + cl_lock_mutex_put(env, lock); + osc_ast_data_put(env, olck); + } else { + /* + * These errors are normal races, so we don't want to + * fill the console with messages by calling + * ptlrpc_error() + */ + lustre_pack_reply(req, 1, NULL, NULL); + result = -ELDLM_NO_LOCK_DATA; + } + cl_env_put(env, &refcheck); + } else + result = PTR_ERR(env); + req->rq_status = result; + return result; +} + +static unsigned long osc_lock_weigh(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + /* + * don't need to grab coh_page_guard since we don't care the exact # + * of pages.. + */ + return cl_object_header(slice->cls_obj)->coh_pages; +} + +/** + * Get the weight of dlm lock for early cancellation. + * + * XXX: it should return the pages covered by this \a dlmlock. + */ +static unsigned long osc_ldlm_weigh_ast(struct ldlm_lock *dlmlock) +{ + struct lu_env *env; + int refcheck; + void *cookie; + struct osc_lock *lock; + struct cl_lock *cll; + unsigned long weight; + ENTRY; + + might_sleep(); + cookie = cl_env_reenter(); + /* + * osc_ldlm_weigh_ast has a complex context since it might be called + * because of lock canceling, or from user's input. We have to make + * a new environment for it. Probably it is implementation safe to use + * the upper context because cl_lock_put don't modify environment + * variables. But in case of .. + */ + env = cl_env_get(&refcheck); + if (IS_ERR(env)) { + /* Mostly because lack of memory, tend to eliminate this lock*/ + cl_env_reexit(cookie); + RETURN(0); + } + + LASSERT(dlmlock->l_resource->lr_type == LDLM_EXTENT); + lock = osc_ast_data_get(dlmlock); + if (lock == NULL) { + /* cl_lock was destroyed because of memory pressure. + * It is much reasonable to assign this type of lock + * a lower cost. + */ + GOTO(out, weight = 0); + } + + cll = lock->ols_cl.cls_lock; + cl_lock_mutex_get(env, cll); + weight = cl_lock_weigh(env, cll); + cl_lock_mutex_put(env, cll); + osc_ast_data_put(env, lock); + EXIT; + +out: + cl_env_put(env, &refcheck); + cl_env_reexit(cookie); + return weight; +} + +static void osc_lock_build_einfo(const struct lu_env *env, + const struct cl_lock *clock, + struct osc_lock *lock, + struct ldlm_enqueue_info *einfo) +{ + enum cl_lock_mode mode; + + mode = clock->cll_descr.cld_mode; + if (mode == CLM_PHANTOM) + /* + * For now, enqueue all glimpse locks in read mode. In the + * future, client might choose to enqueue LCK_PW lock for + * glimpse on a file opened for write. + */ + mode = CLM_READ; + + einfo->ei_type = LDLM_EXTENT; + einfo->ei_mode = osc_cl_lock2ldlm(mode); + einfo->ei_cb_bl = osc_ldlm_blocking_ast; + einfo->ei_cb_cp = osc_ldlm_completion_ast; + einfo->ei_cb_gl = osc_ldlm_glimpse_ast; + einfo->ei_cb_wg = osc_ldlm_weigh_ast; + einfo->ei_cbdata = lock; /* value to be put into ->l_ast_data */ +} + +/** + * Cancels \a conflict lock and waits until it reached CLS_FREEING state. This + * is called as a part of enqueuing to cancel conflicting locks early. + * + * \retval 0: success, \a conflict was cancelled and destroyed. + * + * \retval CLO_REPEAT: \a conflict was cancelled, but \a lock mutex was + * released in the process. Repeat enqueing. + * + * \retval -EWOULDBLOCK: \a conflict cannot be cancelled immediately, and + * either \a lock is non-blocking, or current thread + * holds other locks, that prevent it from waiting + * for cancel to complete. + * + * \retval -ve: other error, including -EINTR. + * + */ +static int osc_lock_cancel_wait(const struct lu_env *env, struct cl_lock *lock, + struct cl_lock *conflict, int canwait) +{ + int rc; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(cl_lock_is_mutexed(conflict)); + + rc = 0; + if (conflict->cll_state != CLS_FREEING) { + cl_lock_cancel(env, conflict); + cl_lock_delete(env, conflict); + if (conflict->cll_flags & (CLF_CANCELPEND|CLF_DOOMED)) { + rc = -EWOULDBLOCK; + if (cl_lock_nr_mutexed(env) > 2) + /* + * If mutices of locks other than @lock and + * @scan are held by the current thread, it + * cannot wait on @scan state change in a + * dead-lock safe matter, so simply skip early + * cancellation in this case. + * + * This means that early cancellation doesn't + * work when there is even slight mutex + * contention, as top-lock's mutex is usually + * held at this time. + */ + ; + else if (canwait) { + /* Waiting for @scan to be destroyed */ + cl_lock_mutex_put(env, lock); + do { + rc = cl_lock_state_wait(env, conflict); + } while (!rc && + conflict->cll_state < CLS_FREEING); + /* mutex was released, repeat enqueue. */ + rc = rc ?: CLO_REPEAT; + cl_lock_mutex_get(env, lock); + } + } + LASSERT(ergo(!rc, conflict->cll_state == CLS_FREEING)); + CDEBUG(D_INFO, "lock %p was %s freed now, rc (%d)\n", + conflict, rc ? "not":"", rc); + } + return rc; +} + +/** + * Cancel all conflicting locks and wait for them to be destroyed. + * + * This function is used for two purposes: + * + * - early cancel all conflicting locks before starting IO, and + * + * - guarantee that pages added to the page cache by lockless IO are never + * covered by locks other than lockless IO lock, and, hence, are not + * visible to other threads. + */ +static int osc_lock_enqueue_wait(const struct lu_env *env, + const struct osc_lock *olck) +{ + struct cl_lock *lock = olck->ols_cl.cls_lock; + struct cl_lock_descr *descr = &lock->cll_descr; + struct cl_object_header *hdr = cl_object_header(descr->cld_obj); + struct cl_lock_closure *closure = &osc_env_info(env)->oti_closure; + struct cl_lock *scan; + struct cl_lock *temp; + int lockless = osc_lock_is_lockless(olck); + int rc = 0; + int canwait; + int stop; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + + /* + * XXX This function could be sped up if we had asynchronous + * cancellation. + */ + + canwait = + !(olck->ols_flags & LDLM_FL_BLOCK_NOWAIT) && + cl_lock_nr_mutexed(env) == 1; + cl_lock_closure_init(env, closure, lock, canwait); + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry_safe(scan, temp, &hdr->coh_locks, cll_linkage) { + if (scan == lock) + continue; + + if (scan->cll_state < CLS_QUEUING || + scan->cll_state == CLS_FREEING || + scan->cll_descr.cld_start > descr->cld_end || + scan->cll_descr.cld_end < descr->cld_start) + continue; + + /* overlapped and living locks. */ + /* A tricky case for lockless pages: + * We need to cancel the compatible locks if we're enqueuing + * a lockless lock, for example: + * imagine that client has PR lock on [0, 1000], and thread T0 + * is doing lockless IO in [500, 1500] region. Concurrent + * thread T1 can see lockless data in [500, 1000], which is + * wrong, because these data are possibly stale. + */ + if (!lockless && cl_lock_compatible(scan, lock)) + continue; + + /* Now @scan is conflicting with @lock, this means current + * thread have to sleep for @scan being destroyed. */ + cl_lock_get_trust(scan); + if (&temp->cll_linkage != &hdr->coh_locks) + cl_lock_get_trust(temp); + spin_unlock(&hdr->coh_lock_guard); + lu_ref_add(&scan->cll_reference, "cancel-wait", lock); + + LASSERT(list_empty(&closure->clc_list)); + rc = cl_lock_closure_build(env, scan, closure); + if (rc == 0) { + rc = osc_lock_cancel_wait(env, lock, scan, canwait); + cl_lock_disclosure(env, closure); + if (rc == -EWOULDBLOCK) + rc = 0; + } + if (rc == CLO_REPEAT && !canwait) + /* cannot wait... no early cancellation. */ + rc = 0; + + lu_ref_del(&scan->cll_reference, "cancel-wait", lock); + cl_lock_put(env, scan); + spin_lock(&hdr->coh_lock_guard); + /* + * Lock list could have been modified, while spin-lock was + * released. Check that it is safe to continue. + */ + stop = list_empty(&temp->cll_linkage); + if (&temp->cll_linkage != &hdr->coh_locks) + cl_lock_put(env, temp); + if (stop || rc != 0) + break; + } + spin_unlock(&hdr->coh_lock_guard); + cl_lock_closure_fini(closure); + RETURN(rc); +} + +/** + * Deadlock avoidance for osc_lock_enqueue(). Consider following scenario: + * + * - Thread0: obtains PR:[0, 10]. Lock is busy. + * + * - Thread1: enqueues PW:[5, 50]. Blocking ast is sent to + * PR:[0, 10], but cancellation of busy lock is postponed. + * + * - Thread0: enqueue PR:[30, 40]. Lock is locally matched to + * PW:[5, 50], and thread0 waits for the lock completion never + * releasing PR:[0, 10]---deadlock. + * + * The second PR lock can be glimpse (it is to deal with that situation that + * ll_glimpse_size() has second argument, preventing local match of + * not-yet-granted locks, see bug 10295). Similar situation is possible in the + * case of memory mapped user level buffer. + * + * To prevent this we can detect a situation when current "thread" or "io" + * already holds a lock on this object and either add LDLM_FL_BLOCK_GRANTED to + * the ols->ols_flags, or prevent local match with PW locks. + */ +static int osc_deadlock_is_possible(const struct lu_env *env, + struct cl_lock *lock) +{ + struct cl_object *obj; + struct cl_object_header *head; + struct cl_lock *scan; + struct osc_io *oio; + + int result; + + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + + oio = osc_env_io(env); + obj = lock->cll_descr.cld_obj; + head = cl_object_header(obj); + + result = 0; + spin_lock(&head->coh_lock_guard); + list_for_each_entry(scan, &head->coh_locks, cll_linkage) { + if (scan != lock) { + struct osc_lock *oscan; + + oscan = osc_lock_at(scan); + LASSERT(oscan != NULL); + if (oscan->ols_owner == oio) { + result = 1; + break; + } + } + } + spin_unlock(&head->coh_lock_guard); + RETURN(result); +} + +/** + * Implementation of cl_lock_operations::clo_enqueue() method for osc + * layer. This initiates ldlm enqueue: + * + * - checks for possible dead-lock conditions (osc_deadlock_is_possible()); + * + * - cancels conflicting locks early (osc_lock_enqueue_wait()); + * + * - calls osc_enqueue_base() to do actual enqueue. + * + * osc_enqueue_base() is supplied with an upcall function that is executed + * when lock is received either after a local cached ldlm lock is matched, or + * when a reply from the server is received. + * + * This function does not wait for the network communication to complete. + */ +static int osc_lock_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *_, __u32 enqflags) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = ols->ols_cl.cls_lock; + struct osc_object *obj = cl2osc(slice->cls_obj); + struct osc_thread_info *info = osc_env_info(env); + struct ldlm_res_id *resname = &info->oti_resname; + ldlm_policy_data_t *policy = &info->oti_policy; + struct ldlm_enqueue_info *einfo = &ols->ols_einfo; + int result; + ENTRY; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + LASSERT(ols->ols_state == OLS_NEW); + + osc_lock_build_res(env, obj, resname); + osc_lock_build_policy(env, lock, policy); + ols->ols_flags = osc_enq2ldlm_flags(enqflags); + if (ols->ols_locklessable) + ols->ols_flags |= LDLM_FL_DENY_ON_CONTENTION; + if (osc_deadlock_is_possible(env, lock)) + ols->ols_flags |= LDLM_FL_BLOCK_GRANTED; + if (ols->ols_flags & LDLM_FL_HAS_INTENT) + ols->ols_glimpse = 1; + + result = osc_lock_enqueue_wait(env, ols); + if (result == 0) { + /* a reference for lock, passed as an upcall cookie */ + cl_lock_get(lock); + lu_ref_add(&lock->cll_reference, "upcall", lock); + ols->ols_state = OLS_ENQUEUED; + + /* + * XXX: this is possible blocking point as + * ldlm_lock_match(LDLM_FL_LVB_READY) waits for + * LDLM_CP_CALLBACK. + */ + result = osc_enqueue_base(osc_export(obj), resname, + &ols->ols_flags, policy, + &ols->ols_lvb, + obj->oo_oinfo->loi_kms_valid, + osc_lock_upcall, + ols, einfo, &ols->ols_handle, + PTLRPCD_SET, 1); + if (result != 0) { + lu_ref_del(&lock->cll_reference, "upcall", lock); + cl_lock_put(env, lock); + } + } + + RETURN(result); +} + +static int osc_lock_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + if (olck->ols_glimpse && olck->ols_state >= OLS_UPCALL_RECEIVED) + return 0; + + LASSERT(equi(olck->ols_state >= OLS_UPCALL_RECEIVED && + lock->cll_error == 0, olck->ols_lock != NULL)); + + return lock->cll_error ?: olck->ols_state >= OLS_GRANTED ? 0 : CLO_WAIT; +} + +/** + * An implementation of cl_lock_operations::clo_use() method that pins cached + * lock. + */ +static int osc_lock_use(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + int rc; + + LASSERT(!olck->ols_hold); + /* + * Atomically check for LDLM_FL_CBPENDING and addref a lock if this + * flag is not set. This protects us from a concurrent blocking ast. + */ + rc = ldlm_lock_addref_try(&olck->ols_handle, olck->ols_einfo.ei_mode); + if (rc == 0) { + olck->ols_hold = olck->ols_has_ref = 1; + olck->ols_state = OLS_GRANTED; + } else { + struct cl_lock *lock; + + /* + * Lock is being cancelled somewhere within + * ldlm_handle_bl_callback(): LDLM_FL_CBPENDING is already + * set, but osc_ldlm_blocking_ast() hasn't yet acquired + * cl_lock mutex. + */ + lock = slice->cls_lock; + LASSERT(lock->cll_state == CLS_CACHED); + LASSERT(lock->cll_users > 0); + LASSERT(olck->ols_lock->l_flags & LDLM_FL_CBPENDING); + /* set a flag for osc_dlm_blocking_ast0() to signal the + * lock.*/ + olck->ols_ast_wait = 1; + rc = CLO_WAIT; + } + return rc; +} + +static int osc_lock_flush(struct osc_lock *ols, int discard) +{ + struct cl_lock *lock = ols->ols_cl.cls_lock; + struct cl_env_nest nest; + struct lu_env *env; + int result = 0; + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + result = cl_lock_page_out(env, lock, discard); + cl_env_nested_put(&nest, env); + } else + result = PTR_ERR(env); + if (result == 0) + ols->ols_flush = 1; + return result; +} + +/** + * Implements cl_lock_operations::clo_cancel() method for osc layer. This is + * called (as part of cl_lock_cancel()) when lock is canceled either voluntary + * (LRU pressure, early cancellation, umount, etc.) or due to the conflict + * with some other lock some where in the cluster. This function does the + * following: + * + * - invalidates all pages protected by this lock (after sending dirty + * ones to the server, as necessary); + * + * - decref's underlying ldlm lock; + * + * - cancels ldlm lock (ldlm_cli_cancel()). + */ +static void osc_lock_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct cl_lock *lock = slice->cls_lock; + struct osc_lock *olck = cl2osc_lock(slice); + struct ldlm_lock *dlmlock = olck->ols_lock; + int result; + int discard; + + LASSERT(cl_lock_is_mutexed(lock)); + LINVRNT(osc_lock_invariant(olck)); + + if (dlmlock != NULL) { + discard = dlmlock->l_flags & LDLM_FL_DISCARD_DATA; + result = osc_lock_flush(olck, discard); + if (olck->ols_hold) + osc_lock_unuse(env, slice); + LASSERT(dlmlock->l_readers == 0 && dlmlock->l_writers == 0); + result = ldlm_cli_cancel(&olck->ols_handle); + if (result < 0) + CL_LOCK_DEBUG(D_ERROR, env, lock, + "lock %p cancel failure with error(%d)\n", + lock, result); + } + olck->ols_state = OLS_CANCELLED; + osc_lock_detach(env, olck); +} + +void cl_lock_page_list_fixup(const struct lu_env *env, + struct cl_io *io, struct cl_lock *lock, + struct cl_page_list *queue); + +#ifdef INVARIANT_CHECK +/** + * Returns true iff there are pages under \a olck not protected by other + * locks. + */ +static int osc_lock_has_pages(struct osc_lock *olck) +{ + struct cl_lock *lock; + struct cl_lock_descr *descr; + struct cl_object *obj; + struct osc_object *oob; + struct cl_page_list *plist; + struct cl_page *page; + struct cl_env_nest nest; + struct cl_io *io; + struct lu_env *env; + int result; + + env = cl_env_nested_get(&nest); + if (!IS_ERR(env)) { + obj = olck->ols_cl.cls_obj; + oob = cl2osc(obj); + io = &oob->oo_debug_io; + lock = olck->ols_cl.cls_lock; + descr = &lock->cll_descr; + plist = &osc_env_info(env)->oti_plist; + cl_page_list_init(plist); + + mutex_lock(&oob->oo_debug_mutex); + + io->ci_obj = cl_object_top(obj); + cl_io_init(env, io, CIT_MISC, io->ci_obj); + cl_page_gang_lookup(env, obj, io, + descr->cld_start, descr->cld_end, plist); + cl_lock_page_list_fixup(env, io, lock, plist); + if (plist->pl_nr > 0) { + CL_LOCK_DEBUG(D_ERROR, env, lock, "still has pages\n"); + cl_page_list_for_each(page, plist) + CL_PAGE_DEBUG(D_ERROR, env, page, "\n"); + } + result = plist->pl_nr > 0; + cl_page_list_disown(env, io, plist); + cl_page_list_fini(env, plist); + cl_io_fini(env, io); + mutex_unlock(&oob->oo_debug_mutex); + cl_env_nested_put(&nest, env); + } else + result = 0; + return result; +} +#else +# define osc_lock_has_pages(olck) (0) +#endif /* INVARIANT_CHECK */ + +static void osc_lock_delete(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck; + + olck = cl2osc_lock(slice); + LINVRNT(osc_lock_invariant(olck)); + LINVRNT(!osc_lock_has_pages(olck)); + + if (olck->ols_hold) + osc_lock_unuse(env, slice); + osc_lock_detach(env, olck); +} + +/** + * Implements cl_lock_operations::clo_state() method for osc layer. + * + * Maintains osc_lock::ols_owner field. + * + * This assumes that lock always enters CLS_HELD (from some other state) in + * the same IO context as one that requested the lock. This should not be a + * problem, because context is by definition shared by all activity pertaining + * to the same high-level IO. + */ +static void osc_lock_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + struct osc_io *oio = osc_env_io(env); + + /* + * XXX multiple io contexts can use the lock at the same time. + */ + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD && slice->cls_lock->cll_state != CLS_HELD) { + LASSERT(lock->ols_owner == NULL); + lock->ols_owner = oio; + } else if (state != CLS_HELD) + lock->ols_owner = NULL; +} + +static int osc_lock_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct cl_lock_slice *slice) +{ + struct osc_lock *lock = cl2osc_lock(slice); + + /* + * XXX print ldlm lock and einfo properly. + */ + (*p)(env, cookie, "%p %08x "LPU64" %d %p ", + lock->ols_lock, lock->ols_flags, lock->ols_handle.cookie, + lock->ols_state, lock->ols_owner); + osc_lvb_print(env, cookie, p, &lock->ols_lvb); + return 0; +} + +static const struct cl_lock_operations osc_lock_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_enqueue, + .clo_wait = osc_lock_wait, + .clo_unuse = osc_lock_unuse, + .clo_use = osc_lock_use, + .clo_delete = osc_lock_delete, + .clo_state = osc_lock_state, + .clo_cancel = osc_lock_cancel, + .clo_weigh = osc_lock_weigh, + .clo_print = osc_lock_print +}; + +static int osc_lock_lockless_enqueue(const struct lu_env *env, + const struct cl_lock_slice *slice, + struct cl_io *_, __u32 enqflags) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = ols->ols_cl.cls_lock; + int result; + + LASSERT(cl_lock_is_mutexed(lock)); + LASSERT(lock->cll_state == CLS_QUEUING); + LASSERT(ols->ols_state == OLS_NEW); + + result = osc_lock_enqueue_wait(env, ols); + if (result == 0) + ols->ols_state = OLS_GRANTED; + return result; +} + +static int osc_lock_lockless_unuse(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + struct cl_lock *lock = slice->cls_lock; + + LASSERT(ols->ols_state == OLS_GRANTED); + LINVRNT(osc_lock_invariant(ols)); + + cl_lock_cancel(env, lock); + cl_lock_delete(env, lock); + return 0; +} + +static void osc_lock_lockless_cancel(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *ols = cl2osc_lock(slice); + int result; + + result = osc_lock_flush(ols, 0); + if (result) + CERROR("Pages for lockless lock %p were not purged(%d)\n", + ols, result); + ols->ols_state = OLS_CANCELLED; +} + +static int osc_lock_lockless_wait(const struct lu_env *env, + const struct cl_lock_slice *slice) +{ + struct osc_lock *olck = cl2osc_lock(slice); + struct cl_lock *lock = olck->ols_cl.cls_lock; + + LINVRNT(osc_lock_invariant(olck)); + LASSERT(olck->ols_state >= OLS_UPCALL_RECEIVED); + + return lock->cll_error; +} + +static void osc_lock_lockless_state(const struct lu_env *env, + const struct cl_lock_slice *slice, + enum cl_lock_state state) +{ + struct osc_lock *lock = cl2osc_lock(slice); + struct osc_io *oio = osc_env_io(env); + + LINVRNT(osc_lock_invariant(lock)); + if (state == CLS_HELD) { + LASSERT(lock->ols_owner == NULL); + lock->ols_owner = oio; + oio->oi_lockless = 1; + } else + lock->ols_owner = NULL; +} + +static int osc_lock_lockless_fits_into(const struct lu_env *env, + const struct cl_lock_slice *slice, + const struct cl_lock_descr *need, + const struct cl_io *io) +{ + return 0; +} + +static const struct cl_lock_operations osc_lock_lockless_ops = { + .clo_fini = osc_lock_fini, + .clo_enqueue = osc_lock_lockless_enqueue, + .clo_wait = osc_lock_lockless_wait, + .clo_unuse = osc_lock_lockless_unuse, + .clo_state = osc_lock_lockless_state, + .clo_fits_into = osc_lock_lockless_fits_into, + .clo_cancel = osc_lock_lockless_cancel, + .clo_print = osc_lock_print +}; + +int osc_lock_init(const struct lu_env *env, + struct cl_object *obj, struct cl_lock *lock, + const struct cl_io *io) +{ + struct osc_lock *clk; + struct osc_io *oio = osc_env_io(env); + struct osc_object *oob = cl2osc(obj); + int result; + + OBD_SLAB_ALLOC_PTR(clk, osc_lock_kmem); + if (clk != NULL) { + const struct cl_lock_operations *ops; + const struct osc_device *osd = lu2osc_dev(obj->co_lu.lo_dev); + struct obd_connect_data *ocd; + + osc_lock_build_einfo(env, lock, clk, &clk->ols_einfo); + clk->ols_state = OLS_NEW; + + /* + * Check if we need to do lockless IO here. + * Following conditions must be satisfied: + * - the current IO must be locklessable; + * - the stripe is in contention; + * - requested lock is not a glimpse. + * + * if not, we have to inherit the locklessable flag to + * osc_lock, and let ost make the decision. + * + * Additional policy can be implemented here, e.g., never do + * lockless-io for large extents. + */ + LASSERT(io->ci_lockreq == CILR_MANDATORY || + io->ci_lockreq == CILR_MAYBE || + io->ci_lockreq == CILR_NEVER); + ocd = &class_exp2cliimp(osc_export(oob))->imp_connect_data; + clk->ols_locklessable = (io->ci_type != CIT_TRUNC) && + (io->ci_lockreq == CILR_MAYBE) && + (ocd->ocd_connect_flags & OBD_CONNECT_SRVLOCK); + ops = &osc_lock_ops; + if (io->ci_lockreq == CILR_NEVER || + /* lockless IO */ + (clk->ols_locklessable && osc_object_is_contended(oob)) || + /* lockless truncate */ + (io->ci_type == CIT_TRUNC && + (ocd->ocd_connect_flags & OBD_CONNECT_TRUNCLOCK) && + osd->od_lockless_truncate)) { + ops = &osc_lock_lockless_ops; + oio->oi_lockless = 1; + clk->ols_locklessable = 1; + } + + cl_lock_slice_add(lock, &clk->ols_cl, obj, ops); + result = 0; + } else + result = -ENOMEM; + return result; +} + + +/** @} osc */ diff --git a/lustre/osc/osc_object.c b/lustre/osc/osc_object.c new file mode 100644 index 0000000..1f099b8 --- /dev/null +++ b/lustre/osc/osc_object.c @@ -0,0 +1,243 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_object for OSC layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup osc osc @{ */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +/***************************************************************************** + * + * Type conversions. + * + */ + +static struct lu_object *osc2lu(struct osc_object *osc) +{ + return &osc->oo_cl.co_lu; +} + +static struct osc_object *lu2osc(const struct lu_object *obj) +{ + LINVRNT(osc_is_object(obj)); + return container_of0(obj, struct osc_object, oo_cl.co_lu); +} + +/***************************************************************************** + * + * Object operations. + * + */ + +static int osc_object_init(const struct lu_env *env, struct lu_object *obj, + const struct lu_object_conf *conf) +{ + struct osc_object *osc = lu2osc(obj); + const struct cl_object_conf *cconf = lu2cl_conf(conf); + int i; + + osc->oo_oinfo = cconf->u.coc_oinfo; +#ifdef INVARIANT_CHECK + mutex_init(&osc->oo_debug_mutex); +#endif + spin_lock_init(&osc->oo_seatbelt); + for (i = 0; i < CRT_NR; ++i) + CFS_INIT_LIST_HEAD(&osc->oo_inflight[i]); + return 0; +} + +static void osc_object_free(const struct lu_env *env, struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + int i; + + for (i = 0; i < CRT_NR; ++i) + LASSERT(list_empty(&osc->oo_inflight[i])); + + lu_object_fini(obj); + OBD_SLAB_FREE_PTR(osc, osc_object_kmem); +} + +int osc_lvb_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct ost_lvb *lvb) +{ + return (*p)(env, cookie, "size: "LPU64" mtime: "LPU64" atime: "LPU64" " + "ctime: "LPU64" blocks: "LPU64, + lvb->lvb_size, lvb->lvb_mtime, lvb->lvb_atime, + lvb->lvb_ctime, lvb->lvb_blocks); +} + +static int osc_object_print(const struct lu_env *env, void *cookie, + lu_printer_t p, const struct lu_object *obj) +{ + struct osc_object *osc = lu2osc(obj); + struct lov_oinfo *oinfo = osc->oo_oinfo; + struct osc_async_rc *ar = &oinfo->loi_ar; + + (*p)(env, cookie, "id: "LPU64" gr: "LPU64" " + "idx: %d gen: %d kms_valid: %u kms "LPU64" " + "rc: %d force_sync: %d min_xid: "LPU64" ", + oinfo->loi_id, oinfo->loi_gr, oinfo->loi_ost_idx, + oinfo->loi_ost_gen, oinfo->loi_kms_valid, oinfo->loi_kms, + ar->ar_rc, ar->ar_force_sync, ar->ar_min_xid); + osc_lvb_print(env, cookie, p, &oinfo->loi_lvb); + return 0; +} + + +static int osc_attr_get(const struct lu_env *env, struct cl_object *obj, + struct cl_attr *attr) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + cl_lvb2attr(attr, &oinfo->loi_lvb); + attr->cat_kms = oinfo->loi_kms_valid ? oinfo->loi_kms : 0; + return 0; +} + +int osc_attr_set(const struct lu_env *env, struct cl_object *obj, + const struct cl_attr *attr, unsigned valid) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + struct ost_lvb *lvb = &oinfo->loi_lvb; + + if (valid & CAT_SIZE) + lvb->lvb_size = attr->cat_size; + if (valid & CAT_MTIME) + lvb->lvb_mtime = attr->cat_mtime; + if (valid & CAT_ATIME) + lvb->lvb_atime = attr->cat_atime; + if (valid & CAT_CTIME) + lvb->lvb_ctime = attr->cat_ctime; + if (valid & CAT_BLOCKS) + lvb->lvb_blocks = attr->cat_blocks; + if (valid & CAT_KMS) + loi_kms_set(oinfo, attr->cat_kms); + return 0; +} + +static int osc_object_glimpse(const struct lu_env *env, + const struct cl_object *obj, struct ost_lvb *lvb) +{ + struct lov_oinfo *oinfo = cl2osc(obj)->oo_oinfo; + + ENTRY; + lvb->lvb_size = oinfo->loi_kms; + lvb->lvb_blocks = oinfo->loi_lvb.lvb_blocks; + RETURN(0); +} + + +void osc_object_set_contended(struct osc_object *obj) +{ + obj->oo_contention_time = cfs_time_current(); + /* mb(); */ + obj->oo_contended = 1; +} + +void osc_object_clear_contended(struct osc_object *obj) +{ + obj->oo_contended = 0; +} + +int osc_object_is_contended(struct osc_object *obj) +{ + struct osc_device *dev = lu2osc_dev(obj->oo_cl.co_lu.lo_dev); + int osc_contention_time = dev->od_contention_time; + cfs_time_t cur_time = cfs_time_current(); + cfs_time_t retry_time; + + if (OBD_FAIL_CHECK(OBD_FAIL_OSC_OBJECT_CONTENTION)) + return 1; + + if (!obj->oo_contended) + return 0; + + /* + * I like copy-paste. the code is copied from + * ll_file_is_contended. + */ + retry_time = cfs_time_add(obj->oo_contention_time, + cfs_time_seconds(osc_contention_time)); + if (cfs_time_after(cur_time, retry_time)) { + osc_object_clear_contended(obj); + return 0; + } + return 1; +} + +static const struct cl_object_operations osc_ops = { + .coo_page_init = osc_page_init, + .coo_lock_init = osc_lock_init, + .coo_io_init = osc_io_init, + .coo_attr_get = osc_attr_get, + .coo_attr_set = osc_attr_set, + .coo_glimpse = osc_object_glimpse +}; + +static const struct lu_object_operations osc_lu_obj_ops = { + .loo_object_init = osc_object_init, + .loo_object_delete = NULL, + .loo_object_release = NULL, + .loo_object_free = osc_object_free, + .loo_object_print = osc_object_print, + .loo_object_invariant = NULL +}; + +struct lu_object *osc_object_alloc(const struct lu_env *env, + const struct lu_object_header *_, + struct lu_device *dev) +{ + struct osc_object *osc; + struct lu_object *obj; + + OBD_SLAB_ALLOC_PTR(osc, osc_object_kmem); + if (osc != NULL) { + obj = osc2lu(osc); + lu_object_init(obj, NULL, dev); + osc->oo_cl.co_ops = &osc_ops; + obj->lo_ops = &osc_lu_obj_ops; + } else + obj = NULL; + return obj; +} + +/** @} osc */ diff --git a/lustre/osc/osc_page.c b/lustre/osc/osc_page.c new file mode 100644 index 0000000..a583eef --- /dev/null +++ b/lustre/osc/osc_page.c @@ -0,0 +1,532 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + * + * Implementation of cl_page for OSC layer. + * + * Author: Nikita Danilov + */ + +/** \addtogroup osc osc @{ */ + +#define DEBUG_SUBSYSTEM S_OSC + +#include "osc_cl_internal.h" + +static int osc_page_is_dlocked(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int pending, int unref) +{ + struct cl_page *page; + struct osc_object *obj; + struct osc_thread_info *info; + struct ldlm_res_id *resname; + struct lustre_handle *lockh; + ldlm_policy_data_t *policy; + ldlm_mode_t dlmmode; + int flags; + + info = osc_env_info(env); + resname = &info->oti_resname; + policy = &info->oti_policy; + lockh = &info->oti_handle; + page = opg->ops_cl.cpl_page; + obj = cl2osc(opg->ops_cl.cpl_obj); + + flags = LDLM_FL_TEST_LOCK | LDLM_FL_BLOCK_GRANTED; + if (pending) + flags |= LDLM_FL_CBPENDING; + + dlmmode = osc_cl_lock2ldlm(mode) | LCK_PW; + osc_lock_build_res(env, obj, resname); + osc_index2policy(policy, page->cp_obj, page->cp_index, page->cp_index); + return osc_match_base(osc_export(obj), resname, LDLM_EXTENT, policy, + dlmmode, &flags, NULL, lockh, unref); +} + +static int osc_page_protected(const struct lu_env *env, + const struct osc_page *opg, + enum cl_lock_mode mode, int unref) +{ + struct cl_object_header *hdr; + struct cl_lock *scan; + struct cl_page *page; + struct cl_lock_descr *descr; + int result; + + LINVRNT(!opg->ops_temp); + + result = osc_page_is_dlocked(env, opg, mode, 1, unref); + if (result == 0) { + /* maybe this page is a part of a lockless io? */ + hdr = cl_object_header(opg->ops_cl.cpl_obj); + page = opg->ops_cl.cpl_page; + descr = &osc_env_info(env)->oti_descr; + descr->cld_mode = mode; + descr->cld_start = page->cp_index; + descr->cld_end = page->cp_index; + spin_lock(&hdr->coh_lock_guard); + list_for_each_entry(scan, &hdr->coh_locks, cll_linkage) { + /* + * Lock-less sub-lock has to be either in HELD state + * (when io is actively going on), or in CACHED state, + * when top-lock is being unlocked: + * cl_io_unlock()->cl_unuse()->...->lov_lock_unuse(). + */ + if ((scan->cll_state == CLS_HELD || + scan->cll_state == CLS_CACHED) && + cl_lock_ext_match(&scan->cll_descr, descr)) { + struct osc_lock *olck; + + olck = osc_lock_at(scan); + result = osc_lock_is_lockless(olck); + break; + } + } + spin_unlock(&hdr->coh_lock_guard); + } + return result; +} + +/***************************************************************************** + * + * Page operations. + * + */ +static void osc_page_fini(const struct lu_env *env, + struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + CDEBUG(D_TRACE, "%p\n", opg); + OBD_SLAB_FREE_PTR(opg, osc_page_kmem); +} + +static void osc_page_transfer_get(struct osc_page *opg, const char *label) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + LASSERT(!opg->ops_transfer_pinned); + cl_page_get(page); + lu_ref_add_atomic(&page->cp_reference, label, page); + opg->ops_transfer_pinned = 1; +} + +static void osc_page_transfer_put(const struct lu_env *env, + struct osc_page *opg) +{ + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + + if (opg->ops_transfer_pinned) { + lu_ref_del(&page->cp_reference, "transfer", page); + opg->ops_transfer_pinned = 0; + cl_page_put(env, page); + } +} + +/** + * This is called once for every page when it is submitted for a transfer + * either opportunistic (osc_page_cache_add()), or immediate + * (osc_page_submit()). + */ +static void osc_page_transfer_add(const struct lu_env *env, + struct osc_page *opg, enum cl_req_type crt) +{ + struct osc_object *obj; + + obj = cl2osc(opg->ops_cl.cpl_obj); + spin_lock(&obj->oo_seatbelt); + list_add(&opg->ops_inflight, &obj->oo_inflight[crt]); + opg->ops_submitter = cfs_current(); + spin_unlock(&obj->oo_seatbelt); +} + +static int osc_page_cache_add(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + struct osc_io *oio = osc_env_io(env); + int result; + int brw_flags; + int noquota = 0; + + LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 0)); + ENTRY; + + /* Set the OBD_BRW_SRVLOCK before the page is queued. */ + brw_flags = oio->oi_lockless ? OBD_BRW_SRVLOCK : 0; + if (!client_is_remote(osc_export(obj)) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) { + brw_flags |= OBD_BRW_NOQUOTA; + noquota = OBD_BRW_NOQUOTA; + } + + osc_page_transfer_get(opg, "transfer\0cache"); + result = osc_queue_async_io(env, osc_export(obj), NULL, obj->oo_oinfo, + &opg->ops_oap, OBD_BRW_WRITE | noquota, + 0, 0, brw_flags, 0); + if (result != 0) + osc_page_transfer_put(env, opg); + else + osc_page_transfer_add(env, opg, CRT_WRITE); + RETURN(result); +} + +void osc_index2policy(ldlm_policy_data_t *policy, const struct cl_object *obj, + pgoff_t start, pgoff_t end) +{ + memset(policy, 0, sizeof *policy); + policy->l_extent.start = cl_offset(obj, start); + policy->l_extent.end = cl_offset(obj, end + 1) - 1; +} + +static int osc_page_is_under_lock(const struct lu_env *env, + const struct cl_page_slice *slice, + struct cl_io *_) +{ + struct cl_lock *lock; + int result; + + ENTRY; + lock = cl_lock_at_page(env, slice->cpl_obj, slice->cpl_page, + NULL, 1, 0); + if (lock != NULL) { + cl_lock_put(env, lock); + result = -EBUSY; + } else + result = -ENODATA; + RETURN(result); +} + +static int osc_page_fail(const struct lu_env *env, + const struct cl_page_slice *slice, struct cl_io *_) +{ + /* + * Cached read? + */ + LBUG(); + return 0; +} + + +static const char *osc_list(struct list_head *head) +{ + return list_empty(head) ? "-" : "+"; +} + +static int osc_page_print(const struct lu_env *env, + const struct cl_page_slice *slice, + void *cookie, lu_printer_t printer) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + return (*printer)(env, cookie, LUSTRE_OSC_NAME"-page@%p: " + "%#x %d %u %s %s %s %llu %u %#x %p %p %p %p %p\n", + opg, oap->oap_magic, oap->oap_cmd, + oap->oap_interrupted, + osc_list(&oap->oap_pending_item), + osc_list(&oap->oap_urgent_item), + osc_list(&oap->oap_rpc_item), + oap->oap_obj_off, oap->oap_page_off, + oap->oap_async_flags, oap->oap_request, + oap->oap_cli, oap->oap_loi, oap->oap_caller_ops, + oap->oap_caller_data); +} + +static void osc_page_delete(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + struct osc_async_page *oap = &opg->ops_oap; + int rc; + + LINVRNT(opg->ops_temp || osc_page_protected(env, opg, CLM_READ, 1)); + + ENTRY; + CDEBUG(D_TRACE, "%p\n", opg); + osc_page_transfer_put(env, opg); + rc = osc_teardown_async_page(osc_export(obj), NULL, obj->oo_oinfo, oap); + LASSERTF(rc == 0, "%i\n", rc); + spin_lock(&obj->oo_seatbelt); + list_del_init(&opg->ops_inflight); + spin_unlock(&obj->oo_seatbelt); + EXIT; +} + +void osc_page_clip(const struct lu_env *env, const struct cl_page_slice *slice, + int from, int to) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + opg->ops_from = from; + opg->ops_to = to; + oap->oap_async_flags |= ASYNC_COUNT_STABLE; +} + +static int osc_page_cancel(const struct lu_env *env, + const struct cl_page_slice *slice) +{ + struct osc_page *opg = cl2osc_page(slice); + struct osc_async_page *oap = &opg->ops_oap; + int rc = 0; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 0)); + + client_obd_list_lock(&oap->oap_cli->cl_loi_list_lock); + /* Check if the transferring against this page + * is completed, or not even queued. */ + if (opg->ops_transfer_pinned) + /* FIXME: may not be interrupted.. */ + rc = osc_oap_interrupted(env, oap); + LASSERT(ergo(rc == 0, opg->ops_transfer_pinned == 0)); + client_obd_list_unlock(&oap->oap_cli->cl_loi_list_lock); + return rc; +} + +static const struct cl_page_operations osc_page_ops = { + .cpo_fini = osc_page_fini, + .cpo_print = osc_page_print, + .cpo_delete = osc_page_delete, + .cpo_is_under_lock = osc_page_is_under_lock, + .io = { + [CRT_READ] = { + .cpo_cache_add = osc_page_fail + }, + [CRT_WRITE] = { + .cpo_cache_add = osc_page_cache_add + } + }, + .cpo_clip = osc_page_clip, + .cpo_cancel = osc_page_cancel +}; + +static int osc_make_ready(const struct lu_env *env, void *data, int cmd) +{ + struct osc_page *opg = data; + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + int result; + + LASSERT(cmd == OBD_BRW_WRITE); /* no cached reads */ + LINVRNT(osc_page_protected(env, opg, CLM_WRITE, 1)); + + ENTRY; + result = cl_page_make_ready(env, page, CRT_WRITE); + RETURN(result); +} + +static int osc_refresh_count(const struct lu_env *env, void *data, int cmd) +{ + struct cl_page *page; + struct osc_page *osc = data; + struct cl_object *obj; + struct cl_attr *attr = &osc_env_info(env)->oti_attr; + + int result; + loff_t kms; + + LINVRNT(osc_page_protected(env, osc, CLM_READ, 1)); + + /* readpage queues with _COUNT_STABLE, shouldn't get here. */ + LASSERT(!(cmd & OBD_BRW_READ)); + LASSERT(osc != NULL); + page = osc->ops_cl.cpl_page; + obj = osc->ops_cl.cpl_obj; + + cl_object_attr_lock(obj); + result = cl_object_attr_get(env, obj, attr); + cl_object_attr_unlock(obj); + if (result < 0) + return result; + kms = attr->cat_kms; + if (cl_offset(obj, page->cp_index) >= kms) + /* catch race with truncate */ + return 0; + else if (cl_offset(obj, page->cp_index + 1) > kms) + /* catch sub-page write at end of file */ + return kms % CFS_PAGE_SIZE; + else + return CFS_PAGE_SIZE; +} + +static int osc_completion(const struct lu_env *env, + void *data, int cmd, struct obdo *oa, int rc) +{ + struct osc_page *opg = data; + struct osc_async_page *oap = &opg->ops_oap; + struct cl_page *page = cl_page_top(opg->ops_cl.cpl_page); + struct osc_object *obj = cl2osc(opg->ops_cl.cpl_obj); + enum cl_req_type crt; + + LINVRNT(osc_page_protected(env, opg, CLM_READ, 1)); + + ENTRY; + + cmd &= ~OBD_BRW_NOQUOTA; + LASSERT(equi(page->cp_state == CPS_PAGEIN, cmd == OBD_BRW_READ)); + LASSERT(equi(page->cp_state == CPS_PAGEOUT, cmd == OBD_BRW_WRITE)); + LASSERT(opg->ops_transfer_pinned); + + /* + * page->cp_req can be NULL if io submission failed before + * cl_req was allocated. + */ + if (page->cp_req != NULL) + cl_req_page_done(env, page); + LASSERT(page->cp_req == NULL); + + /* As the transfer for this page is being done, clear the flags */ + oap->oap_async_flags = 0; + + crt = cmd == OBD_BRW_READ ? CRT_READ : CRT_WRITE; + /* Clear opg->ops_transfer_pinned before VM lock is released. */ + opg->ops_transfer_pinned = 0; + + spin_lock(&obj->oo_seatbelt); + LASSERT(opg->ops_submitter != NULL); + LASSERT(!list_empty(&opg->ops_inflight)); + list_del_init(&opg->ops_inflight); + spin_unlock(&obj->oo_seatbelt); + + cl_page_completion(env, page, crt, rc); + + /* statistic */ + if (rc == 0 && oap->oap_brw_flags & OBD_BRW_SRVLOCK) { + struct lu_device *ld = opg->ops_cl.cpl_obj->co_lu.lo_dev; + struct osc_stats *stats = &lu2osc_dev(ld)->od_stats; + int bytes = opg->ops_to - opg->ops_from; + + if (crt == CRT_READ) + stats->os_lockless_reads += bytes; + else + stats->os_lockless_writes += bytes; + } + + /* + * This has to be the last operation with the page, as locks are + * released in cl_page_completion() and nothing except for the + * reference counter protects page from concurrent reclaim. + */ + lu_ref_del(&page->cp_reference, "transfer", page); + /* + * As page->cp_obj is pinned by a reference from page->cp_req, it is + * safe to call cl_page_put() without risking object destruction in a + * non-blocking context. + */ + cl_page_put(env, page); + RETURN(0); +} + +const static struct obd_async_page_ops osc_async_page_ops = { + .ap_make_ready = osc_make_ready, + .ap_refresh_count = osc_refresh_count, + .ap_completion = osc_completion +}; + +struct cl_page *osc_page_init(const struct lu_env *env, + struct cl_object *obj, + struct cl_page *page, cfs_page_t *vmpage) +{ + struct osc_object *osc = cl2osc(obj); + struct osc_page *opg; + int result; + + OBD_SLAB_ALLOC_PTR(opg, osc_page_kmem); + if (opg != NULL) { + void *oap = &opg->ops_oap; + + opg->ops_from = 0; + opg->ops_to = CFS_PAGE_SIZE; + + result = osc_prep_async_page(osc_export(osc), + NULL, osc->oo_oinfo, vmpage, + cl_offset(obj, page->cp_index), + &osc_async_page_ops, + opg, (void **)&oap, 1, NULL); + if (result == 0) + cl_page_slice_add(page, &opg->ops_cl, obj, + &osc_page_ops); + /* + * Cannot assert osc_page_protected() here as read-ahead + * creates temporary pages outside of a lock. + */ +#ifdef INVARIANT_CHECK + opg->ops_temp = !osc_page_protected(env, opg, CLM_READ, 1); +#endif + CFS_INIT_LIST_HEAD(&opg->ops_inflight); + } else + result = -ENOMEM; + return ERR_PTR(result); +} + +void osc_io_submit_page(const struct lu_env *env, + struct osc_io *oio, struct osc_page *opg, + enum cl_req_type crt) +{ + struct osc_async_page *oap = &opg->ops_oap; + struct client_obd *cli = oap->oap_cli; + + LINVRNT(osc_page_protected(env, opg, + crt == CRT_WRITE ? CLM_WRITE : CLM_READ, 1)); + + oap->oap_page_off = opg->ops_from; + oap->oap_count = opg->ops_to - opg->ops_from; + oap->oap_brw_flags |= OBD_BRW_SYNC; + if (oio->oi_lockless) + oap->oap_brw_flags |= OBD_BRW_SRVLOCK; + + oap->oap_cmd = crt == CRT_WRITE ? OBD_BRW_WRITE : OBD_BRW_READ; + if (!client_is_remote(osc_export(cl2osc(opg->ops_cl.cpl_obj))) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) { + oap->oap_brw_flags |= OBD_BRW_NOQUOTA; + oap->oap_cmd |= OBD_BRW_NOQUOTA; + } + + oap->oap_async_flags |= OSC_FLAGS; + if (oap->oap_cmd & OBD_BRW_READ) + oap->oap_async_flags |= ASYNC_COUNT_STABLE; + else if (!(oap->oap_brw_page.flag & OBD_BRW_FROM_GRANT)) + osc_enter_cache_try(env, cli, oap->oap_loi, oap, 1); + + osc_oap_to_pending(oap); + osc_page_transfer_get(opg, "transfer\0imm"); + osc_page_transfer_add(env, opg, crt); +} + +/** @} osc */ diff --git a/lustre/osc/osc_request.c b/lustre/osc/osc_request.c index a00185f..9bf1afa 100644 --- a/lustre/osc/osc_request.c +++ b/lustre/osc/osc_request.c @@ -61,7 +61,6 @@ #include #include #include -#include #include "osc_internal.h" static quota_interface_t *quota_interface = NULL; @@ -97,7 +96,7 @@ static int osc_packmd(struct obd_export *exp, struct lov_mds_md **lmmp, if (lsm) { LASSERT(lsm->lsm_object_id); - LASSERT(lsm->lsm_object_gr); + LASSERT_MDS_GROUP(lsm->lsm_object_gr); (*lmmp)->lmm_object_id = cpu_to_le64(lsm->lsm_object_id); (*lmmp)->lmm_object_gr = cpu_to_le64(lsm->lsm_object_gr); } @@ -154,7 +153,7 @@ static int osc_unpackmd(struct obd_export *exp, struct lov_stripe_md **lsmp, (*lsmp)->lsm_object_id = le64_to_cpu (lmm->lmm_object_id); (*lsmp)->lsm_object_gr = le64_to_cpu (lmm->lmm_object_gr); LASSERT((*lsmp)->lsm_object_id); - LASSERT((*lsmp)->lsm_object_gr); + LASSERT_MDS_GROUP((*lsmp)->lsm_object_gr); } (*lsmp)->lsm_maxbytes = LUSTRE_STRIPE_MAXBYTES; @@ -313,8 +312,10 @@ static int osc_setattr(struct obd_export *exp, struct obd_info *oinfo, int rc; ENTRY; - LASSERT(!(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP) || - oinfo->oi_oa->o_gr > 0); + LASSERTF(!(oinfo->oi_oa->o_valid & OBD_MD_FLGROUP) || + CHECK_MDS_GROUP(oinfo->oi_oa->o_gr), + "oinfo->oi_oa->o_valid="LPU64" oinfo->oi_oa->o_gr="LPU64"\n", + oinfo->oi_oa->o_valid, oinfo->oi_oa->o_gr); req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_SETATTR); if (req == NULL) @@ -399,7 +400,7 @@ static int osc_setattr_async(struct obd_export *exp, struct obd_info *oinfo, /* do mds to ost setattr asynchronously */ if (!rqset) { /* Do not wait for response. */ - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); } else { req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_setattr_interpret; @@ -501,7 +502,7 @@ out: static int osc_punch_interpret(const struct lu_env *env, struct ptlrpc_request *req, - struct osc_async_args *aa, int rc) + struct osc_punch_args *aa, int rc) { struct ost_body *body; ENTRY; @@ -513,32 +514,28 @@ static int osc_punch_interpret(const struct lu_env *env, if (body == NULL) GOTO(out, rc = -EPROTO); - *aa->aa_oi->oi_oa = body->oa; + *aa->pa_oa = body->oa; out: - rc = aa->aa_oi->oi_cb_up(aa->aa_oi, rc); + rc = aa->pa_upcall(aa->pa_cookie, rc); RETURN(rc); } -static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, - struct obd_trans_info *oti, - struct ptlrpc_request_set *rqset) +int osc_punch_base(struct obd_export *exp, struct obdo *oa, + struct obd_capa *capa, + obd_enqueue_update_f upcall, void *cookie, + struct ptlrpc_request_set *rqset) { struct ptlrpc_request *req; - struct osc_async_args *aa; + struct osc_punch_args *aa; struct ost_body *body; int rc; ENTRY; - if (!oinfo->oi_oa) { - CDEBUG(D_INFO, "oa NULL\n"); - RETURN(-EINVAL); - } - req = ptlrpc_request_alloc(class_exp2cliimp(exp), &RQF_OST_PUNCH); if (req == NULL) RETURN(-ENOMEM); - osc_set_capa_size(req, &RMF_CAPA1, oinfo->oi_capa); + osc_set_capa_size(req, &RMF_CAPA1, capa); rc = ptlrpc_request_pack(req, LUSTRE_OST_VERSION, OST_PUNCH); if (rc) { ptlrpc_request_free(req); @@ -546,26 +543,40 @@ static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, } req->rq_request_portal = OST_IO_PORTAL; /* bug 7198 */ ptlrpc_at_set_req_timeout(req); - osc_pack_req_body(req, oinfo); - /* overload the size and blocks fields in the oa with start/end */ body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); LASSERT(body); - body->oa.o_size = oinfo->oi_policy.l_extent.start; - body->oa.o_blocks = oinfo->oi_policy.l_extent.end; - body->oa.o_valid |= (OBD_MD_FLSIZE | OBD_MD_FLBLOCKS); + body->oa = *oa; + osc_pack_capa(req, body, capa); + ptlrpc_request_set_replen(req); req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_punch_interpret; CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); aa = ptlrpc_req_async_args(req); - aa->aa_oi = oinfo; - ptlrpc_set_add_req(rqset, req); + aa->pa_oa = oa; + aa->pa_upcall = upcall; + aa->pa_cookie = cookie; + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PSCOPE_OTHER); + else + ptlrpc_set_add_req(rqset, req); RETURN(0); } +static int osc_punch(struct obd_export *exp, struct obd_info *oinfo, + struct obd_trans_info *oti, + struct ptlrpc_request_set *rqset) +{ + oinfo->oi_oa->o_size = oinfo->oi_policy.l_extent.start; + oinfo->oi_oa->o_blocks = oinfo->oi_policy.l_extent.end; + oinfo->oi_oa->o_valid |= OBD_MD_FLSIZE | OBD_MD_FLBLOCKS; + return osc_punch_base(exp, oinfo->oi_oa, oinfo->oi_capa, + oinfo->oi_cb_up, oinfo, rqset); +} + static int osc_sync(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *md, obd_size start, obd_size end, void *capa) @@ -685,7 +696,7 @@ static int osc_can_send_destroy(struct client_obd *cli) * cookies to the MDS after committing destroy transactions. */ static int osc_destroy(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_export) + struct obd_export *md_export, void *capa) { struct client_obd *cli = &exp->exp_obd->u.cli; struct ptlrpc_request *req; @@ -708,6 +719,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, RETURN(-ENOMEM); } + osc_set_capa_size(req, &RMF_CAPA1, (struct obd_capa *)capa); rc = ldlm_prep_elc_req(exp, req, LUSTRE_OST_VERSION, OST_DESTROY, 0, &cancels, count); if (rc) { @@ -725,6 +737,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, LASSERT(body); body->oa = *oa; + osc_pack_capa(req, body, (struct obd_capa *)capa); ptlrpc_request_set_replen(req); if (!osc_can_send_destroy(cli)) { @@ -739,7 +752,7 @@ static int osc_destroy(struct obd_export *exp, struct obdo *oa, } /* Do not wait for response */ - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } @@ -753,13 +766,16 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, oa->o_valid |= bits; client_obd_list_lock(&cli->cl_loi_list_lock); oa->o_dirty = cli->cl_dirty; - if (cli->cl_dirty > cli->cl_dirty_max) { - CERROR("dirty %lu > dirty_max %lu\n", - cli->cl_dirty, cli->cl_dirty_max); + if (cli->cl_dirty - cli->cl_dirty_transit > cli->cl_dirty_max) { + CERROR("dirty %lu - %lu > dirty_max %lu\n", + cli->cl_dirty, cli->cl_dirty_transit, cli->cl_dirty_max); oa->o_undirty = 0; - } else if (atomic_read(&obd_dirty_pages) > obd_max_dirty_pages) { - CERROR("dirty %d > system dirty_max %d\n", - atomic_read(&obd_dirty_pages), obd_max_dirty_pages); + } else if (atomic_read(&obd_dirty_pages) - + atomic_read(&obd_dirty_transit_pages) > obd_max_dirty_pages){ + CERROR("dirty %d - %d > system dirty_max %d\n", + atomic_read(&obd_dirty_pages), + atomic_read(&obd_dirty_transit_pages), + obd_max_dirty_pages); oa->o_undirty = 0; } else if (cli->cl_dirty_max - cli->cl_dirty > 0x7fffffff) { CERROR("dirty %lu - dirty_max %lu too big???\n", @@ -782,6 +798,7 @@ static void osc_announce_cached(struct client_obd *cli, struct obdo *oa, static void osc_consume_write_grant(struct client_obd *cli, struct brw_page *pga) { + LASSERT(!(pga->flag & OBD_BRW_FROM_GRANT)); atomic_inc(&obd_dirty_pages); cli->cl_dirty += CFS_PAGE_SIZE; cli->cl_avail_grant -= CFS_PAGE_SIZE; @@ -807,6 +824,11 @@ static void osc_release_write_grant(struct client_obd *cli, pga->flag &= ~OBD_BRW_FROM_GRANT; atomic_dec(&obd_dirty_pages); cli->cl_dirty -= CFS_PAGE_SIZE; + if (pga->flag & OBD_BRW_NOCACHE) { + pga->flag &= ~OBD_BRW_NOCACHE; + atomic_dec(&obd_dirty_transit_pages); + cli->cl_dirty_transit -= CFS_PAGE_SIZE; + } if (!sent) { cli->cl_lost_grant += CFS_PAGE_SIZE; CDEBUG(D_CACHE, "lost grant: %lu avail grant: %lu dirty: %lu\n", @@ -977,7 +999,7 @@ static int check_write_rcs(struct ptlrpc_request *req, static inline int can_merge_pages(struct brw_page *p1, struct brw_page *p2) { if (p1->flag != p2->flag) { - unsigned mask = ~OBD_BRW_FROM_GRANT; + unsigned mask = ~(OBD_BRW_FROM_GRANT|OBD_BRW_NOCACHE); /* warn if we try to combine flags that we don't know to be * safe to combine */ @@ -1030,7 +1052,7 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, struct lov_stripe_md *lsm, obd_count page_count, struct brw_page **pga, struct ptlrpc_request **reqp, - struct obd_capa *ocapa) + struct obd_capa *ocapa, int reserve) { struct ptlrpc_request *req; struct ptlrpc_bulk_desc *desc; @@ -1057,7 +1079,6 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, opc = OST_READ; req = ptlrpc_request_alloc(cli->cl_import, &RQF_OST_BRW); } - if (req == NULL) RETURN(-ENOMEM); @@ -1201,6 +1222,8 @@ static int osc_brw_prep_request(int cmd, struct client_obd *cli,struct obdo *oa, aa->aa_ppga = pga; aa->aa_cli = cli; CFS_INIT_LIST_HEAD(&aa->aa_oaps); + if (ocapa && reserve) + aa->aa_ocapa = capa_get(ocapa); *reqp = req; RETURN(0); @@ -1430,7 +1453,7 @@ static int osc_brw_internal(int cmd, struct obd_export *exp, struct obdo *oa, restart_bulk: rc = osc_brw_prep_request(cmd, &exp->exp_obd->u.cli, oa, lsm, - page_count, pga, &req, ocapa); + page_count, pga, &req, ocapa, 0); if (rc != 0) return (rc); @@ -1477,18 +1500,13 @@ int osc_brw_redo_request(struct ptlrpc_request *request, } DEBUG_REQ(D_ERROR, request, "redo for recoverable error"); -/* - body = lustre_msg_buf(request->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); - if (body->oa.o_valid & OBD_MD_FLOSSCAPA) - ocapa = lustre_unpack_capa(request->rq_reqmsg, - REQ_REC_OFF + 3); -*/ + rc = osc_brw_prep_request(lustre_msg_get_opc(request->rq_reqmsg) == OST_WRITE ? OBD_BRW_WRITE :OBD_BRW_READ, aa->aa_cli, aa->aa_oa, NULL /* lsm unused by osc currently */, aa->aa_page_count, aa->aa_ppga, - &new_req, NULL /* ocapa */); + &new_req, aa->aa_ocapa, 0); if (rc) RETURN(rc); @@ -1526,6 +1544,9 @@ int osc_brw_redo_request(struct ptlrpc_request *request, } } + new_aa->aa_ocapa = aa->aa_ocapa; + aa->aa_ocapa = NULL; + /* use ptlrpc_set_add_req is safe because interpret functions work * in check_set context. only one way exist with access to request * from different thread got -EINTR - this way protected with @@ -1538,63 +1559,6 @@ int osc_brw_redo_request(struct ptlrpc_request *request, RETURN(0); } -static int async_internal(int cmd, struct obd_export *exp, struct obdo *oa, - struct lov_stripe_md *lsm, obd_count page_count, - struct brw_page **pga, struct ptlrpc_request_set *set, - struct obd_capa *ocapa) -{ - struct ptlrpc_request *req; - struct client_obd *cli = &exp->exp_obd->u.cli; - int rc, i; - struct osc_brw_async_args *aa; - ENTRY; - - /* Consume write credits even if doing a sync write - - * otherwise we may run out of space on OST due to grant. */ - if (cmd == OBD_BRW_WRITE) { - spin_lock(&cli->cl_loi_list_lock); - for (i = 0; i < page_count; i++) { - if (cli->cl_avail_grant >= CFS_PAGE_SIZE) - osc_consume_write_grant(cli, pga[i]); - } - spin_unlock(&cli->cl_loi_list_lock); - } - - rc = osc_brw_prep_request(cmd, cli, oa, lsm, page_count, pga, - &req, ocapa); - - aa = ptlrpc_req_async_args(req); - if (cmd == OBD_BRW_READ) { - lprocfs_oh_tally_log2(&cli->cl_read_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_read_rpc_hist, cli->cl_r_in_flight); - } else { - lprocfs_oh_tally_log2(&cli->cl_write_page_hist, page_count); - lprocfs_oh_tally(&cli->cl_write_rpc_hist, - cli->cl_w_in_flight); - } - ptlrpc_lprocfs_brw(req, aa->aa_requested_nob); - - LASSERT(list_empty(&aa->aa_oaps)); - if (rc == 0) { - req->rq_interpret_reply = brw_interpret; - ptlrpc_set_add_req(set, req); - client_obd_list_lock(&cli->cl_loi_list_lock); - if (cmd == OBD_BRW_READ) - cli->cl_r_in_flight++; - else - cli->cl_w_in_flight++; - client_obd_list_unlock(&cli->cl_loi_list_lock); - OBD_FAIL_TIMEOUT(OBD_FAIL_OSC_DIO_PAUSE, 3); - } else if (cmd == OBD_BRW_WRITE) { - client_obd_list_lock(&cli->cl_loi_list_lock); - for (i = 0; i < page_count; i++) - osc_release_write_grant(cli, pga[i], 0); - osc_wake_cache_waiters(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - } - RETURN (rc); -} - /* * ugh, we want disk allocation on the target to happen in offset order. we'll * follow sedgewicks advice and stick to the dead simple shellsort -- it'll do @@ -1743,76 +1707,6 @@ out: RETURN(rc); } -static int osc_brw_async(int cmd, struct obd_export *exp, - struct obd_info *oinfo, obd_count page_count, - struct brw_page *pga, struct obd_trans_info *oti, - struct ptlrpc_request_set *set) -{ - struct brw_page **ppga, **orig; - struct client_obd *cli = &exp->exp_obd->u.cli; - int page_count_orig; - int rc = 0; - ENTRY; - - if (cmd & OBD_BRW_CHECK) { - struct obd_import *imp = class_exp2cliimp(exp); - /* The caller just wants to know if there's a chance that this - * I/O can succeed */ - - if (imp == NULL || imp->imp_invalid) - RETURN(-EIO); - RETURN(0); - } - - orig = ppga = osc_build_ppga(pga, page_count); - if (ppga == NULL) - RETURN(-ENOMEM); - page_count_orig = page_count; - - sort_brw_pages(ppga, page_count); - while (page_count) { - struct brw_page **copy; - obd_count pages_per_brw; - - pages_per_brw = min_t(obd_count, page_count, - cli->cl_max_pages_per_rpc); - - pages_per_brw = max_unfragmented_pages(ppga, pages_per_brw); - - /* use ppga only if single RPC is going to fly */ - if (pages_per_brw != page_count_orig || ppga != orig) { - OBD_ALLOC(copy, sizeof(*copy) * pages_per_brw); - if (copy == NULL) - GOTO(out, rc = -ENOMEM); - memcpy(copy, ppga, sizeof(*copy) * pages_per_brw); - } else - copy = ppga; - - rc = async_internal(cmd, exp, oinfo->oi_oa, oinfo->oi_md, - pages_per_brw, copy, set, oinfo->oi_capa); - - if (rc != 0) { - if (copy != ppga) - OBD_FREE(copy, sizeof(*copy) * pages_per_brw); - break; - } - if (copy == orig) { - /* we passed it to async_internal() which is - * now responsible for releasing memory */ - orig = NULL; - } - - page_count -= pages_per_brw; - ppga += pages_per_brw; - } -out: - if (orig) - osc_release_ppga(orig, page_count_orig); - RETURN(rc); -} - -static void osc_check_rpcs(struct client_obd *cli); - /* The companion to osc_enter_cache(), called when @oap is no longer part of * the dirty accounting. Writeback completes or truncate happens before * writing starts. Must be called with the loi lock held. */ @@ -1883,7 +1777,7 @@ static void on_list(struct list_head *item, struct list_head *list, /* maintain the loi's cli list membership invariants so that osc_send_oap_rpc * can find pages to build into rpcs quickly */ -static void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) +void loi_list_maint(struct client_obd *cli, struct lov_oinfo *loi) { on_list(&loi->loi_cli_item, &cli->cl_loi_ready_list, lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE) || @@ -1906,34 +1800,35 @@ static void lop_update_pending(struct client_obd *cli, cli->cl_pending_r_pages += delta; } -/* this is called when a sync waiter receives an interruption. Its job is to +/** + * this is called when a sync waiter receives an interruption. Its job is to * get the caller woken as soon as possible. If its page hasn't been put in an * rpc yet it can dequeue immediately. Otherwise it has to mark the rpc as * desiring interruption which will forcefully complete the rpc once the rpc - * has timed out */ -static void osc_occ_interrupted(struct oig_callback_context *occ) + * has timed out. + */ +int osc_oap_interrupted(const struct lu_env *env, struct osc_async_page *oap) { - struct osc_async_page *oap; struct loi_oap_pages *lop; struct lov_oinfo *loi; + int rc = -EBUSY; ENTRY; - /* XXX member_of() */ - oap = list_entry(occ, struct osc_async_page, oap_occ); - - client_obd_list_lock(&oap->oap_cli->cl_loi_list_lock); - + LASSERT(!oap->oap_interrupted); oap->oap_interrupted = 1; /* ok, it's been put in an rpc. only one oap gets a request reference */ if (oap->oap_request != NULL) { ptlrpc_mark_interrupted(oap->oap_request); ptlrpcd_wake(oap->oap_request); - GOTO(unlock, 0); + ptlrpc_req_finished(oap->oap_request); + oap->oap_request = NULL; } - /* we don't get interruption callbacks until osc_trigger_group_io() - * has been called and put the sync oaps in the pending/urgent lists.*/ + /* + * page completion may be called only if ->cpo_prep() method was + * executed by osc_io_submit(), that also adds page the to pending list + */ if (!list_empty(&oap->oap_pending_item)) { list_del_init(&oap->oap_pending_item); list_del_init(&oap->oap_urgent_item); @@ -1943,13 +1838,12 @@ static void osc_occ_interrupted(struct oig_callback_context *occ) &loi->loi_write_lop : &loi->loi_read_lop; lop_update_pending(oap->oap_cli, lop, oap->oap_cmd, -1); loi_list_maint(oap->oap_cli, oap->oap_loi); - - oig_complete_one(oap->oap_oig, &oap->oap_occ, -EINTR); - oap->oap_oig = NULL; + rc = oap->oap_caller_ops->ap_completion(env, + oap->oap_caller_data, + oap->oap_cmd, NULL, -EINTR); } -unlock: - client_obd_list_unlock(&oap->oap_cli->cl_loi_list_lock); + RETURN(rc); } /* this is trying to propogate async writeback errors back up to the @@ -1974,7 +1868,7 @@ static void osc_process_ar(struct osc_async_rc *ar, __u64 xid, ar->ar_force_sync = 0; } -static void osc_oap_to_pending(struct osc_async_page *oap) +void osc_oap_to_pending(struct osc_async_page *oap) { struct loi_oap_pages *lop; @@ -1991,7 +1885,8 @@ static void osc_oap_to_pending(struct osc_async_page *oap) /* this must be called holding the loi list lock to give coverage to exit_cache, * async_flag maintenance, and oap_request */ -static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, +static void osc_ap_completion(const struct lu_env *env, + struct client_obd *cli, struct obdo *oa, struct osc_async_page *oap, int sent, int rc) { __u64 xid = 0; @@ -2022,15 +1917,7 @@ static void osc_ap_completion(struct client_obd *cli, struct obdo *oa, oap->oap_loi->loi_lvb.lvb_ctime = oa->o_ctime; } - if (oap->oap_oig) { - osc_exit_cache(cli, oap, sent); - oig_complete_one(oap->oap_oig, &oap->oap_occ, rc); - oap->oap_oig = NULL; - EXIT; - return; - } - - rc = oap->oap_caller_ops->ap_completion(oap->oap_caller_data, + rc = oap->oap_caller_ops->ap_completion(env, oap->oap_caller_data, oap->oap_cmd, oa, rc); /* ll_ap_completion (from llite) drops PG_locked. so, a new @@ -2049,6 +1936,7 @@ static int brw_interpret(const struct lu_env *env, { struct osc_brw_async_args *aa = data; struct client_obd *cli; + int async; ENTRY; rc = osc_brw_fini_request(req, rc); @@ -2059,6 +1947,11 @@ static int brw_interpret(const struct lu_env *env, RETURN(0); } + if (aa->aa_ocapa) { + capa_put(aa->aa_ocapa); + aa->aa_ocapa = NULL; + } + cli = aa->aa_cli; client_obd_list_lock(&cli->cl_loi_list_lock); @@ -2071,13 +1964,14 @@ static int brw_interpret(const struct lu_env *env, else cli->cl_r_in_flight--; - if (!list_empty(&aa->aa_oaps)) { /* from osc_send_oap_rpc() */ + async = list_empty(&aa->aa_oaps); + if (!async) { /* from osc_send_oap_rpc() */ struct osc_async_page *oap, *tmp; /* the caller may re-use the oap after the completion call so * we need to clean it up a little */ list_for_each_entry_safe(oap, tmp, &aa->aa_oaps, oap_rpc_item) { list_del_init(&oap->oap_rpc_item); - osc_ap_completion(cli, aa->aa_oa, oap, 1, rc); + osc_ap_completion(env, cli, aa->aa_oa, oap, 1, rc); } OBDO_FREE(aa->aa_oa); } else { /* from async_internal() */ @@ -2086,14 +1980,16 @@ static int brw_interpret(const struct lu_env *env, osc_release_write_grant(aa->aa_cli, aa->aa_ppga[i], 1); } osc_wake_cache_waiters(cli); - osc_check_rpcs(cli); + osc_check_rpcs(env, cli); client_obd_list_unlock(&cli->cl_loi_list_lock); - + if (!async) + cl_req_completion(env, aa->aa_clerq, rc); osc_release_ppga(aa->aa_ppga, aa->aa_page_count); RETURN(rc); } -static struct ptlrpc_request *osc_build_req(struct client_obd *cli, +static struct ptlrpc_request *osc_build_req(const struct lu_env *env, + struct client_obd *cli, struct list_head *rpc_list, int page_count, int cmd) { @@ -2101,19 +1997,24 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, struct brw_page **pga = NULL; struct osc_brw_async_args *aa; struct obdo *oa = NULL; - struct obd_async_page_ops *ops = NULL; + const struct obd_async_page_ops *ops = NULL; void *caller_data = NULL; - struct obd_capa *ocapa; struct osc_async_page *oap; + struct osc_async_page *tmp; + struct ost_body *body; + struct cl_req *clerq = NULL; + enum cl_req_type crt = (cmd & OBD_BRW_WRITE) ? CRT_WRITE : CRT_READ; struct ldlm_lock *lock = NULL; + struct cl_req_attr crattr; int i, rc; ENTRY; LASSERT(!list_empty(rpc_list)); + memset(&crattr, 0, sizeof crattr); OBD_ALLOC(pga, sizeof(*pga) * page_count); if (pga == NULL) - RETURN(ERR_PTR(-ENOMEM)); + GOTO(out, req = ERR_PTR(-ENOMEM)); OBDO_ALLOC(oa); if (oa == NULL) @@ -2121,9 +2022,16 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, i = 0; list_for_each_entry(oap, rpc_list, oap_rpc_item) { + struct cl_page *page = osc_oap2cl_page(oap); if (ops == NULL) { ops = oap->oap_caller_ops; caller_data = oap->oap_caller_data; + + clerq = cl_req_alloc(env, page, crt, + 1 /* only 1-object rpcs for + * now */); + if (IS_ERR(clerq)) + GOTO(out, req = (void *)clerq); lock = oap->oap_ldlm_lock; } pga[i] = &oap->oap_brw_page; @@ -2131,21 +2039,28 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, CDEBUG(0, "put page %p index %lu oap %p flg %x to pga\n", pga[i]->pg, cfs_page_index(oap->oap_page), oap, pga[i]->flag); i++; + cl_req_page_add(env, clerq, page); } /* always get the data for the obdo for the rpc */ LASSERT(ops != NULL); - ops->ap_fill_obdo(caller_data, cmd, oa); - ocapa = ops->ap_lookup_capa(caller_data, cmd); + crattr.cra_oa = oa; + crattr.cra_capa = NULL; + cl_req_attr_set(env, clerq, &crattr, ~0ULL); if (lock) { oa->o_handle = lock->l_remote_handle; oa->o_valid |= OBD_MD_FLHANDLE; } + rc = cl_req_prep(env, clerq); + if (rc != 0) { + CERROR("cl_req_prep failed: %d\n", rc); + GOTO(out, req = ERR_PTR(rc)); + } + sort_brw_pages(pga, page_count); rc = osc_brw_prep_request(cmd, cli, oa, NULL, page_count, - pga, &req, ocapa); - capa_put(ocapa); + pga, &req, crattr.cra_capa, 1); if (rc != 0) { CERROR("prep_req failed: %d\n", rc); GOTO(out, req = ERR_PTR(rc)); @@ -2156,27 +2071,45 @@ static struct ptlrpc_request *osc_build_req(struct client_obd *cli, * later setattr before earlier BRW (as determined by the request xid), * the OST will not use BRW timestamps. Sadly, there is no obvious * way to do this in a single call. bug 10150 */ - ops->ap_update_obdo(caller_data, cmd, oa, - OBD_MD_FLMTIME | OBD_MD_FLCTIME | OBD_MD_FLATIME); + body = req_capsule_client_get(&req->rq_pill, &RMF_OST_BODY); + cl_req_attr_set(env, clerq, &crattr, + OBD_MD_FLMTIME|OBD_MD_FLCTIME|OBD_MD_FLATIME); CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); aa = ptlrpc_req_async_args(req); CFS_INIT_LIST_HEAD(&aa->aa_oaps); list_splice(rpc_list, &aa->aa_oaps); CFS_INIT_LIST_HEAD(rpc_list); - + aa->aa_clerq = clerq; out: + capa_put(crattr.cra_capa); if (IS_ERR(req)) { if (oa) OBDO_FREE(oa); if (pga) OBD_FREE(pga, sizeof(*pga) * page_count); + /* this should happen rarely and is pretty bad, it makes the + * pending list not follow the dirty order */ + client_obd_list_lock(&cli->cl_loi_list_lock); + list_for_each_entry_safe(oap, tmp, rpc_list, oap_rpc_item) { + list_del_init(&oap->oap_rpc_item); + + /* queued sync pages can be torn down while the pages + * were between the pending list and the rpc */ + if (oap->oap_interrupted) { + CDEBUG(D_INODE, "oap %p interrupted\n", oap); + osc_ap_completion(env, cli, NULL, oap, 0, + oap->oap_count); + continue; + } + osc_ap_completion(env, cli, NULL, oap, 0, PTR_ERR(req)); + } + if (clerq && !IS_ERR(clerq)) + cl_req_completion(env, clerq, PTR_ERR(req)); } RETURN(req); } -/* the loi lock is held across this function but it's allowed to release - * and reacquire it during its work */ /** * prepare pages for ASYNC io and put pages in send queue. * @@ -2188,18 +2121,21 @@ out: * \return zero if pages successfully add to send queue. * \return not zere if error occurring. */ -static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, - int cmd, struct loi_oap_pages *lop) +static int +osc_send_oap_rpc(const struct lu_env *env, struct client_obd *cli, + struct lov_oinfo *loi, + int cmd, struct loi_oap_pages *lop) { struct ptlrpc_request *req; obd_count page_count = 0; struct osc_async_page *oap = NULL, *tmp; struct osc_brw_async_args *aa; - struct obd_async_page_ops *ops; + const struct obd_async_page_ops *ops; CFS_LIST_HEAD(rpc_list); unsigned int ending_offset; unsigned starting_offset = 0; int srvlock = 0; + struct cl_object *clob = NULL; ENTRY; /* first we find the pages we're allowed to work with */ @@ -2209,6 +2145,13 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, LASSERT(oap->oap_magic == OAP_MAGIC); + if (clob == NULL) { + /* pin object in memory, so that completion call-backs + * can be safely called under client_obd_list lock. */ + clob = osc_oap2cl_page(oap)->cp_obj; + cl_object_get(clob); + } + if (page_count != 0 && srvlock != !!(oap->oap_brw_flags & OBD_BRW_SRVLOCK)) { CDEBUG(D_PAGE, "SRVLOCK flag mismatch," @@ -2226,7 +2169,8 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, * will still be on the dirty list). we could call in * at the end of ll_file_write to process the queue again. */ if (!(oap->oap_async_flags & ASYNC_READY)) { - int rc = ops->ap_make_ready(oap->oap_caller_data, cmd); + int rc = ops->ap_make_ready(env, oap->oap_caller_data, + cmd); if (rc < 0) CDEBUG(D_INODE, "oap %p page %p returned %d " "instead of ready\n", oap, @@ -2264,11 +2208,20 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, * ->ap_make_ready() or by higher layers. */ #if defined(__KERNEL__) && defined(__linux__) - if(!(PageLocked(oap->oap_page) && - (CheckWriteback(oap->oap_page, cmd) || oap->oap_oig !=NULL))) { - CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n", - oap->oap_page, (long)oap->oap_page->flags, oap->oap_async_flags); - LBUG(); + { + struct cl_page *page; + + page = osc_oap2cl_page(oap); + + if (page->cp_type == CPT_CACHEABLE && + !(PageLocked(oap->oap_page) && + (CheckWriteback(oap->oap_page, cmd)))) { + CDEBUG(D_PAGE, "page %p lost wb %lx/%x\n", + oap->oap_page, + (long)oap->oap_page->flags, + oap->oap_async_flags); + LBUG(); + } } #endif /* If there is a gap at the start of this page, it can't merge @@ -2287,13 +2240,17 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, (PTLRPC_MAX_BRW_SIZE - 1); /* ask the caller for the size of the io as the rpc leaves. */ - if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) + if (!(oap->oap_async_flags & ASYNC_COUNT_STABLE)) { oap->oap_count = - ops->ap_refresh_count(oap->oap_caller_data,cmd); + ops->ap_refresh_count(env, oap->oap_caller_data, + cmd); + LASSERT(oap->oap_page_off + oap->oap_count <= CFS_PAGE_SIZE); + } if (oap->oap_count <= 0) { CDEBUG(D_CACHE, "oap %p count %d, completing\n", oap, oap->oap_count); - osc_ap_completion(cli, NULL, oap, 0, oap->oap_count); + osc_ap_completion(env, cli, NULL, + oap, 0, oap->oap_count); continue; } @@ -2322,31 +2279,21 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, osc_wake_cache_waiters(cli); - if (page_count == 0) - RETURN(0); - loi_list_maint(cli, loi); client_obd_list_unlock(&cli->cl_loi_list_lock); - req = osc_build_req(cli, &rpc_list, page_count, cmd); - if (IS_ERR(req)) { - /* this should happen rarely and is pretty bad, it makes the - * pending list not follow the dirty order */ + if (clob != NULL) + cl_object_put(env, clob); + + if (page_count == 0) { client_obd_list_lock(&cli->cl_loi_list_lock); - list_for_each_entry_safe(oap, tmp, &rpc_list, oap_rpc_item) { - list_del_init(&oap->oap_rpc_item); + RETURN(0); + } - /* queued sync pages can be torn down while the pages - * were between the pending list and the rpc */ - if (oap->oap_interrupted) { - CDEBUG(D_INODE, "oap %p interrupted\n", oap); - osc_ap_completion(cli, NULL, oap, 0, - oap->oap_count); - continue; - } - osc_ap_completion(cli, NULL, oap, 0, PTR_ERR(req)); - } + req = osc_build_req(env, cli, &rpc_list, page_count, cmd); + if (IS_ERR(req)) { + LASSERT(list_empty(&rpc_list)); loi_list_maint(cli, loi); RETURN(PTR_ERR(req)); } @@ -2394,7 +2341,7 @@ static int osc_send_oap_rpc(struct client_obd *cli, struct lov_oinfo *loi, page_count, aa, cli->cl_r_in_flight, cli->cl_w_in_flight); req->rq_interpret_reply = brw_interpret; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_BRW); RETURN(1); } @@ -2441,7 +2388,7 @@ struct lov_oinfo *osc_next_loi(struct client_obd *cli) } /* called with the loi list lock held */ -static void osc_check_rpcs(struct client_obd *cli) +void osc_check_rpcs(const struct lu_env *env, struct client_obd *cli) { struct lov_oinfo *loi; int rc = 0, race_counter = 0; @@ -2460,7 +2407,7 @@ static void osc_check_rpcs(struct client_obd *cli) * partial read pending queue when we're given this object to * do io on writes while there are cache waiters */ if (lop_makes_rpc(cli, &loi->loi_write_lop, OBD_BRW_WRITE)) { - rc = osc_send_oap_rpc(cli, loi, OBD_BRW_WRITE, + rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_WRITE, &loi->loi_write_lop); if (rc < 0) break; @@ -2470,7 +2417,7 @@ static void osc_check_rpcs(struct client_obd *cli) race_counter++; } if (lop_makes_rpc(cli, &loi->loi_read_lop, OBD_BRW_READ)) { - rc = osc_send_oap_rpc(cli, loi, OBD_BRW_READ, + rc = osc_send_oap_rpc(env, cli, loi, OBD_BRW_READ, &loi->loi_read_lop); if (rc < 0) break; @@ -2520,9 +2467,32 @@ static int ocw_granted(struct client_obd *cli, struct osc_cache_waiter *ocw) RETURN(rc); }; +/** + * Non-blocking version of osc_enter_cache() that consumes grant only when it + * is available. + */ +int osc_enter_cache_try(const struct lu_env *env, + struct client_obd *cli, struct lov_oinfo *loi, + struct osc_async_page *oap, int transient) +{ + int has_grant; + + has_grant = cli->cl_avail_grant >= CFS_PAGE_SIZE; + if (has_grant) { + osc_consume_write_grant(cli, &oap->oap_brw_page); + if (transient) { + cli->cl_dirty_transit += CFS_PAGE_SIZE; + atomic_inc(&obd_dirty_transit_pages); + oap->oap_brw_flags |= OBD_BRW_NOCACHE; + } + } + return has_grant; +} + /* Caller must hold loi_list_lock - we drop/regain it if we need to wait for * grant or cache space. */ -static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, +static int osc_enter_cache(const struct lu_env *env, + struct client_obd *cli, struct lov_oinfo *loi, struct osc_async_page *oap) { struct osc_cache_waiter ocw; @@ -2542,13 +2512,10 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, RETURN(-EDQUOT); /* Hopefully normal case - cache space and write credits available */ - if ((cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max) && - (atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages) && - (cli->cl_avail_grant >= CFS_PAGE_SIZE)) { - /* account for ourselves */ - osc_consume_write_grant(cli, &oap->oap_brw_page); + if (cli->cl_dirty + CFS_PAGE_SIZE <= cli->cl_dirty_max && + atomic_read(&obd_dirty_pages) + 1 <= obd_max_dirty_pages && + osc_enter_cache_try(env, cli, loi, oap, 0)) RETURN(0); - } /* Make sure that there are write rpcs in flight to wait for. This * is a little silly as this object may not have any pending but @@ -2560,7 +2527,7 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, ocw.ocw_rc = 0; loi_list_maint(cli, loi); - osc_check_rpcs(cli); + osc_check_rpcs(env, cli); client_obd_list_unlock(&cli->cl_loi_list_lock); CDEBUG(D_CACHE, "sleeping for cache space\n"); @@ -2577,84 +2544,15 @@ static int osc_enter_cache(struct client_obd *cli, struct lov_oinfo *loi, RETURN(-EDQUOT); } -/** - * Checks if requested extent lock is compatible with a lock under the page. - * - * Checks if the lock under \a page is compatible with a read or write lock - * (specified by \a rw) for an extent [\a start , \a end]. - * - * \param exp osc export - * \param lsm striping information for the file - * \param res osc_async_page placeholder - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param start start of the requested extent - * \param end end of the requested extent - * \param cookie transparent parameter for passing locking context - * - * \post result == 1, *cookie == context, appropriate lock is referenced or - * \post result == 0 - * - * \retval 1 owned lock is reused for the request - * \retval 0 no lock reused for the request - * - * \see osc_release_short_lock - */ -static int osc_reget_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, - void **res, int rw, - obd_off start, obd_off end, - void **cookie) -{ - struct osc_async_page *oap = *res; - int rc; - - ENTRY; - - spin_lock(&oap->oap_lock); - rc = ldlm_lock_fast_match(oap->oap_ldlm_lock, rw, - start, end, cookie); - spin_unlock(&oap->oap_lock); - - RETURN(rc); -} - -/** - * Releases a reference to a lock taken in a "fast" way. - * - * Releases a read or a write (specified by \a rw) lock - * referenced by \a cookie. - * - * \param exp osc export - * \param lsm striping information for the file - * \param end end of the locked extent - * \param rw OBD_BRW_READ if requested for reading, - * OBD_BRW_WRITE if requested for writing - * \param cookie transparent parameter for passing locking context - * - * \post appropriate lock is dereferenced - * - * \see osc_reget_short_lock - */ -static int osc_release_short_lock(struct obd_export *exp, - struct lov_stripe_md *lsm, obd_off end, - void *cookie, int rw) -{ - ENTRY; - ldlm_lock_fast_release(cookie, rw); - /* no error could have happened at this layer */ - RETURN(0); -} int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, struct lov_oinfo *loi, cfs_page_t *page, - obd_off offset, struct obd_async_page_ops *ops, + obd_off offset, const struct obd_async_page_ops *ops, void *data, void **res, int nocache, struct lustre_handle *lockh) { struct osc_async_page *oap; - struct ldlm_res_id oid; - int rc = 0; + ENTRY; if (!page) @@ -2670,28 +2568,18 @@ int osc_prep_async_page(struct obd_export *exp, struct lov_stripe_md *lsm, oap->oap_page = page; oap->oap_obj_off = offset; + if (!client_is_remote(exp) && + cfs_capable(CFS_CAP_SYS_RESOURCE)) + oap->oap_brw_flags = OBD_BRW_NOQUOTA; + + LASSERT(!(offset & ~CFS_PAGE_MASK)); CFS_INIT_LIST_HEAD(&oap->oap_pending_item); CFS_INIT_LIST_HEAD(&oap->oap_urgent_item); CFS_INIT_LIST_HEAD(&oap->oap_rpc_item); CFS_INIT_LIST_HEAD(&oap->oap_page_list); - oap->oap_occ.occ_interrupted = osc_occ_interrupted; - spin_lock_init(&oap->oap_lock); - - /* If the page was marked as notcacheable - don't add to any locks */ - if (!nocache) { - osc_build_res_name(loi->loi_id, loi->loi_gr, &oid); - /* This is the only place where we can call cache_add_extent - without oap_lock, because this page is locked now, and - the lock we are adding it to is referenced, so cannot lose - any pages either. */ - rc = cache_add_extent(oap->oap_cli->cl_cache, &oid, oap, lockh); - if (rc) - RETURN(rc); - } - CDEBUG(D_CACHE, "oap %p page %p obj off "LPU64"\n", oap, page, offset); RETURN(0); } @@ -2704,10 +2592,11 @@ struct osc_async_page *oap_from_cookie(void *cookie) return oap; }; -static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, enum async_flags async_flags) +int osc_queue_async_io(const struct lu_env *env, + struct obd_export *exp, struct lov_stripe_md *lsm, + struct lov_oinfo *loi, void *cookie, + int cmd, obd_off off, int count, + obd_flag brw_flags, enum async_flags async_flags) { struct client_obd *cli = &exp->exp_obd->u.cli; struct osc_async_page *oap; @@ -2727,32 +2616,29 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, RETURN(-EBUSY); /* check if the file's owner/group is over quota */ -#ifdef HAVE_QUOTA_SUPPORT - if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)){ - struct obd_async_page_ops *ops; - struct obdo *oa; + if ((cmd & OBD_BRW_WRITE) && !(cmd & OBD_BRW_NOQUOTA)) { + struct cl_object *obj; + struct cl_attr attr; /* XXX put attr into thread info */ - OBDO_ALLOC(oa); - if (oa == NULL) - RETURN(-ENOMEM); + obj = cl_object_top(osc_oap2cl_page(oap)->cp_obj); - ops = oap->oap_caller_ops; - ops->ap_fill_obdo(oap->oap_caller_data, cmd, oa); - if (lquota_chkdq(quota_interface, cli, oa->o_uid, oa->o_gid) == - NO_QUOTA) - rc = -EDQUOT; + cl_object_attr_lock(obj); + rc = cl_object_attr_get(env, obj, &attr); + cl_object_attr_unlock(obj); - OBDO_FREE(oa); + if (rc == 0 && lquota_chkdq(quota_interface, cli, attr.cat_uid, + attr.cat_gid) == NO_QUOTA) + rc = -EDQUOT; if (rc) RETURN(rc); } -#endif if (loi == NULL) loi = lsm->lsm_oinfo[0]; client_obd_list_lock(&cli->cl_loi_list_lock); + LASSERT(off + count <= CFS_PAGE_SIZE); oap->oap_cmd = cmd; oap->oap_page_off = off; oap->oap_count = count; @@ -2760,7 +2646,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, oap->oap_async_flags = async_flags; if (cmd & OBD_BRW_WRITE) { - rc = osc_enter_cache(cli, loi, oap); + rc = osc_enter_cache(env, cli, loi, oap); if (rc) { client_obd_list_unlock(&cli->cl_loi_list_lock); RETURN(rc); @@ -2773,7 +2659,7 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, LOI_DEBUG(loi, "oap %p page %p added for cmd %d\n", oap, oap->oap_page, cmd); - osc_check_rpcs(cli); + osc_check_rpcs(env, cli); client_obd_list_unlock(&cli->cl_loi_list_lock); RETURN(0); @@ -2782,50 +2668,27 @@ static int osc_queue_async_io(struct obd_export *exp, struct lov_stripe_md *lsm, /* aka (~was & now & flag), but this is more clear :) */ #define SETTING(was, now, flag) (!(was & flag) && (now & flag)) -static int osc_set_async_flags(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flag async_flags) +int osc_set_async_flags_base(struct client_obd *cli, + struct lov_oinfo *loi, struct osc_async_page *oap, + obd_flag async_flags) { - struct client_obd *cli = &exp->exp_obd->u.cli; struct loi_oap_pages *lop; - struct osc_async_page *oap; - int rc = 0; ENTRY; - oap = oap_from_cookie(cookie); - if (IS_ERR(oap)) - RETURN(PTR_ERR(oap)); - - /* - * bug 7311: OST-side locking is only supported for liblustre for now - * (and liblustre never calls obd_set_async_flags(). I hope.), generic - * implementation has to handle case where OST-locked page was picked - * up by, e.g., ->writepage(). - */ - LASSERT(!(oap->oap_brw_flags & OBD_BRW_SRVLOCK)); - LASSERT(!LIBLUSTRE_CLIENT); /* check that liblustre angels do fear to - * tread here. */ - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) RETURN(-EIO); - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - if (oap->oap_cmd & OBD_BRW_WRITE) { lop = &loi->loi_write_lop; } else { lop = &loi->loi_read_lop; } - client_obd_list_lock(&cli->cl_loi_list_lock); - if (list_empty(&oap->oap_pending_item)) - GOTO(out, rc = -EINVAL); + RETURN(-EINVAL); if ((oap->oap_async_flags & async_flags) == async_flags) - GOTO(out, rc = 0); + RETURN(0); if (SETTING(oap->oap_async_flags, async_flags, ASYNC_READY)) oap->oap_async_flags |= ASYNC_READY; @@ -2839,106 +2702,12 @@ static int osc_set_async_flags(struct obd_export *exp, LOI_DEBUG(loi, "oap %p page %p has flags %x\n", oap, oap->oap_page, oap->oap_async_flags); -out: - osc_check_rpcs(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(rc); -} - -static int osc_queue_group_io(struct obd_export *exp, struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, - obd_flag async_flags) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - struct osc_async_page *oap; - struct loi_oap_pages *lop; - int rc = 0; - ENTRY; - - oap = oap_from_cookie(cookie); - if (IS_ERR(oap)) - RETURN(PTR_ERR(oap)); - - if (cli->cl_import == NULL || cli->cl_import->imp_invalid) - RETURN(-EIO); - - if (!list_empty(&oap->oap_pending_item) || - !list_empty(&oap->oap_urgent_item) || - !list_empty(&oap->oap_rpc_item)) - RETURN(-EBUSY); - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - client_obd_list_lock(&cli->cl_loi_list_lock); - - oap->oap_cmd = cmd; - oap->oap_page_off = off; - oap->oap_count = count; - oap->oap_brw_flags = brw_flags; - oap->oap_async_flags = async_flags; - - if (cmd & OBD_BRW_WRITE) - lop = &loi->loi_write_lop; - else - lop = &loi->loi_read_lop; - - list_add_tail(&oap->oap_pending_item, &lop->lop_pending_group); - if (oap->oap_async_flags & ASYNC_GROUP_SYNC) { - oap->oap_oig = oig; - rc = oig_add_one(oig, &oap->oap_occ); - } - - LOI_DEBUG(loi, "oap %p page %p on group pending: rc %d\n", - oap, oap->oap_page, rc); - - client_obd_list_unlock(&cli->cl_loi_list_lock); - - RETURN(rc); -} - -static void osc_group_to_pending(struct client_obd *cli, struct lov_oinfo *loi, - struct loi_oap_pages *lop, int cmd) -{ - struct list_head *pos, *tmp; - struct osc_async_page *oap; - - list_for_each_safe(pos, tmp, &lop->lop_pending_group) { - oap = list_entry(pos, struct osc_async_page, oap_pending_item); - list_del(&oap->oap_pending_item); - osc_oap_to_pending(oap); - } - loi_list_maint(cli, loi); -} - -static int osc_trigger_group_io(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig) -{ - struct client_obd *cli = &exp->exp_obd->u.cli; - ENTRY; - - if (loi == NULL) - loi = lsm->lsm_oinfo[0]; - - client_obd_list_lock(&cli->cl_loi_list_lock); - - osc_group_to_pending(cli, loi, &loi->loi_write_lop, OBD_BRW_WRITE); - osc_group_to_pending(cli, loi, &loi->loi_read_lop, OBD_BRW_READ); - - osc_check_rpcs(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - RETURN(0); } -static int osc_teardown_async_page(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie) +int osc_teardown_async_page(struct obd_export *exp, + struct lov_stripe_md *lsm, + struct lov_oinfo *loi, void *cookie) { struct client_obd *cli = &exp->exp_obd->u.cli; struct loi_oap_pages *lop; @@ -2976,85 +2745,44 @@ static int osc_teardown_async_page(struct obd_export *exp, lop_update_pending(cli, lop, oap->oap_cmd, -1); } loi_list_maint(cli, loi); - cache_remove_extent(cli->cl_cache, oap); - LOI_DEBUG(loi, "oap %p page %p torn down\n", oap, oap->oap_page); out: client_obd_list_unlock(&cli->cl_loi_list_lock); RETURN(rc); } -int osc_extent_blocking_cb(struct ldlm_lock *lock, - struct ldlm_lock_desc *new, void *data, - int flag) +static void osc_set_lock_data_with_check(struct ldlm_lock *lock, + struct ldlm_enqueue_info *einfo, + int flags) { - struct lustre_handle lockh = { 0 }; - int rc; - ENTRY; - - if ((unsigned long)data > 0 && (unsigned long)data < 0x1000) { - LDLM_ERROR(lock, "cancelling lock with bad data %p", data); - LBUG(); - } + void *data = einfo->ei_cbdata; - switch (flag) { - case LDLM_CB_BLOCKING: - ldlm_lock2handle(lock, &lockh); - rc = ldlm_cli_cancel(&lockh); - if (rc != ELDLM_OK) - CERROR("ldlm_cli_cancel failed: %d\n", rc); - break; - case LDLM_CB_CANCELING: { - - ldlm_lock2handle(lock, &lockh); - /* This lock wasn't granted, don't try to do anything */ - if (lock->l_req_mode != lock->l_granted_mode) - RETURN(0); + LASSERT(lock != NULL); + LASSERT(lock->l_blocking_ast == einfo->ei_cb_bl); + LASSERT(lock->l_resource->lr_type == einfo->ei_type); + LASSERT(lock->l_completion_ast == einfo->ei_cb_cp); + LASSERT(lock->l_glimpse_ast == einfo->ei_cb_gl); - cache_remove_lock(lock->l_conn_export->exp_obd->u.cli.cl_cache, - &lockh); - - if (lock->l_conn_export->exp_obd->u.cli.cl_ext_lock_cancel_cb) - lock->l_conn_export->exp_obd->u.cli.cl_ext_lock_cancel_cb( - lock, new, data,flag); - break; - } - default: - LBUG(); - } - - RETURN(0); + lock_res_and_lock(lock); + spin_lock(&osc_ast_guard); + LASSERT(lock->l_ast_data == NULL || lock->l_ast_data == data); + lock->l_ast_data = data; + spin_unlock(&osc_ast_guard); + unlock_res_and_lock(lock); } -EXPORT_SYMBOL(osc_extent_blocking_cb); -static void osc_set_data_with_check(struct lustre_handle *lockh, void *data, +static void osc_set_data_with_check(struct lustre_handle *lockh, + struct ldlm_enqueue_info *einfo, int flags) { struct ldlm_lock *lock = ldlm_handle2lock(lockh); - if (lock == NULL) { - CERROR("lockh %p, data %p - client evicted?\n", lockh, data); - return; - } - lock_res_and_lock(lock); -#if defined (__KERNEL__) && defined (__linux__) - /* Liang XXX: Darwin and Winnt checking should be added */ - if (lock->l_ast_data && lock->l_ast_data != data) { - struct inode *new_inode = data; - struct inode *old_inode = lock->l_ast_data; - if (!(old_inode->i_state & I_FREEING)) - LDLM_ERROR(lock, "inconsistent l_ast_data found"); - LASSERTF(old_inode->i_state & I_FREEING, - "Found existing inode %p/%lu/%u state %lu in lock: " - "setting data to %p/%lu/%u\n", old_inode, - old_inode->i_ino, old_inode->i_generation, - old_inode->i_state, - new_inode, new_inode->i_ino, new_inode->i_generation); - } -#endif - lock->l_ast_data = data; - unlock_res_and_lock(lock); - LDLM_LOCK_PUT(lock); + if (lock != NULL) { + osc_set_lock_data_with_check(lock, einfo, flags); + LDLM_LOCK_PUT(lock); + } else + CERROR("lockh %p, data %p - client evicted?\n", + lockh, einfo->ei_cbdata); } static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, @@ -3068,9 +2796,11 @@ static int osc_change_cbdata(struct obd_export *exp, struct lov_stripe_md *lsm, return 0; } -static int osc_enqueue_fini(struct obd_device *obd, struct ptlrpc_request *req, - struct obd_info *oinfo, int intent, int rc) +static int osc_enqueue_fini(struct ptlrpc_request *req, struct ost_lvb *lvb, + obd_enqueue_update_f upcall, void *cookie, + int *flags, int rc) { + int intent = *flags & LDLM_FL_HAS_INTENT; ENTRY; if (intent) { @@ -3087,17 +2817,13 @@ static int osc_enqueue_fini(struct obd_device *obd, struct ptlrpc_request *req, } if ((intent && rc == ELDLM_LOCK_ABORTED) || !rc) { + *flags |= LDLM_FL_LVB_READY; CDEBUG(D_INODE,"got kms "LPU64" blocks "LPU64" mtime "LPU64"\n", - oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_size, - oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_blocks, - oinfo->oi_md->lsm_oinfo[0]->loi_lvb.lvb_mtime); + lvb->lvb_size, lvb->lvb_blocks, lvb->lvb_mtime); } - if (!rc) - cache_add_lock(obd->u.cli.cl_cache, oinfo->oi_lockh); - /* Call the update callback. */ - rc = oinfo->oi_cb_up(oinfo, rc); + rc = (*upcall)(cookie, rc); RETURN(rc); } @@ -3105,36 +2831,87 @@ static int osc_enqueue_interpret(const struct lu_env *env, struct ptlrpc_request *req, struct osc_enqueue_args *aa, int rc) { - int intent = aa->oa_oi->oi_flags & LDLM_FL_HAS_INTENT; - struct lov_stripe_md *lsm = aa->oa_oi->oi_md; struct ldlm_lock *lock; + struct lustre_handle handle; + __u32 mode; + + /* Make a local copy of a lock handle and a mode, because aa->oa_* + * might be freed anytime after lock upcall has been called. */ + lustre_handle_copy(&handle, aa->oa_lockh); + mode = aa->oa_ei->ei_mode; /* ldlm_cli_enqueue is holding a reference on the lock, so it must * be valid. */ - lock = ldlm_handle2lock(aa->oa_oi->oi_lockh); + lock = ldlm_handle2lock(&handle); + + /* Take an additional reference so that a blocking AST that + * ldlm_cli_enqueue_fini() might post for a failed lock, is guaranteed + * to arrive after an upcall has been executed by + * osc_enqueue_fini(). */ + ldlm_lock_addref(&handle, mode); /* Complete obtaining the lock procedure. */ rc = ldlm_cli_enqueue_fini(aa->oa_exp, req, aa->oa_ei->ei_type, 1, - aa->oa_ei->ei_mode, - &aa->oa_oi->oi_flags, - &lsm->lsm_oinfo[0]->loi_lvb, - sizeof(lsm->lsm_oinfo[0]->loi_lvb), - lustre_swab_ost_lvb, - aa->oa_oi->oi_lockh, rc); - + mode, aa->oa_flags, aa->oa_lvb, + sizeof(*aa->oa_lvb), lustre_swab_ost_lvb, + &handle, rc); /* Complete osc stuff. */ - rc = osc_enqueue_fini(aa->oa_exp->exp_obd, req, aa->oa_oi, intent, rc); - + rc = osc_enqueue_fini(req, aa->oa_lvb, + aa->oa_upcall, aa->oa_cookie, aa->oa_flags, rc); /* Release the lock for async request. */ - if (lustre_handle_is_used(aa->oa_oi->oi_lockh) && rc == ELDLM_OK) - ldlm_lock_decref(aa->oa_oi->oi_lockh, aa->oa_ei->ei_mode); + if (lustre_handle_is_used(&handle) && rc == ELDLM_OK) + /* + * Releases a reference taken by ldlm_cli_enqueue(), if it is + * not already released by + * ldlm_cli_enqueue_fini()->failed_lock_cleanup() + */ + ldlm_lock_decref(&handle, mode); LASSERTF(lock != NULL, "lockh %p, req %p, aa %p - client evicted?\n", - aa->oa_oi->oi_lockh, req, aa); + aa->oa_lockh, req, aa); + ldlm_lock_decref(&handle, mode); LDLM_LOCK_PUT(lock); return rc; } +void osc_update_enqueue(struct lustre_handle *lov_lockhp, + struct lov_oinfo *loi, int flags, + struct ost_lvb *lvb, __u32 mode, int rc) +{ + if (rc == ELDLM_OK) { + struct ldlm_lock *lock = ldlm_handle2lock(lov_lockhp); + __u64 tmp; + + LASSERT(lock != NULL); + loi->loi_lvb = *lvb; + tmp = loi->loi_lvb.lvb_size; + /* Extend KMS up to the end of this lock and no further + * A lock on [x,y] means a KMS of up to y + 1 bytes! */ + if (tmp > lock->l_policy_data.l_extent.end) + tmp = lock->l_policy_data.l_extent.end + 1; + if (tmp >= loi->loi_kms) { + LDLM_DEBUG(lock, "lock acquired, setting rss="LPU64 + ", kms="LPU64, loi->loi_lvb.lvb_size, tmp); + loi_kms_set(loi, tmp); + } else { + LDLM_DEBUG(lock, "lock acquired, setting rss=" + LPU64"; leaving kms="LPU64", end="LPU64, + loi->loi_lvb.lvb_size, loi->loi_kms, + lock->l_policy_data.l_extent.end); + } + ldlm_lock_allow_match(lock); + LDLM_LOCK_PUT(lock); + } else if (rc == ELDLM_LOCK_ABORTED && (flags & LDLM_FL_HAS_INTENT)) { + loi->loi_lvb = *lvb; + CDEBUG(D_INODE, "glimpsed, setting rss="LPU64"; leaving" + " kms="LPU64"\n", loi->loi_lvb.lvb_size, loi->loi_kms); + rc = ELDLM_OK; + } +} +EXPORT_SYMBOL(osc_update_enqueue); + +struct ptlrpc_request_set *PTLRPCD_SET = (void *)1; + /* When enqueuing asynchronously, locks are not ordered, we can obtain a lock * from the 2nd OSC before a lock from the 1st one. This does not deadlock with * other synchronous requests, however keeping some locks and trying to obtain @@ -3142,28 +2919,33 @@ static int osc_enqueue_interpret(const struct lu_env *env, * when other sync requests do not get released lock from a client, the client * is excluded from the cluster -- such scenarious make the life difficult, so * release locks just after they are obtained. */ -static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, - struct ldlm_enqueue_info *einfo, - struct ptlrpc_request_set *rqset) +int osc_enqueue_base(struct obd_export *exp, struct ldlm_res_id *res_id, + int *flags, ldlm_policy_data_t *policy, + struct ost_lvb *lvb, int kms_valid, + obd_enqueue_update_f upcall, void *cookie, + struct ldlm_enqueue_info *einfo, + struct lustre_handle *lockh, + struct ptlrpc_request_set *rqset, int async) { - struct ldlm_res_id res_id; struct obd_device *obd = exp->exp_obd; struct ptlrpc_request *req = NULL; - int intent = oinfo->oi_flags & LDLM_FL_HAS_INTENT; + int intent = *flags & LDLM_FL_HAS_INTENT; ldlm_mode_t mode; int rc; ENTRY; - - osc_build_res_name(oinfo->oi_md->lsm_object_id, - oinfo->oi_md->lsm_object_gr, &res_id); /* Filesystem lock extents are extended to page boundaries so that * dealing with the page cache is a little smoother. */ - oinfo->oi_policy.l_extent.start -= - oinfo->oi_policy.l_extent.start & ~CFS_PAGE_MASK; - oinfo->oi_policy.l_extent.end |= ~CFS_PAGE_MASK; + policy->l_extent.start -= policy->l_extent.start & ~CFS_PAGE_MASK; + policy->l_extent.end |= ~CFS_PAGE_MASK; - if (oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid == 0) + /* + * kms is not valid when either object is completely fresh (so that no + * locks are cached), or object was evicted. In the latter case cached + * lock cannot be used, because it would prime inode state with + * potentially stale LVB. + */ + if (!kms_valid) goto no_match; /* Next, search for already existing extent locks that will cover us */ @@ -3182,32 +2964,37 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, if (einfo->ei_mode == LCK_PR) mode |= LCK_PW; mode = ldlm_lock_match(obd->obd_namespace, - oinfo->oi_flags | LDLM_FL_LVB_READY, &res_id, - einfo->ei_type, &oinfo->oi_policy, mode, - oinfo->oi_lockh); + *flags | LDLM_FL_LVB_READY, res_id, + einfo->ei_type, policy, mode, lockh, 0); if (mode) { - /* addref the lock only if not async requests and PW lock is - * matched whereas we asked for PR. */ - if (!rqset && einfo->ei_mode != mode) - ldlm_lock_addref(oinfo->oi_lockh, LCK_PR); - osc_set_data_with_check(oinfo->oi_lockh, einfo->ei_cbdata, - oinfo->oi_flags); - if (intent) { - /* I would like to be able to ASSERT here that rss <= - * kms, but I can't, for reasons which are explained in - * lov_enqueue() */ - } - - /* We already have a lock, and it's referenced */ - oinfo->oi_cb_up(oinfo, ELDLM_OK); + struct ldlm_lock *matched = ldlm_handle2lock(lockh); + + if (matched->l_ast_data == NULL || + matched->l_ast_data == einfo->ei_cbdata) { + /* addref the lock only if not async requests and PW + * lock is matched whereas we asked for PR. */ + if (!rqset && einfo->ei_mode != mode) + ldlm_lock_addref(lockh, LCK_PR); + osc_set_lock_data_with_check(matched, einfo, *flags); + if (intent) { + /* I would like to be able to ASSERT here that + * rss <= kms, but I can't, for reasons which + * are explained in lov_enqueue() */ + } - /* For async requests, decref the lock. */ - if (einfo->ei_mode != mode) - ldlm_lock_decref(oinfo->oi_lockh, LCK_PW); - else if (rqset) - ldlm_lock_decref(oinfo->oi_lockh, einfo->ei_mode); + /* We already have a lock, and it's referenced */ + (*upcall)(cookie, ELDLM_OK); - RETURN(ELDLM_OK); + /* For async requests, decref the lock. */ + if (einfo->ei_mode != mode) + ldlm_lock_decref(lockh, LCK_PW); + else if (rqset) + ldlm_lock_decref(lockh, einfo->ei_mode); + LDLM_LOCK_PUT(matched); + RETURN(ELDLM_OK); + } else + ldlm_lock_decref(lockh, mode); + LDLM_LOCK_PUT(matched); } no_match: @@ -3223,56 +3010,76 @@ static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, RETURN(rc); req_capsule_set_size(&req->rq_pill, &RMF_DLM_LVB, RCL_SERVER, - sizeof(oinfo->oi_md->lsm_oinfo[0]->loi_lvb)); + sizeof *lvb); ptlrpc_request_set_replen(req); } /* users of osc_enqueue() can pass this flag for ldlm_lock_match() */ - oinfo->oi_flags &= ~LDLM_FL_BLOCK_GRANTED; + *flags &= ~LDLM_FL_BLOCK_GRANTED; - rc = ldlm_cli_enqueue(exp, &req, einfo, &res_id, - &oinfo->oi_policy, &oinfo->oi_flags, - &oinfo->oi_md->lsm_oinfo[0]->loi_lvb, - sizeof(oinfo->oi_md->lsm_oinfo[0]->loi_lvb), - lustre_swab_ost_lvb, oinfo->oi_lockh, - rqset ? 1 : 0); + rc = ldlm_cli_enqueue(exp, &req, einfo, res_id, policy, flags, lvb, + sizeof(*lvb), lustre_swab_ost_lvb, lockh, async); if (rqset) { if (!rc) { struct osc_enqueue_args *aa; CLASSERT (sizeof(*aa) <= sizeof(req->rq_async_args)); aa = ptlrpc_req_async_args(req); - aa->oa_oi = oinfo; aa->oa_ei = einfo; aa->oa_exp = exp; + aa->oa_flags = flags; + aa->oa_upcall = upcall; + aa->oa_cookie = cookie; + aa->oa_lvb = lvb; + aa->oa_lockh = lockh; req->rq_interpret_reply = (ptlrpc_interpterer_t)osc_enqueue_interpret; - ptlrpc_set_add_req(rqset, req); + if (rqset == PTLRPCD_SET) + ptlrpcd_add_req(req, PSCOPE_OTHER); + else + ptlrpc_set_add_req(rqset, req); } else if (intent) { ptlrpc_req_finished(req); } RETURN(rc); } - rc = osc_enqueue_fini(obd, req, oinfo, intent, rc); + rc = osc_enqueue_fini(req, lvb, upcall, cookie, flags, rc); if (intent) ptlrpc_req_finished(req); RETURN(rc); } -static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, - __u32 type, ldlm_policy_data_t *policy, __u32 mode, - int *flags, void *data, struct lustre_handle *lockh) +static int osc_enqueue(struct obd_export *exp, struct obd_info *oinfo, + struct ldlm_enqueue_info *einfo, + struct ptlrpc_request_set *rqset) { struct ldlm_res_id res_id; + int rc; + ENTRY; + + osc_build_res_name(oinfo->oi_md->lsm_object_id, + oinfo->oi_md->lsm_object_gr, &res_id); + + rc = osc_enqueue_base(exp, &res_id, &oinfo->oi_flags, &oinfo->oi_policy, + &oinfo->oi_md->lsm_oinfo[0]->loi_lvb, + oinfo->oi_md->lsm_oinfo[0]->loi_kms_valid, + oinfo->oi_cb_up, oinfo, einfo, oinfo->oi_lockh, + rqset, rqset != NULL); + RETURN(rc); +} + +int osc_match_base(struct obd_export *exp, struct ldlm_res_id *res_id, + __u32 type, ldlm_policy_data_t *policy, __u32 mode, + int *flags, void *data, struct lustre_handle *lockh, + int unref) +{ struct obd_device *obd = exp->exp_obd; int lflags = *flags; ldlm_mode_t rc; ENTRY; - osc_build_res_name(lsm->lsm_object_id, lsm->lsm_object_gr, &res_id); - if (OBD_FAIL_CHECK(OBD_FAIL_OSC_MATCH)) RETURN(-EIO); @@ -3289,9 +3096,10 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, if (mode == LCK_PR) rc |= LCK_PW; rc = ldlm_lock_match(obd->obd_namespace, lflags | LDLM_FL_LVB_READY, - &res_id, type, policy, rc, lockh); + res_id, type, policy, rc, lockh, unref); if (rc) { - osc_set_data_with_check(lockh, data, lflags); + if (data != NULL) + osc_set_data_with_check(lockh, data, lflags); if (!(lflags & LDLM_FL_TEST_LOCK) && mode != rc) { ldlm_lock_addref(lockh, LCK_PR); ldlm_lock_decref(lockh, LCK_PW); @@ -3301,8 +3109,7 @@ static int osc_match(struct obd_export *exp, struct lov_stripe_md *lsm, RETURN(rc); } -static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md, - __u32 mode, struct lustre_handle *lockh) +int osc_cancel_base(struct lustre_handle *lockh, __u32 mode) { ENTRY; @@ -3314,6 +3121,13 @@ static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md, RETURN(0); } +static int osc_cancel(struct obd_export *exp, struct lov_stripe_md *md, + __u32 mode, struct lustre_handle *lockh) +{ + ENTRY; + RETURN(osc_cancel_base(lockh, mode)); +} + static int osc_cancel_unused(struct obd_export *exp, struct lov_stripe_md *lsm, int flags, void *opaque) @@ -3782,6 +3596,11 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, RETURN(0); } + if (KEY_IS(KEY_SPTLRPC_CONF)) { + sptlrpc_conf_client_adapt(obd); + RETURN(0); + } + if (KEY_IS(KEY_FLUSH_CTX)) { sptlrpc_import_flush_my_ctx(imp); RETURN(0); @@ -3822,7 +3641,7 @@ static int osc_set_info_async(struct obd_export *exp, obd_count keylen, oscc->oscc_oa.o_gr = (*(__u32 *)val); oscc->oscc_oa.o_valid |= OBD_MD_FLGROUP; - LASSERT(oscc->oscc_oa.o_gr > 0); + LASSERT_MDS_GROUP(oscc->oscc_oa.o_gr); req->rq_interpret_reply = osc_setinfo_mds_conn_interpret; } @@ -3935,14 +3754,21 @@ static int osc_reconnect(const struct lu_env *env, static int osc_disconnect(struct obd_export *exp) { struct obd_device *obd = class_exp2obd(exp); - struct llog_ctxt *ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + struct llog_ctxt *ctxt; int rc; - if (obd->u.cli.cl_conn_count == 1) - /* flush any remaining cancel messages out to the target */ - llog_sync(ctxt, exp); - - llog_ctxt_put(ctxt); + ctxt = llog_get_context(obd, LLOG_SIZE_REPL_CTXT); + if (ctxt) { + if (obd->u.cli.cl_conn_count == 1) { + /* Flush any remaining cancel messages out to the + * target */ + llog_sync(ctxt, exp); + } + llog_ctxt_put(ctxt); + } else { + CDEBUG(D_HA, "No LLOG_SIZE_REPL_CTXT found in obd %p\n", + obd); + } rc = client_disconnect_export(exp); return rc; @@ -3981,16 +3807,23 @@ static int osc_import_event(struct obd_device *obd, } case IMP_EVENT_INVALIDATE: { struct ldlm_namespace *ns = obd->obd_namespace; + struct lu_env *env; + int refcheck; + + env = cl_env_get(&refcheck); + if (!IS_ERR(env)) { + /* Reset grants */ + cli = &obd->u.cli; + client_obd_list_lock(&cli->cl_loi_list_lock); + /* all pages go to failing rpcs due to the invalid + * import */ + osc_check_rpcs(env, cli); + client_obd_list_unlock(&cli->cl_loi_list_lock); - /* Reset grants */ - cli = &obd->u.cli; - client_obd_list_lock(&cli->cl_loi_list_lock); - /* all pages go to failing rpcs due to the invalid import */ - osc_check_rpcs(cli); - client_obd_list_unlock(&cli->cl_loi_list_lock); - - ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); - + ldlm_namespace_cleanup(ns, LDLM_FL_LOCAL_ONLY); + cl_env_put(env, &refcheck); + } else + rc = PTR_ERR(env); break; } case IMP_EVENT_ACTIVE: { @@ -4059,11 +3892,6 @@ int osc_setup(struct obd_device *obd, struct lustre_cfg *lcfg) ptlrpc_init_rq_pool(cli->cl_max_rpcs_in_flight + 2, OST_MAXREQSIZE, ptlrpc_add_rqs_to_pool); - cli->cl_cache = cache_create(obd); - if (!cli->cl_cache) { - osc_cleanup(obd); - rc = -ENOMEM; - } } RETURN(rc); @@ -4091,12 +3919,17 @@ static int osc_precleanup(struct obd_device *obd, enum obd_cleanup_stage stage) client import will not have been cleaned. */ if (obd->u.cli.cl_import) { struct obd_import *imp; + down_write(&obd->u.cli.cl_sem); imp = obd->u.cli.cl_import; CDEBUG(D_CONFIG, "%s: client import never connected\n", obd->obd_name); ptlrpc_invalidate_import(imp); - ptlrpc_free_rq_pool(imp->imp_rq_pool); + if (imp->imp_rq_pool) { + ptlrpc_free_rq_pool(imp->imp_rq_pool); + imp->imp_rq_pool = NULL; + } class_destroy_import(imp); + up_write(&obd->u.cli.cl_sem); obd->u.cli.cl_import = NULL; } rc = obd_llog_finish(obd, 0); @@ -4125,71 +3958,36 @@ int osc_cleanup(struct obd_device *obd) /* free memory of osc quota cache */ lquota_cleanup(quota_interface, obd); - cache_destroy(obd->u.cli.cl_cache); rc = client_obd_cleanup(obd); ptlrpcd_decref(); RETURN(rc); } -static int osc_register_page_removal_cb(struct obd_export *exp, - obd_page_removal_cb_t func, - obd_pin_extent_cb pin_cb) -{ - return cache_add_extent_removal_cb(exp->exp_obd->u.cli.cl_cache, func, - pin_cb); -} - -static int osc_unregister_page_removal_cb(struct obd_export *exp, - obd_page_removal_cb_t func) -{ - return cache_del_extent_removal_cb(exp->exp_obd->u.cli.cl_cache, func); -} - -static int osc_register_lock_cancel_cb(struct obd_export *exp, - obd_lock_cancel_cb cb) -{ - LASSERT(exp->exp_obd->u.cli.cl_ext_lock_cancel_cb == NULL); - - exp->exp_obd->u.cli.cl_ext_lock_cancel_cb = cb; - return 0; -} - -static int osc_unregister_lock_cancel_cb(struct obd_export *exp, - obd_lock_cancel_cb cb) +int osc_process_config_base(struct obd_device *obd, struct lustre_cfg *lcfg) { - if (exp->exp_obd->u.cli.cl_ext_lock_cancel_cb != cb) { - CERROR("Unregistering cancel cb %p, while only %p was " - "registered\n", cb, - exp->exp_obd->u.cli.cl_ext_lock_cancel_cb); - RETURN(-EINVAL); - } - - exp->exp_obd->u.cli.cl_ext_lock_cancel_cb = NULL; - return 0; -} - -static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) -{ - struct lustre_cfg *lcfg = buf; struct lprocfs_static_vars lvars = { 0 }; int rc = 0; lprocfs_osc_init_vars(&lvars); switch (lcfg->lcfg_command) { - case LCFG_SPTLRPC_CONF: - rc = sptlrpc_cliobd_process_config(obd, lcfg); - break; default: rc = class_process_proc_param(PARAM_OSC, lvars.obd_vars, lcfg, obd); + if (rc > 0) + rc = 0; break; } return(rc); } +static int osc_process_config(struct obd_device *obd, obd_count len, void *buf) +{ + return osc_process_config_base(obd, buf); +} + struct obd_ops osc_obd_ops = { .o_owner = THIS_MODULE, .o_setup = osc_setup, @@ -4212,19 +4010,9 @@ struct obd_ops osc_obd_ops = { .o_setattr = osc_setattr, .o_setattr_async = osc_setattr_async, .o_brw = osc_brw, - .o_brw_async = osc_brw_async, - .o_prep_async_page = osc_prep_async_page, - .o_reget_short_lock = osc_reget_short_lock, - .o_release_short_lock = osc_release_short_lock, - .o_queue_async_io = osc_queue_async_io, - .o_set_async_flags = osc_set_async_flags, - .o_queue_group_io = osc_queue_group_io, - .o_trigger_group_io = osc_trigger_group_io, - .o_teardown_async_page = osc_teardown_async_page, .o_punch = osc_punch, .o_sync = osc_sync, .o_enqueue = osc_enqueue, - .o_match = osc_match, .o_change_cbdata = osc_change_cbdata, .o_cancel = osc_cancel, .o_cancel_unused = osc_cancel_unused, @@ -4235,18 +4023,25 @@ struct obd_ops osc_obd_ops = { .o_llog_init = osc_llog_init, .o_llog_finish = osc_llog_finish, .o_process_config = osc_process_config, - .o_register_page_removal_cb = osc_register_page_removal_cb, - .o_unregister_page_removal_cb = osc_unregister_page_removal_cb, - .o_register_lock_cancel_cb = osc_register_lock_cancel_cb, - .o_unregister_lock_cancel_cb = osc_unregister_lock_cancel_cb, }; +extern struct lu_kmem_descr osc_caches[]; +extern spinlock_t osc_ast_guard; +extern struct lock_class_key osc_ast_guard_class; + int __init osc_init(void) { struct lprocfs_static_vars lvars = { 0 }; int rc; ENTRY; + /* print an address of _any_ initialized kernel symbol from this + * module, to allow debugging with gdb that doesn't support data + * symbols from modules.*/ + CDEBUG(D_CONSOLE, "Lustre OSC module (%p).\n", &osc_caches); + + rc = lu_kmem_init(osc_caches); + lprocfs_osc_init_vars(&lvars); request_module("lquota"); @@ -4255,24 +4050,31 @@ int __init osc_init(void) init_obd_quota_ops(quota_interface, &osc_obd_ops); rc = class_register_type(&osc_obd_ops, NULL, lvars.module_vars, - LUSTRE_OSC_NAME, NULL); + LUSTRE_OSC_NAME, &osc_device_type); if (rc) { if (quota_interface) PORTAL_SYMBOL_PUT(osc_quota_interface); + lu_kmem_fini(osc_caches); RETURN(rc); } + spin_lock_init(&osc_ast_guard); + lockdep_set_class(&osc_ast_guard, &osc_ast_guard_class); + RETURN(rc); } #ifdef __KERNEL__ static void /*__exit*/ osc_exit(void) { + lu_device_type_fini(&osc_device_type); + lquota_exit(quota_interface); if (quota_interface) PORTAL_SYMBOL_PUT(osc_quota_interface); class_unregister_type(LUSTRE_OSC_NAME); + lu_kmem_fini(osc_caches); } MODULE_AUTHOR("Sun Microsystems, Inc. "); diff --git a/lustre/osd/osd_handler.c b/lustre/osd/osd_handler.c index 5d9a155..0d4b6be 100644 --- a/lustre/osd/osd_handler.c +++ b/lustre/osd/osd_handler.c @@ -82,6 +82,14 @@ #include "osd_internal.h" #include "osd_igif.h" +/* llo_* api support */ +#include + +static const char MDT_XATTR_NAME[] = "trusted.lma"; +static const char dot[] = "."; +static const char dotdot[] = ".."; +static const char remote_obj_dir[] = "REM_OBJ_DIR"; + struct osd_directory { struct iam_container od_container; struct iam_descr od_descr; @@ -102,6 +110,14 @@ struct osd_object { struct osd_directory *oo_dir; /** protects inode attributes. */ spinlock_t oo_guard; + /** + * Following two members are used to indicate the presence of dot and + * dotdot in the given directory. This is required for interop mode + * (b11826). + */ + int oo_compat_dot_created; + int oo_compat_dotdot_created; + const struct lu_env *oo_owner; #ifdef CONFIG_LOCKDEP struct lockdep_map oo_dep_map; @@ -141,41 +157,64 @@ static int osd_fid_lookup (const struct lu_env *env, const struct lu_fid *fid); static void osd_inode_getattr (const struct lu_env *env, struct inode *inode, struct lu_attr *attr); -static void osd_inode_setattr (const struct lu_env *env, +static int osd_inode_setattr (const struct lu_env *env, struct inode *inode, const struct lu_attr *attr); static int osd_param_is_sane (const struct osd_device *dev, const struct txn_param *param); -static int osd_index_lookup (const struct lu_env *env, - struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa); -static int osd_index_insert (const struct lu_env *env, - struct dt_object *dt, - const struct dt_rec *rec, - const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa); -static int osd_index_delete (const struct lu_env *env, - struct dt_object *dt, const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa); -static int osd_index_probe (const struct lu_env *env, - struct osd_object *o, - const struct dt_index_features *feat); +static int osd_index_iam_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa); +static int osd_index_ea_lookup(const struct lu_env *env, + struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa); +static int osd_index_iam_insert(const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *handle, + struct lustre_capa *capa, + int ingore_quota); +static int osd_index_ea_insert (const struct lu_env *env, + struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, + struct thandle *handle, + struct lustre_capa *capa, + int ingore_quota); +static int osd_index_iam_delete(const struct lu_env *env, + struct dt_object *dt, const struct dt_key *key, + struct thandle *handle, + struct lustre_capa *capa); +static int osd_index_ea_delete (const struct lu_env *env, + struct dt_object *dt, const struct dt_key *key, + struct thandle *handle, + struct lustre_capa *capa); + +static int osd_iam_index_probe (const struct lu_env *env, + struct osd_object *o, + const struct dt_index_features *feat); static int osd_index_try (const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat); static void osd_index_fini (struct osd_object *o); -static void osd_it_fini (const struct lu_env *env, struct dt_it *di); -static int osd_it_get (const struct lu_env *env, +static void osd_it_iam_fini (const struct lu_env *env, struct dt_it *di); +static int osd_it_iam_get (const struct lu_env *env, + struct dt_it *di, const struct dt_key *key); +static void osd_it_iam_put (const struct lu_env *env, struct dt_it *di); +static int osd_it_iam_next (const struct lu_env *env, struct dt_it *di); +static int osd_it_iam_key_size (const struct lu_env *env, + const struct dt_it *di); +static void osd_it_ea_fini (const struct lu_env *env, struct dt_it *di); +static int osd_it_ea_get (const struct lu_env *env, struct dt_it *di, const struct dt_key *key); -static void osd_it_put (const struct lu_env *env, struct dt_it *di); -static int osd_it_next (const struct lu_env *env, struct dt_it *di); -static int osd_it_del (const struct lu_env *env, struct dt_it *di, - struct thandle *th); -static int osd_it_key_size (const struct lu_env *env, +static void osd_it_ea_put (const struct lu_env *env, struct dt_it *di); +static int osd_it_ea_next (const struct lu_env *env, struct dt_it *di); +static int osd_it_ea_key_size(const struct lu_env *env, const struct dt_it *di); + static void osd_conf_get (const struct lu_env *env, const struct dt_device *dev, struct dt_device_param *param); @@ -201,13 +240,21 @@ static struct inode *osd_iget (struct osd_thread_info *info, struct osd_device *dev, const struct osd_inode_id *id); static struct super_block *osd_sb (const struct osd_device *dev); -static struct dt_it *osd_it_init (const struct lu_env *env, - struct dt_object *dt, int wable, +static struct dt_it *osd_it_iam_init (const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *capa); +static struct dt_key *osd_it_iam_key (const struct lu_env *env, + const struct dt_it *di); +static struct dt_rec *osd_it_iam_rec (const struct lu_env *env, + const struct dt_it *di); +static struct dt_it *osd_it_ea_init (const struct lu_env *env, + struct dt_object *dt, struct lustre_capa *capa); -static struct dt_key *osd_it_key (const struct lu_env *env, +static struct dt_key *osd_it_ea_key (const struct lu_env *env, const struct dt_it *di); -static struct dt_rec *osd_it_rec (const struct lu_env *env, +static struct dt_rec *osd_it_ea_rec (const struct lu_env *env, const struct dt_it *di); + static struct timespec *osd_inode_time (const struct lu_env *env, struct inode *inode, __u64 seconds); @@ -216,16 +263,23 @@ static struct thandle *osd_trans_start (const struct lu_env *env, struct txn_param *p); static journal_t *osd_journal (const struct osd_device *dev); +static int __osd_ea_add_rec(struct osd_thread_info *info, + struct osd_object *pobj, + struct osd_object *cobj, + const char *name, + struct thandle *th); + static const struct lu_device_type_operations osd_device_type_ops; -static struct lu_device_type osd_device_type; +static struct lu_device_type osd_device_type; static const struct lu_object_operations osd_lu_obj_ops; -static struct obd_ops osd_obd_device_ops; +static struct obd_ops osd_obd_device_ops; static const struct lu_device_operations osd_lu_ops; -static struct lu_context_key osd_key; +static struct lu_context_key osd_key; static const struct dt_object_operations osd_obj_ops; +static const struct dt_object_operations osd_obj_ea_ops; static const struct dt_body_operations osd_body_ops; -static const struct dt_index_operations osd_index_ops; -static const struct dt_index_operations osd_index_compat_ops; +static const struct dt_index_operations osd_index_iam_ops; +static const struct dt_index_operations osd_index_ea_ops; struct osd_thandle { struct thandle ot_super; @@ -236,6 +290,31 @@ struct osd_thandle { }; +#ifdef HAVE_QUOTA_SUPPORT +static inline void +osd_push_ctxt(const struct lu_env *env, struct osd_ctxt *save) +{ + struct md_ucred *uc = md_ucred(env); + + LASSERT(uc != NULL); + + save->oc_uid = current->fsuid; + save->oc_gid = current->fsgid; + save->oc_cap = current->cap_effective; + current->fsuid = uc->mu_fsuid; + current->fsgid = uc->mu_fsgid; + current->cap_effective = uc->mu_cap; +} + +static inline void +osd_pop_ctxt(struct osd_ctxt *save) +{ + current->fsuid = save->oc_uid; + current->fsgid = save->oc_gid; + current->cap_effective = save->oc_cap; +} +#endif + /* * Invariants, assertions. */ @@ -317,7 +396,11 @@ static struct lu_object *osd_object_alloc(const struct lu_env *env, l = &mo->oo_dt.do_lu; dt_object_init(&mo->oo_dt, NULL, d); - mo->oo_dt.do_ops = &osd_obj_ops; + if (osd_dev(d)->od_iop_mode) + mo->oo_dt.do_ops = &osd_obj_ea_ops; + else + mo->oo_dt.do_ops = &osd_obj_ops; + l->lo_ops = &osd_lu_obj_ops; init_rwsem(&mo->oo_sem); spin_lock_init(&mo->oo_guard); @@ -372,11 +455,18 @@ static void osd_object_free(const struct lu_env *env, struct lu_object *l) OBD_FREE_PTR(obj); } -static struct iam_path_descr *osd_ipd_get(const struct lu_env *env, - const struct iam_container *bag) +static struct iam_path_descr *osd_it_ipd_get(const struct lu_env *env, + const struct iam_container *bag) +{ + return bag->ic_descr->id_ops->id_ipd_alloc(bag, + osd_oti_get(env)->oti_it_ipd); +} + +static struct iam_path_descr *osd_idx_ipd_get(const struct lu_env *env, + const struct iam_container *bag) { return bag->ic_descr->id_ops->id_ipd_alloc(bag, - osd_oti_get(env)->oti_ipd); + osd_oti_get(env)->oti_idx_ipd); } static void osd_ipd_put(const struct lu_env *env, @@ -433,7 +523,7 @@ static int osd_inode_remove(const struct lu_env *env, struct osd_object *obj) struct thandle *th; int result; - txn_param_init(prm, OSD_TXN_OI_DELETE_CREDITS + + txn_param_init(prm, OSD_TXN_OI_DELETE_CREDITS + OSD_TXN_INODE_DELETE_CREDITS); th = osd_trans_start(env, &osd->od_dt_dev, prm); if (!IS_ERR(th)) { @@ -460,8 +550,6 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) /* * If object is unlinked remove fid->ino mapping from object index. - * - * File body will be deleted by iput(). */ osd_index_fini(obj); @@ -475,6 +563,7 @@ static void osd_object_delete(const struct lu_env *env, struct lu_object *l) "Failed to cleanup: %d\n", result); } + iput(inode); obj->oo_inode = NULL; } @@ -532,7 +621,7 @@ int osd_statfs(const struct lu_env *env, struct dt_device *d, } if (likely(result == 0)) - *sfs = osd->od_kstatfs; + *sfs = osd->od_kstatfs; spin_unlock(&osd->od_osfs_lock); return result; @@ -553,6 +642,19 @@ static void osd_conf_get(const struct lu_env *env, param->ddp_block_shift = osd_sb(osd_dt_dev(dev))->s_blocksize_bits; } +/** + * Helper function to get and fill the buffer with input values. + */ +static struct lu_buf *osd_buf_get(const struct lu_env *env, void *area, ssize_t len) +{ + struct lu_buf *buf; + + buf = &osd_oti_get(env)->oti_buf; + buf->lb_buf = area; + buf->lb_len = len; + return buf; +} + /* * Journal */ @@ -572,7 +674,7 @@ static int osd_param_is_sane(const struct osd_device *dev, static void osd_trans_commit_cb(struct journal_callback *jcb, int error) { struct osd_thandle *oh = container_of0(jcb, struct osd_thandle, ot_jcb); - struct thandle *th = &oh->ot_super; + struct thandle *th = &oh->ot_super; struct dt_device *dev = th->th_dev; struct lu_device *lud = &dev->dd_lu_dev; @@ -703,6 +805,28 @@ static int osd_sync(const struct lu_env *env, struct dt_device *d) return ldiskfs_force_commit(osd_sb(osd_dt_dev(d))); } +/** + * Start commit for OSD device. + * + * An implementation of dt_commit_async method for OSD device. + * Asychronously starts underlayng fs sync and thereby a transaction + * commit. + * + * \param env environment + * \param d dt device + * + * \see dt_device_operations + */ +static int osd_commit_async(const struct lu_env *env, + struct dt_device *d) +{ + struct super_block *s = osd_sb(osd_dt_dev(d)); + ENTRY; + + CDEBUG(D_HA, "async commit OSD %s\n", LUSTRE_OSD_NAME); + RETURN(s->s_op->sync_fs(s, 0)); +} + /* * Concurrency: shouldn't matter. */ @@ -719,6 +843,7 @@ static void osd_ro(const struct lu_env *env, struct dt_device *d) EXIT; } + /* * Concurrency: serialization provided by callers. */ @@ -736,46 +861,161 @@ static int osd_init_capa_ctxt(const struct lu_env *env, struct dt_device *d, RETURN(0); } -/* Note: we did not count into QUOTA here, If we mount with --data_journal - * we may need more*/ -static const int osd_dto_credits[DTO_NR] = { - /* - * Insert/Delete. IAM EXT3_INDEX_EXTRA_TRANS_BLOCKS(8) + - * EXT3_SINGLEDATA_TRANS_BLOCKS 8 XXX Note: maybe iam need more,since - * iam have more level than Ext3 htree +/** + * Concurrency: serialization provided by callers. + */ +static void osd_init_quota_ctxt(const struct lu_env *env, struct dt_device *d, + struct dt_quota_ctxt *ctxt, void *data) +{ + struct obd_device *obd = (void *)ctxt; + struct vfsmount *mnt = (struct vfsmount *)data; + ENTRY; + + obd->u.obt.obt_sb = mnt->mnt_root->d_inode->i_sb; + OBD_SET_CTXT_MAGIC(&obd->obd_lvfs_ctxt); + obd->obd_lvfs_ctxt.pwdmnt = mnt; + obd->obd_lvfs_ctxt.pwd = mnt->mnt_root; + obd->obd_lvfs_ctxt.fs = get_ds(); + + EXIT; +} + +/** + * Note: we do not count into QUOTA here. + * If we mount with --data_journal we may need more. + */ +static const int osd_dto_credits_noquota[DTO_NR] = { + /** + * Insert/Delete. + * INDEX_EXTRA_TRANS_BLOCKS(8) + + * SINGLEDATA_TRANS_BLOCKS(8) + * XXX Note: maybe iam need more, since iam have more level than + * EXT3 htree. */ [DTO_INDEX_INSERT] = 16, [DTO_INDEX_DELETE] = 16, + /** + * Unused now + */ [DTO_IDNEX_UPDATE] = 16, - /* - * Create a object. Same as create object in Ext3 filesystem, but did - * not count QUOTA i EXT3_DATA_TRANS_BLOCKS(12) + - * INDEX_EXTRA_BLOCKS(8) + 3(inode bits,groups, GDT) + /** + * Create a object. The same as create object in EXT3. + * DATA_TRANS_BLOCKS(14) + + * INDEX_EXTRA_BLOCKS(8) + + * 3(inode bits, groups, GDT) */ - [DTO_OBJECT_CREATE] = 23, - [DTO_OBJECT_DELETE] = 23, + [DTO_OBJECT_CREATE] = 25, + /** + * Unused now + */ + [DTO_OBJECT_DELETE] = 25, + /** + * Attr set credits. + * 3(inode bits, group, GDT) + */ + [DTO_ATTR_SET_BASE] = 3, + /** + * Xattr set. The same as xattr of EXT3. + * DATA_TRANS_BLOCKS(14) + * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS are + * also counted in. Do not know why? + */ + [DTO_XATTR_SET] = 14, + [DTO_LOG_REC] = 14, + /** + * creadits for inode change during write. + */ + [DTO_WRITE_BASE] = 3, + /** + * credits for single block write. + */ + [DTO_WRITE_BLOCK] = 14, + /** + * Attr set credits for chown. + * 3 (inode bit, group, GDT) + */ + [DTO_ATTR_SET_CHOWN]= 3 +}; + +/** + * Note: we count into QUOTA here. + * If we mount with --data_journal we may need more. + */ +static const int osd_dto_credits_quota[DTO_NR] = { + /** + * INDEX_EXTRA_TRANS_BLOCKS(8) + + * SINGLEDATA_TRANS_BLOCKS(8) + + * 2 * QUOTA_TRANS_BLOCKS(2) + */ + [DTO_INDEX_INSERT] = 20, + /** + * INDEX_EXTRA_TRANS_BLOCKS(8) + + * SINGLEDATA_TRANS_BLOCKS(8) + + * 2 * QUOTA_TRANS_BLOCKS(2) + */ + [DTO_INDEX_DELETE] = 20, + /** + * Unused now. + */ + [DTO_IDNEX_UPDATE] = 16, /* - * Attr set credits 3 inode, group, GDT + * Create a object. Same as create object in EXT3 filesystem. + * DATA_TRANS_BLOCKS(16) + + * INDEX_EXTRA_BLOCKS(8) + + * 3(inode bits, groups, GDT) + + * 2 * QUOTA_INIT_BLOCKS(25) */ - [DTO_ATTR_SET] = 3, + [DTO_OBJECT_CREATE] = 77, /* - * XATTR_SET. SAME AS XATTR of EXT3 EXT3_DATA_TRANS_BLOCKS XXX Note: - * in original MDS implmentation EXT3_INDEX_EXTRA_TRANS_BLOCKS are - * also counted in. Do not know why? + * Unused now. + * DATA_TRANS_BLOCKS(16) + + * INDEX_EXTRA_BLOCKS(8) + + * 3(inode bits, groups, GDT) + + * QUOTA(?) + */ + [DTO_OBJECT_DELETE] = 27, + /** + * Attr set credits. + * 3 (inode bit, group, GDT) + + */ + [DTO_ATTR_SET_BASE] = 3, + /** + * Xattr set. The same as xattr of EXT3. + * DATA_TRANS_BLOCKS(16) + * XXX Note: in original MDS implmentation INDEX_EXTRA_TRANS_BLOCKS are + * also counted in. Do not know why? */ [DTO_XATTR_SET] = 16, [DTO_LOG_REC] = 16, - /* creadits for inode change during write */ + /** + * creadits for inode change during write. + */ [DTO_WRITE_BASE] = 3, - /* credits for single block write */ - [DTO_WRITE_BLOCK] = 12 + /** + * credits for single block write. + */ + [DTO_WRITE_BLOCK] = 16, + /** + * Attr set credits for chown. + * 3 (inode bit, group, GDT) + + * 2 * QUOTA_INIT_BLOCKS(25) + + * 2 * QUOTA_DEL_BLOCKS(9) + */ + [DTO_ATTR_SET_CHOWN]= 71 }; static int osd_credit_get(const struct lu_env *env, struct dt_device *d, enum dt_txn_op op) { - LASSERT(0 <= op && op < ARRAY_SIZE(osd_dto_credits)); - return osd_dto_credits[op]; + LASSERT(ARRAY_SIZE(osd_dto_credits_noquota) == + ARRAY_SIZE(osd_dto_credits_quota)); + LASSERT(0 <= op && op < ARRAY_SIZE(osd_dto_credits_noquota)); +#ifdef HAVE_QUOTA_SUPPORT + if (test_opt(osd_sb(osd_dt_dev(d)), QUOTA)) + return osd_dto_credits_quota[op]; + else +#endif + return osd_dto_credits_noquota[op]; } static const struct dt_device_operations osd_dt_ops = { @@ -786,8 +1026,10 @@ static const struct dt_device_operations osd_dt_ops = { .dt_conf_get = osd_conf_get, .dt_sync = osd_sync, .dt_ro = osd_ro, + .dt_commit_async = osd_commit_async, .dt_credit_get = osd_credit_get, .dt_init_capa_ctxt = osd_init_capa_ctxt, + .dt_init_quota_ctxt= osd_init_quota_ctxt, }; static void osd_object_read_lock(const struct lu_env *env, @@ -801,8 +1043,8 @@ static void osd_object_read_lock(const struct lu_env *env, LASSERT(obj->oo_owner != env); down_read_nested(&obj->oo_sem, role); - LASSERT(obj->oo_owner == NULL); - oti->oti_r_locks++; + LASSERT(obj->oo_owner == NULL); + oti->oti_r_locks++; } static void osd_object_write_lock(const struct lu_env *env, @@ -816,21 +1058,21 @@ static void osd_object_write_lock(const struct lu_env *env, LASSERT(obj->oo_owner != env); down_write_nested(&obj->oo_sem, role); - LASSERT(obj->oo_owner == NULL); - obj->oo_owner = env; - oti->oti_w_locks++; + LASSERT(obj->oo_owner == NULL); + obj->oo_owner = env; + oti->oti_w_locks++; } static void osd_object_read_unlock(const struct lu_env *env, struct dt_object *dt) { struct osd_object *obj = osd_dt_obj(dt); - struct osd_thread_info *oti = osd_oti_get(env); + struct osd_thread_info *oti = osd_oti_get(env); LINVRNT(osd_invariant(obj)); - LASSERT(oti->oti_r_locks > 0); - oti->oti_r_locks--; + LASSERT(oti->oti_r_locks > 0); + oti->oti_r_locks--; up_read(&obj->oo_sem); } @@ -838,14 +1080,14 @@ static void osd_object_write_unlock(const struct lu_env *env, struct dt_object *dt) { struct osd_object *obj = osd_dt_obj(dt); - struct osd_thread_info *oti = osd_oti_get(env); + struct osd_thread_info *oti = osd_oti_get(env); LINVRNT(osd_invariant(obj)); - LASSERT(obj->oo_owner == env); - LASSERT(oti->oti_w_locks > 0); - oti->oti_w_locks--; - obj->oo_owner = NULL; + LASSERT(obj->oo_owner == env); + LASSERT(oti->oti_w_locks > 0); + oti->oti_w_locks--; + obj->oo_owner = NULL; up_write(&obj->oo_sem); } @@ -855,6 +1097,7 @@ static int capa_is_sane(const struct lu_env *env, struct lustre_capa_key *keys) { struct osd_thread_info *oti = osd_oti_get(env); + struct lustre_capa *tcapa = &oti->oti_capa; struct obd_capa *oc; int i, rc = 0; ENTRY; @@ -869,6 +1112,11 @@ static int capa_is_sane(const struct lu_env *env, RETURN(rc); } + if (capa_is_expired_sec(capa)) { + DEBUG_CAPA(D_ERROR, capa, "expired"); + RETURN(-ESTALE); + } + spin_lock(&capa_lock); for (i = 0; i < 2; i++) { if (keys[i].lk_keyid == capa->lc_keyid) { @@ -883,11 +1131,11 @@ static int capa_is_sane(const struct lu_env *env, RETURN(-ESTALE); } - rc = capa_hmac(oti->oti_capa.lc_hmac, capa, oti->oti_capa_key.lk_key); + rc = capa_hmac(tcapa->lc_hmac, capa, oti->oti_capa_key.lk_key); if (rc) RETURN(rc); - if (memcmp(oti->oti_capa.lc_hmac, capa->lc_hmac, sizeof(capa->lc_hmac))) - { + + if (memcmp(tcapa->lc_hmac, capa->lc_hmac, sizeof(capa->lc_hmac))) { DEBUG_CAPA(D_ERROR, capa, "HMAC mismatch"); RETURN(-EACCES); } @@ -903,6 +1151,7 @@ static int osd_object_auth(const struct lu_env *env, struct dt_object *dt, { const struct lu_fid *fid = lu_object_fid(&dt->do_lu); struct osd_device *dev = osd_dev(dt->do_lu.lo_dev); + struct md_capainfo *ci; int rc; if (!dev->od_fl_capa) @@ -911,6 +1160,13 @@ static int osd_object_auth(const struct lu_env *env, struct dt_object *dt, if (capa == BYPASS_CAPA) return 0; + ci = md_capainfo(env); + if (unlikely(!ci)) + return 0; + + if (ci->mc_auth == LC_ID_NONE) + return 0; + if (!capa) { CERROR("no capability is provided for fid "DFID"\n", PFID(fid)); return -EACCES; @@ -961,6 +1217,7 @@ static int osd_attr_set(const struct lu_env *env, struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); + int rc; LASSERT(handle != NULL); LASSERT(dt_object_exists(dt)); @@ -970,11 +1227,12 @@ static int osd_attr_set(const struct lu_env *env, return -EACCES; spin_lock(&obj->oo_guard); - osd_inode_setattr(env, obj->oo_inode, attr); + rc = osd_inode_setattr(env, obj->oo_inode, attr); spin_unlock(&obj->oo_guard); - mark_inode_dirty(obj->oo_inode); - return 0; + if (!rc) + mark_inode_dirty(obj->oo_inode); + return rc; } static struct timespec *osd_inode_time(const struct lu_env *env, @@ -989,8 +1247,8 @@ static struct timespec *osd_inode_time(const struct lu_env *env, return t; } -static void osd_inode_setattr(const struct lu_env *env, - struct inode *inode, const struct lu_attr *attr) +static int osd_inode_setattr(const struct lu_env *env, + struct inode *inode, const struct lu_attr *attr) { __u64 bits; @@ -998,6 +1256,24 @@ static void osd_inode_setattr(const struct lu_env *env, LASSERT(!(bits & LA_TYPE)); /* Huh? You want too much. */ +#ifdef HAVE_QUOTA_SUPPORT + if ((bits & LA_UID && attr->la_uid != inode->i_uid) || + (bits & LA_GID && attr->la_gid != inode->i_gid)) { + struct osd_ctxt *save = &osd_oti_get(env)->oti_ctxt; + struct iattr iattr; + int rc; + + iattr.ia_valid = bits & (LA_UID | LA_GID); + iattr.ia_uid = attr->la_uid; + iattr.ia_gid = attr->la_gid; + osd_push_ctxt(env, save); + rc = DQUOT_TRANSFER(inode, &iattr) ? -EDQUOT : 0; + osd_pop_ctxt(save); + if (rc != 0) + return rc; + } +#endif + if (bits & LA_ATIME) inode->i_atime = *osd_inode_time(env, inode, attr->la_atime); if (bits & LA_CTIME) @@ -1008,8 +1284,14 @@ static void osd_inode_setattr(const struct lu_env *env, LDISKFS_I(inode)->i_disksize = attr->la_size; i_size_write(inode, attr->la_size); } +# if 0 + /* + * OSD should not change "i_blocks" which is used by quota. + * "i_blocks" should be changed by ldiskfs only. + * Disable this assignment until SOM to fix some EA field. */ if (bits & LA_BLOCKS) inode->i_blocks = attr->la_blocks; +#endif if (bits & LA_MODE) inode->i_mode = (inode->i_mode & S_IFMT) | (attr->la_mode & ~S_IFMT); @@ -1028,6 +1310,7 @@ static void osd_inode_setattr(const struct lu_env *env, li->i_flags = (li->i_flags & ~LDISKFS_FL_USER_MODIFIABLE) | (attr->la_flags & LDISKFS_FL_USER_MODIFIABLE); } + return 0; } /* @@ -1053,6 +1336,43 @@ static int osd_create_post(struct osd_thread_info *info, struct osd_object *obj, extern struct inode *ldiskfs_create_inode(handle_t *handle, struct inode * dir, int mode); +extern int ldiskfs_add_entry(handle_t *handle, struct dentry *dentry, + struct inode *inode); +extern int ldiskfs_delete_entry(handle_t *handle, + struct inode * dir, + struct ldiskfs_dir_entry_2 * de_del, + struct buffer_head * bh); +extern struct buffer_head * ldiskfs_find_entry(struct dentry *dentry, + struct ldiskfs_dir_entry_2 + ** res_dir); +extern int ldiskfs_add_dot_dotdot(handle_t *handle, struct inode *dir, + struct inode *inode); + +extern int ldiskfs_xattr_set_handle(handle_t *handle, struct inode *inode, + int name_index, const char *name, + const void *value, size_t value_len, + int flags); + +static struct dentry * osd_child_dentry_get(const struct lu_env *env, + struct osd_object *obj, + const char *name, + const int namelen) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct dentry *child_dentry = &info->oti_child_dentry; + struct dentry *obj_dentry = &info->oti_obj_dentry; + + obj_dentry->d_inode = obj->oo_inode; + obj_dentry->d_sb = osd_sb(osd_obj2dev(obj)); + obj_dentry->d_name.hash = 0; + + child_dentry->d_name.hash = 0; + child_dentry->d_parent = obj_dentry; + child_dentry->d_name.name = name; + child_dentry->d_name.len = namelen; + return child_dentry; +} + static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, umode_t mode, @@ -1062,23 +1382,34 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, int result; struct osd_device *osd = osd_obj2dev(obj); struct osd_thandle *oth; - struct inode *parent; + struct dt_object *parent; struct inode *inode; +#ifdef HAVE_QUOTA_SUPPORT + struct osd_ctxt *save = &info->oti_ctxt; +#endif LINVRNT(osd_invariant(obj)); LASSERT(obj->oo_inode == NULL); - LASSERT(osd->od_obj_area != NULL); oth = container_of(th, struct osd_thandle, ot_super); LASSERT(oth->ot_handle->h_transaction != NULL); if (hint && hint->dah_parent) - parent = osd_dt_obj(hint->dah_parent)->oo_inode; + parent = hint->dah_parent; else - parent = osd->od_obj_area->d_inode; - LASSERT(parent->i_op != NULL); + parent = osd->od_obj_area; + + LASSERT(parent != NULL); + LASSERT(osd_dt_obj(parent)->oo_inode->i_op != NULL); - inode = ldiskfs_create_inode(oth->ot_handle, parent, mode); +#ifdef HAVE_QUOTA_SUPPORT + osd_push_ctxt(info->oti_env, save); +#endif + inode = ldiskfs_create_inode(oth->ot_handle, + osd_dt_obj(parent)->oo_inode, mode); +#ifdef HAVE_QUOTA_SUPPORT + osd_pop_ctxt(save); +#endif if (!IS_ERR(inode)) { obj->oo_inode = inode; result = 0; @@ -1092,6 +1423,10 @@ static int osd_mkfile(struct osd_thread_info *info, struct osd_object *obj, extern int iam_lvar_create(struct inode *obj, int keysize, int ptrsize, int recsize, handle_t *handle); +extern int iam_lfix_create(struct inode *obj, int keysize, int ptrsize, + int recsize, handle_t *handle); + + enum { OSD_NAME_LEN = 255 }; @@ -1099,22 +1434,25 @@ enum { static int osd_mkdir(struct osd_thread_info *info, struct osd_object *obj, struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) { int result; struct osd_thandle *oth; + struct osd_device *osd = osd_obj2dev(obj); + __u32 mode = (attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX)); LASSERT(S_ISDIR(attr->la_mode)); oth = container_of(th, struct osd_thandle, ot_super); LASSERT(oth->ot_handle->h_transaction != NULL); - result = osd_mkfile(info, obj, (attr->la_mode & - (S_IFMT | S_IRWXUGO | S_ISVTX)), hint, th); - if (result == 0) { + result = osd_mkfile(info, obj, mode, hint, th); + if (result == 0 && osd->od_iop_mode == 0) { LASSERT(obj->oo_inode != NULL); /* * XXX uh-oh... call low-level iam function directly. */ + result = iam_lvar_create(obj->oo_inode, OSD_NAME_LEN, 4, sizeof (struct lu_fid_pack), oth->ot_handle); @@ -1122,9 +1460,47 @@ static int osd_mkdir(struct osd_thread_info *info, struct osd_object *obj, return result; } +static int osd_mk_index(struct osd_thread_info *info, struct osd_object *obj, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + int result; + struct osd_thandle *oth; + const struct dt_index_features *feat = dof->u.dof_idx.di_feat; + + __u32 mode = (attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX)); + + LASSERT(S_ISREG(attr->la_mode)); + + oth = container_of(th, struct osd_thandle, ot_super); + LASSERT(oth->ot_handle->h_transaction != NULL); + + result = osd_mkfile(info, obj, mode, hint, th); + if (result == 0) { + LASSERT(obj->oo_inode != NULL); + if (feat->dif_flags & DT_IND_VARKEY) + result = iam_lvar_create(obj->oo_inode, + feat->dif_keysize_max, + feat->dif_ptrsize, + feat->dif_recsize_max, + oth->ot_handle); + else + result = iam_lfix_create(obj->oo_inode, + feat->dif_keysize_max, + feat->dif_ptrsize, + feat->dif_recsize_max, + oth->ot_handle); + + } + return result; +} + static int osd_mkreg(struct osd_thread_info *info, struct osd_object *obj, struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) { LASSERT(S_ISREG(attr->la_mode)); @@ -1135,6 +1511,7 @@ static int osd_mkreg(struct osd_thread_info *info, struct osd_object *obj, static int osd_mksym(struct osd_thread_info *info, struct osd_object *obj, struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) { LASSERT(S_ISLNK(attr->la_mode)); @@ -1145,22 +1522,17 @@ static int osd_mksym(struct osd_thread_info *info, struct osd_object *obj, static int osd_mknod(struct osd_thread_info *info, struct osd_object *obj, struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) { - int result; - struct osd_device *osd = osd_obj2dev(obj); - struct inode *dir; umode_t mode = attr->la_mode & (S_IFMT | S_IRWXUGO | S_ISVTX); + int result; LINVRNT(osd_invariant(obj)); LASSERT(obj->oo_inode == NULL); - LASSERT(osd->od_obj_area != NULL); LASSERT(S_ISCHR(mode) || S_ISBLK(mode) || S_ISFIFO(mode) || S_ISSOCK(mode)); - dir = osd->od_obj_area->d_inode; - LASSERT(dir->i_op != NULL); - result = osd_mkfile(info, obj, mode, hint, th); if (result == 0) { LASSERT(obj->oo_inode != NULL); @@ -1173,28 +1545,30 @@ static int osd_mknod(struct osd_thread_info *info, struct osd_object *obj, typedef int (*osd_obj_type_f)(struct osd_thread_info *, struct osd_object *, struct lu_attr *, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *); -static osd_obj_type_f osd_create_type_f(__u32 mode) +static osd_obj_type_f osd_create_type_f(enum dt_format_type type) { osd_obj_type_f result; - switch (mode) { - case S_IFDIR: + switch (type) { + case DFT_DIR: result = osd_mkdir; break; - case S_IFREG: + case DFT_REGULAR: result = osd_mkreg; break; - case S_IFLNK: + case DFT_SYM: result = osd_mksym; break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + case DFT_NODE: result = osd_mknod; break; + case DFT_INDEX: + result = osd_mk_index; + break; + default: LBUG(); break; @@ -1213,19 +1587,62 @@ static void osd_ah_init(const struct lu_env *env, struct dt_allocation_hint *ah, ah->dah_mode = child_mode; } +/** + * Helper function for osd_object_create() + * + * \retval 0, on success + */ +static int __osd_object_create(struct osd_thread_info *info, + struct osd_object *obj, struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + + int result; -/* - * Concurrency: @dt is write locked. + result = osd_create_pre(info, obj, attr, th); + if (result == 0) { + result = osd_create_type_f(dof->dof_type)(info, obj, + attr, hint, dof, th); + if (result == 0) + result = osd_create_post(info, obj, attr, th); + } + return result; +} + +/** + * Helper function for osd_object_create() + * + * \retval 0, on success */ +static int __osd_oi_insert(const struct lu_env *env, struct osd_object *obj, + const struct lu_fid *fid, struct thandle *th) +{ + struct osd_thread_info *info = osd_oti_get(env); + struct osd_inode_id *id = &info->oti_id; + struct osd_device *osd = osd_obj2dev(obj); + struct md_ucred *uc = md_ucred(env); + + LASSERT(obj->oo_inode != NULL); + LASSERT(uc != NULL); + + id->oii_ino = obj->oo_inode->i_ino; + id->oii_gen = obj->oo_inode->i_generation; + + return osd_oi_insert(info, &osd->od_oi, fid, id, th, + uc->mu_cap & CFS_CAP_SYS_RESOURCE_MASK); +} + static int osd_object_create(const struct lu_env *env, struct dt_object *dt, - struct lu_attr *attr, + struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th) { - const struct lu_fid *fid = lu_object_fid(&dt->do_lu); - struct osd_object *obj = osd_dt_obj(dt); - struct osd_device *osd = osd_obj2dev(obj); - struct osd_thread_info *info = osd_oti_get(env); + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_object *obj = osd_dt_obj(dt); + struct osd_thread_info *info = osd_oti_get(env); int result; ENTRY; @@ -1235,77 +1652,219 @@ static int osd_object_create(const struct lu_env *env, struct dt_object *dt, LASSERT(osd_write_locked(env, obj)); LASSERT(th != NULL); - /* - * XXX missing: Quote handling. - */ - - result = osd_create_pre(info, obj, attr, th); - if (result == 0) { - result = osd_create_type_f(attr->la_mode & S_IFMT)(info, obj, - attr, hint, th); - if (result == 0) - result = osd_create_post(info, obj, attr, th); - } - if (result == 0) { - struct osd_inode_id *id = &info->oti_id; - - LASSERT(obj->oo_inode != NULL); - - id->oii_ino = obj->oo_inode->i_ino; - id->oii_gen = obj->oo_inode->i_generation; - - result = osd_oi_insert(info, &osd->od_oi, fid, id, th); - } + result = __osd_object_create(info, obj, attr, hint, dof, th); + if (result == 0) + result = __osd_oi_insert(env, obj, fid, th); LASSERT(ergo(result == 0, dt_object_exists(dt))); - LINVRNT(osd_invariant(obj)); + LASSERT(osd_invariant(obj)); RETURN(result); } -/* - * Concurrency: @dt is write locked. +/** + * Helper function for osd_xattr_set() */ -static void osd_object_ref_add(const struct lu_env *env, - struct dt_object *dt, - struct thandle *th) +static int __osd_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, int fl) { - struct osd_object *obj = osd_dt_obj(dt); - struct inode *inode = obj->oo_inode; + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + struct osd_thread_info *info = osd_oti_get(env); + struct dentry *dentry = &info->oti_child_dentry; + struct timespec *t = &info->oti_time; + int fs_flags = 0; + int rc; - LINVRNT(osd_invariant(obj)); LASSERT(dt_object_exists(dt)); + LASSERT(inode->i_op != NULL && inode->i_op->setxattr != NULL); LASSERT(osd_write_locked(env, obj)); - LASSERT(th != NULL); - spin_lock(&obj->oo_guard); - LASSERT(inode->i_nlink < LDISKFS_LINK_MAX); - inode->i_nlink++; - spin_unlock(&obj->oo_guard); - mark_inode_dirty(inode); - LINVRNT(osd_invariant(obj)); + if (fl & LU_XATTR_REPLACE) + fs_flags |= XATTR_REPLACE; + + if (fl & LU_XATTR_CREATE) + fs_flags |= XATTR_CREATE; + + dentry->d_inode = inode; + *t = inode->i_ctime; + rc = inode->i_op->setxattr(dentry, name, buf->lb_buf, + buf->lb_len, fs_flags); + if (likely(rc == 0)) { + spin_lock(&obj->oo_guard); + inode->i_ctime = *t; + spin_unlock(&obj->oo_guard); + mark_inode_dirty(inode); + } + return rc; } -/* - * Concurrency: @dt is write locked. +/** + * Put the fid into lustre_mdt_attrs, and then place the structure + * inode's ea. This fid should not be altered during the life time + * of the inode. + * + * \retval +ve, on success + * \retval -ve, on error + * + * FIXME: It is good to have/use ldiskfs_xattr_set_handle() here */ -static void osd_object_ref_del(const struct lu_env *env, - struct dt_object *dt, - struct thandle *th) +static int osd_ea_fid_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_fid *fid) { - struct osd_object *obj = osd_dt_obj(dt); - struct inode *inode = obj->oo_inode; + struct osd_thread_info *info = osd_oti_get(env); + struct lustre_mdt_attrs *mdt_attrs = &info->oti_mdt_attrs; - LINVRNT(osd_invariant(obj)); - LASSERT(dt_object_exists(dt)); - LASSERT(osd_write_locked(env, obj)); - LASSERT(th != NULL); + fid_cpu_to_be(&mdt_attrs->lma_self_fid, fid); - spin_lock(&obj->oo_guard); - LASSERT(inode->i_nlink > 0); - inode->i_nlink--; - spin_unlock(&obj->oo_guard); - mark_inode_dirty(inode); - LINVRNT(osd_invariant(obj)); + return __osd_xattr_set(env, dt, + osd_buf_get(env, mdt_attrs, sizeof *mdt_attrs), + MDT_XATTR_NAME, LU_XATTR_CREATE); + +} + +/** + * Helper function to form igif + */ +static inline void osd_igif_get(const struct lu_env *env, struct dentry *dentry, + struct lu_fid *fid) +{ + struct inode *inode = dentry->d_inode; + lu_igif_build(fid, inode->i_ino, inode->i_generation); +} + +/** + * Helper function to pack the fid + */ +static inline void osd_fid_pack(const struct lu_env *env, const struct lu_fid *fid, + struct lu_fid_pack *pack) +{ + fid_pack(pack, fid, &osd_oti_get(env)->oti_fid); +} + +/** + * Try to read the fid from inode ea into dt_rec, if return value + * i.e. rc is +ve, then we got fid, otherwise we will have to form igif + * + * \param rec, the data-structure into which fid/igif is read + * + * \retval 0, on success + */ +static int osd_ea_fid_get(const struct lu_env *env, struct dentry *dentry, + struct dt_rec *rec) +{ + struct inode *inode = dentry->d_inode; + struct osd_thread_info *info = osd_oti_get(env); + struct lustre_mdt_attrs *mdt_attrs = &info->oti_mdt_attrs; + struct lu_fid *fid = &info->oti_fid; + int rc; + + LASSERT(inode->i_op != NULL && inode->i_op->getxattr != NULL); + + rc = inode->i_op->getxattr(dentry, MDT_XATTR_NAME, (void *)mdt_attrs, + sizeof *mdt_attrs); + + if (rc > 0) { + fid_be_to_cpu(fid, &mdt_attrs->lma_self_fid); + rc = 0; + } else if (rc == -ENODATA) { + osd_igif_get(env, dentry, fid); + rc = 0; + } + + if (rc == 0) + osd_fid_pack(env, fid, (struct lu_fid_pack*)rec); + + return rc; +} + +/** + * OSD layer object create function for interoperability mode (b11826). + * This is mostly similar to osd_object_create(). Only difference being, fid is + * inserted into inode ea here. + * + * \retval 0, on success + * \retval -ve, on error + */ +static int osd_object_ea_create(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th) +{ + const struct lu_fid *fid = lu_object_fid(&dt->do_lu); + struct osd_object *obj = osd_dt_obj(dt); + struct osd_thread_info *info = osd_oti_get(env); + int result; + int is_root = 0; + + ENTRY; + + LASSERT(osd_invariant(obj)); + LASSERT(!dt_object_exists(dt)); + LASSERT(osd_write_locked(env, obj)); + LASSERT(th != NULL); + + result = __osd_object_create(info, obj, attr, hint, dof, th); + + if (hint && hint->dah_parent) + is_root = osd_object_is_root(osd_dt_obj(hint->dah_parent)); + + /* objects under osd root shld have igif fid, so dont add fid EA */ + if (result == 0 && is_root == 0) + result = osd_ea_fid_set(env, dt, fid); + + if (result == 0) + result = __osd_oi_insert(env, obj, fid, th); + + LASSERT(ergo(result == 0, dt_object_exists(dt))); + LINVRNT(osd_invariant(obj)); + RETURN(result); +} + +/* + * Concurrency: @dt is write locked. + */ +static void osd_object_ref_add(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + + LINVRNT(osd_invariant(obj)); + LASSERT(dt_object_exists(dt)); + LASSERT(osd_write_locked(env, obj)); + LASSERT(th != NULL); + + spin_lock(&obj->oo_guard); + LASSERT(inode->i_nlink < LDISKFS_LINK_MAX); + inode->i_nlink++; + spin_unlock(&obj->oo_guard); + mark_inode_dirty(inode); + LINVRNT(osd_invariant(obj)); +} + +/* + * Concurrency: @dt is write locked. + */ +static void osd_object_ref_del(const struct lu_env *env, + struct dt_object *dt, + struct thandle *th) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *inode = obj->oo_inode; + + LINVRNT(osd_invariant(obj)); + LASSERT(dt_object_exists(dt)); + LASSERT(osd_write_locked(env, obj)); + LASSERT(th != NULL); + + spin_lock(&obj->oo_guard); + LASSERT(inode->i_nlink > 0); + inode->i_nlink--; + spin_unlock(&obj->oo_guard); + mark_inode_dirty(inode); + LINVRNT(osd_invariant(obj)); } /* @@ -1320,7 +1879,7 @@ static int osd_xattr_get(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_dentry; + struct dentry *dentry = &info->oti_obj_dentry; LASSERT(dt_object_exists(dt)); LASSERT(inode->i_op != NULL && inode->i_op->getxattr != NULL); @@ -1333,6 +1892,7 @@ static int osd_xattr_get(const struct lu_env *env, return inode->i_op->getxattr(dentry, name, buf->lb_buf, buf->lb_len); } + /* * Concurrency: @dt is write locked. */ @@ -1340,39 +1900,12 @@ static int osd_xattr_set(const struct lu_env *env, struct dt_object *dt, const struct lu_buf *buf, const char *name, int fl, struct thandle *handle, struct lustre_capa *capa) { - struct osd_object *obj = osd_dt_obj(dt); - struct inode *inode = obj->oo_inode; - struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_dentry; - struct timespec *t = &info->oti_time; - int fs_flags = 0, rc; - - LASSERT(dt_object_exists(dt)); - LASSERT(inode->i_op != NULL && inode->i_op->setxattr != NULL); - LASSERT(osd_write_locked(env, obj)); LASSERT(handle != NULL); if (osd_object_auth(env, dt, capa, CAPA_OPC_META_WRITE)) return -EACCES; - if (fl & LU_XATTR_REPLACE) - fs_flags |= XATTR_REPLACE; - - if (fl & LU_XATTR_CREATE) - fs_flags |= XATTR_CREATE; - - dentry->d_inode = inode; - *t = inode->i_ctime; - rc = inode->i_op->setxattr(dentry, name, - buf->lb_buf, buf->lb_len, fs_flags); - if (likely(rc == 0)) { - /* ctime should not be updated with server-side time. */ - spin_lock(&obj->oo_guard); - inode->i_ctime = *t; - spin_unlock(&obj->oo_guard); - mark_inode_dirty(inode); - } - return rc; + return __osd_xattr_set(env, dt, buf, name, fl); } /* @@ -1386,7 +1919,7 @@ static int osd_xattr_list(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_dentry; + struct dentry *dentry = &info->oti_obj_dentry; LASSERT(dt_object_exists(dt)); LASSERT(inode->i_op != NULL && inode->i_op->listxattr != NULL); @@ -1411,7 +1944,7 @@ static int osd_xattr_del(const struct lu_env *env, struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_dentry; + struct dentry *dentry = &info->oti_obj_dentry; struct timespec *t = &info->oti_time; int rc; @@ -1448,6 +1981,7 @@ static struct obd_capa *osd_capa_get(const struct lu_env *env, struct lustre_capa_key *key = &info->oti_capa_key; struct lustre_capa *capa = &info->oti_capa; struct obd_capa *oc; + struct md_capainfo *ci; int rc; ENTRY; @@ -1461,10 +1995,41 @@ static struct obd_capa *osd_capa_get(const struct lu_env *env, if (old && osd_object_auth(env, dt, old, opc)) RETURN(ERR_PTR(-EACCES)); + ci = md_capainfo(env); + if (unlikely(!ci)) + RETURN(ERR_PTR(-ENOENT)); + + switch (ci->mc_auth) { + case LC_ID_NONE: + RETURN(NULL); + case LC_ID_PLAIN: + capa->lc_uid = obj->oo_inode->i_uid; + capa->lc_gid = obj->oo_inode->i_gid; + capa->lc_flags = LC_ID_PLAIN; + break; + case LC_ID_CONVERT: { + __u32 d[4], s[4]; + + s[0] = obj->oo_inode->i_uid; + get_random_bytes(&(s[1]), sizeof(__u32)); + s[2] = obj->oo_inode->i_gid; + get_random_bytes(&(s[3]), sizeof(__u32)); + rc = capa_encrypt_id(d, s, key->lk_key, CAPA_HMAC_KEY_MAX_LEN); + if (unlikely(rc)) + RETURN(ERR_PTR(rc)); + + capa->lc_uid = ((__u64)d[1] << 32) | d[0]; + capa->lc_gid = ((__u64)d[3] << 32) | d[2]; + capa->lc_flags = LC_ID_CONVERT; + break; + } + default: + RETURN(ERR_PTR(-EINVAL)); + } + capa->lc_fid = *fid; capa->lc_opc = opc; - capa->lc_uid = 0; - capa->lc_flags = dev->od_capa_alg << 24; + capa->lc_flags |= dev->od_capa_alg << 24; capa->lc_timeout = dev->od_capa_timeout; capa->lc_expiry = 0; @@ -1497,7 +2062,7 @@ static int osd_object_sync(const struct lu_env *env, struct dt_object *dt) struct osd_object *obj = osd_dt_obj(dt); struct inode *inode = obj->oo_inode; struct osd_thread_info *info = osd_oti_get(env); - struct dentry *dentry = &info->oti_dentry; + struct dentry *dentry = &info->oti_obj_dentry; struct file *file = &info->oti_file; ENTRY; @@ -1531,6 +2096,30 @@ static const struct dt_object_operations osd_obj_ops = { .do_object_sync = osd_object_sync, }; +/** + * dt_object_operations for interoperability mode + * (i.e. to run 2.0 mds on 1.8 disk) (b11826) + */ +static const struct dt_object_operations osd_obj_ea_ops = { + .do_read_lock = osd_object_read_lock, + .do_write_lock = osd_object_write_lock, + .do_read_unlock = osd_object_read_unlock, + .do_write_unlock = osd_object_write_unlock, + .do_attr_get = osd_attr_get, + .do_attr_set = osd_attr_set, + .do_ah_init = osd_ah_init, + .do_create = osd_object_ea_create, + .do_index_try = osd_index_try, + .do_ref_add = osd_object_ref_add, + .do_ref_del = osd_object_ref_del, + .do_xattr_get = osd_xattr_get, + .do_xattr_set = osd_xattr_set, + .do_xattr_del = osd_xattr_del, + .do_xattr_list = osd_xattr_list, + .do_capa_get = osd_capa_get, + .do_object_sync = osd_object_sync, +}; + /* * Body operations. */ @@ -1564,11 +2153,15 @@ static ssize_t osd_read(const struct lu_env *env, struct dt_object *dt, static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, const struct lu_buf *buf, loff_t *pos, - struct thandle *handle, struct lustre_capa *capa) + struct thandle *handle, struct lustre_capa *capa, + int ignore_quota) { struct inode *inode = osd_dt_obj(dt)->oo_inode; struct osd_thandle *oh; ssize_t result; +#ifdef HAVE_QUOTA_SUPPORT + cfs_cap_t save = current->cap_effective; +#endif LASSERT(handle != NULL); @@ -1577,8 +2170,17 @@ static ssize_t osd_write(const struct lu_env *env, struct dt_object *dt, oh = container_of(handle, struct osd_thandle, ot_super); LASSERT(oh->ot_handle->h_transaction != NULL); +#ifdef HAVE_QUOTA_SUPPORT + if (ignore_quota) + current->cap_effective |= CFS_CAP_SYS_RESOURCE_MASK; + else + current->cap_effective &= ~CFS_CAP_SYS_RESOURCE_MASK; +#endif result = fsfilt_ldiskfs_write_handle(inode, buf->lb_buf, buf->lb_len, pos, oh->ot_handle); +#ifdef HAVE_QUOTA_SUPPORT + current->cap_effective = save; +#endif if (result == 0) result = buf->lb_len; return result; @@ -1598,10 +2200,11 @@ static int osd_object_is_root(const struct osd_object *obj) return osd_sb(osd_obj2dev(obj))->s_root->d_inode == obj->oo_inode; } -static int osd_index_probe(const struct lu_env *env, struct osd_object *o, +static int osd_iam_index_probe(const struct lu_env *env, struct osd_object *o, const struct dt_index_features *feat) { struct iam_descr *descr; + struct dt_object *dt = &o->oo_dt; if (osd_object_is_root(o)) return feat == &dt_directory_features; @@ -1609,14 +2212,23 @@ static int osd_index_probe(const struct lu_env *env, struct osd_object *o, LASSERT(o->oo_dir != NULL); descr = o->oo_dir->od_container.ic_descr; - if (feat == &dt_directory_features) - return descr == &iam_htree_compat_param || - (descr->id_rec_size == sizeof(struct lu_fid_pack) && - 1 /* - * XXX check that index looks like directory. - */ - ); - else + if (feat == &dt_directory_features) { + if (descr->id_rec_size == sizeof(struct lu_fid_pack)) + return 1; + + if (descr == &iam_htree_compat_param) { + /* if it is a HTREE dir then there is good chance that, + * we dealing with ext3 directory here with no FIDs. */ + + if (descr->id_rec_size == + sizeof ((struct ldiskfs_dir_entry_2 *)NULL)->inode) { + + dt->do_index_ops = &osd_index_ea_ops; + return 1; + } + } + return 0; + } else { return feat->dif_keysize_min <= descr->id_key_size && descr->id_key_size <= feat->dif_keysize_max && @@ -1627,11 +2239,12 @@ static int osd_index_probe(const struct lu_env *env, struct osd_object *o, ergo(feat->dif_flags & DT_IND_UPDATE, 1 /* XXX check that object (and file system) is * writable */); + } } -static int osd_container_init(const struct lu_env *env, - struct osd_object *obj, - struct osd_directory *dir) +static int osd_iam_container_init(const struct lu_env *env, + struct osd_object *obj, + struct osd_directory *dir) { int result; struct iam_container *bag; @@ -1641,7 +2254,7 @@ static int osd_container_init(const struct lu_env *env, if (result == 0) { result = iam_container_setup(bag); if (result == 0) - obj->oo_dt.do_index_ops = &osd_index_ops; + obj->oo_dt.do_index_ops = &osd_index_iam_ops; else iam_container_fini(bag); } @@ -1655,14 +2268,23 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt, const struct dt_index_features *feat) { int result; + int ea_dir = 0; struct osd_object *obj = osd_dt_obj(dt); + struct osd_device *osd = osd_obj2dev(obj); LINVRNT(osd_invariant(obj)); LASSERT(dt_object_exists(dt)); if (osd_object_is_root(obj)) { - dt->do_index_ops = &osd_index_compat_ops; + dt->do_index_ops = &osd_index_ea_ops; result = 0; + } else if (feat == &dt_directory_features && osd->od_iop_mode) { + dt->do_index_ops = &osd_index_ea_ops; + if (S_ISDIR(obj->oo_inode->i_mode)) + result = 0; + else + result = -ENOTDIR; + ea_dir = 1; } else if (!osd_has_index(obj)) { struct osd_directory *dir; @@ -1688,7 +2310,7 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt, * recheck under lock. */ if (!osd_has_index(obj)) - result = osd_container_init(env, obj, dir); + result = osd_iam_container_init(env, obj, dir); else result = 0; up(&obj->oo_dir->od_sem); @@ -1697,8 +2319,8 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt, } else result = 0; - if (result == 0) { - if (!osd_index_probe(env, obj, feat)) + if (result == 0 && ea_dir == 0) { + if (!osd_iam_index_probe(env, obj, feat)) result = -ENOTDIR; } LINVRNT(osd_invariant(obj)); @@ -1706,9 +2328,21 @@ static int osd_index_try(const struct lu_env *env, struct dt_object *dt, return result; } -static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, - const struct dt_key *key, struct thandle *handle, - struct lustre_capa *capa) +/** + * delete a (key, value) pair from index \a dt specified by \a key + * + * \param dt_object osd index object + * \param key key for index + * \param rec record reference + * \param handle transaction handler + * + * \retval 0 success + * \retval -ve failure + */ + +static int osd_index_iam_delete(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *handle, + struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); struct osd_thandle *oh; @@ -1726,7 +2360,7 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE)) RETURN(-EACCES); - ipd = osd_ipd_get(env, bag); + ipd = osd_idx_ipd_get(env, bag); if (unlikely(ipd == NULL)) RETURN(-ENOMEM); @@ -1740,438 +2374,947 @@ static int osd_index_delete(const struct lu_env *env, struct dt_object *dt, RETURN(rc); } -static int osd_index_lookup(const struct lu_env *env, struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa) +/** + * Index delete function for interoperability mode (b11826). + * It will remove the directory entry added by osd_index_ea_insert(). + * This entry is needed to maintain name->fid mapping. + * + * \param key, key i.e. file entry to be deleted + * + * \retval 0, on success + * \retval -ve, on error + */ +static int osd_index_ea_delete(const struct lu_env *env, struct dt_object *dt, + const struct dt_key *key, struct thandle *handle, + struct lustre_capa *capa) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct inode *dir = obj->oo_inode; + struct dentry *dentry; + struct osd_thandle *oh; + struct ldiskfs_dir_entry_2 *de; + struct buffer_head *bh; + + int rc; + + ENTRY; + + LINVRNT(osd_invariant(obj)); + LASSERT(dt_object_exists(dt)); + LASSERT(handle != NULL); + + oh = container_of(handle, struct osd_thandle, ot_super); + LASSERT(oh->ot_handle != NULL); + LASSERT(oh->ot_handle->h_transaction != NULL); + + if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE)) + RETURN(-EACCES); + + dentry = osd_child_dentry_get(env, obj, + (char *)key, strlen((char *)key)); + bh = ldiskfs_find_entry(dentry, &de); + if (bh) { + rc = ldiskfs_delete_entry(oh->ot_handle, + dir, de, bh); + if (!rc) + mark_inode_dirty(dir); + brelse(bh); + } else + rc = -ENOENT; + + LASSERT(osd_invariant(obj)); + RETURN(rc); +} + +/** + * Lookup index for \a key and copy record to \a rec. + * + * \param dt_object osd index object + * \param key key for index + * \param rec record reference + * + * \retval +ve success : exact mach + * \retval 0 return record with key not greater than \a key + * \retval -ve failure + */ +static int osd_index_iam_lookup(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa) { struct osd_object *obj = osd_dt_obj(dt); struct iam_path_descr *ipd; struct iam_container *bag = &obj->oo_dir->od_container; + struct osd_thread_info *oti = osd_oti_get(env); + struct iam_iterator *it = &oti->oti_idx_it; int rc; - ENTRY; - LINVRNT(osd_invariant(obj)); + LASSERT(osd_invariant(obj)); LASSERT(dt_object_exists(dt)); LASSERT(bag->ic_object == obj->oo_inode); if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_LOOKUP)) - return -EACCES; + RETURN(-EACCES); - ipd = osd_ipd_get(env, bag); - if (unlikely(ipd == NULL)) + ipd = osd_idx_ipd_get(env, bag); + if (IS_ERR(ipd)) RETURN(-ENOMEM); - rc = iam_lookup(bag, (const struct iam_key *)key, - (struct iam_rec *)rec, ipd); + /* got ipd now we can start iterator. */ + iam_it_init(it, bag, 0, ipd); + + rc = iam_it_get(it, (struct iam_key *)key); + if (rc >= 0) + iam_reccpy(&it->ii_path.ip_leaf, (struct iam_rec *)rec); + + iam_it_put(it); + iam_it_fini(it); osd_ipd_put(env, bag, ipd); + LINVRNT(osd_invariant(obj)); RETURN(rc); } -static int osd_index_insert(const struct lu_env *env, struct dt_object *dt, - const struct dt_rec *rec, const struct dt_key *key, - struct thandle *th, struct lustre_capa *capa) +/** + * Inserts (key, value) pair in \a dt index object. + * + * \param dt osd index object + * \param key key for index + * \param rec record reference + * \param th transaction handler + * + * \retval 0 success + * \retval -ve failure + */ +static int osd_index_iam_insert(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, const struct dt_key *key, + struct thandle *th, struct lustre_capa *capa, + int ignore_quota) { struct osd_object *obj = osd_dt_obj(dt); struct iam_path_descr *ipd; struct osd_thandle *oh; struct iam_container *bag = &obj->oo_dir->od_container; +#ifdef HAVE_QUOTA_SUPPORT + cfs_cap_t save = current->cap_effective; +#endif + int rc; + + ENTRY; + + LINVRNT(osd_invariant(obj)); + LASSERT(dt_object_exists(dt)); + LASSERT(bag->ic_object == obj->oo_inode); + LASSERT(th != NULL); + + if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_INSERT)) + return -EACCES; + + ipd = osd_idx_ipd_get(env, bag); + if (unlikely(ipd == NULL)) + RETURN(-ENOMEM); + + oh = container_of0(th, struct osd_thandle, ot_super); + LASSERT(oh->ot_handle != NULL); + LASSERT(oh->ot_handle->h_transaction != NULL); +#ifdef HAVE_QUOTA_SUPPORT + if (ignore_quota) + current->cap_effective |= CFS_CAP_SYS_RESOURCE_MASK; + else + current->cap_effective &= ~CFS_CAP_SYS_RESOURCE_MASK; +#endif + rc = iam_insert(oh->ot_handle, bag, (const struct iam_key *)key, + (struct iam_rec *)rec, ipd); +#ifdef HAVE_QUOTA_SUPPORT + current->cap_effective = save; +#endif + osd_ipd_put(env, bag, ipd); + LINVRNT(osd_invariant(obj)); + RETURN(rc); +} + +/** + * Calls ldiskfs_add_dot_dotdot() to add dot and dotdot entries + * into the directory.Also sets flags into osd object to + * indicate dot and dotdot are created. This is required for + * interoperability mode (b11826) + * + * \param dir directory for dot and dotdot fixup. + * \param obj child object for linking + * + * \retval 0, on success + * \retval -ve, on error + */ +static int osd_add_dot_dotdot(struct osd_thread_info *info, + struct osd_object *dir, + struct osd_object *obj, const char *name, + struct thandle *th) +{ + struct inode *parent_dir = obj->oo_inode; + struct inode *inode = dir->oo_inode; + struct osd_thandle *oth; + int result = 0; + + oth = container_of(th, struct osd_thandle, ot_super); + LASSERT(oth->ot_handle->h_transaction != NULL); + LASSERT(S_ISDIR(dir->oo_inode->i_mode)); + + if (strcmp(name, dot) == 0) { + if (dir->oo_compat_dot_created) { + result = -EEXIST; + } else { + LASSERT(obj == dir); + dir->oo_compat_dot_created = 1; + result = 0; + } + } else if(strcmp(name, dotdot) == 0) { + if (!dir->oo_compat_dot_created) + return -EINVAL; + if (dir->oo_compat_dotdot_created) + return __osd_ea_add_rec(info, dir, obj, name, th); + + result = ldiskfs_add_dot_dotdot(oth->ot_handle, parent_dir, inode); + if (result == 0) + dir->oo_compat_dotdot_created = 1; + } + + return result; +} + +/** + * Calls ldiskfs_add_entry() to add directory entry + * into the directory. This is required for + * interoperability mode (b11826) + * + * \retval 0, on success + * \retval -ve, on error + */ +static int __osd_ea_add_rec(struct osd_thread_info *info, + struct osd_object *pobj, + struct osd_object *cobj, + const char *name, + struct thandle *th) +{ + struct dentry *child; + struct osd_thandle *oth; + struct inode *cinode = cobj->oo_inode; + int rc; + + oth = container_of(th, struct osd_thandle, ot_super); + LASSERT(oth->ot_handle != NULL); + LASSERT(oth->ot_handle->h_transaction != NULL); + + child = osd_child_dentry_get(info->oti_env, pobj, name, strlen(name)); + rc = ldiskfs_add_entry(oth->ot_handle, child, cinode); + + RETURN(rc); +} + +/** + * It will call the appropriate osd_add* function and return the + * value, return by respective functions. + */ +static int osd_ea_add_rec(const struct lu_env *env, + struct osd_object *pobj, + struct osd_object *cobj, + const char *name, + struct thandle *th) +{ + struct osd_thread_info *info = osd_oti_get(env); + int rc; + + if (name[0] == '.' && (name[1] == '\0' || (name[1] == '.' && + name[2] =='\0'))) + rc = osd_add_dot_dotdot(info, pobj, cobj, name, th); + else + rc = __osd_ea_add_rec(info, pobj, cobj, name, th); + + return rc; +} + +/** + * Calls ->lookup() to find dentry. From dentry get inode and + * read inode's ea to get fid. This is required for interoperability + * mode (b11826) + * + * \retval 0, on success + * \retval -ve, on error + */ +static int osd_ea_lookup_rec(const struct lu_env *env, struct osd_object *obj, + struct dt_rec *rec, const struct dt_key *key) +{ + struct inode *dir = obj->oo_inode; + struct osd_thread_info *info = osd_oti_get(env); + struct dentry *dentry; + struct osd_device *dev = osd_dev(obj->oo_dt.do_lu.lo_dev); + struct osd_inode_id *id = &info->oti_id; + struct ldiskfs_dir_entry_2 *de; + struct buffer_head *bh; + struct inode *inode; + int ino; + int rc; + + LASSERT(dir->i_op != NULL && dir->i_op->lookup != NULL); + + dentry = osd_child_dentry_get(env, obj, + (char *)key, strlen((char *)key)); + bh = ldiskfs_find_entry(dentry, &de); + if (bh) { + ino = le32_to_cpu(de->inode); + brelse(bh); + id->oii_ino = ino; + id->oii_gen = OSD_OII_NOGEN; + + inode = osd_iget(info, dev, id); + if (!IS_ERR(inode)) { + dentry->d_inode = inode; + + rc = osd_ea_fid_get(env, dentry, rec); + iput(inode); + } else + rc = -ENOENT; + } else + rc = -ENOENT; + + RETURN (rc); +} + +/** + * Find the osd object for given fid. + * + * \param fid, need to find the osd object having this fid + * + * \retval osd_object, on success + * \retval -ve, on error + */ +struct osd_object *osd_object_find(const struct lu_env *env, + struct dt_object *dt, + const struct lu_fid *fid) +{ + struct lu_device *ludev = dt->do_lu.lo_dev; + struct osd_object *child = NULL; + struct lu_object *luch; + struct lu_object *lo; + + luch = lu_object_find(env, ludev, fid, NULL); + if (!IS_ERR(luch)) { + if (lu_object_exists(luch)) { + lo = lu_object_locate(luch->lo_header, ludev->ld_type); + if (lo != NULL) + child = osd_obj(lo); + else + LU_OBJECT_DEBUG(D_ERROR, env, luch, + "lu_object can't be located" + ""DFID"\n", PFID(fid)); + + if (child == NULL) { + lu_object_put(env, luch); + CERROR("Unable to get osd_object\n"); + child = ERR_PTR(-ENOENT); + } + } else { + LU_OBJECT_DEBUG(D_ERROR, env, luch, + "lu_object does not exists "DFID"\n", + PFID(fid)); + child = ERR_PTR(-ENOENT); + } + } else + child = (void *)luch; + + return child; +} + +/** + * Put the osd object once done with it. + * + * \param obj, osd object that needs to be put + */ +static inline void osd_object_put(const struct lu_env *env, + struct osd_object *obj) +{ + lu_object_put(env, &obj->oo_dt.do_lu); +} + +/** + * Index add function for interoperability mode (b11826). + * It will add the directory entry.This entry is needed to + * maintain name->fid mapping. + * + * \param key, it is key i.e. file entry to be inserted + * \param rec, it is value of given key i.e. fid + * + * \retval 0, on success + * \retval -ve, on error + */ +static int osd_index_ea_insert(const struct lu_env *env, struct dt_object *dt, + const struct dt_rec *rec, + const struct dt_key *key, struct thandle *th, + struct lustre_capa *capa, int ignore_quota) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct lu_fid *fid = &osd_oti_get(env)->oti_fid; + const struct lu_fid_pack *pack = (const struct lu_fid_pack *)rec; + const char *name = (const char *)key; + struct osd_object *child; +#ifdef HAVE_QUOTA_SUPPORT + cfs_cap_t save = current->cap_effective; +#endif int rc; ENTRY; - LINVRNT(osd_invariant(obj)); + LASSERT(osd_invariant(obj)); LASSERT(dt_object_exists(dt)); - LASSERT(bag->ic_object == obj->oo_inode); LASSERT(th != NULL); if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_INSERT)) - return -EACCES; + RETURN(-EACCES); - ipd = osd_ipd_get(env, bag); - if (unlikely(ipd == NULL)) - RETURN(-ENOMEM); + rc = fid_unpack(pack, fid); + if (rc != 0) + RETURN(rc); + child = osd_object_find(env, dt, fid); + if (!IS_ERR(child)) { +#ifdef HAVE_QUOTA_SUPPORT + if (ignore_quota) + current->cap_effective |= CFS_CAP_SYS_RESOURCE_MASK; + else + current->cap_effective &= ~CFS_CAP_SYS_RESOURCE_MASK; +#endif + rc = osd_ea_add_rec(env, obj, child, name, th); - oh = container_of0(th, struct osd_thandle, ot_super); - LASSERT(oh->ot_handle != NULL); - LASSERT(oh->ot_handle->h_transaction != NULL); - rc = iam_insert(oh->ot_handle, bag, (const struct iam_key *)key, - (struct iam_rec *)rec, ipd); - osd_ipd_put(env, bag, ipd); - LINVRNT(osd_invariant(obj)); +#ifdef HAVE_QUOTA_SUPPORT + current->cap_effective = save; +#endif + osd_object_put(env, child); + } else { + rc = PTR_ERR(child); + } + + LASSERT(osd_invariant(obj)); RETURN(rc); } -/* - * Iterator operations. +/** + * Initialize osd Iterator for given osd index object. + * + * \param dt osd index object */ -struct osd_it { - struct osd_object *oi_obj; - struct iam_path_descr *oi_ipd; - struct iam_iterator oi_it; -}; -static struct dt_it *osd_it_init(const struct lu_env *env, - struct dt_object *dt, int writable, +static struct dt_it *osd_it_iam_init(const struct lu_env *env, + struct dt_object *dt, struct lustre_capa *capa) { - struct osd_it *it; + struct osd_it_iam *it; + struct osd_thread_info *oti = osd_oti_get(env); struct osd_object *obj = osd_dt_obj(dt); struct lu_object *lo = &dt->do_lu; struct iam_path_descr *ipd; struct iam_container *bag = &obj->oo_dir->od_container; - __u32 flags; LASSERT(lu_object_exists(lo)); - if (osd_object_auth(env, dt, capa, writable ? CAPA_OPC_BODY_WRITE : - CAPA_OPC_BODY_READ)) + if (osd_object_auth(env, dt, capa, CAPA_OPC_BODY_READ)) return ERR_PTR(-EACCES); - flags = writable ? IAM_IT_MOVE|IAM_IT_WRITE : IAM_IT_MOVE; - OBD_ALLOC_PTR(it); - if (it != NULL) { - /* - * XXX: as ipd is allocated within osd_thread_info, assignment - * below implies that iterator usage is confined within single - * environment. - */ - ipd = osd_ipd_get(env, bag); - if (likely(ipd != NULL)) { - it->oi_obj = obj; - it->oi_ipd = ipd; - lu_object_get(lo); - iam_it_init(&it->oi_it, bag, flags, ipd); - return (struct dt_it *)it; - } else - OBD_FREE_PTR(it); + it = &oti->oti_it; + ipd = osd_it_ipd_get(env, bag); + if (likely(ipd != NULL)) { + it->oi_obj = obj; + it->oi_ipd = ipd; + lu_object_get(lo); + iam_it_init(&it->oi_it, bag, IAM_IT_MOVE, ipd); + return (struct dt_it *)it; } return ERR_PTR(-ENOMEM); } -static void osd_it_fini(const struct lu_env *env, struct dt_it *di) +/** + * free given Iterator. + */ + +static void osd_it_iam_fini(const struct lu_env *env, struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; struct osd_object *obj = it->oi_obj; iam_it_fini(&it->oi_it); osd_ipd_put(env, &obj->oo_dir->od_container, it->oi_ipd); lu_object_put(env, &obj->oo_dt.do_lu); - OBD_FREE_PTR(it); } -static int osd_it_get(const struct lu_env *env, +/** + * Move Iterator to record specified by \a key + * + * \param di osd iterator + * \param key key for index + * + * \retval +ve di points to record with least key not larger than key + * \retval 0 di points to exact matched key + * \retval -ve failure + */ + +static int osd_it_iam_get(const struct lu_env *env, struct dt_it *di, const struct dt_key *key) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return iam_it_get(&it->oi_it, (const struct iam_key *)key); } -static void osd_it_put(const struct lu_env *env, struct dt_it *di) +/** + * Release Iterator + * + * \param di osd iterator + */ + +static void osd_it_iam_put(const struct lu_env *env, struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; iam_it_put(&it->oi_it); } -static int osd_it_next(const struct lu_env *env, struct dt_it *di) +/** + * Move iterator by one record + * + * \param di osd iterator + * + * \retval +1 end of container reached + * \retval 0 success + * \retval -ve failure + */ + +static int osd_it_iam_next(const struct lu_env *env, struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return iam_it_next(&it->oi_it); } -static int osd_it_del(const struct lu_env *env, struct dt_it *di, - struct thandle *th) -{ - struct osd_it *it = (struct osd_it *)di; - struct osd_thandle *oh; - - LASSERT(th != NULL); - - oh = container_of0(th, struct osd_thandle, ot_super); - LASSERT(oh->ot_handle != NULL); - LASSERT(oh->ot_handle->h_transaction != NULL); - - return iam_it_rec_delete(oh->ot_handle, &it->oi_it); -} +/** + * Return pointer to the key under iterator. + */ -static struct dt_key *osd_it_key(const struct lu_env *env, +static struct dt_key *osd_it_iam_key(const struct lu_env *env, const struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return (struct dt_key *)iam_it_key_get(&it->oi_it); } -static int osd_it_key_size(const struct lu_env *env, const struct dt_it *di) +/** + * Return size of key under iterator (in bytes) + */ + +static int osd_it_iam_key_size(const struct lu_env *env, const struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return iam_it_key_size(&it->oi_it); } -static struct dt_rec *osd_it_rec(const struct lu_env *env, +/** + * Return pointer to the record under iterator. + */ +static struct dt_rec *osd_it_iam_rec(const struct lu_env *env, const struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return (struct dt_rec *)iam_it_rec_get(&it->oi_it); } -static __u64 osd_it_store(const struct lu_env *env, const struct dt_it *di) +/** + * Returns cookie for current Iterator position. + */ +static __u64 osd_it_iam_store(const struct lu_env *env, const struct dt_it *di) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return iam_it_store(&it->oi_it); } -static int osd_it_load(const struct lu_env *env, +/** + * Restore iterator from cookie. + * + * \param di osd iterator + * \param hash Iterator location cookie + * + * \retval +ve di points to record with least key not larger than key. + * \retval 0 di points to exact matched key + * \retval -ve failure + */ + +static int osd_it_iam_load(const struct lu_env *env, const struct dt_it *di, __u64 hash) { - struct osd_it *it = (struct osd_it *)di; + struct osd_it_iam *it = (struct osd_it_iam *)di; return iam_it_load(&it->oi_it, hash); } -static const struct dt_index_operations osd_index_ops = { - .dio_lookup = osd_index_lookup, - .dio_insert = osd_index_insert, - .dio_delete = osd_index_delete, +static const struct dt_index_operations osd_index_iam_ops = { + .dio_lookup = osd_index_iam_lookup, + .dio_insert = osd_index_iam_insert, + .dio_delete = osd_index_iam_delete, .dio_it = { - .init = osd_it_init, - .fini = osd_it_fini, - .get = osd_it_get, - .put = osd_it_put, - .del = osd_it_del, - .next = osd_it_next, - .key = osd_it_key, - .key_size = osd_it_key_size, - .rec = osd_it_rec, - .store = osd_it_store, - .load = osd_it_load + .init = osd_it_iam_init, + .fini = osd_it_iam_fini, + .get = osd_it_iam_get, + .put = osd_it_iam_put, + .next = osd_it_iam_next, + .key = osd_it_iam_key, + .key_size = osd_it_iam_key_size, + .rec = osd_it_iam_rec, + .store = osd_it_iam_store, + .load = osd_it_iam_load } }; -static int osd_index_compat_delete(const struct lu_env *env, - struct dt_object *dt, - const struct dt_key *key, - struct thandle *handle, - struct lustre_capa *capa) +/** + * Creates or initializes iterator context. + * + * \retval struct osd_it_ea, iterator structure on success + * + */ +static struct dt_it *osd_it_ea_init(const struct lu_env *env, + struct dt_object *dt, + struct lustre_capa *capa) +{ + struct osd_object *obj = osd_dt_obj(dt); + struct osd_thread_info *info = osd_oti_get(env); + struct osd_it_ea *it = &info->oti_it_ea; + struct lu_object *lo = &dt->do_lu; + struct dentry *obj_dentry = &info->oti_it_dentry; + ENTRY; + LASSERT(lu_object_exists(lo)); + + obj_dentry->d_inode = obj->oo_inode; + obj_dentry->d_sb = osd_sb(osd_obj2dev(obj)); + obj_dentry->d_name.hash = 0; + + it->oie_namelen = 0; + it->oie_curr_pos = 0; + it->oie_next_pos = 0; + it->oie_obj = obj; + it->oie_file.f_dentry = obj_dentry; + it->oie_file.f_mapping = obj->oo_inode->i_mapping; + it->oie_file.f_op = obj->oo_inode->i_fop; + it->oie_file.private_data = NULL; + lu_object_get(lo); + + RETURN((struct dt_it*) it); +} + +/** + * Destroy or finishes iterator context. + * + * \param di, struct osd_it_ea, iterator structure to be destroyed + */ +static void osd_it_ea_fini(const struct lu_env *env, struct dt_it *di) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_it_ea *it = (struct osd_it_ea *)di; + struct osd_object *obj = it->oie_obj; + - LASSERT(handle != NULL); - LASSERT(S_ISDIR(obj->oo_inode->i_mode)); ENTRY; + lu_object_put(env, &obj->oo_dt.do_lu); + EXIT; +} -#if 0 - if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_DELETE)) - RETURN(-EACCES); -#endif +/** + * It position the iterator at given key, so that next lookup continues from + * that key Or it is similar to dio_it->load() but based on a key, + * rather than file position. + * + * As a special convention, osd_it_ea_get(env, di, "") has to rewind iterator + * to the beginning. + * + * TODO: Presently return +1 considering it is only used by mdd_dir_is_empty(). + */ +static int osd_it_ea_get(const struct lu_env *env, + struct dt_it *di, const struct dt_key *key) +{ + struct osd_it_ea *it = (struct osd_it_ea *)di; - RETURN(-EOPNOTSUPP); + ENTRY; + LASSERT(((const char *)key)[0] == '\0'); + it->oie_namelen = 0; + it->oie_curr_pos = 0; + it->oie_next_pos = 0; + + RETURN(+1); } -/* - * Compatibility index operations. +/** + * Does nothing */ +static void osd_it_ea_put(const struct lu_env *env, struct dt_it *di) +{ +} - -static void osd_build_pack(const struct lu_env *env, struct osd_device *osd, - struct dentry *dentry, struct lu_fid_pack *pack) +/** + * It is called internally by ->readdir(). It fills the + * iterator's in-memory data structure with required + * information i.e. name, namelen, rec_size etc. + * + * \param buf, in which information to be filled in. + * \param name, name of the file in given dir + * + * \retval 0, on success + * \retval 1, on buffer full + */ +static int osd_ldiskfs_filldir(char *buf, const char *name, int namelen, + loff_t offset, ino_t ino, + unsigned int d_type) { - struct inode *inode = dentry->d_inode; - struct lu_fid *fid = &osd_oti_get(env)->oti_fid; + struct osd_it_ea *it = (struct osd_it_ea *)buf; + struct dirent64 *dirent = &it->oie_dirent64; + int reclen = LDISKFS_DIR_REC_LEN(namelen); - lu_igif_build(fid, inode->i_ino, inode->i_generation); - fid_cpu_to_be(fid, fid); - pack->fp_len = sizeof *fid + 1; - memcpy(pack->fp_area, fid, sizeof *fid); + + ENTRY; + if (it->oie_namelen) + RETURN(-ENOENT); + + if (namelen == 0 || namelen > LDISKFS_NAME_LEN) + RETURN(-EIO); + + strncpy(dirent->d_name, name, LDISKFS_NAME_LEN); + dirent->d_name[namelen] = 0; + dirent->d_ino = ino; + dirent->d_off = offset; + dirent->d_reclen = reclen; + it->oie_namelen = namelen; + it->oie_curr_pos = offset; + + RETURN(0); } -static int osd_index_compat_lookup(const struct lu_env *env, - struct dt_object *dt, - struct dt_rec *rec, const struct dt_key *key, - struct lustre_capa *capa) +/** + * Calls ->readdir() to load a directory entry at a time + * and stored it in iterator's in-memory data structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval 0, on success + * \retval -ve, on error + */ +int osd_ldiskfs_it_fill(const struct dt_it *di) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_it_ea *it = (struct osd_it_ea *)di; + struct osd_object *obj = it->oie_obj; + struct inode *inode = obj->oo_inode; + int result = 0; - struct osd_device *osd = osd_obj2dev(obj); - struct osd_thread_info *info = osd_oti_get(env); - struct inode *dir; + ENTRY; + it->oie_namelen = 0; + it->oie_file.f_pos = it->oie_curr_pos; - int result; + result = inode->i_fop->readdir(&it->oie_file, it, + (filldir_t) osd_ldiskfs_filldir); - /* - * XXX temporary solution. - */ - struct dentry *dentry; - struct dentry *parent; + it->oie_next_pos = it->oie_file.f_pos; - LINVRNT(osd_invariant(obj)); - LASSERT(S_ISDIR(obj->oo_inode->i_mode)); - LASSERT(osd_has_index(obj)); + if(!result && it->oie_namelen == 0) + result = -EIO; - if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_LOOKUP)) - return -EACCES; + RETURN(result); +} - info->oti_str.name = (const char *)key; - info->oti_str.len = strlen((const char *)key); +/** + * It calls osd_ldiskfs_it_fill() which will use ->readdir() + * to load a directory entry at a time and stored it in + * iterator's in-memory data structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval +ve, iterator reached to end + * \retval 0, iterator not reached to end + * \retval -ve, on error + */ +static int osd_it_ea_next(const struct lu_env *env, struct dt_it *di) +{ + struct osd_it_ea *it = (struct osd_it_ea *)di; + int rc; - dir = obj->oo_inode; - LASSERT(dir->i_op != NULL && dir->i_op->lookup != NULL); + ENTRY; + it->oie_curr_pos = it->oie_next_pos; - parent = d_alloc_root(dir); - if (parent == NULL) - return -ENOMEM; - igrab(dir); - dentry = d_alloc(parent, &info->oti_str); - if (dentry != NULL) { - struct dentry *d; + if (it->oie_curr_pos == LDISKFS_HTREE_EOF) + rc = +1; + else + rc = osd_ldiskfs_it_fill(di); - /* - * XXX passing NULL for nameidata should work for - * ext3/ldiskfs. - */ - d = dir->i_op->lookup(dir, dentry, NULL); - if (d == NULL) { - /* - * normal case, result is in @dentry. - */ - if (dentry->d_inode != NULL) { - osd_build_pack(env, osd, dentry, - (struct lu_fid_pack *)rec); - result = 0; - } else - result = -ENOENT; - } else { - /* What? Disconnected alias? Ppheeeww... */ - CERROR("Aliasing where not expected\n"); - result = -EIO; - dput(d); - } - dput(dentry); - } else - result = -ENOMEM; - dput(parent); - LINVRNT(osd_invariant(obj)); - return result; + RETURN(rc); } -static int osd_add_rec(struct osd_thread_info *info, struct osd_device *dev, - struct inode *dir, struct inode *inode, const char *name) +/** + * Returns the key at current position from iterator's in memory structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval key i.e. struct dt_key on success + */ +static struct dt_key *osd_it_ea_key(const struct lu_env *env, + const struct dt_it *di) { - struct dentry *old; - struct dentry *new; - struct dentry *parent; + struct osd_it_ea *it = (struct osd_it_ea *)di; + ENTRY; + RETURN((struct dt_key *)it->oie_dirent64.d_name); +} - int result; +/** + * Returns the key's size at current position from iterator's in memory structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval key_size i.e. struct dt_key on success + */ +static int osd_it_ea_key_size(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_it_ea *it = (struct osd_it_ea *)di; + ENTRY; + RETURN(it->oie_namelen); +} - info->oti_str.name = name; - info->oti_str.len = strlen(name); - - LASSERT(atomic_read(&dir->i_count) > 0); - result = -ENOMEM; - old = d_alloc(dev->od_obj_area, &info->oti_str); - if (old != NULL) { - d_instantiate(old, inode); - igrab(inode); - LASSERT(atomic_read(&dir->i_count) > 0); - parent = d_alloc_root(dir); - if (parent != NULL) { - igrab(dir); - LASSERT(atomic_read(&dir->i_count) > 1); - new = d_alloc(parent, &info->oti_str); - LASSERT(atomic_read(&dir->i_count) > 1); - if (new != NULL) { - LASSERT(atomic_read(&dir->i_count) > 1); - result = dir->i_op->link(old, dir, new); - LASSERT(atomic_read(&dir->i_count) > 1); - dput(new); - LASSERT(atomic_read(&dir->i_count) > 1); - } - LASSERT(atomic_read(&dir->i_count) > 1); - dput(parent); - LASSERT(atomic_read(&dir->i_count) > 0); - } - dput(old); +/** + * Returns the value (i.e. fid/igif) at current position from iterator's + * in memory structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval value i.e. struct dt_rec on success + */ +static struct dt_rec *osd_it_ea_rec(const struct lu_env *env, + const struct dt_it *di) +{ + struct osd_it_ea *it = (struct osd_it_ea *)di; + struct osd_object *obj = it->oie_obj; + struct osd_thread_info *info = osd_oti_get(env); + struct osd_inode_id *id = &info->oti_id; + struct lu_fid_pack *rec = &info->oti_pack; + struct lu_device *ldev = obj->oo_dt.do_lu.lo_dev; + struct dentry *dentry = &info->oti_child_dentry; + struct osd_device *dev; + struct inode *inode; + int rc; + + ENTRY; + dev = osd_dev(ldev); + id->oii_ino = it->oie_dirent64.d_ino; + id->oii_gen = OSD_OII_NOGEN; + inode = osd_iget(info, dev, id); + if (!IS_ERR(inode)) { + dentry->d_inode = inode; + LASSERT(dentry->d_inode->i_sb == osd_sb(dev)); + } else { + CERROR("Error getting inode for ino =%d", id->oii_ino); + RETURN((struct dt_rec *) PTR_ERR(inode)); } - LASSERT(atomic_read(&dir->i_count) > 0); - return result; + + rc = osd_ea_fid_get(env, dentry, (struct dt_rec*) rec); + + iput(inode); + RETURN((struct dt_rec *)rec); + } +/** + * Returns a cookie for current position of the iterator head, so that + * user can use this cookie to load/start the iterator next time. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval cookie for current position, on success + */ +static __u64 osd_it_ea_store(const struct lu_env *env, const struct dt_it *di) +{ + struct osd_it_ea *it = (struct osd_it_ea *)di; + ENTRY; + RETURN(it->oie_curr_pos); +} -/* - * XXX Temporary stuff. +/** + * It calls osd_ldiskfs_it_fill() which will use ->readdir() + * to load a directory entry at a time and stored it i inn, + * in iterator's in-memory data structure. + * + * \param di, struct osd_it_ea, iterator's in memory structure + * + * \retval +ve, on success + * \retval -ve, on error */ -static int osd_index_compat_insert(const struct lu_env *env, - struct dt_object *dt, - const struct dt_rec *rec, - const struct dt_key *key, struct thandle *th, - struct lustre_capa *capa) +static int osd_it_ea_load(const struct lu_env *env, + const struct dt_it *di, __u64 hash) { - struct osd_object *obj = osd_dt_obj(dt); + struct osd_it_ea *it = (struct osd_it_ea *)di; + int rc; + + ENTRY; + it->oie_curr_pos = it->oie_next_pos = hash; - const char *name = (const char *)key; + rc = osd_ldiskfs_it_fill(di); + if (rc == 0) + rc = +1; - struct lu_device *ludev = dt->do_lu.lo_dev; - struct lu_object *luch; + RETURN(rc); +} +/** + * Index and Iterator operations for interoperability + * mode (i.e. to run 2.0 mds on 1.8 disk) (b11826) + */ +static const struct dt_index_operations osd_index_ea_ops = { + .dio_lookup = osd_index_ea_lookup, + .dio_insert = osd_index_ea_insert, + .dio_delete = osd_index_ea_delete, + .dio_it = { + .init = osd_it_ea_init, + .fini = osd_it_ea_fini, + .get = osd_it_ea_get, + .put = osd_it_ea_put, + .next = osd_it_ea_next, + .key = osd_it_ea_key, + .key_size = osd_it_ea_key_size, + .rec = osd_it_ea_rec, + .store = osd_it_ea_store, + .load = osd_it_ea_load + } +}; - struct osd_thread_info *info = osd_oti_get(env); - const struct lu_fid_pack *pack = (const struct lu_fid_pack *)rec; - struct lu_fid *fid = &osd_oti_get(env)->oti_fid; +/** + * Index lookup function for interoperability mode (b11826). + * + * \param key, key i.e. file name to be searched + * + * \retval +ve, on success + * \retval -ve, on error + */ +static int osd_index_ea_lookup(const struct lu_env *env, struct dt_object *dt, + struct dt_rec *rec, const struct dt_key *key, + struct lustre_capa *capa) +{ + struct osd_object *obj = osd_dt_obj(dt); + int rc = 0; - int result; + ENTRY; LASSERT(S_ISDIR(obj->oo_inode->i_mode)); LINVRNT(osd_invariant(obj)); - LASSERT(th != NULL); - if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_INSERT)) + if (osd_object_auth(env, dt, capa, CAPA_OPC_INDEX_LOOKUP)) return -EACCES; - result = fid_unpack(pack, fid); - if (result != 0) - return result; + rc = osd_ea_lookup_rec(env, obj, rec, key); - luch = lu_object_find(env, ludev, fid, NULL); - if (!IS_ERR(luch)) { - if (lu_object_exists(luch)) { - struct osd_object *child; - - child = osd_obj(lu_object_locate(luch->lo_header, - ludev->ld_type)); - if (child != NULL) - result = osd_add_rec(info, osd_obj2dev(obj), - obj->oo_inode, - child->oo_inode, name); - else { - CERROR("No osd slice.\n"); - result = -ENOENT; - } - LINVRNT(osd_invariant(obj)); - LINVRNT(osd_invariant(child)); - } else { - CERROR("Sorry.\n"); - result = -ENOENT; - } - lu_object_put(env, luch); - } else - result = PTR_ERR(luch); - LINVRNT(osd_invariant(obj)); - return result; + if (rc == 0) + rc = +1; + RETURN(rc); } -static const struct dt_index_operations osd_index_compat_ops = { - .dio_lookup = osd_index_compat_lookup, - .dio_insert = osd_index_compat_insert, - .dio_delete = osd_index_compat_delete -}; - /* type constructor/destructor: osd_type_init, osd_type_fini */ LU_TYPE_INIT_FINI(osd, &osd_key); @@ -2212,11 +3355,15 @@ static int osd_device_init(const struct lu_env *env, struct lu_device *d, const char *name, struct lu_device *next) { int rc; + struct lu_context *ctx; + /* context for commit hooks */ - rc = lu_context_init(&osd_dev(d)->od_env_for_commit.le_ctx, - LCT_MD_THREAD); - if (rc == 0) + ctx = &osd_dev(d)->od_env_for_commit.le_ctx; + rc = lu_context_init(ctx, LCT_MD_THREAD|LCT_REMEMBER|LCT_NOREF); + if (rc == 0) { rc = osd_procfs_init(osd_dev(d), name); + ctx->lc_cookie = 0x3; + } return rc; } @@ -2225,7 +3372,7 @@ static int osd_shutdown(const struct lu_env *env, struct osd_device *o) struct osd_thread_info *info = osd_oti_get(env); ENTRY; if (o->od_obj_area != NULL) { - dput(o->od_obj_area); + lu_object_put(env, &o->od_obj_area->do_lu); o->od_obj_area = NULL; } osd_oi_fini(info, &o->od_oi); @@ -2238,8 +3385,8 @@ static int osd_mount(const struct lu_env *env, { struct lustre_mount_info *lmi; const char *dev = lustre_cfg_string(cfg, 0); - struct osd_thread_info *info = osd_oti_get(env); - int result; + struct lustre_disk_data *ldd; + struct lustre_sb_info *lsi; ENTRY; @@ -2259,20 +3406,17 @@ static int osd_mount(const struct lu_env *env, /* save lustre_mount_info in dt_device */ o->od_mount = lmi; - result = osd_oi_init(info, &o->od_oi, &o->od_dt_dev); - if (result == 0) { - struct dentry *d; + lsi = s2lsi(lmi->lmi_sb); + ldd = lsi->lsi_ldd; - d = simple_mkdir(osd_sb(o)->s_root, lmi->lmi_mnt, "*OBJ-TEMP*", - 0777, 1); - if (!IS_ERR(d)) { - o->od_obj_area = d; - } else - result = PTR_ERR(d); - } - if (result != 0) - osd_shutdown(env, o); - RETURN(result); + if (ldd->ldd_flags & LDD_F_IAM_DIR) { + o->od_iop_mode = 0; + LCONSOLE_WARN("OSD: IAM mode enabled\n"); + } else + o->od_iop_mode = 1; + + o->od_obj_area = NULL; + RETURN(0); } static struct lu_device *osd_device_fini(const struct lu_env *env, @@ -2359,13 +3503,14 @@ static int osd_process_config(const struct lu_env *env, err = osd_shutdown(env, o); break; default: - err = -ENOTTY; + err = -ENOSYS; } RETURN(err); } + extern void ldiskfs_orphan_cleanup (struct super_block * sb, - struct ldiskfs_super_block * es); + struct ldiskfs_super_block * es); static int osd_recovery_complete(const struct lu_env *env, struct lu_device *d) @@ -2377,6 +3522,49 @@ static int osd_recovery_complete(const struct lu_env *env, RETURN(0); } +static int osd_prepare(const struct lu_env *env, + struct lu_device *pdev, + struct lu_device *dev) +{ + struct osd_device *osd = osd_dev(dev); + struct lustre_sb_info *lsi; + struct lustre_disk_data *ldd; + struct lustre_mount_info *lmi; + struct osd_thread_info *oti = osd_oti_get(env); + struct dt_object *d; + int result; + + ENTRY; + /* 1. initialize oi before any file create or file open */ + result = osd_oi_init(oti, &osd->od_oi, + &osd->od_dt_dev, lu2md_dev(pdev)); + if (result != 0) + RETURN(result); + + lmi = osd->od_mount; + lsi = s2lsi(lmi->lmi_sb); + ldd = lsi->lsi_ldd; + + /* 2. setup local objects */ + result = llo_local_objects_setup(env, lu2md_dev(pdev), lu2dt_dev(dev)); + if (result) + goto out; + + /* 3. open remote object dir */ + d = dt_store_open(env, lu2dt_dev(dev), "", + remote_obj_dir, &oti->oti_fid); + if (!IS_ERR(d)) { + osd->od_obj_area = d; + result = 0; + } else { + result = PTR_ERR(d); + osd->od_obj_area = NULL; + } + +out: + RETURN(result); +} + static struct inode *osd_iget(struct osd_thread_info *info, struct osd_device *dev, const struct osd_inode_id *id) @@ -2391,7 +3579,8 @@ static struct inode *osd_iget(struct osd_thread_info *info, CERROR("bad inode\n"); iput(inode); inode = ERR_PTR(-ENOENT); - } else if (inode->i_generation != id->oii_gen) { + } else if (id->oii_gen != OSD_OII_NOGEN && + inode->i_generation != id->oii_gen) { CERROR("stale inode\n"); iput(inode); inode = ERR_PTR(-ESTALE); @@ -2420,7 +3609,7 @@ static int osd_fid_lookup(const struct lu_env *env, * fids. Unfortunately it is somewhat expensive (does a * cache-lookup). Disabling it for production/acceptance-testing. */ - LASSERT(1 || fid_is_local(ldev->ld_site, fid)); + LASSERT(1 || fid_is_local(env, ldev->ld_site, fid)); ENTRY; @@ -2438,6 +3627,10 @@ static int osd_fid_lookup(const struct lu_env *env, if (!IS_ERR(inode)) { obj->oo_inode = inode; LASSERT(obj->oo_inode->i_sb == osd_sb(dev)); + if (dev->od_iop_mode) { + obj->oo_compat_dot_created = 1; + obj->oo_compat_dotdot_created = 1; + } result = 0; } else /* @@ -2451,6 +3644,7 @@ static int osd_fid_lookup(const struct lu_env *env, } else if (result == -ENOENT) result = 0; LINVRNT(osd_invariant(obj)); + RETURN(result); } @@ -2550,7 +3744,8 @@ static const struct lu_object_operations osd_lu_obj_ops = { static const struct lu_device_operations osd_lu_ops = { .ldo_object_alloc = osd_object_alloc, .ldo_process_config = osd_process_config, - .ldo_recovery_complete = osd_recovery_complete + .ldo_recovery_complete = osd_recovery_complete, + .ldo_prepare = osd_prepare, }; static const struct lu_device_type_operations osd_device_type_ops = { @@ -2581,10 +3776,19 @@ static struct obd_ops osd_obd_device_ops = { .o_owner = THIS_MODULE }; +static struct lu_local_obj_desc llod_osd_rem_obj_dir = { + .llod_name = remote_obj_dir, + .llod_oid = OSD_REM_OBJ_DIR_OID, + .llod_is_index = 1, + .llod_feat = &dt_directory_features, +}; + static int __init osd_mod_init(void) { struct lprocfs_static_vars lvars; + osd_oi_mod_init(); + llo_local_obj_register(&llod_osd_rem_obj_dir); lprocfs_osd_init_vars(&lvars); return class_register_type(&osd_obd_device_ops, NULL, lvars.module_vars, LUSTRE_OSD_NAME, &osd_device_type); @@ -2592,6 +3796,7 @@ static int __init osd_mod_init(void) static void __exit osd_mod_exit(void) { + llo_local_obj_unregister(&llod_osd_rem_obj_dir); class_unregister_type(LUSTRE_OSD_NAME); } diff --git a/lustre/osd/osd_internal.h b/lustre/osd/osd_internal.h index 4a476db..8fd25f2 100644 --- a/lustre/osd/osd_internal.h +++ b/lustre/osd/osd_internal.h @@ -54,6 +54,8 @@ /* struct dentry */ #include #include +/* struct dirent64 */ +#include /* LUSTRE_OSD_NAME */ #include @@ -66,6 +68,17 @@ struct inode; +#define OSD_OII_NOGEN (0) +#define OSD_COUNTERS (0) + +#ifdef HAVE_QUOTA_SUPPORT +struct osd_ctxt { + __u32 oc_uid; + __u32 oc_gid; + __u32 oc_cap; +}; +#endif + /* * osd device. */ @@ -80,7 +93,7 @@ struct osd_device { * XXX temporary stuff for object index: directory where every object * is named by its fid. */ - struct dentry *od_obj_area; + struct dt_object *od_obj_area; /* Environment for transaction commit callback. * Currently, OSD is based on ext3/JBD. Transaction commit in ext3/JBD @@ -98,7 +111,7 @@ struct osd_device { __u32 od_capa_alg; struct lustre_capa_key *od_capa_keys; struct hlist_head *od_capa_hash; - + cfs_proc_dir_entry_t *od_proc_entry; struct lprocfs_stats *od_stats; /* @@ -107,23 +120,62 @@ struct osd_device { cfs_time_t od_osfs_age; struct kstatfs od_kstatfs; spinlock_t od_osfs_lock; + + /** + * The following flag indicates, if it is interop mode or not. + * It will be initialized, using mount param. + */ + __u32 od_iop_mode; }; +/** + * This is iterator's in-memory data structure in interoperability + * mode (i.e. iterator over ldiskfs style directory) + */ +struct osd_it_ea { + struct osd_object *oie_obj; + /** used in ldiskfs iterator, to stored file pointer */ + struct file oie_file; + /** used in ldiskfs iterator, to store directory entry */ + struct dirent64 oie_dirent64; + /** current file position */ + __u64 oie_curr_pos; + /** next file position */ + __u64 oie_next_pos; + /** namelen of the file */ + __u8 oie_namelen; + +}; + +/** + * Iterator's in-memory data structure for IAM mode. + */ +struct osd_it_iam { + struct osd_object *oi_obj; + struct iam_path_descr *oi_ipd; + struct iam_iterator oi_it; +}; struct osd_thread_info { const struct lu_env *oti_env; + /** + * used for index operations. + */ + struct dentry oti_obj_dentry; + struct dentry oti_child_dentry; + + /** dentry for Iterator context. */ + struct dentry oti_it_dentry; struct lu_fid oti_fid; struct osd_inode_id oti_id; /* * XXX temporary: for ->i_op calls. */ - struct qstr oti_str; struct txn_param oti_txn; /* * XXX temporary: fake dentry used by xattr calls. */ - struct dentry oti_dentry; struct timespec oti_time; /* * XXX temporary: fake struct file for osd_object_sync @@ -137,14 +189,46 @@ struct osd_thread_info { struct lu_fid_pack oti_pack; - /* union to guarantee that ->oti_ipd[] has proper alignment. */ + /** + * following ipd and it structures are used for osd_index_iam_lookup() + * these are defined separately as we might do index operation + * in open iterator session. + */ + + /** osd iterator context used for iterator session */ + + union { + struct osd_it_iam oti_it; + /** ldiskfs iterator data structure, see osd_it_ea_{init, fini} */ + struct osd_it_ea oti_it_ea; + }; + + + /** IAM iterator for index operation. */ + struct iam_iterator oti_idx_it; + + /** union to guarantee that ->oti_ipd[] has proper alignment. */ union { - char oti_ipd[DX_IPD_MAX_SIZE]; + char oti_it_ipd[DX_IPD_MAX_SIZE]; long long oti_alignment_lieutenant; }; + + union { + char oti_idx_ipd[DX_IPD_MAX_SIZE]; + long long oti_alignment_lieutenant_colonel; + }; + + int oti_r_locks; int oti_w_locks; int oti_txns; + /** used in osd_fid_set() to put xattr */ + struct lu_buf oti_buf; + /** used in osd_ea_fid_set() to set fid into common ea */ + struct lustre_mdt_attrs oti_mdt_attrs; +#ifdef HAVE_QUOTA_SUPPORT + struct osd_ctxt oti_ctxt; +#endif }; #ifdef LPROCFS diff --git a/lustre/osd/osd_oi.c b/lustre/osd/osd_oi.c index a2e086c..5a43a54 100644 --- a/lustre/osd/osd_oi.c +++ b/lustre/osd/osd_oi.c @@ -77,50 +77,89 @@ struct oi_descr { int fid_size; char *name; + __u32 oid; +}; + +/** to serialize concurrent OI index initialization */ +static struct mutex oi_init_lock; + +static struct dt_index_features oi_feat = { + .dif_flags = DT_IND_UPDATE, + .dif_recsize_min = sizeof(struct osd_inode_id), + .dif_recsize_max = sizeof(struct osd_inode_id), + .dif_ptrsize = 4 }; static const struct oi_descr oi_descr[OSD_OI_FID_NR] = { [OSD_OI_FID_SMALL] = { .fid_size = 5, - .name = "oi.5" + .name = "oi.5", + .oid = OSD_OI_FID_SMALL_OID }, [OSD_OI_FID_OTHER] = { .fid_size = sizeof(struct lu_fid), - .name = "oi.16" + .name = "oi.16", + .oid = OSD_OI_FID_OTHER_OID } }; +static int osd_oi_index_create(struct osd_thread_info *info, + struct dt_device *dev, + struct md_device *mdev) +{ + const struct lu_env *env; + struct lu_fid *oi_fid = &info->oti_fid; + struct md_object *mdo; + int i; + int rc; + + env = info->oti_env; + + for (i = rc = 0; i < OSD_OI_FID_NR && rc == 0; ++i) { + char *name; + name = oi_descr[i].name; + lu_local_obj_fid(oi_fid, oi_descr[i].oid); + oi_feat.dif_keysize_min = oi_descr[i].fid_size, + oi_feat.dif_keysize_max = oi_descr[i].fid_size, + + mdo = llo_store_create_index(env, mdev, dev, + "", name, + oi_fid, &oi_feat); + + if (IS_ERR(mdo)) + RETURN(PTR_ERR(mdo)); + + lu_object_put(env, &mdo->mo_lu); + } + return 0; +} + int osd_oi_init(struct osd_thread_info *info, - struct osd_oi *oi, struct dt_device *dev) + struct osd_oi *oi, + struct dt_device *dev, + struct md_device *mdev) { + const struct lu_env *env; int rc; int i; - const struct lu_env *env; CLASSERT(ARRAY_SIZE(oi->oi_dir) == ARRAY_SIZE(oi_descr)); env = info->oti_env; - + mutex_lock(&oi_init_lock); memset(oi, 0, sizeof *oi); - - for (i = rc = 0; i < ARRAY_SIZE(oi->oi_dir) && rc == 0; ++i) { +retry: + for (i = rc = 0; i < OSD_OI_FID_NR && rc == 0; ++i) { const char *name; - /* - * Allocate on stack---this is initialization. - */ - const struct dt_index_features feat = { - .dif_flags = DT_IND_UPDATE, - .dif_keysize_min = oi_descr[i].fid_size, - .dif_keysize_max = oi_descr[i].fid_size, - .dif_recsize_min = sizeof(struct osd_inode_id), - .dif_recsize_max = sizeof(struct osd_inode_id) - }; struct dt_object *obj; name = oi_descr[i].name; - obj = dt_store_open(env, dev, name, &info->oti_fid); + oi_feat.dif_keysize_min = oi_descr[i].fid_size, + oi_feat.dif_keysize_max = oi_descr[i].fid_size, + + obj = dt_store_open(env, dev, "", name, &info->oti_fid); if (!IS_ERR(obj)) { - rc = obj->do_ops->do_index_try(env, obj, &feat); + rc = obj->do_ops->do_index_try(env, obj, &oi_feat); if (rc == 0) { LASSERT(obj->do_index_ops != NULL); oi->oi_dir[i] = obj; @@ -130,17 +169,25 @@ int osd_oi_init(struct osd_thread_info *info, } } else { rc = PTR_ERR(obj); + if (rc == -ENOENT) { + rc = osd_oi_index_create(info, dev, mdev); + if (!rc) + goto retry; + } CERROR("Cannot open \"%s\": %d\n", name, rc); } } if (rc != 0) osd_oi_fini(info, oi); + + mutex_unlock(&oi_init_lock); return rc; } void osd_oi_fini(struct osd_thread_info *info, struct osd_oi *oi) { int i; + for (i = 0; i < ARRAY_SIZE(oi->oi_dir); ++i) { if (oi->oi_dir[i] != NULL) { lu_object_put(info->oti_env, &oi->oi_dir[i]->do_lu); @@ -171,6 +218,16 @@ static const struct dt_key *oi_fid_key(struct osd_thread_info *info, return NULL; } +static inline int fid_is_oi_fid(const struct lu_fid *fid) +{ + /* We need to filter-out oi obj's fid. As we can not store it, while + * oi-index create operation. + */ + return (unlikely(fid_seq(fid) == FID_SEQ_LOCAL_FILE && + (fid_oid(fid) == OSD_OI_FID_SMALL_OID || + fid_oid(fid) == OSD_OI_FID_OTHER_OID))); +} + int osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi, const struct lu_fid *fid, struct osd_inode_id *id) { @@ -183,19 +240,26 @@ int osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi, struct dt_object *idx; const struct dt_key *key; + if (fid_is_oi_fid(fid)) + return -ENOENT; + key = oi_fid_key(info, oi, fid, &idx); rc = idx->do_index_ops->dio_lookup(info->oti_env, idx, (struct dt_rec *)id, key, BYPASS_CAPA); - id->oii_ino = be32_to_cpu(id->oii_ino); - id->oii_gen = be32_to_cpu(id->oii_gen); + if (rc > 0) { + id->oii_ino = be32_to_cpu(id->oii_ino); + id->oii_gen = be32_to_cpu(id->oii_gen); + rc = 0; + } else if (rc == 0) + rc = -ENOENT; } return rc; } int osd_oi_insert(struct osd_thread_info *info, struct osd_oi *oi, const struct lu_fid *fid, const struct osd_inode_id *id0, - struct thandle *th) + struct thandle *th, int ignore_quota) { struct dt_object *idx; struct osd_inode_id *id; @@ -204,13 +268,17 @@ int osd_oi_insert(struct osd_thread_info *info, struct osd_oi *oi, if (fid_is_igif(fid)) return 0; + if (fid_is_oi_fid(fid)) + return 0; + key = oi_fid_key(info, oi, fid, &idx); id = &info->oti_id; id->oii_ino = cpu_to_be32(id0->oii_ino); id->oii_gen = cpu_to_be32(id0->oii_gen); return idx->do_index_ops->dio_insert(info->oti_env, idx, (const struct dt_rec *)id, - key, th, BYPASS_CAPA); + key, th, BYPASS_CAPA, + ignore_quota); } int osd_oi_delete(struct osd_thread_info *info, @@ -227,3 +295,9 @@ int osd_oi_delete(struct osd_thread_info *info, return idx->do_index_ops->dio_delete(info->oti_env, idx, key, th, BYPASS_CAPA); } + +int osd_oi_mod_init() +{ + mutex_init(&oi_init_lock); + return 0; +} diff --git a/lustre/osd/osd_oi.h b/lustre/osd/osd_oi.h index b1d2dc8..fe87768 100644 --- a/lustre/osd/osd_oi.h +++ b/lustre/osd/osd_oi.h @@ -54,6 +54,7 @@ /* struct rw_semaphore */ #include #include +#include struct lu_fid; struct osd_thread_info; @@ -90,15 +91,18 @@ struct osd_inode_id { __u32 oii_gen; /* inode generation */ }; -int osd_oi_init(struct osd_thread_info *info, - struct osd_oi *oi, struct dt_device *dev); +int osd_oi_mod_init(void); +int osd_oi_init(struct osd_thread_info *info, + struct osd_oi *oi, + struct dt_device *dev, + struct md_device *mdev); void osd_oi_fini(struct osd_thread_info *info, struct osd_oi *oi); int osd_oi_lookup(struct osd_thread_info *info, struct osd_oi *oi, const struct lu_fid *fid, struct osd_inode_id *id); int osd_oi_insert(struct osd_thread_info *info, struct osd_oi *oi, const struct lu_fid *fid, const struct osd_inode_id *id, - struct thandle *th); + struct thandle *th, int ingore_quota); int osd_oi_delete(struct osd_thread_info *info, struct osd_oi *oi, const struct lu_fid *fid, struct thandle *th); diff --git a/lustre/ost/ost_handler.c b/lustre/ost/ost_handler.c index 3e6f0f2..d76b2e2 100644 --- a/lustre/ost/ost_handler.c +++ b/lustre/ost/ost_handler.c @@ -69,6 +69,18 @@ static int oss_num_create_threads; CFS_MODULE_PARM(oss_num_create_threads, "i", int, 0444, "number of OSS create threads to start"); +/** + * Do not return server-side uid/gid to remote client + */ +static void ost_drop_id(struct obd_export *exp, struct obdo *oa) +{ + if (exp_connect_rmtclient(exp)) { + oa->o_uid = -1; + oa->o_gid = -1; + oa->o_valid &= ~(OBD_MD_FLUID | OBD_MD_FLGID); + } +} + void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) { struct oti_req_ack_lock *ack_lock; @@ -86,7 +98,7 @@ void oti_to_request(struct obd_trans_info *oti, struct ptlrpc_request *req) if (!ack_lock->mode) break; /* XXX not even calling target_send_reply in some cases... */ - ptlrpc_save_lock (req, &ack_lock->lock, ack_lock->mode); + ptlrpc_save_lock (req, &ack_lock->lock, ack_lock->mode, 0); } } @@ -95,6 +107,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req, { struct ost_body *body, *repbody; __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; + struct lustre_capa *capa = NULL; int rc; ENTRY; @@ -115,6 +128,9 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req, ldlm_request_cancel(req, dlm, 0); } + if (body->oa.o_valid & OBD_MD_FLOSSCAPA) + capa = lustre_unpack_capa(req->rq_reqmsg, REQ_REC_OFF + 2); + rc = lustre_pack_reply(req, 2, size, NULL); if (rc) RETURN(rc); @@ -124,7 +140,7 @@ static int ost_destroy(struct obd_export *exp, struct ptlrpc_request *req, repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); - req->rq_status = obd_destroy(exp, &body->oa, NULL, oti, NULL); + req->rq_status = obd_destroy(exp, &body->oa, NULL, oti, NULL, capa); RETURN(0); } @@ -154,6 +170,7 @@ static int ost_getattr(struct obd_export *exp, struct ptlrpc_request *req) oinfo.oi_capa = lustre_unpack_capa(req->rq_reqmsg, REQ_REC_OFF + 1); req->rq_status = obd_getattr(exp, &oinfo); + ost_drop_id(exp, &repbody->oa); RETURN(0); } @@ -253,7 +270,8 @@ static int ost_punch_lock_get(struct obd_export *exp, struct obdo *oa, RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id, LDLM_EXTENT, &policy, LCK_PW, &flags, ldlm_blocking_ast, ldlm_completion_ast, - ldlm_glimpse_ast, NULL, 0, NULL, lh)); + ldlm_glimpse_ast, NULL, 0, NULL, + NULL, lh)); } /* @@ -282,10 +300,9 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, /* check that we do support OBD_CONNECT_TRUNCLOCK. */ CLASSERT(OST_CONNECT_SUPPORTED & OBD_CONNECT_TRUNCLOCK); - body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), - lustre_swab_ost_body); - if (body == NULL) - RETURN(-EFAULT); + /* ost_body is varified and swabbed in ost_hpreq_handler() */ + body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); + LASSERT(body != NULL); oinfo.oi_oa = &body->oa; oinfo.oi_policy.l_extent.start = oinfo.oi_oa->o_size; @@ -319,6 +336,7 @@ static int ost_punch(struct obd_export *exp, struct ptlrpc_request *req, ost_punch_lock_put(exp, oinfo.oi_oa, &lh); } repbody->oa = *oinfo.oi_oa; + ost_drop_id(exp, &repbody->oa); RETURN(rc); } @@ -347,6 +365,7 @@ static int ost_sync(struct obd_export *exp, struct ptlrpc_request *req) memcpy(&repbody->oa, &body->oa, sizeof(body->oa)); req->rq_status = obd_sync(exp, &repbody->oa, NULL, repbody->oa.o_size, repbody->oa.o_blocks, capa); + ost_drop_id(exp, &repbody->oa); RETURN(0); } @@ -377,6 +396,7 @@ static int ost_setattr(struct obd_export *exp, struct ptlrpc_request *req, oinfo.oi_capa = lustre_unpack_capa(req->rq_reqmsg, REQ_REC_OFF + 1); req->rq_status = obd_setattr(exp, &oinfo, oti); + ost_drop_id(exp, &repbody->oa); RETURN(0); } @@ -452,7 +472,8 @@ static int ost_brw_lock_get(int mode, struct obd_export *exp, RETURN(ldlm_cli_enqueue_local(exp->exp_obd->obd_namespace, &res_id, LDLM_EXTENT, &policy, mode, &flags, ldlm_blocking_ast, ldlm_completion_ast, - ldlm_glimpse_ast, NULL, 0, NULL, lh)); + ldlm_glimpse_ast, NULL, 0, NULL, + NULL, lh)); } static void ost_brw_lock_put(int mode, @@ -471,7 +492,9 @@ static void ost_brw_lock_put(int mode, struct ost_prolong_data { struct obd_export *opd_exp; ldlm_policy_data_t opd_policy; + struct obdo *opd_oa; ldlm_mode_t opd_mode; + int opd_lock_match; }; static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data) @@ -501,6 +524,14 @@ static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data) return LDLM_ITER_CONTINUE; } + /* Fill the obdo with the matched lock handle. + * XXX: it is possible in some cases the IO RPC is covered by several + * locks, even for the write case, so it may need to be a lock list. */ + if (opd->opd_oa && !(opd->opd_oa->o_valid & OBD_MD_FLHANDLE)) { + opd->opd_oa->o_handle.cookie = lock->l_handle.h_cookie; + opd->opd_oa->o_valid |= OBD_MD_FLHANDLE; + } + if (!(lock->l_flags & LDLM_FL_AST_SENT)) { /* ignore locks not being cancelled */ return LDLM_ITER_CONTINUE; @@ -509,17 +540,18 @@ static int ost_prolong_locks_iter(struct ldlm_lock *lock, void *data) /* OK. this is a possible lock the user holds doing I/O * let's refresh eviction timer for it */ ldlm_refresh_waiting_lock(lock); + opd->opd_lock_match = 1; return LDLM_ITER_CONTINUE; } -static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj, - struct niobuf_remote *nb, struct obdo *oa, - ldlm_mode_t mode) +static int ost_rw_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj, + struct niobuf_remote *nb, struct obdo *oa, + ldlm_mode_t mode) { struct ldlm_res_id res_id; int nrbufs = obj->ioo_bufcnt; - struct ost_prolong_data opd; + struct ost_prolong_data opd = { 0 }; ENTRY; osc_build_res_name(obj->ioo_id, obj->ioo_gr, &res_id); @@ -540,16 +572,28 @@ static void ost_prolong_locks(struct obd_export *exp, struct obd_ioobj *obj, lock = ldlm_handle2lock(&oa->o_handle); if (lock != NULL) { ost_prolong_locks_iter(lock, &opd); + if (opd.opd_lock_match) { + LDLM_LOCK_PUT(lock); + RETURN(1); + } + + /* Check if the lock covers the whole IO region, + * otherwise iterate through the resource. */ + if (lock->l_policy_data.l_extent.end >= + opd.opd_policy.l_extent.end && + lock->l_policy_data.l_extent.start <= + opd.opd_policy.l_extent.start) { + LDLM_LOCK_PUT(lock); + RETURN(0); + } LDLM_LOCK_PUT(lock); - EXIT; - return; } } + opd.opd_oa = oa; ldlm_resource_iterate(exp->exp_obd->obd_namespace, &res_id, ost_prolong_locks_iter, &opd); - - EXIT; + RETURN(opd.opd_lock_match); } static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) @@ -564,7 +608,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) struct l_wait_info lwi; struct lustre_handle lockh = { 0 }; __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*body) }; - int objcount, niocount, npages, nob = 0, rc, i; + int niocount, npages, nob = 0, rc, i; int no_reply = 0; ENTRY; @@ -586,49 +630,18 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) if (exp->exp_failed) GOTO(out, rc = -ENOTCONN); - body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), - lustre_swab_ost_body); - if (body == NULL) { - CERROR("Missing/short ost_body\n"); - GOTO(out, rc = -EFAULT); - } - - objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / - sizeof(*ioo); - if (objcount == 0) { - CERROR("Missing/short ioobj\n"); - GOTO(out, rc = -EFAULT); - } - if (objcount > 1) { - CERROR("too many ioobjs (%d)\n", objcount); - GOTO(out, rc = -EFAULT); - } + /* ost_body, ioobj & noibuf_remote are verified and swabbed in + * ost_rw_hpreq_check(). */ + body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); + LASSERT(body != NULL); - ioo = lustre_swab_reqbuf(req, REQ_REC_OFF + 1, sizeof(*ioo), - lustre_swab_obd_ioobj); - if (ioo == NULL) { - CERROR("Missing/short ioobj\n"); - GOTO(out, rc = -EFAULT); - } + ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, sizeof(*ioo)); + LASSERT(ioo != NULL); niocount = ioo->ioo_bufcnt; - if (niocount > PTLRPC_MAX_BRW_PAGES) { - DEBUG_REQ(D_ERROR, req, "bulk has too many pages (%d)", - niocount); - GOTO(out, rc = -EFAULT); - } - - remote_nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2, - niocount * sizeof(*remote_nb), - lustre_swab_niobuf_remote); - if (remote_nb == NULL) { - CERROR("Missing/short niobuf\n"); - GOTO(out, rc = -EFAULT); - } - if (lustre_msg_swabbed(req->rq_reqmsg)) { /* swab remaining niobufs */ - for (i = 1; i < niocount; i++) - lustre_swab_niobuf_remote (&remote_nb[i]); - } + remote_nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, + niocount * sizeof(*remote_nb)); + LASSERT(remote_nb != NULL); if (body->oa.o_valid & OBD_MD_FLOSSCAPA) capa = lustre_unpack_capa(req->rq_reqmsg, REQ_REC_OFF + 3); @@ -673,7 +686,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) if (desc == NULL) /* XXX: check all cleanup stuff */ GOTO(out, rc = -ENOMEM); - ost_prolong_locks(exp, ioo, remote_nb, &body->oa, LCK_PW | LCK_PR); + ost_rw_prolong_locks(exp, ioo, remote_nb, &body->oa, LCK_PW | LCK_PR); nob = 0; for (i = 0; i < npages; i++) { @@ -748,7 +761,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc) || + !ptlrpc_server_bulk_active(desc) || exp->exp_failed, &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); /* Wait again if we changed deadline */ @@ -790,6 +803,7 @@ static int ost_brw_read(struct ptlrpc_request *req, struct obd_trans_info *oti) repbody = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repbody)); memcpy(&repbody->oa, &body->oa, sizeof(repbody->oa)); + ost_drop_id(exp, &repbody->oa); } out_lock: @@ -837,10 +851,11 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) __u32 *rcs; __u32 size[3] = { sizeof(struct ptlrpc_body), sizeof(*body) }; int objcount, niocount, npages; - int rc, swab, i, j; + int rc, i, j; obd_count client_cksum = 0, server_cksum = 0; cksum_type_t cksum_type = OBD_CKSUM_CRC32; int no_reply = 0; + __u32 o_uid = 0, o_gid = 0; ENTRY; req->rq_bulk_write = 1; @@ -864,56 +879,22 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (exp->exp_failed) GOTO(out, rc = -ENOTCONN); - swab = lustre_msg_swabbed(req->rq_reqmsg); - body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), - lustre_swab_ost_body); - if (body == NULL) { - CERROR("Missing/short ost_body\n"); - GOTO(out, rc = -EFAULT); - } + /* ost_body, ioobj & noibuf_remote are verified and swabbed in + * ost_rw_hpreq_check(). */ + body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); + LASSERT(body != NULL); - lustre_set_req_swabbed(req, REQ_REC_OFF + 1); objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / - sizeof(*ioo); - if (objcount == 0) { - CERROR("Missing/short ioobj\n"); - GOTO(out, rc = -EFAULT); - } - if (objcount > 1) { - CERROR("too many ioobjs (%d)\n", objcount); - GOTO(out, rc = -EFAULT); - } - + sizeof(*ioo); ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, objcount * sizeof(*ioo)); - LASSERT (ioo != NULL); - for (niocount = i = 0; i < objcount; i++) { - if (swab) - lustre_swab_obd_ioobj(&ioo[i]); - if (ioo[i].ioo_bufcnt == 0) { - CERROR("ioo[%d] has zero bufcnt\n", i); - GOTO(out, rc = -EFAULT); - } + LASSERT(ioo != NULL); + for (niocount = i = 0; i < objcount; i++) niocount += ioo[i].ioo_bufcnt; - } - if (niocount > PTLRPC_MAX_BRW_PAGES) { - DEBUG_REQ(D_ERROR, req, "bulk has too many pages (%d)", - niocount); - GOTO(out, rc = -EFAULT); - } - - remote_nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2, - niocount * sizeof(*remote_nb), - lustre_swab_niobuf_remote); - if (remote_nb == NULL) { - CERROR("Missing/short niobuf\n"); - GOTO(out, rc = -EFAULT); - } - if (swab) { /* swab the remaining niobufs */ - for (i = 1; i < niocount; i++) - lustre_swab_niobuf_remote (&remote_nb[i]); - } + remote_nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, + niocount * sizeof(*remote_nb)); + LASSERT(remote_nb != NULL); if (body->oa.o_valid & OBD_MD_FLOSSCAPA) capa = lustre_unpack_capa(req->rq_reqmsg, REQ_REC_OFF + 3); @@ -951,7 +932,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) GOTO(out_lock, rc = -ETIMEDOUT); } - ost_prolong_locks(exp, ioo, remote_nb,&body->oa, LCK_PW); + ost_rw_prolong_locks(exp, ioo, remote_nb,&body->oa, LCK_PW); /* obd_preprw clobbers oa->valid, so save what we need */ if (body->oa.o_valid & OBD_MD_FLCKSUM) { @@ -968,6 +949,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) body->oa.o_valid &= ~OBD_MD_FLGRANT; } + if (exp_connect_rmtclient(exp)) { + o_uid = body->oa.o_uid; + o_gid = body->oa.o_gid; + } npages = OST_THREAD_POOL_SIZE; rc = obd_preprw(OBD_BRW_WRITE, exp, &body->oa, objcount, ioo, remote_nb, &npages, local_nb, oti, capa); @@ -991,7 +976,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) if (desc->bd_export->exp_failed) rc = -ENOTCONN; else - rc = ptlrpc_start_bulk_transfer (desc); + rc = ptlrpc_start_bulk_transfer(desc); if (rc == 0) { time_t start = cfs_time_current_sec(); do { @@ -1002,7 +987,7 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) lwi = LWI_TIMEOUT_INTERVAL(timeout, cfs_time_seconds(1), ost_bulk_timeout, desc); rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc) || + !ptlrpc_server_bulk_active(desc) || desc->bd_export->exp_failed, &lwi); LASSERT(rc == 0 || rc == -ETIMEDOUT); /* Wait again if we changed deadline */ @@ -1063,6 +1048,10 @@ static int ost_brw_write(struct ptlrpc_request *req, struct obd_trans_info *oti) /* Must commit after prep above in all cases */ rc = obd_commitrw(OBD_BRW_WRITE, exp, &repbody->oa, objcount, ioo, remote_nb, npages, local_nb, oti, rc); + if (exp_connect_rmtclient(exp)) { + repbody->oa.o_uid = o_uid; + repbody->oa.o_gid = o_gid; + } if (unlikely(client_cksum != server_cksum && rc == 0)) { int new_cksum = ost_checksum_bulk(desc, OST_WRITE, cksum_type); @@ -1228,26 +1217,25 @@ static int ost_get_info(struct obd_export *exp, struct ptlrpc_request *req) RETURN(rc); } +#ifdef HAVE_QUOTA_SUPPORT static int ost_handle_quotactl(struct ptlrpc_request *req) { struct obd_quotactl *oqctl, *repoqc; - __u32 size[2] = { sizeof(struct ptlrpc_body), sizeof(*repoqc) }; int rc; ENTRY; - oqctl = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*oqctl), - lustre_swab_obd_quotactl); + oqctl = req_capsule_client_get(&req->rq_pill, &RMF_OBD_QUOTACTL); if (oqctl == NULL) GOTO(out, rc = -EPROTO); - rc = lustre_pack_reply(req, 2, size, NULL); + rc = req_capsule_server_pack(&req->rq_pill); if (rc) GOTO(out, rc); - repoqc = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, sizeof(*repoqc)); - + repoqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); req->rq_status = obd_quotactl(req->rq_export, oqctl); *repoqc = *oqctl; + out: RETURN(rc); } @@ -1263,15 +1251,38 @@ static int ost_handle_quotacheck(struct ptlrpc_request *req) RETURN(-EPROTO); rc = req_capsule_server_pack(&req->rq_pill); - if (rc) { - CERROR("ost: out of memory while packing quotacheck reply\n"); + if (rc) RETURN(-ENOMEM); - } req->rq_status = obd_quotacheck(req->rq_export, oqctl); RETURN(0); } +static int ost_handle_quota_adjust_qunit(struct ptlrpc_request *req) +{ + struct quota_adjust_qunit *oqaq, *repoqa; + struct lustre_quota_ctxt *qctxt; + int rc; + ENTRY; + + qctxt = &req->rq_export->exp_obd->u.obt.obt_qctxt; + oqaq = req_capsule_client_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT); + if (oqaq == NULL) + GOTO(out, rc = -EPROTO); + + rc = req_capsule_server_pack(&req->rq_pill); + if (rc) + GOTO(out, rc); + + repoqa = req_capsule_server_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT); + req->rq_status = obd_quota_adjust_qunit(req->rq_export, oqaq, qctxt); + *repoqa = *oqaq; + + out: + RETURN(rc); +} +#endif + static int ost_llog_handle_connect(struct obd_export *exp, struct ptlrpc_request *req) { @@ -1284,41 +1295,171 @@ static int ost_llog_handle_connect(struct obd_export *exp, RETURN(rc); } -static int filter_export_check_flavor(struct filter_obd *filter, - struct obd_export *exp, - struct ptlrpc_request *req) +#define ost_init_sec_none(reply, exp) \ +do { \ + reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT | \ + OBD_CONNECT_RMT_CLIENT_FORCE | \ + OBD_CONNECT_OSS_CAPA); \ + spin_lock(&exp->exp_lock); \ + exp->exp_connect_flags = reply->ocd_connect_flags; \ + spin_unlock(&exp->exp_lock); \ +} while (0) + +static int ost_init_sec_level(struct ptlrpc_request *req) { - int rc = 0; - - /* FIXME - * this should be done in filter_connect()/filter_reconnect(), but - * we can't obtain information like NID, which stored in incoming - * request, thus can't decide what flavor to use. so we do it here. - * - * This hack should be removed after the OST stack be rewritten, just - * like what we are doing in mdt_obd_connect()/mdt_obd_reconnect(). - */ - if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_INVALID) - return 0; - - CDEBUG(D_SEC, "from %s\n", sptlrpc_part2name(req->rq_sp_from)); - spin_lock(&exp->exp_lock); - exp->exp_sp_peer = req->rq_sp_from; - - read_lock(&filter->fo_sptlrpc_lock); - sptlrpc_rule_set_choose(&filter->fo_sptlrpc_rset, exp->exp_sp_peer, - req->rq_peer.nid, &exp->exp_flvr); - read_unlock(&filter->fo_sptlrpc_lock); - - if (exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { - CERROR("invalid rpc flavor %x, expect %x, from %s\n", - req->rq_flvr.sf_rpc, exp->exp_flvr.sf_rpc, - libcfs_nid2str(req->rq_peer.nid)); - exp->exp_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; - rc = -EACCES; + struct obd_export *exp = req->rq_export; + struct req_capsule *pill = &req->rq_pill; + struct obd_device *obd = exp->exp_obd; + struct filter_obd *filter = &obd->u.filter; + char *client = libcfs_nid2str(req->rq_peer.nid); + struct obd_connect_data *data, *reply; + int rc = 0, remote; + ENTRY; + + data = req_capsule_client_get(pill, &RMF_CONNECT_DATA); + reply = req_capsule_server_get(pill, &RMF_CONNECT_DATA); + if (data == NULL || reply == NULL) + RETURN(-EFAULT); + + /* connection from MDT is always trusted */ + if (req->rq_auth_usr_mdt) { + ost_init_sec_none(reply, exp); + RETURN(0); + } + + /* no GSS support case */ + if (!req->rq_auth_gss) { + if (filter->fo_sec_level > LUSTRE_SEC_NONE) { + CWARN("client %s -> target %s does not user GSS, " + "can not run under security level %d.\n", + client, obd->obd_name, filter->fo_sec_level); + RETURN(-EACCES); + } else { + ost_init_sec_none(reply, exp); + RETURN(0); + } + } + + /* old version case */ + if (unlikely(!(data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT) || + !(data->ocd_connect_flags & OBD_CONNECT_OSS_CAPA))) { + if (filter->fo_sec_level > LUSTRE_SEC_NONE) { + CWARN("client %s -> target %s uses old version, " + "can not run under security level %d.\n", + client, obd->obd_name, filter->fo_sec_level); + RETURN(-EACCES); + } else { + CWARN("client %s -> target %s uses old version, " + "run under security level %d.\n", + client, obd->obd_name, filter->fo_sec_level); + ost_init_sec_none(reply, exp); + RETURN(0); + } + } + + remote = data->ocd_connect_flags & OBD_CONNECT_RMT_CLIENT_FORCE; + if (remote) { + if (!req->rq_auth_remote) + CDEBUG(D_SEC, "client (local realm) %s -> target %s " + "asked to be remote.\n", client, obd->obd_name); + } else if (req->rq_auth_remote) { + remote = 1; + CDEBUG(D_SEC, "client (remote realm) %s -> target %s is set " + "as remote by default.\n", client, obd->obd_name); + } + + if (remote) { + if (!filter->fo_fl_oss_capa) { + CDEBUG(D_SEC, "client %s -> target %s is set as remote," + " but OSS capabilities are not enabled: %d.\n", + client, obd->obd_name, filter->fo_fl_oss_capa); + RETURN(-EACCES); + } + } + + switch (filter->fo_sec_level) { + case LUSTRE_SEC_NONE: + if (!remote) { + ost_init_sec_none(reply, exp); + break; + } else { + CDEBUG(D_SEC, "client %s -> target %s is set as remote, " + "can not run under security level %d.\n", + client, obd->obd_name, filter->fo_sec_level); + RETURN(-EACCES); + } + case LUSTRE_SEC_REMOTE: + if (!remote) + ost_init_sec_none(reply, exp); + break; + case LUSTRE_SEC_ALL: + if (!remote) { + reply->ocd_connect_flags &= ~(OBD_CONNECT_RMT_CLIENT | + OBD_CONNECT_RMT_CLIENT_FORCE); + if (!filter->fo_fl_oss_capa) + reply->ocd_connect_flags &= ~OBD_CONNECT_OSS_CAPA; + + spin_lock(&exp->exp_lock); + exp->exp_connect_flags = reply->ocd_connect_flags; + spin_unlock(&exp->exp_lock); + } + break; + default: + RETURN(-EINVAL); } - spin_unlock(&exp->exp_lock); + RETURN(rc); +} + +/* + * FIXME + * this should be done in filter_connect()/filter_reconnect(), but + * we can't obtain information like NID, which stored in incoming + * request, thus can't decide what flavor to use. so we do it here. + * + * This hack should be removed after the OST stack be rewritten, just + * like what we are doing in mdt_obd_connect()/mdt_obd_reconnect(). + */ +static int ost_connect_check_sptlrpc(struct ptlrpc_request *req) +{ + struct obd_export *exp = req->rq_export; + struct filter_obd *filter = &exp->exp_obd->u.filter; + struct sptlrpc_flavor flvr; + int rc = 0; + + if (exp->exp_flvr.sf_rpc == SPTLRPC_FLVR_INVALID) { + read_lock(&filter->fo_sptlrpc_lock); + sptlrpc_target_choose_flavor(&filter->fo_sptlrpc_rset, + req->rq_sp_from, + req->rq_peer.nid, + &flvr); + read_unlock(&filter->fo_sptlrpc_lock); + + spin_lock(&exp->exp_lock); + + exp->exp_sp_peer = req->rq_sp_from; + exp->exp_flvr = flvr; + + if (exp->exp_flvr.sf_rpc != SPTLRPC_FLVR_ANY && + exp->exp_flvr.sf_rpc != req->rq_flvr.sf_rpc) { + CERROR("unauthorized rpc flavor %x from %s, " + "expect %x\n", req->rq_flvr.sf_rpc, + libcfs_nid2str(req->rq_peer.nid), + exp->exp_flvr.sf_rpc); + rc = -EACCES; + } + + spin_unlock(&exp->exp_lock); + } else { + if (exp->exp_sp_peer != req->rq_sp_from) { + CERROR("RPC source %s doesn't match %s\n", + sptlrpc_part2name(req->rq_sp_from), + sptlrpc_part2name(exp->exp_sp_peer)); + rc = -EACCES; + } else { + rc = sptlrpc_target_export_check(exp, req); + } + } return rc; } @@ -1380,8 +1521,11 @@ int ost_msg_check_version(struct lustre_msg *msg) case OST_SYNC: case OST_SET_INFO: case OST_GET_INFO: +#ifdef HAVE_QUOTA_SUPPORT case OST_QUOTACHECK: case OST_QUOTACTL: + case OST_QUOTA_ADJUST_QUNIT: +#endif rc = lustre_msg_check_version(msg, LUSTRE_OST_VERSION); if (rc) CERROR("bad opc %u version %08x, expecting %08x\n", @@ -1417,6 +1561,250 @@ int ost_msg_check_version(struct lustre_msg *msg) return rc; } +static int ost_rw_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct niobuf_remote *nb; + struct obd_ioobj *ioo; + struct ost_body *body; + int objcount, niocount; + int mode, opc, i; + __u64 start, end; + ENTRY; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + LASSERT(opc == OST_READ || opc == OST_WRITE); + + /* As the request may be covered by several locks, do not look at + * o_handle, look at the RPC IO region. */ + body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), + lustre_swab_obdo); + objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / + sizeof(*ioo); + ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, + objcount * sizeof(*ioo)); + LASSERT(ioo != NULL); + for (niocount = i = 0; i < objcount; i++) + niocount += ioo[i].ioo_bufcnt; + + nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, + niocount * sizeof(*nb)); + LASSERT(nb != NULL); + + mode = LCK_PW; + if (opc == OST_READ) + mode |= LCK_PR; + + start = nb[0].offset & CFS_PAGE_MASK; + end = (nb[ioo->ioo_bufcnt - 1].offset + + nb[ioo->ioo_bufcnt - 1].len - 1) | ~CFS_PAGE_MASK; + + if (!(lock->l_granted_mode & mode)) + RETURN(0); + + if (lock->l_policy_data.l_extent.end < start || + lock->l_policy_data.l_extent.start > end) + RETURN(0); + + RETURN(1); +} + +/** + * Swab buffers needed to call ost_rw_prolong_locks() and call it. + * Return the value from ost_rw_prolong_locks() which is non-zero if + * there is a cancelled lock which is waiting for this IO request. + */ +static int ost_rw_hpreq_check(struct ptlrpc_request *req) +{ + struct niobuf_remote *nb; + struct obd_ioobj *ioo; + struct ost_body *body; + int objcount, niocount; + int mode, opc, i; + ENTRY; + + opc = lustre_msg_get_opc(req->rq_reqmsg); + LASSERT(opc == OST_READ || opc == OST_WRITE); + + body = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, sizeof(*body)); + LASSERT(body != NULL); + + objcount = lustre_msg_buflen(req->rq_reqmsg, REQ_REC_OFF + 1) / + sizeof(*ioo); + ioo = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 1, + objcount * sizeof(*ioo)); + LASSERT(ioo != NULL); + + for (niocount = i = 0; i < objcount; i++) + niocount += ioo[i].ioo_bufcnt; + nb = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF + 2, + niocount * sizeof(*nb)); + LASSERT(nb != NULL); + LASSERT(niocount == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)); + + mode = LCK_PW; + if (opc == OST_READ) + mode |= LCK_PR; + RETURN(ost_rw_prolong_locks(req->rq_export, ioo, nb, &body->oa, mode)); +} + +static int ost_punch_prolong_locks(struct obd_export *exp, struct obdo *oa) +{ + struct ldlm_res_id res_id = { .name = { oa->o_id } }; + struct ost_prolong_data opd = { 0 }; + __u64 start, end; + ENTRY; + + start = oa->o_size; + end = start + oa->o_blocks; + + opd.opd_mode = LCK_PW; + opd.opd_exp = exp; + opd.opd_policy.l_extent.start = start & CFS_PAGE_MASK; + if (oa->o_blocks == OBD_OBJECT_EOF || end < start) + opd.opd_policy.l_extent.end = OBD_OBJECT_EOF; + else + opd.opd_policy.l_extent.end = end | ~CFS_PAGE_MASK; + + CDEBUG(D_DLMTRACE,"refresh locks: "LPU64"/"LPU64" ("LPU64"->"LPU64")\n", + res_id.name[0], res_id.name[1], opd.opd_policy.l_extent.start, + opd.opd_policy.l_extent.end); + + opd.opd_oa = oa; + ldlm_resource_iterate(exp->exp_obd->obd_namespace, &res_id, + ost_prolong_locks_iter, &opd); + RETURN(opd.opd_lock_match); +} + +static int ost_punch_hpreq_lock_match(struct ptlrpc_request *req, + struct ldlm_lock *lock) +{ + struct ost_body *body; + ENTRY; + + body = lustre_swab_reqbuf(req, REQ_REC_OFF, sizeof(*body), + lustre_swab_obdo); + LASSERT(body != NULL); + + if (body->oa.o_valid & OBD_MD_FLHANDLE && + body->oa.o_handle.cookie == lock->l_handle.h_cookie) + RETURN(1); + RETURN(0); +} + +static int ost_punch_hpreq_check(struct ptlrpc_request *req) +{ + struct ost_body *body = lustre_msg_buf(req->rq_reqmsg, + REQ_REC_OFF, sizeof(*body)); + LASSERT(body != NULL); + LASSERT(!(body->oa.o_valid & OBD_MD_FLFLAGS) || + !(body->oa.o_flags & OBD_FL_TRUNCLOCK)); + + RETURN(ost_punch_prolong_locks(req->rq_export, &body->oa)); +} + +struct ptlrpc_hpreq_ops ost_hpreq_rw = { + .hpreq_lock_match = ost_rw_hpreq_lock_match, + .hpreq_check = ost_rw_hpreq_check, +}; + +struct ptlrpc_hpreq_ops ost_hpreq_punch = { + .hpreq_lock_match = ost_punch_hpreq_lock_match, + .hpreq_check = ost_punch_hpreq_check, +}; + +/** Assign high priority operations to the request if needed. */ +static int ost_hpreq_handler(struct ptlrpc_request *req) +{ + ENTRY; + if (req->rq_export) { + int opc = lustre_msg_get_opc(req->rq_reqmsg); + struct ost_body *body; + + if (opc == OST_READ || opc == OST_WRITE) { + struct niobuf_remote *nb; + struct obd_ioobj *ioo; + int objcount, niocount; + int swab, i; + + body = lustre_swab_reqbuf(req, REQ_REC_OFF, + sizeof(*body), + lustre_swab_obdo); + if (!body) { + CERROR("Missing/short ost_body\n"); + RETURN(-EFAULT); + } + objcount = lustre_msg_buflen(req->rq_reqmsg, + REQ_REC_OFF + 1) / + sizeof(*ioo); + if (objcount == 0) { + CERROR("Missing/short ioobj\n"); + RETURN(-EFAULT); + } + if (objcount > 1) { + CERROR("too many ioobjs (%d)\n", objcount); + RETURN(-EFAULT); + } + + swab = !lustre_req_swabbed(req, REQ_REC_OFF + 1) && + lustre_msg_swabbed(req->rq_reqmsg); + ioo = lustre_swab_reqbuf(req, REQ_REC_OFF + 1, + objcount * sizeof(*ioo), + lustre_swab_obd_ioobj); + if (!ioo) { + CERROR("Missing/short ioobj\n"); + RETURN(-EFAULT); + } + for (niocount = i = 0; i < objcount; i++) { + if (i > 0 && swab) + lustre_swab_obd_ioobj(&ioo[i]); + if (ioo[i].ioo_bufcnt == 0) { + CERROR("ioo[%d] has zero bufcnt\n", i); + RETURN(-EFAULT); + } + niocount += ioo[i].ioo_bufcnt; + } + if (niocount > PTLRPC_MAX_BRW_PAGES) { + DEBUG_REQ(D_ERROR, req, "bulk has too many " + "pages (%d)", niocount); + RETURN(-EFAULT); + } + + swab = !lustre_req_swabbed(req, REQ_REC_OFF + 2) && + lustre_msg_swabbed(req->rq_reqmsg); + nb = lustre_swab_reqbuf(req, REQ_REC_OFF + 2, + niocount * sizeof(*nb), + lustre_swab_niobuf_remote); + if (!nb) { + CERROR("Missing/short niobuf\n"); + RETURN(-EFAULT); + } + + if (swab) { + /* swab remaining niobufs */ + for (i = 1; i < niocount; i++) + lustre_swab_niobuf_remote(&nb[i]); + } + + if (niocount == 0 || !(nb[0].flags & OBD_BRW_SRVLOCK)) + req->rq_ops = &ost_hpreq_rw; + } else if (opc == OST_PUNCH) { + body = lustre_swab_reqbuf(req, REQ_REC_OFF, + sizeof(*body), + lustre_swab_obdo); + if (!body) { + CERROR("Missing/short ost_body\n"); + RETURN(-EFAULT); + } + + if (!(body->oa.o_valid & OBD_MD_FLFLAGS) || + !(body->oa.o_flags & OBD_FL_TRUNCLOCK)) + req->rq_ops = &ost_hpreq_punch; + } + } + RETURN(0); +} + /* TODO: handle requests in a similar way as MDT: see mdt_handle_common() */ int ost_handle(struct ptlrpc_request *req) { @@ -1485,12 +1873,9 @@ int ost_handle(struct ptlrpc_request *req) if (OBD_FAIL_CHECK(OBD_FAIL_OST_CONNECT_NET2)) RETURN(0); if (!rc) { - struct obd_export *exp = req->rq_export; - - obd = exp->exp_obd; - - rc = filter_export_check_flavor(&obd->u.filter, - exp, req); + rc = ost_init_sec_level(req); + if (!rc) + rc = ost_connect_check_sptlrpc(req); } break; } @@ -1596,6 +1981,7 @@ int ost_handle(struct ptlrpc_request *req) DEBUG_REQ(D_INODE, req, "get_info"); rc = ost_get_info(req->rq_export, req); break; +#ifdef HAVE_QUOTA_SUPPORT case OST_QUOTACHECK: CDEBUG(D_INODE, "quotacheck\n"); req_capsule_set(&req->rq_pill, &RQF_OST_QUOTACHECK); @@ -1610,6 +1996,12 @@ int ost_handle(struct ptlrpc_request *req) RETURN(0); rc = ost_handle_quotactl(req); break; + case OST_QUOTA_ADJUST_QUNIT: + CDEBUG(D_INODE, "quota_adjust_qunit\n"); + req_capsule_set(&req->rq_pill, &RQF_OST_QUOTA_ADJUST_QUNIT); + rc = ost_handle_quota_adjust_qunit(req); + break; +#endif case OBD_PING: DEBUG_REQ(D_INODE, req, "ping"); req_capsule_set(&req->rq_pill, &RQF_OBD_PING); @@ -1773,7 +2165,7 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) /* Insure a 4x range for dynamic threads */ if (oss_min_threads > OSS_THREADS_MAX / 4) oss_min_threads = OSS_THREADS_MAX / 4; - oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4); + oss_max_threads = min(OSS_THREADS_MAX, oss_min_threads * 4 + 1); } ost->ost_service = @@ -1783,7 +2175,7 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) ost_handle, LUSTRE_OSS_NAME, obd->obd_proc_entry, target_print_req, oss_min_threads, oss_max_threads, - "ll_ost", LCT_DT_THREAD); + "ll_ost", LCT_DT_THREAD, NULL); if (ost->ost_service == NULL) { CERROR("failed to start service\n"); GOTO(out_lprocfs, rc = -ENOMEM); @@ -1812,7 +2204,7 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) ost_handle, "ost_create", obd->obd_proc_entry, target_print_req, oss_min_create_threads, oss_max_create_threads, - "ll_ost_creat", LCT_DT_THREAD); + "ll_ost_creat", LCT_DT_THREAD, NULL); if (ost->ost_create_service == NULL) { CERROR("failed to start OST create service\n"); GOTO(out_service, rc = -ENOMEM); @@ -1829,7 +2221,7 @@ static int ost_setup(struct obd_device *obd, struct lustre_cfg* lcfg) ost_handle, "ost_io", obd->obd_proc_entry, target_print_req, oss_min_threads, oss_max_threads, - "ll_ost_io", LCT_DT_THREAD); + "ll_ost_io", LCT_DT_THREAD, ost_hpreq_handler); if (ost->ost_io_service == NULL) { CERROR("failed to start OST I/O service\n"); GOTO(out_create, rc = -ENOMEM); diff --git a/lustre/ptlrpc/client.c b/lustre/ptlrpc/client.c index de5f94b..0913e85 100644 --- a/lustre/ptlrpc/client.c +++ b/lustre/ptlrpc/client.c @@ -101,8 +101,8 @@ static inline struct ptlrpc_bulk_desc *new_bulk(int npages, int type, int portal return desc; } -struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp (struct ptlrpc_request *req, - int npages, int type, int portal) +struct ptlrpc_bulk_desc *ptlrpc_prep_bulk_imp(struct ptlrpc_request *req, + int npages, int type, int portal) { struct obd_import *imp = req->rq_import; struct ptlrpc_bulk_desc *desc; @@ -356,9 +356,9 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) struct list_head *l, *tmp; struct ptlrpc_request *req; - if (!pool) - return; + LASSERT(pool != NULL); + spin_lock(&pool->prp_lock); list_for_each_safe(l, tmp, &pool->prp_req_list) { req = list_entry(l, struct ptlrpc_request, rq_list); list_del(&req->rq_list); @@ -367,6 +367,7 @@ void ptlrpc_free_rq_pool(struct ptlrpc_request_pool *pool) OBD_FREE(req->rq_reqbuf, pool->prp_rq_size); OBD_FREE(req, sizeof(*req)); } + spin_unlock(&pool->prp_lock); OBD_FREE(pool, sizeof(*pool)); } @@ -407,8 +408,9 @@ void ptlrpc_add_rqs_to_pool(struct ptlrpc_request_pool *pool, int num_rq) return; } -struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int num_rq, int msgsize, - void (*populate_pool)(struct ptlrpc_request_pool *, int)) +struct ptlrpc_request_pool * +ptlrpc_init_rq_pool(int num_rq, int msgsize, + void (*populate_pool)(struct ptlrpc_request_pool *, int)) { struct ptlrpc_request_pool *pool; @@ -434,7 +436,8 @@ struct ptlrpc_request_pool *ptlrpc_init_rq_pool(int num_rq, int msgsize, return pool; } -static struct ptlrpc_request *ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) +static struct ptlrpc_request * +ptlrpc_prep_req_from_pool(struct ptlrpc_request_pool *pool) { struct ptlrpc_request *request; struct lustre_msg *reqbuf; @@ -455,7 +458,7 @@ static struct ptlrpc_request *ptlrpc_prep_req_from_pool(struct ptlrpc_request_po request = list_entry(pool->prp_req_list.next, struct ptlrpc_request, rq_list); - list_del(&request->rq_list); + list_del_init(&request->rq_list); spin_unlock(&pool->prp_lock); LASSERT(request->rq_reqbuf); @@ -476,6 +479,7 @@ static void __ptlrpc_free_req_to_pool(struct ptlrpc_request *request) spin_lock(&pool->prp_lock); LASSERT(list_empty(&request->rq_list)); + LASSERT(!request->rq_receiving_reply); list_add_tail(&request->rq_list, &pool->prp_req_list); spin_unlock(&pool->prp_lock); } @@ -517,7 +521,9 @@ static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, request->rq_reply_cbid.cbid_fn = reply_in_callback; request->rq_reply_cbid.cbid_arg = request; + request->rq_reply_deadline = 0; request->rq_phase = RQ_PHASE_NEW; + request->rq_next_phase = RQ_PHASE_UNDEFINED; request->rq_request_portal = imp->imp_client->cli_request_portal; request->rq_reply_portal = imp->imp_client->cli_reply_portal; @@ -532,6 +538,7 @@ static int __ptlrpc_request_bufs_pack(struct ptlrpc_request *request, CFS_INIT_LIST_HEAD(&request->rq_ctx_chain); CFS_INIT_LIST_HEAD(&request->rq_set_chain); CFS_INIT_LIST_HEAD(&request->rq_history_list); + CFS_INIT_LIST_HEAD(&request->rq_exp_list); cfs_waitq_init(&request->rq_reply_waitq); request->rq_xid = ptlrpc_next_xid(); atomic_set(&request->rq_refcount, 1); @@ -771,8 +778,6 @@ void ptlrpc_set_add_req(struct ptlrpc_request_set *set, list_add_tail(&req->rq_set_chain, &set->set_requests); req->rq_set = set; set->set_remaining++; - - atomic_inc(&req->rq_import->imp_inflight); } /** @@ -878,12 +883,12 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) /* serialise with network callback */ spin_lock(&req->rq_lock); - if (req->rq_replied) + if (ptlrpc_client_replied(req)) GOTO(out, rc = 1); if (req->rq_net_err && !req->rq_timedout) { spin_unlock(&req->rq_lock); - rc = ptlrpc_expire_one_request(req); + rc = ptlrpc_expire_one_request(req, 0); spin_lock(&req->rq_lock); GOTO(out, rc); } @@ -897,7 +902,7 @@ static int ptlrpc_check_reply(struct ptlrpc_request *req) if (req->rq_restart) GOTO(out, rc = 1); - if (req->rq_early) { + if (ptlrpc_client_early(req)) { ptlrpc_at_recv_early_reply(req); GOTO(out, rc = 0); /* keep waiting */ } @@ -1062,7 +1067,7 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) if (req->rq_sent && (req->rq_sent > cfs_time_current_sec())) RETURN (0); - req->rq_phase = RQ_PHASE_RPC; + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); imp = req->rq_import; spin_lock(&imp->imp_lock); @@ -1070,18 +1075,17 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) req->rq_import_generation = imp->imp_generation; if (ptlrpc_import_delay_req(imp, req, &rc)) { - spin_lock (&req->rq_lock); + spin_lock(&req->rq_lock); req->rq_waiting = 1; - spin_unlock (&req->rq_lock); + spin_unlock(&req->rq_lock); DEBUG_REQ(D_HA, req, "req from PID %d waiting for recovery: " - "(%s != %s)", - lustre_msg_get_status(req->rq_reqmsg) , + "(%s != %s)", lustre_msg_get_status(req->rq_reqmsg), ptlrpc_import_state_name(req->rq_send_state), ptlrpc_import_state_name(imp->imp_state)); - LASSERT(list_empty (&req->rq_list)); - + LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&req->rq_import->imp_inflight); spin_unlock(&imp->imp_lock); RETURN(0); } @@ -1089,13 +1093,13 @@ static int ptlrpc_send_new_req(struct ptlrpc_request *req) if (rc != 0) { spin_unlock(&imp->imp_lock); req->rq_status = rc; - req->rq_phase = RQ_PHASE_INTERPRET; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); RETURN(rc); } - /* XXX this is the same as ptlrpc_queue_wait */ LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&req->rq_import->imp_inflight); spin_unlock(&imp->imp_lock); lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid()); @@ -1147,6 +1151,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) ptlrpc_send_new_req(req)) { force_timer_recalc = 1; } + /* delayed send - skip */ if (req->rq_phase == RQ_PHASE_NEW && req->rq_sent) continue; @@ -1154,50 +1159,84 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) if (!(req->rq_phase == RQ_PHASE_RPC || req->rq_phase == RQ_PHASE_BULK || req->rq_phase == RQ_PHASE_INTERPRET || + req->rq_phase == RQ_PHASE_UNREGISTERING || req->rq_phase == RQ_PHASE_COMPLETE)) { DEBUG_REQ(D_ERROR, req, "bad phase %x", req->rq_phase); LBUG(); } + if (req->rq_phase == RQ_PHASE_UNREGISTERING) { + LASSERT(req->rq_next_phase != req->rq_phase); + LASSERT(req->rq_next_phase != RQ_PHASE_UNDEFINED); + + /* + * Skip processing until reply is unlinked. We + * can't return to pool before that and we can't + * call interpret before that. We need to make + * sure that all rdma transfers finished and will + * not corrupt any data. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + + /* + * Turn fail_loc off to prevent it from looping + * forever. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK, + OBD_FAIL_ONCE); + } + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK)) { + OBD_FAIL_CHECK_ORSET(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK, + OBD_FAIL_ONCE); + } + + /* + * Move to next phase if reply was successfully + * unlinked. + */ + ptlrpc_rqphase_move(req, req->rq_next_phase); + } + if (req->rq_phase == RQ_PHASE_COMPLETE) continue; if (req->rq_phase == RQ_PHASE_INTERPRET) GOTO(interpret, req->rq_status); - if (req->rq_net_err && !req->rq_timedout) - ptlrpc_expire_one_request(req); + /* + * Note that this also will start async reply unlink. + */ + if (req->rq_net_err && !req->rq_timedout) { + ptlrpc_expire_one_request(req, 1); + + /* + * Check if we still need to wait for unlink. + */ + if (ptlrpc_client_recv_or_unlink(req) || + ptlrpc_client_bulk_active(req)) + continue; + } if (req->rq_err) { - ptlrpc_unregister_reply(req); req->rq_replied = 0; if (req->rq_status == 0) req->rq_status = -EIO; - req->rq_phase = RQ_PHASE_INTERPRET; - - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); - + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); GOTO(interpret, req->rq_status); } /* ptlrpc_queue_wait->l_wait_event guarantees that rq_intr - * will only be set after rq_timedout, but the oig waiting - * path sets rq_intr irrespective of whether ptlrpcd has - * seen a timeout. our policy is to only interpret + * will only be set after rq_timedout, but the synchronous IO + * waiting path sets rq_intr irrespective of whether ptlrpcd + * has seen a timeout. our policy is to only interpret * interrupted rpcs after they have timed out */ if (req->rq_intr && (req->rq_timedout || req->rq_waiting || req->rq_wait_ctx)) { - /* NB could be on delayed list */ - ptlrpc_unregister_reply(req); req->rq_status = -EINTR; - req->rq_phase = RQ_PHASE_INTERPRET; - - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); - + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); GOTO(interpret, req->rq_status); } @@ -1206,7 +1245,8 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) req->rq_waiting || req->rq_wait_ctx) { int status; - ptlrpc_unregister_reply(req); + if (!ptlrpc_unregister_reply(req, 1)) + continue; spin_lock(&imp->imp_lock); @@ -1215,19 +1255,22 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) continue; } - list_del_init(&req->rq_list); if (status != 0) { req->rq_status = status; - req->rq_phase = RQ_PHASE_INTERPRET; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); spin_unlock(&imp->imp_lock); GOTO(interpret, req->rq_status); } if (req->rq_no_resend && !req->rq_wait_ctx) { req->rq_status = -ENOTCONN; - req->rq_phase = RQ_PHASE_INTERPRET; + ptlrpc_rqphase_move(req, + RQ_PHASE_INTERPRET); spin_unlock(&imp->imp_lock); GOTO(interpret, req->rq_status); } + + list_del_init(&req->rq_list); list_add_tail(&req->rq_list, &imp->imp_sending_list); @@ -1242,11 +1285,13 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); if (req->rq_bulk) { - __u64 old_xid = req->rq_xid; + __u64 old_xid; - ptlrpc_unregister_bulk (req); + if (!ptlrpc_unregister_bulk(req, 1)) + continue; /* ensure previous bulk fails */ + old_xid = req->rq_xid; req->rq_xid = ptlrpc_next_xid(); CDEBUG(D_HA, "resend bulk " "old x"LPU64 @@ -1262,6 +1307,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) if (status) { if (req->rq_err) { req->rq_status = status; + req->rq_wait_ctx = 0; force_timer_recalc = 1; } else { req->rq_wait_ctx = 1; @@ -1285,36 +1331,33 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) spin_lock(&req->rq_lock); - if (req->rq_early) { + if (ptlrpc_client_early(req)) { ptlrpc_at_recv_early_reply(req); spin_unlock(&req->rq_lock); continue; } /* Still waiting for a reply? */ - if (req->rq_receiving_reply) { + if (ptlrpc_client_recv(req)) { spin_unlock(&req->rq_lock); continue; } /* Did we actually receive a reply? */ - if (!req->rq_replied) { + if (!ptlrpc_client_replied(req)) { spin_unlock(&req->rq_lock); continue; } spin_unlock(&req->rq_lock); - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); - req->rq_status = after_reply(req); if (req->rq_resend) { /* Add this req to the delayed list so it can be errored if the import is evicted after recovery. */ spin_lock(&imp->imp_lock); + list_del_init(&req->rq_list); list_add_tail(&req->rq_list, &imp->imp_delayed_list); spin_unlock(&imp->imp_lock); @@ -1323,19 +1366,19 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) /* If there is no bulk associated with this request, * then we're done and should let the interpreter - * process the reply. Similarly if the RPC returned + * process the reply. Similarly if the RPC returned * an error, and therefore the bulk will never arrive. */ if (req->rq_bulk == NULL || req->rq_status != 0) { - req->rq_phase = RQ_PHASE_INTERPRET; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); GOTO(interpret, req->rq_status); } - req->rq_phase = RQ_PHASE_BULK; + ptlrpc_rqphase_move(req, RQ_PHASE_BULK); } LASSERT(req->rq_phase == RQ_PHASE_BULK); - if (ptlrpc_bulk_active(req->rq_bulk)) + if (ptlrpc_client_bulk_active(req)) continue; if (!req->rq_bulk->bd_success) { @@ -1347,24 +1390,31 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) LBUG(); } - req->rq_phase = RQ_PHASE_INTERPRET; + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); interpret: LASSERT(req->rq_phase == RQ_PHASE_INTERPRET); - LASSERT(!req->rq_receiving_reply); - ptlrpc_unregister_reply(req); - if (req->rq_bulk != NULL) - ptlrpc_unregister_bulk (req); + /* This moves to "unregistering" phase we need to wait for + * reply unlink. */ + if (!ptlrpc_unregister_reply(req, 1)) + continue; + + if (!ptlrpc_unregister_bulk(req, 1)) + continue; + + /* When calling interpret receiving already should be + * finished. */ + LASSERT(!req->rq_receiving_reply); if (req->rq_interpret_reply != NULL) { ptlrpc_interpterer_t interpreter = req->rq_interpret_reply; - req->rq_status = interpreter(NULL, req, + req->rq_status = interpreter(env, req, &req->rq_async_args, req->rq_status); } - req->rq_phase = RQ_PHASE_COMPLETE; + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); CDEBUG(D_RPCTRACE, "Completed RPC pname:cluuid:pid:xid:nid:" "opc %s:%s:%d:"LPU64":%s:%d\n", cfs_curproc_comm(), @@ -1373,7 +1423,17 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) libcfs_nid2str(imp->imp_connection->c_peer.nid), lustre_msg_get_opc(req->rq_reqmsg)); - atomic_dec(&imp->imp_inflight); + spin_lock(&imp->imp_lock); + /* Request already may be not on sending or delaying list. This + * may happen in the case of marking it errorneous for the case + * ptlrpc_import_delay_req(req, status) find it impossible to + * allow sending this rpc and returns *status != 0. */ + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + } + spin_unlock(&imp->imp_lock); + set->set_remaining--; cfs_waitq_signal(&imp->imp_recovery_waitq); } @@ -1383,7 +1443,7 @@ int ptlrpc_check_set(const struct lu_env *env, struct ptlrpc_request_set *set) } /* Return 1 if we should give up, else 0 */ -int ptlrpc_expire_one_request(struct ptlrpc_request *req) +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink) { struct obd_import *imp = req->rq_import; int rc = 0; @@ -1412,14 +1472,12 @@ int ptlrpc_expire_one_request(struct ptlrpc_request *req) req->rq_timedout = 1; spin_unlock(&req->rq_lock); - ptlrpc_unregister_reply (req); + ptlrpc_unregister_reply(req, async_unlink); + ptlrpc_unregister_bulk(req, async_unlink); if (obd_dump_on_timeout) libcfs_debug_dumplog(); - if (req->rq_bulk != NULL) - ptlrpc_unregister_bulk (req); - if (imp == NULL) { DEBUG_REQ(D_HA, req, "NULL import: already cleaned up?"); RETURN(1); @@ -1465,28 +1523,36 @@ int ptlrpc_expired_set(void *data) LASSERT(set != NULL); - /* A timeout expired; see which reqs it applies to... */ + /* + * A timeout expired. See which reqs it applies to... + */ list_for_each (tmp, &set->set_requests) { struct ptlrpc_request *req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); - /* request in-flight? */ - if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting && - !req->rq_resend) || - (req->rq_phase == RQ_PHASE_BULK))) + /* don't expire request waiting for context */ + if (req->rq_wait_ctx) continue; - if (req->rq_timedout || /* already dealt with */ - req->rq_deadline > now) /* not expired */ + /* Request in-flight? */ + if (!((req->rq_phase == RQ_PHASE_RPC && + !req->rq_waiting && !req->rq_resend) || + (req->rq_phase == RQ_PHASE_BULK))) + continue; + + if (req->rq_timedout || /* already dealt with */ + req->rq_deadline > now) /* not expired */ continue; - /* deal with this guy */ - ptlrpc_expire_one_request (req); + /* Deal with this guy. Do it asynchronously to not block + * ptlrpcd thread. */ + ptlrpc_expire_one_request(req, 1); } - /* When waiting for a whole set, we always to break out of the + /* + * When waiting for a whole set, we always to break out of the * sleep so we can recalculate the timeout, or enable interrupts - * iff everyone's timed out. + * if everyone's timed out. */ RETURN(1); } @@ -1510,14 +1576,17 @@ void ptlrpc_interrupted_set(void *data) struct ptlrpc_request *req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); - if (req->rq_phase != RQ_PHASE_RPC) + if (req->rq_phase != RQ_PHASE_RPC && + req->rq_phase != RQ_PHASE_UNREGISTERING) continue; ptlrpc_mark_interrupted(req); } } -/* get the smallest timeout in the set; this does NOT set a timeout. */ +/** + * Get the smallest timeout in the set; this does NOT set a timeout. + */ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) { struct list_head *tmp; @@ -1532,16 +1601,24 @@ int ptlrpc_set_next_timeout(struct ptlrpc_request_set *set) list_for_each(tmp, &set->set_requests) { req = list_entry(tmp, struct ptlrpc_request, rq_set_chain); - /* request in-flight? */ - if (!((req->rq_phase == RQ_PHASE_RPC && !req->rq_waiting) || + /* + * Request in-flight? + */ + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || (req->rq_phase == RQ_PHASE_BULK) || (req->rq_phase == RQ_PHASE_NEW))) continue; - if (req->rq_timedout) /* already timed out */ + /* + * Already timed out. + */ + if (req->rq_timedout) continue; - if (req->rq_wait_ctx) /* waiting for ctx */ + /* + * Waiting for ctx. + */ + if (req->rq_wait_ctx) continue; if (req->rq_phase == RQ_PHASE_NEW) @@ -1642,6 +1719,7 @@ static void __ptlrpc_free_req(struct ptlrpc_request *request, int locked) LASSERTF(request->rq_rqbd == NULL, "req %p\n",request);/* client-side */ LASSERTF(list_empty(&request->rq_list), "req %p\n", request); LASSERTF(list_empty(&request->rq_set_chain), "req %p\n", request); + LASSERTF(list_empty(&request->rq_exp_list), "req %p\n", request); LASSERTF(!request->rq_replay, "req %p\n", request); LASSERT(request->rq_cli_ctx); @@ -1737,23 +1815,54 @@ EXPORT_SYMBOL(ptlrpc_req_xid); * IDEMPOTENT, but _not_ safe against concurrent callers. * The request owner (i.e. the thread doing the I/O) must call... */ -void ptlrpc_unregister_reply (struct ptlrpc_request *request) +int ptlrpc_unregister_reply(struct ptlrpc_request *request, int async) { int rc; cfs_waitq_t *wq; struct l_wait_info lwi; - LASSERT(!in_interrupt ()); /* might sleep */ + /* + * Might sleep. + */ + LASSERT(!in_interrupt()); + + /* + * Let's setup deadline for reply unlink. + */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_REPL_UNLINK) && + async && request->rq_reply_deadline == 0) + request->rq_reply_deadline = cfs_time_current_sec()+LONG_UNLINK; + + /* + * Nothing left to do. + */ if (!ptlrpc_client_recv_or_unlink(request)) - /* Nothing left to do */ - return; + RETURN(1); - LNetMDUnlink (request->rq_reply_md_h); + LNetMDUnlink(request->rq_reply_md_h); - /* We have to l_wait_event() whatever the result, to give liblustre - * a chance to run reply_in_callback(), and to make sure we've - * unlinked before returning a req to the pool */ + /* + * Let's check it once again. + */ + if (!ptlrpc_client_recv_or_unlink(request)) + RETURN(1); + /* + * Move to "Unregistering" phase as reply was not unlinked yet. + */ + ptlrpc_rqphase_move(request, RQ_PHASE_UNREGISTERING); + + /* + * Do not wait for unlink to finish. + */ + if (async) + RETURN(0); + + /* + * We have to l_wait_event() whatever the result, to give liblustre + * a chance to run reply_in_callback(), and to make sure we've + * unlinked before returning a req to the pool. + */ if (request->rq_set != NULL) wq = &request->rq_set->set_waitq; else @@ -1762,17 +1871,21 @@ void ptlrpc_unregister_reply (struct ptlrpc_request *request) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); - rc = l_wait_event (*wq, !ptlrpc_client_recv_or_unlink(request), - &lwi); - if (rc == 0) - return; - - LASSERT (rc == -ETIMEDOUT); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_recv_or_unlink(request), + &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(request, request->rq_next_phase); + RETURN(1); + } + + LASSERT(rc == -ETIMEDOUT); DEBUG_REQ(D_WARNING, request, "Unexpectedly long timeout " "rvcng=%d unlnk=%d", request->rq_receiving_reply, request->rq_must_unlink); } + RETURN(0); } /* caller must hold imp->imp_lock */ @@ -1795,7 +1908,6 @@ void ptlrpc_free_committed(struct obd_import *imp) EXIT; return; } - CDEBUG(D_RPCTRACE, "%s: committing for last_committed "LPU64" gen %d\n", imp->imp_obd->obd_name, imp->imp_peer_committed_transno, imp->imp_generation); @@ -1866,7 +1978,7 @@ void ptlrpc_resend_req(struct ptlrpc_request *req) CDEBUG(D_HA, "resend bulk old x"LPU64" new x"LPU64"\n", old_xid, req->rq_xid); } - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); spin_unlock(&req->rq_lock); } @@ -1879,7 +1991,7 @@ void ptlrpc_restart_req(struct ptlrpc_request *req) spin_lock(&req->rq_lock); req->rq_restart = 1; req->rq_timedout = 0; - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); spin_unlock(&req->rq_lock); } @@ -1888,15 +2000,19 @@ static int expired_request(void *data) struct ptlrpc_request *req = data; ENTRY; - /* some failure can suspend regular timeouts */ + /* + * Some failure can suspend regular timeouts. + */ if (ptlrpc_check_suspend()) RETURN(1); - /* deadline may have changed with an early reply */ + /* + * Deadline may have changed with an early reply. + */ if (req->rq_deadline > cfs_time_current_sec()) RETURN(1); - RETURN(ptlrpc_expire_one_request(req)); + RETURN(ptlrpc_expire_one_request(req, 0)); } static void interrupted_request(void *data) @@ -1973,7 +2089,6 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) LASSERT(req->rq_set == NULL); LASSERT(!req->rq_receiving_reply); - atomic_inc(&imp->imp_inflight); /* for distributed debugging */ lustre_msg_set_status(req->rq_reqmsg, cfs_curproc_pid()); @@ -1986,15 +2101,15 @@ int ptlrpc_queue_wait(struct ptlrpc_request *req) lustre_msg_get_opc(req->rq_reqmsg)); /* Mark phase here for a little debug help */ - req->rq_phase = RQ_PHASE_RPC; + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); spin_lock(&imp->imp_lock); req->rq_import_generation = imp->imp_generation; restart: if (ptlrpc_import_delay_req(imp, req, &rc)) { - list_del(&req->rq_list); - + list_del_init(&req->rq_list); list_add_tail(&req->rq_list, &imp->imp_delayed_list); + atomic_inc(&imp->imp_inflight); spin_unlock(&imp->imp_lock); DEBUG_REQ(D_HA, req, "\"%s\" waiting for recovery: (%s != %s)", @@ -2014,6 +2129,7 @@ restart: spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); if (req->rq_err) { /* rq_status was set locally */ @@ -2032,7 +2148,6 @@ restart: } if (rc != 0) { - list_del_init(&req->rq_list); spin_unlock(&imp->imp_lock); req->rq_status = rc; // XXX this ok? GOTO(out, rc); @@ -2042,7 +2157,7 @@ restart: lustre_msg_add_flags(req->rq_reqmsg, MSG_RESENT); if (req->rq_bulk != NULL) { - ptlrpc_unregister_bulk (req); + ptlrpc_unregister_bulk(req, 0); /* bulk requests are supposed to be * idempotent, so we are free to bump the xid @@ -2060,6 +2175,7 @@ restart: /* XXX this is the same as ptlrpc_set_wait */ LASSERT(list_empty(&req->rq_list)); list_add_tail(&req->rq_list, &imp->imp_sending_list); + atomic_inc(&imp->imp_inflight); spin_unlock(&imp->imp_lock); rc = sptlrpc_req_refresh_ctx(req, 0); @@ -2070,9 +2186,11 @@ restart: */ spin_lock(&imp->imp_lock); list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); spin_unlock(&imp->imp_lock); - CERROR("Failed to refresh ctx of req %p: %d\n", req, rc); + CERROR("Failed to refresh ctx of req %p: %d\n", + req, rc); GOTO(out, rc); } /* simulating we got error during send rpc */ @@ -2082,7 +2200,6 @@ restart: rc = ptl_send_rpc(req, 0); if (rc) DEBUG_REQ(D_HA, req, "send failed (%d); recovering", rc); - repeat: timeoutl = req->rq_deadline - cfs_time_current_sec(); timeout = (timeoutl <= 0 || rc) ? CFS_TICK : @@ -2104,15 +2221,15 @@ after_send: libcfs_nid2str(imp->imp_connection->c_peer.nid), lustre_msg_get_opc(req->rq_reqmsg)); - spin_lock(&imp->imp_lock); - list_del_init(&req->rq_list); - spin_unlock(&imp->imp_lock); - /* If the reply was received normally, this just grabs the spinlock * (ensuring the reply callback has returned), sees that * req->rq_receiving_reply is clear and returns. */ - ptlrpc_unregister_reply (req); + ptlrpc_unregister_reply(req, 0); + spin_lock(&imp->imp_lock); + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + spin_unlock(&imp->imp_lock); if (req->rq_err) { DEBUG_REQ(D_RPCTRACE, req, "err rc=%d status=%d", @@ -2141,7 +2258,7 @@ after_send: GOTO(out, rc = -ETIMEDOUT); } - if (!req->rq_replied) { + if (!ptlrpc_client_replied(req)) { /* How can this be? -eeb */ DEBUG_REQ(D_ERROR, req, "!rq_replied: "); LBUG(); @@ -2165,7 +2282,7 @@ after_send: * me. */ lwi = LWI_TIMEOUT(timeout, NULL, NULL); brc = l_wait_event(req->rq_reply_waitq, - !ptlrpc_bulk_active(req->rq_bulk), + !ptlrpc_client_bulk_active(req), &lwi); LASSERT(brc == 0 || brc == -ETIMEDOUT); if (brc != 0) { @@ -2178,13 +2295,11 @@ after_send: } } if (rc < 0) - ptlrpc_unregister_bulk (req); + ptlrpc_unregister_bulk(req, 0); } LASSERT(!req->rq_receiving_reply); - req->rq_phase = RQ_PHASE_INTERPRET; - - atomic_dec(&imp->imp_inflight); + ptlrpc_rqphase_move(req, RQ_PHASE_INTERPRET); cfs_waitq_signal(&imp->imp_recovery_waitq); RETURN(rc); } @@ -2196,7 +2311,7 @@ struct ptlrpc_replay_async_args { static int ptlrpc_replay_interpret(const struct lu_env *env, struct ptlrpc_request *req, - void * data, int rc) + void * data, int rc) { struct ptlrpc_replay_async_args *aa = data; struct obd_import *imp = req->rq_import; @@ -2204,7 +2319,7 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, ENTRY; atomic_dec(&imp->imp_replay_inflight); - if (!req->rq_replied) { + if (!ptlrpc_client_replied(req)) { CERROR("request replay timed out, restarting recovery\n"); GOTO(out, rc = -ETIMEDOUT); } @@ -2224,7 +2339,7 @@ static int ptlrpc_replay_interpret(const struct lu_env *env, if (req->rq_replay_cb) req->rq_replay_cb(req); - if (req->rq_replied && + if (ptlrpc_client_replied(req) && lustre_msg_get_status(req->rq_repmsg) != aa->praa_old_status) { DEBUG_REQ(D_ERROR, req, "status %d, old was %d", lustre_msg_get_status(req->rq_repmsg), @@ -2274,6 +2389,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) aa->praa_old_state = req->rq_send_state; req->rq_send_state = LUSTRE_IMP_REPLAY; req->rq_phase = RQ_PHASE_NEW; + req->rq_next_phase = RQ_PHASE_UNDEFINED; if (req->rq_repmsg) aa->praa_old_status = lustre_msg_get_status(req->rq_repmsg); req->rq_status = 0; @@ -2286,7 +2402,7 @@ int ptlrpc_replay_req(struct ptlrpc_request *req) atomic_inc(&req->rq_import->imp_replay_inflight); ptlrpc_request_addref(req); /* ptlrpcd needs a ref */ - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } @@ -2315,7 +2431,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; req->rq_status = -EINTR; - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } spin_unlock (&req->rq_lock); } @@ -2330,7 +2446,7 @@ void ptlrpc_abort_inflight(struct obd_import *imp) if (req->rq_import_generation < imp->imp_generation) { req->rq_err = 1; req->rq_status = -EINTR; - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } spin_unlock (&req->rq_lock); } @@ -2347,24 +2463,24 @@ void ptlrpc_abort_inflight(struct obd_import *imp) void ptlrpc_abort_set(struct ptlrpc_request_set *set) { - struct list_head *tmp, *n; + struct list_head *tmp, *pos; LASSERT(set != NULL); - list_for_each_safe(tmp, n, &set->set_requests) { + list_for_each_safe(pos, tmp, &set->set_requests) { struct ptlrpc_request *req = - list_entry(tmp, struct ptlrpc_request, rq_set_chain); + list_entry(pos, struct ptlrpc_request, rq_set_chain); - spin_lock (&req->rq_lock); + spin_lock(&req->rq_lock); if (req->rq_phase != RQ_PHASE_RPC) { - spin_unlock (&req->rq_lock); + spin_unlock(&req->rq_lock); continue; } req->rq_err = 1; req->rq_status = -EINTR; - ptlrpc_wake_client_req(req); - spin_unlock (&req->rq_lock); + ptlrpc_client_wake_req(req); + spin_unlock(&req->rq_lock); } } diff --git a/lustre/ptlrpc/connection.c b/lustre/ptlrpc/connection.c index 9442bf7..fb86c6c 100644 --- a/lustre/ptlrpc/connection.c +++ b/lustre/ptlrpc/connection.c @@ -143,7 +143,7 @@ int ptlrpc_connection_init(void) { ENTRY; - conn_hash = lustre_hash_init("CONN_HASH", 32, 32768, + conn_hash = lustre_hash_init("CONN_HASH", 5, 15, &conn_hash_ops, LH_REHASH); if (!conn_hash) RETURN(-ENOMEM); diff --git a/lustre/ptlrpc/events.c b/lustre/ptlrpc/events.c index 7cca13f..3747f07 100644 --- a/lustre/ptlrpc/events.c +++ b/lustre/ptlrpc/events.c @@ -39,10 +39,12 @@ #ifndef __KERNEL__ # include #else +# include # ifdef __mips64__ # include # endif #endif + #include #include #include @@ -77,7 +79,7 @@ void request_out_callback(lnet_event_t *ev) req->rq_net_err = 1; spin_unlock(&req->rq_lock); - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } ptlrpc_req_finished(req); @@ -161,7 +163,7 @@ void reply_in_callback(lnet_event_t *ev) out_wake: /* NB don't unlock till after wakeup; req can disappear under us * since we don't have our own ref */ - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); spin_unlock(&req->rq_lock); EXIT; } @@ -201,7 +203,7 @@ void client_bulk_callback (lnet_event_t *ev) /* NB don't unlock till after wakeup; desc can disappear under us * otherwise */ - ptlrpc_wake_client_req(desc->bd_req); + ptlrpc_client_wake_req(desc->bd_req); spin_unlock(&desc->bd_lock); EXIT; @@ -341,7 +343,9 @@ void reply_out_callback(lnet_event_t *ev) * until ptlrpc_server_handle_reply() is done with it */ spin_lock(&svc->srv_lock); rs->rs_on_net = 0; - ptlrpc_schedule_difficult_reply (rs); + if (!rs->rs_no_ack || + rs->rs_transno <= rs->rs_export->exp_obd->obd_last_committed) + ptlrpc_schedule_difficult_reply (rs); spin_unlock(&svc->srv_lock); } @@ -677,14 +681,14 @@ void liblustre_wait_idle(void) { static int recursed = 0; - + struct list_head *tmp; struct liblustre_wait_callback *llwc; int idle = 0; LASSERT(!recursed); recursed = 1; - + do { liblustre_wait_event(0); @@ -693,13 +697,13 @@ liblustre_wait_idle(void) list_for_each(tmp, &liblustre_idle_callbacks) { llwc = list_entry(tmp, struct liblustre_wait_callback, llwc_list); - + if (!llwc->llwc_fn(llwc->llwc_arg)) { idle = 0; break; } } - + } while (!idle); recursed = 0; @@ -720,11 +724,12 @@ int ptlrpc_init_portals(void) liblustre_register_wait_callback("liblustre_check_services", &liblustre_check_services, NULL); + init_completion_module(liblustre_wait_event); #endif rc = ptlrpcd_addref(); if (rc == 0) return 0; - + CERROR("rpcd initialisation failed\n"); #ifndef __KERNEL__ liblustre_deregister_wait_callback(liblustre_services_callback); diff --git a/lustre/ptlrpc/gss/gss_cli_upcall.c b/lustre/ptlrpc/gss/gss_cli_upcall.c index 196fb9f..f3df6d2 100644 --- a/lustre/ptlrpc/gss/gss_cli_upcall.c +++ b/lustre/ptlrpc/gss/gss_cli_upcall.c @@ -248,8 +248,8 @@ int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count) int rc; if (count != sizeof(param)) { - CERROR("ioctl size %lu, expect %lu, please check lgssd version\n", - count, (unsigned long) sizeof(param)); + CERROR("ioctl size %lu, expect %lu, please check lgss_keyring " + "version\n", count, (unsigned long) sizeof(param)); RETURN(-EINVAL); } if (copy_from_user(¶m, buffer, sizeof(param))) { @@ -275,11 +275,40 @@ int gss_do_ctx_init_rpc(__user char *buffer, unsigned long count) RETURN(-EINVAL); } + if (unlikely(!obd->obd_set_up)) { + CERROR("obd %s not setup\n", obdname); + RETURN(-EINVAL); + } + + spin_lock(&obd->obd_dev_lock); + if (obd->obd_stopping) { + CERROR("obd %s has stopped\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) && + strcmp(obd->obd_type->typ_name, LUSTRE_MGC_NAME)) { + CERROR("obd %s is not a client device\n", obdname); + spin_unlock(&obd->obd_dev_lock); + RETURN(-EINVAL); + } + spin_unlock(&obd->obd_dev_lock); + + down_read(&obd->u.cli.cl_sem); + if (obd->u.cli.cl_import == NULL) { + CERROR("import has gone\n"); + RETURN(-EINVAL); + } imp = class_import_get(obd->u.cli.cl_import); - LASSERT(imp->imp_sec); + up_read(&obd->u.cli.cl_sem); - /* force this import to use v2 msg */ - imp->imp_msg_magic = LUSTRE_MSG_MAGIC_V2; + if (imp->imp_deactive) { + CERROR("import has been deactivated\n"); + class_import_put(imp); + RETURN(-EINVAL); + } req = ptlrpc_request_alloc_pack(imp, &RQF_SEC_CTX, LUSTRE_OBD_VERSION, SEC_CTX_INIT); diff --git a/lustre/ptlrpc/gss/gss_internal.h b/lustre/ptlrpc/gss/gss_internal.h index 4725059..afbb6144 100644 --- a/lustre/ptlrpc/gss/gss_internal.h +++ b/lustre/ptlrpc/gss/gss_internal.h @@ -106,9 +106,9 @@ enum ptlrpc_gss_proc { }; enum ptlrpc_gss_tgt { - LUSTRE_GSS_TGT_MDS = 0, - LUSTRE_GSS_TGT_OSS = 1, - LUSTRE_GSS_TGT_MGS = 2, + LUSTRE_GSS_TGT_MGS = 0, + LUSTRE_GSS_TGT_MDS = 1, + LUSTRE_GSS_TGT_OSS = 2, }; enum ptlrpc_gss_header_flags { @@ -121,6 +121,8 @@ __u32 import_to_gss_svc(struct obd_import *imp) { const char *name = imp->imp_obd->obd_type->typ_name; + if (!strcmp(name, LUSTRE_MGC_NAME)) + return LUSTRE_GSS_TGT_MGS; if (!strcmp(name, LUSTRE_MDC_NAME)) return LUSTRE_GSS_TGT_MDS; if (!strcmp(name, LUSTRE_OSC_NAME)) diff --git a/lustre/ptlrpc/gss/gss_keyring.c b/lustre/ptlrpc/gss/gss_keyring.c index 8301482..74c786d 100644 --- a/lustre/ptlrpc/gss/gss_keyring.c +++ b/lustre/ptlrpc/gss/gss_keyring.c @@ -1236,7 +1236,9 @@ int gss_kt_instantiate(struct key *key, const void *data, size_t datalen) */ LASSERT(cfs_current()->signal->session_keyring); + lockdep_off(); rc = key_link(cfs_current()->signal->session_keyring, key); + lockdep_on(); if (unlikely(rc)) { CERROR("failed to link key %08x to keyring %08x: %d\n", key->serial, @@ -1267,13 +1269,13 @@ int gss_kt_update(struct key *key, const void *data, size_t datalen) RETURN(-EINVAL); } - /* there's a race between userspace parent - child processes. if - * child finish negotiation too fast and call kt_update(), the ctx + /* if upcall finished negotiation too fast (mostly likely because + * of local error happened) and call kt_update(), the ctx * might be still NULL. but the key will finally be associate * with a context, or be revoked. if key status is fine, return * -EAGAIN to allow userspace sleep a while and call again. */ if (ctx == NULL) { - CWARN("race in userspace. key %p(%x) flags %lx\n", + CDEBUG(D_SEC, "update too soon: key %p(%x) flags %lx\n", key, key->serial, key->flags); rc = key_validate(key); diff --git a/lustre/ptlrpc/gss/gss_krb5.h b/lustre/ptlrpc/gss/gss_krb5.h index fa5db1a..feddfc5 100644 --- a/lustre/ptlrpc/gss/gss_krb5.h +++ b/lustre/ptlrpc/gss/gss_krb5.h @@ -49,8 +49,6 @@ #ifndef PTLRPC_GSS_KRB5_H #define PTLRPC_GSS_KRB5_H -extern spinlock_t krb5_seq_lock; - /* * RFC 4142 */ diff --git a/lustre/ptlrpc/gss/gss_krb5_mech.c b/lustre/ptlrpc/gss/gss_krb5_mech.c index 8d68ff5..a9a5388 100644 --- a/lustre/ptlrpc/gss/gss_krb5_mech.c +++ b/lustre/ptlrpc/gss/gss_krb5_mech.c @@ -77,7 +77,7 @@ #include "gss_asn1.h" #include "gss_krb5.h" -spinlock_t krb5_seq_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t krb5_seq_lock; struct krb5_enctype { char *ke_dispname; @@ -1326,6 +1326,8 @@ int __init init_kerberos_module(void) { int status; + spin_lock_init(&krb5_seq_lock); + status = lgss_mech_register(&gss_kerberos_mech); if (status) CERROR("Failed to register kerberos gss mechanism!\n"); diff --git a/lustre/ptlrpc/gss/gss_pipefs.c b/lustre/ptlrpc/gss/gss_pipefs.c index cf96883..ade0f53 100644 --- a/lustre/ptlrpc/gss/gss_pipefs.c +++ b/lustre/ptlrpc/gss/gss_pipefs.c @@ -1210,7 +1210,7 @@ int __init gss_init_pipefs_upcall(void) de_pipes[MECH_KRB5] = de; CFS_INIT_LIST_HEAD(&upcall_lists[MECH_KRB5]); - upcall_locks[MECH_KRB5] = SPIN_LOCK_UNLOCKED; + spin_lock_init(&upcall_locks[MECH_KRB5]); return 0; } diff --git a/lustre/ptlrpc/gss/gss_svc_upcall.c b/lustre/ptlrpc/gss/gss_svc_upcall.c index 321ebfd..e97ae44 100644 --- a/lustre/ptlrpc/gss/gss_svc_upcall.c +++ b/lustre/ptlrpc/gss/gss_svc_upcall.c @@ -74,7 +74,7 @@ #define GSS_SVC_UPCALL_TIMEOUT (20) -static spinlock_t __ctx_index_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t __ctx_index_lock; static __u64 __ctx_index; __u64 gss_get_next_ctx_index(void) @@ -1416,6 +1416,16 @@ int __init gss_init_svc_upcall(void) { int i; + spin_lock_init(&__ctx_index_lock); + /* + * this helps reducing context index confliction. after server reboot, + * conflicting request from clients might be filtered out by initial + * sequence number checking, thus no chance to sent error notification + * back to clients. + */ + get_random_bytes(&__ctx_index, sizeof(__ctx_index)); + + cache_register(&rsi_cache); cache_register(&rsc_cache); @@ -1436,12 +1446,6 @@ int __init gss_init_svc_upcall(void) CWARN("Init channel is not opened by lsvcgssd, following " "request might be dropped until lsvcgssd is active\n"); - /* this helps reducing context index confliction. after server reboot, - * conflicting request from clients might be filtered out by initial - * sequence number checking, thus no chance to sent error notification - * back to clients. */ - get_random_bytes(&__ctx_index, sizeof(__ctx_index)); - return 0; } diff --git a/lustre/ptlrpc/gss/lproc_gss.c b/lustre/ptlrpc/gss/lproc_gss.c index 208bad8..6398661e 100644 --- a/lustre/ptlrpc/gss/lproc_gss.c +++ b/lustre/ptlrpc/gss/lproc_gss.c @@ -75,7 +75,6 @@ static struct { atomic_t oos_svc_replay[3]; /* server replay detected */ atomic_t oos_svc_pass[3]; /* server verified ok */ } gss_stat_oos = { - .oos_lock = SPIN_LOCK_UNLOCKED, .oos_cli_count = ATOMIC_INIT(0), .oos_cli_behind = 0, .oos_svc_replay = { ATOMIC_INIT(0), }, @@ -156,6 +155,8 @@ int gss_init_lproc(void) struct proc_dir_entry *ent; int rc; + spin_lock_init(&gss_stat_oos.oos_lock); + gss_proc_root = lprocfs_register("gss", sptlrpc_proc_root, gss_lprocfs_vars, NULL); diff --git a/lustre/ptlrpc/gss/sec_gss.c b/lustre/ptlrpc/gss/sec_gss.c index 1229091..f3aae3f 100644 --- a/lustre/ptlrpc/gss/sec_gss.c +++ b/lustre/ptlrpc/gss/sec_gss.c @@ -1087,7 +1087,7 @@ int gss_sec_create_common(struct gss_sec *gsec, sec->ps_id = sptlrpc_get_next_secid(); sec->ps_flvr = *sf; sec->ps_import = class_import_get(imp); - sec->ps_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sec->ps_lock); CFS_INIT_LIST_HEAD(&sec->ps_gc_list); if (!svcctx) { diff --git a/lustre/ptlrpc/import.c b/lustre/ptlrpc/import.c index b575270..5021179 100644 --- a/lustre/ptlrpc/import.c +++ b/lustre/ptlrpc/import.c @@ -151,12 +151,11 @@ int ptlrpc_set_import_discon(struct obd_import *imp, __u32 conn_cnt) libcfs_nid2str(imp->imp_connection->c_peer.nid)); } else { LCONSOLE_ERROR_MSG(0x166, "%s: Connection to service " - "%.*s via nid %s was lost; in progress" - "operations using this service will" - "fail.\n", - imp->imp_obd->obd_name, - target_len, target_start, - libcfs_nid2str(imp->imp_connection->c_peer.nid)); + "%.*s via nid %s was lost; in progress " + "operations using this service will fail.\n", + imp->imp_obd->obd_name, + target_len, target_start, + libcfs_nid2str(imp->imp_connection->c_peer.nid)); } ptlrpc_deactivate_timeouts(imp); IMPORT_SET_STATE_NOLOCK(imp, LUSTRE_IMP_DISCON); @@ -207,6 +206,46 @@ void ptlrpc_deactivate_import(struct obd_import *imp) ptlrpc_deactivate_and_unlock_import(imp); } +static unsigned int +ptlrpc_inflight_deadline(struct ptlrpc_request *req, time_t now) +{ + long dl; + + if (!(((req->rq_phase == RQ_PHASE_RPC) && !req->rq_waiting) || + (req->rq_phase == RQ_PHASE_BULK) || + (req->rq_phase == RQ_PHASE_NEW))) + return 0; + + if (req->rq_timedout) + return 0; + + if (req->rq_phase == RQ_PHASE_NEW) + dl = req->rq_sent; + else + dl = req->rq_deadline; + + if (dl <= now) + return 0; + + return dl - now; +} + +static unsigned int ptlrpc_inflight_timeout(struct obd_import *imp) +{ + time_t now = cfs_time_current_sec(); + struct list_head *tmp, *n; + struct ptlrpc_request *req; + unsigned int timeout = 0; + + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, rq_list); + timeout = max(ptlrpc_inflight_deadline(req, now), timeout); + } + spin_unlock(&imp->imp_lock); + return timeout; +} + /* * This function will invalidate the import, if necessary, then block * for all the RPC completions, and finally notify the obd to @@ -218,6 +257,7 @@ void ptlrpc_invalidate_import(struct obd_import *imp) struct list_head *tmp, *n; struct ptlrpc_request *req; struct l_wait_info lwi; + unsigned int timeout; int rc; atomic_inc(&imp->imp_inval_count); @@ -234,32 +274,78 @@ void ptlrpc_invalidate_import(struct obd_import *imp) LASSERT(imp->imp_invalid); - /* wait for all requests to error out and call completion callbacks. - Cap it at obd_timeout -- these should all have been locally - cancelled by ptlrpc_abort_inflight. */ - lwi = LWI_TIMEOUT_INTERVAL( - cfs_timeout_cap(cfs_time_seconds(obd_timeout)), - cfs_time_seconds(1), NULL, NULL); - rc = l_wait_event(imp->imp_recovery_waitq, - (atomic_read(&imp->imp_inflight) == 0), &lwi); + /* Wait forever until inflight == 0. We really can't do it another + * way because in some cases we need to wait for very long reply + * unlink. We can't do anything before that because there is really + * no guarantee that some rdma transfer is not in progress right now. */ + do { + /* Calculate max timeout for waiting on rpcs to error + * out. Use obd_timeout if calculated value is smaller + * than it. */ + timeout = ptlrpc_inflight_timeout(imp); + timeout += timeout / 3; + + if (timeout == 0) + timeout = obd_timeout; + + CDEBUG(D_RPCTRACE, "Sleeping %d sec for inflight to error out\n", + timeout); + + /* Wait for all requests to error out and call completion + * callbacks. Cap it at obd_timeout -- these should all + * have been locally cancelled by ptlrpc_abort_inflight. */ + lwi = LWI_TIMEOUT_INTERVAL( + cfs_timeout_cap(cfs_time_seconds(timeout)), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(imp->imp_recovery_waitq, + (atomic_read(&imp->imp_inflight) == 0), &lwi); + if (rc) { + const char *cli_tgt = obd2cli_tgt(imp->imp_obd); - if (rc) { - CERROR("%s: rc = %d waiting for callback (%d != 0)\n", - obd2cli_tgt(imp->imp_obd), rc, - atomic_read(&imp->imp_inflight)); - spin_lock(&imp->imp_lock); - list_for_each_safe(tmp, n, &imp->imp_sending_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "still on sending list"); - } - list_for_each_safe(tmp, n, &imp->imp_delayed_list) { - req = list_entry(tmp, struct ptlrpc_request, rq_list); - DEBUG_REQ(D_ERROR, req, "still on delayed list"); - } - spin_unlock(&imp->imp_lock); - LASSERT(atomic_read(&imp->imp_inflight) == 0); - } + CERROR("%s: rc = %d waiting for callback (%d != 0)\n", + cli_tgt, rc, atomic_read(&imp->imp_inflight)); + spin_lock(&imp->imp_lock); + list_for_each_safe(tmp, n, &imp->imp_sending_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, "still on sending list"); + } + list_for_each_safe(tmp, n, &imp->imp_delayed_list) { + req = list_entry(tmp, struct ptlrpc_request, + rq_list); + DEBUG_REQ(D_ERROR, req, "still on delayed list"); + } + + if (atomic_read(&imp->imp_unregistering) == 0) { + /* We know that only "unregistering" rpcs may + * still survive in sending or delaying lists + * (They are waiting for long reply unlink in + * sluggish nets). Let's check this. If there + * is no unregistering and inflight != 0 this + * is bug. */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); + + /* Let's save one loop as soon as inflight have + * dropped to zero. No new inflights possible at + * this point. */ + rc = 0; + } else { + CERROR("%s: RPCs in \"%s\" phase found (%d). " + "Network is sluggish? Waiting them " + "to error out.\n", cli_tgt, + ptlrpc_phase2str(RQ_PHASE_UNREGISTERING), + atomic_read(&imp->imp_unregistering)); + } + spin_unlock(&imp->imp_lock); + } + } while (rc != 0); + + /* + * Let's additionally check that no new rpcs added to import in + * "invalidate" state. + */ + LASSERT(atomic_read(&imp->imp_inflight) == 0); out: obd_import_event(imp->imp_obd, imp, IMP_EVENT_INVALIDATE); sptlrpc_import_flush_all_ctx(imp); @@ -569,11 +655,25 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) GOTO(out, rc); } + /* Report the rpc service time to the server so that it knows how long + * to wait for clients to join recovery */ + lustre_msg_set_service_time(request->rq_reqmsg, + at_timeout2est(request->rq_timeout)); + + /* The amount of time we give the server to process the connect req. + * import_select_connection will increase the net latency on + * repeated reconnect attempts to cover slow networks. + * We override/ignore the server rpc completion estimate here, + * which may be large if this is a reconnect attempt */ + request->rq_timeout = INITIAL_CONNECT_TIMEOUT; + lustre_msg_set_timeout(request->rq_reqmsg, request->rq_timeout); + #ifndef __KERNEL__ lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_LIBCLIENT); #endif lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_NEXT_VER); + request->rq_no_resend = request->rq_no_delay = 1; request->rq_send_state = LUSTRE_IMP_CONNECTING; /* Allow a slightly larger reply for future growth compatibility */ req_capsule_set_size(&request->rq_pill, &RMF_CONNECT_DATA, RCL_SERVER, @@ -594,10 +694,6 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) spin_unlock(&imp->imp_lock); lustre_msg_add_op_flags(request->rq_reqmsg, MSG_CONNECT_INITIAL); - if (AT_OFF) - /* AT will use INITIAL_CONNECT_TIMEOUT the first - time, adaptive after that. */ - request->rq_timeout = INITIAL_CONNECT_TIMEOUT; } if (set_transno) @@ -605,7 +701,7 @@ int ptlrpc_connect_import(struct obd_import *imp, char *new_uuid) MSG_CONNECT_TRANSNO); DEBUG_REQ(D_RPCTRACE, request, "(re)connect request"); - ptlrpcd_add_req(request); + ptlrpcd_add_req(request, PSCOPE_OTHER); rc = 0; out: if (rc != 0) { @@ -638,8 +734,8 @@ static void ptlrpc_maybe_ping_import_soon(struct obd_import *imp) * to have two identical connections in imp_conn_list. We must * compare not conn's pointers but NIDs, otherwise we can defeat * connection throttling. (See bug 14774.) */ - if (imp->imp_conn_current->oic_conn->c_self != - imp_conn->oic_conn->c_self) { + if (imp->imp_conn_current->oic_conn->c_peer.nid != + imp_conn->oic_conn->c_peer.nid) { ptlrpc_ping_import_soon(imp); wake_pinger = 1; } @@ -1043,7 +1139,7 @@ out: static int completed_replay_interpret(const struct lu_env *env, struct ptlrpc_request *req, - void * data, int rc) + void * data, int rc) { ENTRY; atomic_dec(&req->rq_import->imp_replay_inflight); @@ -1081,7 +1177,7 @@ static int signal_completed_replay(struct obd_import *imp) req->rq_timeout *= 3; req->rq_interpret_reply = completed_replay_interpret; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } diff --git a/lustre/ptlrpc/layout.c b/lustre/ptlrpc/layout.c index 562335c..764957e 100644 --- a/lustre/ptlrpc/layout.c +++ b/lustre/ptlrpc/layout.c @@ -105,6 +105,10 @@ static const struct req_msg_field *quotactl_only[] = { &RMF_PTLRPC_BODY, &RMF_OBD_QUOTACTL }; +static const struct req_msg_field *quota_adjust_qunit_only[] = { + &RMF_PTLRPC_BODY, + &RMF_QUOTA_ADJUST_QUNIT +}; static const struct req_msg_field *qunit_data_only[] = { &RMF_PTLRPC_BODY, @@ -240,7 +244,9 @@ static const struct req_msg_field *mds_last_unlink_server[] = { &RMF_PTLRPC_BODY, &RMF_MDT_BODY, &RMF_MDT_MD, - &RMF_LOGCOOKIES + &RMF_LOGCOOKIES, + &RMF_CAPA1, + &RMF_CAPA2 }; static const struct req_msg_field *mds_reint_setattr_client[] = { @@ -465,7 +471,8 @@ static const struct req_msg_field *ost_body_capa[] = { static const struct req_msg_field *ost_destroy_client[] = { &RMF_PTLRPC_BODY, &RMF_OST_BODY, - &RMF_DLM_REQ + &RMF_DLM_REQ, + &RMF_CAPA1 }; @@ -518,10 +525,10 @@ static const struct req_msg_field *ost_get_fiemap_server[] = { static const struct req_format *req_formats[] = { &RQF_OBD_PING, &RQF_SEC_CTX, - &RQF_SEQ_QUERY, - &RQF_FLD_QUERY, &RQF_MGS_TARGET_REG, &RQF_MGS_SET_INFO, + &RQF_SEQ_QUERY, + &RQF_FLD_QUERY, &RQF_MDS_CONNECT, &RQF_MDS_DISCONNECT, &RQF_MDS_SET_INFO, @@ -552,10 +559,12 @@ static const struct req_format *req_formats[] = { &RQF_MDS_QUOTACHECK, &RQF_MDS_QUOTACTL, &RQF_MDS_QUOTA_DQACQ, + &RQF_QC_CALLBACK, &RQF_OST_CONNECT, &RQF_OST_DISCONNECT, &RQF_OST_QUOTACHECK, &RQF_OST_QUOTACTL, + &RQF_OST_QUOTA_ADJUST_QUNIT, &RQF_OST_GETATTR, &RQF_OST_SETATTR, &RQF_OST_CREATE, @@ -647,7 +656,7 @@ EXPORT_SYMBOL(RMF_SEQ_OPC); const struct req_msg_field RMF_SEQ_RANGE = DEFINE_MSGF("seq_query_range", 0, - sizeof(struct lu_range), lustre_swab_lu_range); + sizeof(struct lu_seq_range), lustre_swab_lu_seq_range); EXPORT_SYMBOL(RMF_SEQ_RANGE); const struct req_msg_field RMF_FLD_OPC = @@ -657,7 +666,7 @@ EXPORT_SYMBOL(RMF_FLD_OPC); const struct req_msg_field RMF_FLD_MDFLD = DEFINE_MSGF("fld_query_mdfld", 0, - sizeof(struct md_fld), lustre_swab_md_fld); + sizeof(struct lu_seq_range), lustre_swab_lu_seq_range); EXPORT_SYMBOL(RMF_FLD_MDFLD); const struct req_msg_field RMF_MDT_BODY = @@ -670,6 +679,12 @@ const struct req_msg_field RMF_OBD_QUOTACTL = sizeof(struct obd_quotactl), lustre_swab_obd_quotactl); EXPORT_SYMBOL(RMF_OBD_QUOTACTL); +const struct req_msg_field RMF_QUOTA_ADJUST_QUNIT = + DEFINE_MSGF("quota_adjust_qunit", 0, + sizeof(struct quota_adjust_qunit), + lustre_swab_quota_adjust_qunit); +EXPORT_SYMBOL(RMF_QUOTA_ADJUST_QUNIT); + const struct req_msg_field RMF_QUNIT_DATA = DEFINE_MSGF("qunit_data", 0, sizeof(struct qunit_data), NULL); @@ -781,7 +796,7 @@ EXPORT_SYMBOL(RMF_REC_JOINFILE); const struct req_msg_field RMF_EADATA = DEFINE_MSGF("eadata", 0, -1, NULL); EXPORT_SYMBOL(RMF_EADATA); -const struct req_msg_field RMF_ACL = +const struct req_msg_field RMF_ACL = DEFINE_MSGF("acl", 0, LUSTRE_POSIX_ACL_MAX_SIZE, NULL); EXPORT_SYMBOL(RMF_ACL); @@ -799,7 +814,7 @@ const struct req_msg_field RMF_CAPA2 = lustre_swab_lustre_capa); EXPORT_SYMBOL(RMF_CAPA2); -/* +/* * OST request field. */ const struct req_msg_field RMF_OST_BODY = @@ -863,11 +878,11 @@ DEFINE_REQ_FMT(name, client, ARRAY_SIZE(client), server, ARRAY_SIZE(server)) const struct req_format RQF_OBD_PING = DEFINE_REQ_FMT0("OBD_PING", empty, empty); EXPORT_SYMBOL(RQF_OBD_PING); - + const struct req_format RQF_SEC_CTX = DEFINE_REQ_FMT0("SEC_CTX", empty, empty); EXPORT_SYMBOL(RQF_SEC_CTX); - + const struct req_format RQF_MGS_TARGET_REG = DEFINE_REQ_FMT0("MGS_TARGET_REG", mgs_target_info_only, mgs_target_info_only); @@ -878,6 +893,14 @@ const struct req_format RQF_MGS_SET_INFO = mgs_set_info); EXPORT_SYMBOL(RQF_MGS_SET_INFO); +const struct req_format RQF_SEQ_QUERY = + DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); +EXPORT_SYMBOL(RQF_SEQ_QUERY); + +const struct req_format RQF_FLD_QUERY = + DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); +EXPORT_SYMBOL(RQF_FLD_QUERY); + const struct req_format RQF_LOG_CANCEL = DEFINE_REQ_FMT0("OBD_LOG_CANCEL", log_cancel_client, empty); EXPORT_SYMBOL(RQF_LOG_CANCEL); @@ -898,6 +921,11 @@ const struct req_format RQF_OST_QUOTACTL = DEFINE_REQ_FMT0("OST_QUOTACTL", quotactl_only, quotactl_only); EXPORT_SYMBOL(RQF_OST_QUOTACTL); +const struct req_format RQF_OST_QUOTA_ADJUST_QUNIT = + DEFINE_REQ_FMT0("OST_QUOTA_ADJUST_QUNIT", quota_adjust_qunit_only, + quota_adjust_qunit_only); +EXPORT_SYMBOL(RQF_OST_QUOTA_ADJUST_QUNIT); + const struct req_format RQF_QC_CALLBACK = DEFINE_REQ_FMT0("QC_CALLBACK", quotactl_only, empty); EXPORT_SYMBOL(RQF_QC_CALLBACK); @@ -906,14 +934,6 @@ const struct req_format RQF_MDS_QUOTA_DQACQ = DEFINE_REQ_FMT0("MDS_QUOTA_DQACQ", qunit_data_only, qunit_data_only); EXPORT_SYMBOL(RQF_MDS_QUOTA_DQACQ); -const struct req_format RQF_SEQ_QUERY = - DEFINE_REQ_FMT0("SEQ_QUERY", seq_query_client, seq_query_server); -EXPORT_SYMBOL(RQF_SEQ_QUERY); - -const struct req_format RQF_FLD_QUERY = - DEFINE_REQ_FMT0("FLD_QUERY", fld_query_client, fld_query_server); -EXPORT_SYMBOL(RQF_FLD_QUERY); - const struct req_format RQF_MDS_GETSTATUS = DEFINE_REQ_FMT0("MDS_GETSTATUS", mdt_body_only, mdt_body_capa); EXPORT_SYMBOL(RQF_MDS_GETSTATUS); @@ -1002,11 +1022,11 @@ EXPORT_SYMBOL(RQF_MDS_CONNECT); const struct req_format RQF_MDS_DISCONNECT = DEFINE_REQ_FMT0("MDS_DISCONNECT", empty, empty); EXPORT_SYMBOL(RQF_MDS_DISCONNECT); - + const struct req_format RQF_MDS_SET_INFO = DEFINE_REQ_FMT0("MDS_SET_INFO", mds_set_info_client, empty); EXPORT_SYMBOL(RQF_MDS_SET_INFO); - + const struct req_format RQF_LDLM_ENQUEUE = DEFINE_REQ_FMT0("LDLM_ENQUEUE", ldlm_enqueue_client, ldlm_enqueue_lvb_server); @@ -1305,7 +1325,7 @@ int req_capsule_filled_sizes(struct req_capsule *pill, for (i = 0; i < fmt->rf_fields[loc].nr; ++i) { if (pill->rc_area[loc][i] == -1) { - pill->rc_area[loc][i] = + pill->rc_area[loc][i] = fmt->rf_fields[loc].d[i]->rmf_size; if (pill->rc_area[loc][i] == -1) { /* skip the following fields */ diff --git a/lustre/ptlrpc/lproc_ptlrpc.c b/lustre/ptlrpc/lproc_ptlrpc.c index 519593c..d5af015 100644 --- a/lustre/ptlrpc/lproc_ptlrpc.c +++ b/lustre/ptlrpc/lproc_ptlrpc.c @@ -72,6 +72,7 @@ struct ll_rpc_opcode { { OST_SET_INFO, "ost_set_info" }, { OST_QUOTACHECK, "ost_quotacheck" }, { OST_QUOTACTL, "ost_quotactl" }, + { OST_QUOTA_ADJUST_QUNIT, "ost_quota_adjust_qunit" }, { MDS_GETATTR, "mds_getattr" }, { MDS_GETATTR_NAME, "mds_getattr_lock" }, { MDS_CLOSE, "mds_close" }, @@ -106,7 +107,7 @@ struct ll_rpc_opcode { { MGS_SET_INFO, "mgs_set_info" }, { OBD_PING, "obd_ping" }, { OBD_LOG_CANCEL, "llog_origin_handle_cancel" }, - { OBD_QC_CALLBACK, "obd_qc_callback" }, + { OBD_QC_CALLBACK, "obd_quota_callback" }, { LLOG_ORIGIN_HANDLE_CREATE, "llog_origin_handle_create" }, { LLOG_ORIGIN_HANDLE_NEXT_BLOCK, "llog_origin_handle_next_block" }, { LLOG_ORIGIN_HANDLE_READ_HEADER,"llog_origin_handle_read_header" }, @@ -120,7 +121,9 @@ struct ll_rpc_opcode { { SEQ_QUERY, "seq_query" }, { SEC_CTX_INIT, "sec_ctx_init" }, { SEC_CTX_INIT_CONT,"sec_ctx_init_cont" }, - { SEC_CTX_FINI, "sec_ctx_fini" } + { SEC_CTX_FINI, "sec_ctx_fini" }, + { QUOTA_DQACQ, "quota_acquire" }, + { QUOTA_DQREL, "quota_release" } }; struct ll_eopcode { @@ -132,12 +135,13 @@ struct ll_eopcode { { LDLM_EXTENT_ENQUEUE, "ldlm_extent_enqueue" }, { LDLM_FLOCK_ENQUEUE, "ldlm_flock_enqueue" }, { LDLM_IBITS_ENQUEUE, "ldlm_ibits_enqueue" }, + { MDS_REINT_SETATTR, "mds_reint_setattr" }, { MDS_REINT_CREATE, "mds_reint_create" }, { MDS_REINT_LINK, "mds_reint_link" }, - { MDS_REINT_OPEN, "mds_reint_open" }, - { MDS_REINT_SETATTR, "mds_reint_setattr" }, - { MDS_REINT_RENAME, "mds_reint_rename" }, { MDS_REINT_UNLINK, "mds_reint_unlink" }, + { MDS_REINT_RENAME, "mds_reint_rename" }, + { MDS_REINT_OPEN, "mds_reint_open" }, + { MDS_REINT_SETXATTR, "mds_reint_setxattr" }, { BRW_READ_BYTES, "read_bytes" }, { BRW_WRITE_BYTES, "write_bytes" }, }; @@ -145,15 +149,19 @@ struct ll_eopcode { const char *ll_opcode2str(__u32 opcode) { /* When one of the assertions below fail, chances are that: - * 1) A new opcode was added in lustre_idl.h, but was - * is missing from the table above. + * 1) A new opcode was added in include/lustre/lustre_idl.h, + * but is missing from the table above. * or 2) The opcode space was renumbered or rearranged, * and the opcode_offset() function in * ptlrpc_internal.h needs to be modified. */ __u32 offset = opcode_offset(opcode); - LASSERT(offset < LUSTRE_MAX_OPCODES); - LASSERT(ll_rpc_opcode_table[offset].opcode == opcode); + LASSERTF(offset < LUSTRE_MAX_OPCODES, + "offset %u >= LUSTRE_MAX_OPCODES %u\n", + offset, LUSTRE_MAX_OPCODES); + LASSERTF(ll_rpc_opcode_table[offset].opcode == opcode, + "ll_rpc_opcode_table[%u].opcode %u != opcode %u\n", + offset, ll_rpc_opcode_table[offset].opcode, opcode); return ll_rpc_opcode_table[offset].opname; } @@ -506,6 +514,32 @@ static int ptlrpc_lprocfs_rd_timeouts(char *page, char **start, off_t off, return rc; } +static int ptlrpc_lprocfs_rd_hp_ratio(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct ptlrpc_service *svc = data; + int rc = snprintf(page, count, "%d", svc->srv_hpreq_ratio); + return rc; +} + +static int ptlrpc_lprocfs_wr_hp_ratio(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct ptlrpc_service *svc = data; + int rc, val; + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc < 0) + return rc; + if (val < 0) + return -ERANGE; + + spin_lock(&svc->srv_lock); + svc->srv_hpreq_ratio = val; + spin_unlock(&svc->srv_lock); + return count; +} + void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, struct ptlrpc_service *svc) { @@ -521,6 +555,10 @@ void ptlrpc_lprocfs_register_service(struct proc_dir_entry *entry, {.name = "timeouts", .read_fptr = ptlrpc_lprocfs_rd_timeouts, .data = svc}, + {.name = "high_priority_ratio", + .read_fptr = ptlrpc_lprocfs_rd_hp_ratio, + .write_fptr = ptlrpc_lprocfs_wr_hp_ratio, + .data = svc}, {NULL} }; static struct file_operations req_history_fops = { diff --git a/lustre/ptlrpc/niobuf.c b/lustre/ptlrpc/niobuf.c index d79ad5b..786f929 100644 --- a/lustre/ptlrpc/niobuf.c +++ b/lustre/ptlrpc/niobuf.c @@ -95,7 +95,7 @@ static int ptl_send_buf (lnet_handle_md_t *mdh, void *base, int len, RETURN (0); } -int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) +int ptlrpc_start_bulk_transfer(struct ptlrpc_bulk_desc *desc) { struct ptlrpc_connection *conn = desc->bd_export->exp_connection; int rc; @@ -162,16 +162,16 @@ int ptlrpc_start_bulk_transfer (struct ptlrpc_bulk_desc *desc) RETURN(0); } -void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) +/* Server side bulk abort. Idempotent. Not thread-safe (i.e. only + * serialises with completion callback) */ +void ptlrpc_abort_bulk(struct ptlrpc_bulk_desc *desc) { - /* Server side bulk abort. Idempotent. Not thread-safe (i.e. only - * serialises with completion callback) */ - struct l_wait_info lwi; - int rc; + struct l_wait_info lwi; + int rc; - LASSERT (!in_interrupt ()); /* might sleep */ + LASSERT(!in_interrupt()); /* might sleep */ - if (!ptlrpc_bulk_active(desc)) /* completed or */ + if (!ptlrpc_server_bulk_active(desc)) /* completed or */ return; /* never started */ /* Do not send any meaningful data over the wire for evicted clients */ @@ -183,14 +183,15 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) * but we must still l_wait_event() in this case, to give liblustre * a chance to run server_bulk_callback()*/ - LNetMDUnlink (desc->bd_md_h); + LNetMDUnlink(desc->bd_md_h); for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT (cfs_time_seconds(300), NULL, NULL); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); rc = l_wait_event(desc->bd_waitq, - !ptlrpc_bulk_active(desc), &lwi); + !ptlrpc_server_bulk_active(desc), &lwi); if (rc == 0) return; @@ -199,7 +200,7 @@ void ptlrpc_abort_bulk (struct ptlrpc_bulk_desc *desc) } } -int ptlrpc_register_bulk (struct ptlrpc_request *req) +int ptlrpc_register_bulk(struct ptlrpc_request *req) { struct ptlrpc_bulk_desc *desc = req->rq_bulk; lnet_process_id_t peer; @@ -272,28 +273,44 @@ int ptlrpc_register_bulk (struct ptlrpc_request *req) RETURN(0); } -void ptlrpc_unregister_bulk (struct ptlrpc_request *req) +/* Disconnect a bulk desc from the network. Idempotent. Not + * thread-safe (i.e. only interlocks with completion callback). */ +int ptlrpc_unregister_bulk(struct ptlrpc_request *req, int async) { - /* Disconnect a bulk desc from the network. Idempotent. Not - * thread-safe (i.e. only interlocks with completion callback). */ struct ptlrpc_bulk_desc *desc = req->rq_bulk; cfs_waitq_t *wq; struct l_wait_info lwi; int rc; + ENTRY; + + LASSERT(!in_interrupt()); /* might sleep */ - LASSERT (!in_interrupt ()); /* might sleep */ + /* Let's setup deadline for reply unlink. */ + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_LONG_BULK_UNLINK) && + async && req->rq_bulk_deadline == 0) + req->rq_bulk_deadline = cfs_time_current_sec() + LONG_UNLINK; - if (!ptlrpc_bulk_active(desc)) /* completed or */ - return; /* never registered */ + if (!ptlrpc_client_bulk_active(req)) /* completed or */ + RETURN(1); /* never registered */ - LASSERT (desc->bd_req == req); /* bd_req NULL until registered */ + LASSERT(desc->bd_req == req); /* bd_req NULL until registered */ /* the unlink ensures the callback happens ASAP and is the last * one. If it fails, it must be because completion just happened, * but we must still l_wait_event() in this case to give liblustre * a chance to run client_bulk_callback() */ - LNetMDUnlink (desc->bd_md_h); + LNetMDUnlink(desc->bd_md_h); + + if (!ptlrpc_client_bulk_active(req)) /* completed or */ + RETURN(1); /* never registered */ + + /* Move to "Unregistering" phase as bulk was not unlinked yet. */ + ptlrpc_rqphase_move(req, RQ_PHASE_UNREGISTERING); + + /* Do not wait for unlink to finish. */ + if (async) + RETURN(0); if (req->rq_set != NULL) wq = &req->rq_set->set_waitq; @@ -303,15 +320,19 @@ void ptlrpc_unregister_bulk (struct ptlrpc_request *req) for (;;) { /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT (cfs_time_seconds(300), NULL, NULL); - rc = l_wait_event(*wq, !ptlrpc_bulk_active(desc), &lwi); - if (rc == 0) - return; + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); + rc = l_wait_event(*wq, !ptlrpc_client_bulk_active(req), &lwi); + if (rc == 0) { + ptlrpc_rqphase_move(req, req->rq_next_phase); + RETURN(1); + } - LASSERT (rc == -ETIMEDOUT); - DEBUG_REQ(D_WARNING,req,"Unexpectedly long timeout: desc %p", + LASSERT(rc == -ETIMEDOUT); + DEBUG_REQ(D_WARNING, req, "Unexpectedly long timeout: desc %p", desc); } + RETURN(0); } static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) @@ -322,6 +343,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) if (!(flags & PTLRPC_REPLY_EARLY) && (req->rq_type != PTL_RPC_MSG_ERR) && + (req->rq_reqmsg != NULL) && !(lustre_msg_get_flags(req->rq_reqmsg) & (MSG_RESENT | MSG_REPLAY | MSG_LAST_REPLAY))) { /* early replies, errors and recovery requests don't count @@ -356,7 +378,7 @@ static void ptlrpc_at_set_reply(struct ptlrpc_request *req, int flags) } } -int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) +int ptlrpc_send_reply(struct ptlrpc_request *req, int flags) { struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; struct ptlrpc_reply_state *rs = req->rq_reply_state; @@ -421,7 +443,8 @@ int ptlrpc_send_reply (struct ptlrpc_request *req, int flags) req->rq_sent = cfs_time_current_sec(); rc = ptl_send_buf (&rs->rs_md_h, rs->rs_repbuf, rs->rs_repdata_len, - rs->rs_difficult ? LNET_ACK_REQ : LNET_NOACK_REQ, + (rs->rs_difficult && !rs->rs_no_ack) ? + LNET_ACK_REQ : LNET_NOACK_REQ, &rs->rs_cb_id, conn, svc->srv_rep_portal, req->rq_xid, req->rq_reply_off); out: @@ -484,7 +507,7 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) /* If this is a re-transmit, we're required to have disengaged * cleanly from the previous attempt */ - LASSERT (!request->rq_receiving_reply); + LASSERT(!request->rq_receiving_reply); if (request->rq_import->imp_obd && request->rq_import->imp_obd->obd_fail) { @@ -632,16 +655,16 @@ int ptl_send_rpc(struct ptlrpc_request *request, int noreply) rc2 = LNetMEUnlink(reply_me_h); LASSERT (rc2 == 0); /* UNLINKED callback called synchronously */ - LASSERT (!request->rq_receiving_reply); + LASSERT(!request->rq_receiving_reply); cleanup_bulk: - if (request->rq_bulk != NULL) - ptlrpc_unregister_bulk(request); - + /* We do sync unlink here as there was no real transfer here so + * the chance to have long unlink to sluggish net is smaller here. */ + ptlrpc_unregister_bulk(request, 0); return rc; } -int ptlrpc_register_rqbd (struct ptlrpc_request_buffer_desc *rqbd) +int ptlrpc_register_rqbd(struct ptlrpc_request_buffer_desc *rqbd) { struct ptlrpc_service *service = rqbd->rqbd_service; static lnet_process_id_t match_id = {LNET_NID_ANY, LNET_PID_ANY}; diff --git a/lustre/ptlrpc/pack_generic.c b/lustre/ptlrpc/pack_generic.c index 42790f1..1cd90f0 100644 --- a/lustre/ptlrpc/pack_generic.c +++ b/lustre/ptlrpc/pack_generic.c @@ -90,7 +90,7 @@ int lustre_msg_check_version(struct lustre_msg *msg, __u32 version) case LUSTRE_MSG_MAGIC_V1: case LUSTRE_MSG_MAGIC_V1_SWABBED: CERROR("msg v1 not supported - please upgrade you system\n"); - return -EINVAL; + return -EINVAL; case LUSTRE_MSG_MAGIC_V2: case LUSTRE_MSG_MAGIC_V2_SWABBED: return lustre_msg_check_version_v2(msg, version); @@ -516,7 +516,7 @@ static int lustre_unpack_msg_v2(struct lustre_msg_v2 *m, int len) len, m->lm_bufcount); return -EINVAL; } - + for (i = 0; i < m->lm_bufcount; i++) { if (flipped) __swab32s(&m->lm_buflens[i]); @@ -733,6 +733,7 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size, { void *ptr = NULL; + LASSERT(msg != NULL); switch (msg->lm_magic) { case LUSTRE_MSG_MAGIC_V2: case LUSTRE_MSG_MAGIC_V2_SWABBED: @@ -753,6 +754,9 @@ void *lustre_swab_buf(struct lustre_msg *msg, int index, int min_size, void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size, void *swabber) { + if (lustre_req_swabbed(req, index)) + return lustre_msg_buf(req->rq_reqmsg, index, min_size); + lustre_set_req_swabbed(req, index); return lustre_swab_buf(req->rq_reqmsg, index, min_size, swabber); } @@ -760,6 +764,9 @@ void *lustre_swab_reqbuf(struct ptlrpc_request *req, int index, int min_size, void *lustre_swab_repbuf(struct ptlrpc_request *req, int index, int min_size, void *swabber) { + if (lustre_rep_swabbed(req, index)) + return lustre_msg_buf(req->rq_repmsg, index, min_size); + lustre_set_rep_swabbed(req, index); return lustre_swab_buf(req->rq_repmsg, index, min_size, swabber); } @@ -1744,6 +1751,15 @@ void lustre_swab_obd_quotactl (struct obd_quotactl *q) lustre_swab_obd_dqblk (&q->qc_dqblk); } +void lustre_swab_quota_adjust_qunit (struct quota_adjust_qunit *q) +{ + __swab32s (&q->qaq_flags); + __swab32s (&q->qaq_id); + __swab64s (&q->qaq_bunit_sz); + __swab64s (&q->qaq_iunit_sz); + __swab64s (&q->padding1); +} + void lustre_swab_mds_remote_perm (struct mds_remote_perm *p) { __swab32s (&p->rp_uid); @@ -1892,12 +1908,15 @@ void lustre_swab_mds_rec_rename (struct mds_rec_rename *rn) void lustre_swab_mdt_rec_reint (struct mdt_rec_reint *rr) { __swab32s (&rr->rr_opcode); + __swab32s (&rr->rr_cap); __swab32s (&rr->rr_fsuid); + /* rr_fsuid_h is unused */ __swab32s (&rr->rr_fsgid); - __swab32s (&rr->rr_cap); + /* rr_fsgid_h is unused */ __swab32s (&rr->rr_suppgid1); + /* rr_suppgid1_h is unused */ __swab32s (&rr->rr_suppgid2); - /* handle is opaque */ + /* rr_suppgid2_h is unused */ lustre_swab_lu_fid (&rr->rr_fid1); lustre_swab_lu_fid (&rr->rr_fid2); __swab64s (&rr->rr_mtime); @@ -1923,9 +1942,9 @@ void lustre_swab_lov_desc (struct lov_desc *ld) __swab32s (&ld->ld_tgt_count); __swab32s (&ld->ld_active_tgt_count); __swab32s (&ld->ld_default_stripe_count); + __swab32s (&ld->ld_pattern); __swab64s (&ld->ld_default_stripe_size); __swab64s (&ld->ld_default_stripe_offset); - __swab32s (&ld->ld_pattern); __swab32s (&ld->ld_qos_maxage); /* uuid endian insensitive */ } @@ -1937,12 +1956,6 @@ void lustre_swab_lmv_desc (struct lmv_desc *ld) __swab32s (&ld->ld_active_tgt_count); /* uuid endian insensitive */ } -/*end adding MDT by huanghua@clusterfs.com*/ -void lustre_swab_md_fld (struct md_fld *mf) -{ - __swab64s(&mf->mf_seq); - __swab64s(&mf->mf_mds); -} static void print_lum (struct lov_user_md *lum) { @@ -2101,54 +2114,92 @@ void lustre_swab_qdata(struct qunit_data *d) __swab32s (&d->qd_id); __swab32s (&d->qd_flags); __swab64s (&d->qd_count); -} - -void lustre_swab_qdata_old(struct qunit_data_old *d) -{ - __swab32s (&d->qd_id); - __swab32s (&d->qd_type); - __swab32s (&d->qd_count); - __swab32s (&d->qd_isblk); + __swab64s (&d->qd_qunit); + __swab64s (&d->padding); } #ifdef __KERNEL__ -struct qunit_data *lustre_quota_old_to_new(struct qunit_data_old *d) + +/** + * got qdata from request(req/rep) + */ +int quota_get_qdata(void *request, struct qunit_data *qdata, + int is_req, int is_exp) { - struct qunit_data_old tmp; - struct qunit_data *ret; - ENTRY; + struct ptlrpc_request *req = (struct ptlrpc_request *)request; + struct qunit_data *new; + __u64 flags = is_exp ? req->rq_export->exp_connect_flags : + req->rq_import->imp_connect_data.ocd_connect_flags; + int rc = 0; - if (!d) - return NULL; + LASSERT(req); + LASSERT(qdata); - tmp = *d; - ret = (struct qunit_data *)d; - ret->qd_id = tmp.qd_id; - ret->qd_flags = (tmp.qd_type ? QUOTA_IS_GRP : 0) | (tmp.qd_isblk ? QUOTA_IS_BLOCK : 0); - ret->qd_count = tmp.qd_count; - RETURN(ret); + /* support for quota64 and change_qs */ + if (flags & OBD_CONNECT_CHANGE_QS) { + if (!(flags & OBD_CONNECT_QUOTA64)) { + CDEBUG(D_ERROR, "Wire protocol for qunit is broken!\n"); + return -EINVAL; + } + if (is_req == QUOTA_REQUEST) + new = lustre_swab_reqbuf(req, REQ_REC_OFF, + sizeof(struct qunit_data), + lustre_swab_qdata); + else + new = lustre_swab_repbuf(req, REPLY_REC_OFF, + sizeof(struct qunit_data), + lustre_swab_qdata); + if (new == NULL) + GOTO(out, rc = -EPROTO); + *qdata = *new; + QDATA_SET_CHANGE_QS(qdata); + return 0; + } else { + QDATA_CLR_CHANGE_QS(qdata); + } +out: + return rc; } -EXPORT_SYMBOL(lustre_quota_old_to_new); +EXPORT_SYMBOL(quota_get_qdata); -struct qunit_data_old *lustre_quota_new_to_old(struct qunit_data *d) +/** + * copy qdata to request(req/rep) + */ +int quota_copy_qdata(void *request, struct qunit_data *qdata, + int is_req, int is_exp) { - struct qunit_data tmp; - struct qunit_data_old *ret; - ENTRY; + struct ptlrpc_request *req = (struct ptlrpc_request *)request; + void *target; + __u64 flags = is_exp ? req->rq_export->exp_connect_flags : + req->rq_import->imp_connect_data.ocd_connect_flags; + int rc = 0; - if (!d) - return NULL; + LASSERT(req); + LASSERT(qdata); + + /* support for quota64 and change_qs */ + if (flags & OBD_CONNECT_CHANGE_QS) { + if (!(flags & OBD_CONNECT_QUOTA64)) { + CERROR("Wire protocol for qunit is broken!\n"); + return -EINVAL; + } + if (is_req == QUOTA_REQUEST) + target = lustre_msg_buf(req->rq_reqmsg, REQ_REC_OFF, + sizeof(struct qunit_data)); + else + target = lustre_msg_buf(req->rq_repmsg, REPLY_REC_OFF, + sizeof(struct qunit_data)); + if (!target) + GOTO(out, rc = -EPROTO); + memcpy(target, qdata, sizeof(*qdata)); + return 0; + } - tmp = *d; - ret = (struct qunit_data_old *)d; - ret->qd_id = tmp.qd_id; - ret->qd_type = ((tmp.qd_flags & QUOTA_IS_GRP) ? GRPQUOTA : USRQUOTA); - ret->qd_count = (__u32)tmp.qd_count; - ret->qd_isblk = ((tmp.qd_flags & QUOTA_IS_BLOCK) ? 1 : 0); - RETURN(ret); +out: + return rc; } -EXPORT_SYMBOL(lustre_quota_new_to_old); +EXPORT_SYMBOL(quota_copy_qdata); #endif /* __KERNEL__ */ static inline int req_ptlrpc_body_swabbed(struct ptlrpc_request *req) @@ -2203,7 +2254,7 @@ void _debug_req(struct ptlrpc_request *req, __u32 mask, (char *)req->rq_export->exp_connection->c_remote_uuid.uuid : "", req->rq_request_portal, req->rq_reply_portal, req->rq_reqlen, req->rq_replen, - req->rq_early_count, req->rq_timeout, req->rq_deadline, + req->rq_early_count, !!req->rq_timeout, req->rq_deadline, atomic_read(&req->rq_refcount), DEBUG_REQ_FLAGS(req), req->rq_reqmsg && req_ptlrpc_body_swabbed(req) ? lustre_msg_get_flags(req->rq_reqmsg) : -1, @@ -2219,11 +2270,12 @@ void lustre_swab_lustre_capa(struct lustre_capa *c) { lustre_swab_lu_fid(&c->lc_fid); __swab64s (&c->lc_opc); - __swab32s (&c->lc_uid); + __swab64s (&c->lc_uid); + __swab64s (&c->lc_gid); __swab32s (&c->lc_flags); __swab32s (&c->lc_keyid); __swab32s (&c->lc_timeout); - __swab64s (&c->lc_expiry); + __swab32s (&c->lc_expiry); } void lustre_swab_lustre_capa_key (struct lustre_capa_key *k) diff --git a/lustre/ptlrpc/pinger.c b/lustre/ptlrpc/pinger.c index 0da3caa..a9e4aba 100644 --- a/lustre/ptlrpc/pinger.c +++ b/lustre/ptlrpc/pinger.c @@ -69,7 +69,7 @@ int ptlrpc_ping(struct obd_import *imp) imp->imp_obd->obd_uuid.uuid, obd2cli_tgt(imp->imp_obd)); req->rq_no_resend = req->rq_no_delay = 1; ptlrpc_request_set_replen(req); - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); RETURN(0); } @@ -550,6 +550,7 @@ static int pinger_check_rpcs(void *arg) struct ptlrpc_request *req; struct ptlrpc_request_set *set; struct list_head *iter; + struct obd_import *imp; struct pinger_data *pd = &pinger_args; int rc; @@ -615,7 +616,7 @@ static int pinger_check_rpcs(void *arg) req->rq_no_resend = 1; ptlrpc_request_set_replen(req); req->rq_send_state = LUSTRE_IMP_FULL; - req->rq_phase = RQ_PHASE_RPC; + ptlrpc_rqphase_move(req, RQ_PHASE_RPC); req->rq_import_generation = generation; ptlrpc_set_add_req(set, req); } else { @@ -661,17 +662,23 @@ do_check_set: if (req->rq_phase == RQ_PHASE_COMPLETE) continue; - req->rq_phase = RQ_PHASE_COMPLETE; - atomic_dec(&req->rq_import->imp_inflight); - set->set_remaining--; - /* If it was disconnected, don't sweat it. */ - if (list_empty(&req->rq_import->imp_pinger_chain)) { - ptlrpc_unregister_reply(req); - continue; - } + CDEBUG(D_RPCTRACE, "Pinger initiate expire request(%p)\n", + req); - CDEBUG(D_RPCTRACE, "pinger initiate expire_one_request\n"); - ptlrpc_expire_one_request(req); + /* This will also unregister reply. */ + ptlrpc_expire_one_request(req, 0); + + /* We're done with this req, let's finally move it to complete + * phase and take care of inflights. */ + ptlrpc_rqphase_move(req, RQ_PHASE_COMPLETE); + imp = req->rq_import; + spin_lock(&imp->imp_lock); + if (!list_empty(&req->rq_list)) { + list_del_init(&req->rq_list); + atomic_dec(&imp->imp_inflight); + } + spin_unlock(&imp->imp_lock); + set->set_remaining--; } mutex_up(&pinger_sem); diff --git a/lustre/ptlrpc/ptlrpc_internal.h b/lustre/ptlrpc/ptlrpc_internal.h index bdea8bd..ea6704a 100644 --- a/lustre/ptlrpc/ptlrpc_internal.h +++ b/lustre/ptlrpc/ptlrpc_internal.h @@ -81,7 +81,7 @@ void ptlrpc_lprocfs_do_request_stat (struct ptlrpc_request *req, /* recovd_thread.c */ -int ptlrpc_expire_one_request(struct ptlrpc_request *req); +int ptlrpc_expire_one_request(struct ptlrpc_request *req, int async_unlink); /* pers.c */ void ptlrpc_fill_bulk_md(lnet_md_t *md, struct ptlrpc_bulk_desc *desc); @@ -124,8 +124,17 @@ int sptlrpc_lproc_init(void); void sptlrpc_lproc_fini(void); /* sec_gc.c */ -int sptlrpc_gc_start_thread(void); -void sptlrpc_gc_stop_thread(void); +int sptlrpc_gc_init(void); +void sptlrpc_gc_fini(void); + +/* sec_config.c */ +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf); +int sptlrpc_conf_init(void); +void sptlrpc_conf_fini(void); /* sec.c */ int __init sptlrpc_init(void); diff --git a/lustre/ptlrpc/ptlrpc_module.c b/lustre/ptlrpc/ptlrpc_module.c index 76e0727..363f399 100644 --- a/lustre/ptlrpc/ptlrpc_module.c +++ b/lustre/ptlrpc/ptlrpc_module.c @@ -211,6 +211,7 @@ EXPORT_SYMBOL(ptlrpc_start_thread); EXPORT_SYMBOL(ptlrpc_unregister_service); EXPORT_SYMBOL(ptlrpc_daemonize); EXPORT_SYMBOL(ptlrpc_service_health_check); +EXPORT_SYMBOL(ptlrpc_hpreq_reorder); /* pack_generic.c */ EXPORT_SYMBOL(lustre_msg_swabbed); @@ -264,7 +265,7 @@ EXPORT_SYMBOL(lustre_swab_ldlm_lock_desc); EXPORT_SYMBOL(lustre_swab_ldlm_request); EXPORT_SYMBOL(lustre_swab_ldlm_reply); EXPORT_SYMBOL(lustre_swab_qdata); -EXPORT_SYMBOL(lustre_swab_qdata_old); +EXPORT_SYMBOL(lustre_swab_quota_adjust_qunit); EXPORT_SYMBOL(lustre_msg_get_flags); EXPORT_SYMBOL(lustre_msg_add_flags); EXPORT_SYMBOL(lustre_msg_set_flags); @@ -297,7 +298,6 @@ EXPORT_SYMBOL(lustre_msg_set_transno); EXPORT_SYMBOL(lustre_msg_set_status); EXPORT_SYMBOL(lustre_msg_set_conn_cnt); EXPORT_SYMBOL(lustre_swab_mgs_target_info); -EXPORT_SYMBOL(lustre_swab_md_fld); EXPORT_SYMBOL(lustre_swab_generic_32s); EXPORT_SYMBOL(lustre_swab_lustre_capa); EXPORT_SYMBOL(lustre_swab_lustre_capa_key); diff --git a/lustre/ptlrpc/ptlrpcd.c b/lustre/ptlrpc/ptlrpcd.c index 934e1e7..a52e62b 100644 --- a/lustre/ptlrpc/ptlrpcd.c +++ b/lustre/ptlrpc/ptlrpcd.c @@ -51,10 +51,44 @@ #include #include /* for obd_zombie */ #include /* for OBD_FAIL_CHECK */ +#include /* cl_env_{get,put}() */ #include -static struct ptlrpcd_ctl ptlrpcd_pc; -static struct ptlrpcd_ctl ptlrpcd_recovery_pc; +enum pscope_thread { + PT_NORMAL, + PT_RECOVERY, + PT_NR +}; + +struct ptlrpcd_scope_ctl { + struct ptlrpcd_thread { + const char *pt_name; + struct ptlrpcd_ctl pt_ctl; + } pscope_thread[PT_NR]; +}; + +static struct ptlrpcd_scope_ctl ptlrpcd_scopes[PSCOPE_NR] = { + [PSCOPE_BRW] = { + .pscope_thread = { + [PT_NORMAL] = { + .pt_name = "ptlrpcd-brw" + }, + [PT_RECOVERY] = { + .pt_name = "ptlrpcd-brw-rcv" + } + } + }, + [PSCOPE_OTHER] = { + .pscope_thread = { + [PT_NORMAL] = { + .pt_name = "ptlrpcd" + }, + [PT_RECOVERY] = { + .pt_name = "ptlrpcd-rcv" + } + } + } +}; struct semaphore ptlrpcd_sem; static int ptlrpcd_users = 0; @@ -68,24 +102,26 @@ void ptlrpcd_wake(struct ptlrpc_request *req) cfs_waitq_signal(&rq_set->set_waitq); } -/* +/* * Requests that are added to the ptlrpcd queue are sent via * ptlrpcd_check->ptlrpc_check_set(). */ -void ptlrpcd_add_req(struct ptlrpc_request *req) +void ptlrpcd_add_req(struct ptlrpc_request *req, enum ptlrpcd_scope scope) { struct ptlrpcd_ctl *pc; + enum pscope_thread pt; int rc; - if (req->rq_send_state == LUSTRE_IMP_FULL) - pc = &ptlrpcd_pc; - else - pc = &ptlrpcd_recovery_pc; - + LASSERT(scope < PSCOPE_NR); + pt = req->rq_send_state == LUSTRE_IMP_FULL ? PT_NORMAL : PT_RECOVERY; + pc = &ptlrpcd_scopes[scope].pscope_thread[pt].pt_ctl; rc = ptlrpc_set_add_new_req(pc, req); - if (rc) { + /* + * XXX disable this for CLIO: environment is needed for interpreter. + */ + if (rc && 0) { ptlrpc_interpterer_t interpreter; - + interpreter = req->rq_interpret_reply; /* @@ -110,16 +146,13 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc) int rc = 0; ENTRY; - if (test_bit(LIOD_STOP, &pc->pc_flags)) - RETURN(1); - spin_lock(&pc->pc_set->set_new_req_lock); list_for_each_safe(pos, tmp, &pc->pc_set->set_new_requests) { req = list_entry(pos, struct ptlrpc_request, rq_set_chain); list_del_init(&req->rq_set_chain); ptlrpc_set_add_req(pc->pc_set, req); - /* - * Need to calculate its timeout. + /* + * Need to calculate its timeout. */ rc = 1; } @@ -128,9 +161,9 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc) if (pc->pc_set->set_remaining) { rc = rc | ptlrpc_check_set(env, pc->pc_set); - /* + /* * XXX: our set never completes, so we prune the completed - * reqs after each iteration. boy could this be smarter. + * reqs after each iteration. boy could this be smarter. */ list_for_each_safe(pos, tmp, &pc->pc_set->set_requests) { req = list_entry(pos, struct ptlrpc_request, @@ -145,8 +178,8 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc) } if (rc == 0) { - /* - * If new requests have been added, make sure to wake up. + /* + * If new requests have been added, make sure to wake up. */ spin_lock(&pc->pc_set->set_new_req_lock); rc = !list_empty(&pc->pc_set->set_new_requests); @@ -157,7 +190,7 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc) } #ifdef __KERNEL__ -/* +/* * ptlrpc's code paths like to execute in process context, so we have this * thread which spins on a set which contains the io rpcs. llite specifies * ptlrpcd's set when it pushes pages down into the oscs. @@ -165,54 +198,88 @@ static int ptlrpcd_check(const struct lu_env *env, struct ptlrpcd_ctl *pc) static int ptlrpcd(void *arg) { struct ptlrpcd_ctl *pc = arg; - int rc; + struct lu_env env = { .le_ses = NULL }; + int rc, exit = 0; ENTRY; - if ((rc = cfs_daemonize_ctxt(pc->pc_name))) { - complete(&pc->pc_starting); - goto out; + rc = cfs_daemonize_ctxt(pc->pc_name); + if (rc == 0) { + /* + * XXX So far only "client" ptlrpcd uses an environment. In + * the future, ptlrpcd thread (or a thread-set) has to given + * an argument, describing its "scope". + */ + rc = lu_context_init(&env.le_ctx, + LCT_CL_THREAD|LCT_REMEMBER|LCT_NOREF); } complete(&pc->pc_starting); - /* + if (rc != 0) + RETURN(rc); + env.le_ctx.lc_cookie = 0x7; + + /* * This mainloop strongly resembles ptlrpc_set_wait() except that our * set never completes. ptlrpcd_check() calls ptlrpc_check_set() when - * there are requests in the set. New requests come in on the set's - * new_req_list and ptlrpcd_check() moves them into the set. + * there are requests in the set. New requests come in on the set's + * new_req_list and ptlrpcd_check() moves them into the set. */ - while (1) { + do { struct l_wait_info lwi; - cfs_duration_t timeout; + int timeout; + + rc = lu_env_refill(&env); + if (rc != 0) { + /* + * XXX This is very awkward situation, because + * execution can neither continue (request + * interpreters assume that env is set up), nor repeat + * the loop (as this potentially results in a tight + * loop of -ENOMEM's). + * + * Fortunately, refill only ever does something when + * new modules are loaded, i.e., early during boot up. + */ + CERROR("Failure to refill session: %d\n", rc); + continue; + } - timeout = cfs_time_seconds(ptlrpc_set_next_timeout(pc->pc_set)); - lwi = LWI_TIMEOUT(timeout, ptlrpc_expired_set, pc->pc_set); + timeout = ptlrpc_set_next_timeout(pc->pc_set); + lwi = LWI_TIMEOUT(cfs_time_seconds(timeout ? timeout : 1), + ptlrpc_expired_set, pc->pc_set); - lu_context_enter(&pc->pc_env.le_ctx); + lu_context_enter(&env.le_ctx); l_wait_event(pc->pc_set->set_waitq, - ptlrpcd_check(&pc->pc_env, pc), &lwi); - lu_context_exit(&pc->pc_env.le_ctx); + ptlrpcd_check(&env, pc), &lwi); + lu_context_exit(&env.le_ctx); /* * Abort inflight rpcs for forced stop case. */ - if (test_bit(LIOD_STOP_FORCE, &pc->pc_flags)) - ptlrpc_abort_set(pc->pc_set); + if (test_bit(LIOD_STOP, &pc->pc_flags)) { + if (test_bit(LIOD_FORCE, &pc->pc_flags)) + ptlrpc_abort_set(pc->pc_set); + exit++; + } - if (test_bit(LIOD_STOP, &pc->pc_flags)) - break; - } + /* + * Let's make one more loop to make sure that ptlrpcd_check() + * copied all raced new rpcs into the set so we can kill them. + */ + } while (exit < 2); - /* - * Wait for inflight requests to drain. + /* + * Wait for inflight requests to drain. */ if (!list_empty(&pc->pc_set->set_requests)) ptlrpc_set_wait(pc->pc_set); - + lu_context_fini(&env.le_ctx); complete(&pc->pc_finishing); -out: + clear_bit(LIOD_START, &pc->pc_flags); clear_bit(LIOD_STOP, &pc->pc_flags); + clear_bit(LIOD_FORCE, &pc->pc_flags); return 0; } @@ -221,10 +288,10 @@ out: int ptlrpcd_check_async_rpcs(void *arg) { struct ptlrpcd_ctl *pc = arg; - int rc = 0; + int rc = 0; - /* - * Single threaded!! + /* + * Single threaded!! */ pc->pc_recurred++; @@ -234,10 +301,10 @@ int ptlrpcd_check_async_rpcs(void *arg) lu_context_exit(&pc->pc_env.le_ctx); if (!rc) ptlrpc_expired_set(pc->pc_set); - /* - * XXX: send replay requests. + /* + * XXX: send replay requests. */ - if (pc == &ptlrpcd_recovery_pc) + if (test_bit(LIOD_RECOVERY, &pc->pc_flags)) rc = ptlrpcd_check(&pc->pc_env, pc); } @@ -255,13 +322,13 @@ int ptlrpcd_idle(void *arg) #endif -int ptlrpcd_start(char *name, struct ptlrpcd_ctl *pc) +int ptlrpcd_start(const char *name, struct ptlrpcd_ctl *pc) { int rc; ENTRY; - - /* - * Do not allow start second thread for one pc. + + /* + * Do not allow start second thread for one pc. */ if (test_and_set_bit(LIOD_START, &pc->pc_flags)) { CERROR("Starting second thread (%s) for same pc %p\n", @@ -319,7 +386,7 @@ void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) set_bit(LIOD_STOP, &pc->pc_flags); if (force) - set_bit(LIOD_STOP_FORCE, &pc->pc_flags); + set_bit(LIOD_FORCE, &pc->pc_flags); cfs_waitq_signal(&pc->pc_set->set_waitq); #ifdef __KERNEL__ wait_for_completion(&pc->pc_finishing); @@ -331,28 +398,52 @@ void ptlrpcd_stop(struct ptlrpcd_ctl *pc, int force) ptlrpc_set_destroy(pc->pc_set); } -int ptlrpcd_addref(void) +void ptlrpcd_fini(void) { - int rc = 0; + int i; + int j; + ENTRY; - mutex_down(&ptlrpcd_sem); - if (++ptlrpcd_users != 1) - GOTO(out, rc); + for (i = 0; i < PSCOPE_NR; ++i) { + for (j = 0; j < PT_NR; ++j) { + struct ptlrpcd_ctl *pc; - rc = ptlrpcd_start("ptlrpcd", &ptlrpcd_pc); - if (rc) { - --ptlrpcd_users; - GOTO(out, rc); + pc = &ptlrpcd_scopes[i].pscope_thread[j].pt_ctl; + + if (test_bit(LIOD_START, &pc->pc_flags)) + ptlrpcd_stop(pc, 0); + } } + EXIT; +} - rc = ptlrpcd_start("ptlrpcd-recov", &ptlrpcd_recovery_pc); - if (rc) { - ptlrpcd_stop(&ptlrpcd_pc, 0); - --ptlrpcd_users; - GOTO(out, rc); +int ptlrpcd_addref(void) +{ + int rc = 0; + int i; + int j; + ENTRY; + + mutex_down(&ptlrpcd_sem); + if (++ptlrpcd_users == 1) { + for (i = 0; rc == 0 && i < PSCOPE_NR; ++i) { + for (j = 0; rc == 0 && j < PT_NR; ++j) { + struct ptlrpcd_thread *pt; + struct ptlrpcd_ctl *pc; + + pt = &ptlrpcd_scopes[i].pscope_thread[j]; + pc = &pt->pt_ctl; + if (j == PT_RECOVERY) + set_bit(LIOD_RECOVERY, &pc->pc_flags); + rc = ptlrpcd_start(pt->pt_name, pc); + } + } + if (rc != 0) { + --ptlrpcd_users; + ptlrpcd_fini(); + } } -out: mutex_up(&ptlrpcd_sem); RETURN(rc); } @@ -360,9 +451,7 @@ out: void ptlrpcd_decref(void) { mutex_down(&ptlrpcd_sem); - if (--ptlrpcd_users == 0) { - ptlrpcd_stop(&ptlrpcd_pc, 0); - ptlrpcd_stop(&ptlrpcd_recovery_pc, 0); - } + if (--ptlrpcd_users == 0) + ptlrpcd_fini(); mutex_up(&ptlrpcd_sem); } diff --git a/lustre/ptlrpc/recov_thread.c b/lustre/ptlrpc/recov_thread.c index 01918db..69aec83 100644 --- a/lustre/ptlrpc/recov_thread.c +++ b/lustre/ptlrpc/recov_thread.c @@ -76,28 +76,52 @@ enum { LLOG_LCM_FL_EXIT = 1 << 1 }; -/** +static void llcd_print(struct llog_canceld_ctxt *llcd, + const char *func, int line) +{ + CDEBUG(D_RPCTRACE, "Llcd (%p) at %s:%d:\n", llcd, func, line); + CDEBUG(D_RPCTRACE, " size: %d\n", llcd->llcd_size); + CDEBUG(D_RPCTRACE, " ctxt: %p\n", llcd->llcd_ctxt); + CDEBUG(D_RPCTRACE, " lcm : %p\n", llcd->llcd_lcm); + CDEBUG(D_RPCTRACE, " cookiebytes : %d\n", llcd->llcd_cookiebytes); +} + +/** * Allocate new llcd from cache, init it and return to caller. * Bumps number of objects allocated. */ -static struct llog_canceld_ctxt *llcd_alloc(void) +static struct llog_canceld_ctxt *llcd_alloc(struct llog_commit_master *lcm) { struct llog_canceld_ctxt *llcd; - int llcd_size; + int size, overhead; - /* - * Payload of lustre_msg V2 is bigger. + LASSERT(lcm != NULL); + + /* + * We want to send one page of cookies with rpc header. This buffer + * will be assigned later to the rpc, this is why we preserve the + * space for rpc header. */ - llcd_size = CFS_PAGE_SIZE - - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); - llcd_size += offsetof(struct llog_canceld_ctxt, llcd_cookies); - OBD_SLAB_ALLOC(llcd, llcd_cache, CFS_ALLOC_STD, llcd_size); + size = CFS_PAGE_SIZE - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); + overhead = offsetof(struct llog_canceld_ctxt, llcd_cookies); + OBD_SLAB_ALLOC(llcd, llcd_cache, CFS_ALLOC_STD, size + overhead); if (!llcd) return NULL; - llcd->llcd_size = llcd_size; + CFS_INIT_LIST_HEAD(&llcd->llcd_list); llcd->llcd_cookiebytes = 0; + llcd->llcd_size = size; + + spin_lock(&lcm->lcm_lock); + llcd->llcd_lcm = lcm; + atomic_inc(&lcm->lcm_count); + list_add_tail(&llcd->llcd_list, &lcm->lcm_llcds); + spin_unlock(&lcm->lcm_lock); atomic_inc(&llcd_count); + + CDEBUG(D_RPCTRACE, "Alloc llcd %p on lcm %p (%d)\n", + llcd, lcm, atomic_read(&lcm->lcm_count)); + return llcd; } @@ -106,41 +130,53 @@ static struct llog_canceld_ctxt *llcd_alloc(void) */ static void llcd_free(struct llog_canceld_ctxt *llcd) { + struct llog_commit_master *lcm = llcd->llcd_lcm; + int size; + + if (lcm) { + if (atomic_read(&lcm->lcm_count) == 0) { + CERROR("Invalid llcd free %p\n", llcd); + llcd_print(llcd, __FUNCTION__, __LINE__); + LBUG(); + } + spin_lock(&lcm->lcm_lock); + LASSERT(!list_empty(&llcd->llcd_list)); + list_del_init(&llcd->llcd_list); + atomic_dec(&lcm->lcm_count); + spin_unlock(&lcm->lcm_lock); + + CDEBUG(D_RPCTRACE, "Free llcd %p on lcm %p (%d)\n", + llcd, lcm, atomic_read(&lcm->lcm_count)); + } + LASSERT(atomic_read(&llcd_count) > 0); - OBD_SLAB_FREE(llcd, llcd_cache, llcd->llcd_size); atomic_dec(&llcd_count); -} -/** - * Copy passed @cookies to @llcd. - */ -static void llcd_copy(struct llog_canceld_ctxt *llcd, - struct llog_cookie *cookies) -{ - memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, - cookies, sizeof(*cookies)); - llcd->llcd_cookiebytes += sizeof(*cookies); + size = offsetof(struct llog_canceld_ctxt, llcd_cookies) + + llcd->llcd_size; + OBD_SLAB_FREE(llcd, llcd_cache, size); } /** * Checks if passed cookie fits into llcd free space buffer. Returns * 1 if yes and 0 otherwise. */ -static int llcd_fit(struct llog_canceld_ctxt *llcd, - struct llog_cookie *cookies) +static inline int +llcd_fit(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies) { - return (llcd->llcd_size - - llcd->llcd_cookiebytes) >= sizeof(*cookies); + return (llcd->llcd_size - llcd->llcd_cookiebytes >= sizeof(*cookies)); } -static void llcd_print(struct llog_canceld_ctxt *llcd, - const char *func, int line) +/** + * Copy passed @cookies to @llcd. + */ +static inline void +llcd_copy(struct llog_canceld_ctxt *llcd, struct llog_cookie *cookies) { - CDEBUG(D_RPCTRACE, "Llcd (%p) at %s:%d:\n", llcd, func, line); - CDEBUG(D_RPCTRACE, " size: %d\n", llcd->llcd_size); - CDEBUG(D_RPCTRACE, " ctxt: %p\n", llcd->llcd_ctxt); - CDEBUG(D_RPCTRACE, " lcm : %p\n", llcd->llcd_lcm); - CDEBUG(D_RPCTRACE, " cookiebytes : %d\n", llcd->llcd_cookiebytes); + LASSERT(llcd_fit(llcd, cookies)); + memcpy((char *)llcd->llcd_cookies + llcd->llcd_cookiebytes, + cookies, sizeof(*cookies)); + llcd->llcd_cookiebytes += sizeof(*cookies); } /** @@ -148,19 +184,19 @@ static void llcd_print(struct llog_canceld_ctxt *llcd, * sending result. Error is passed in @rc. Note, that this will be called * in cleanup time when all inflight rpcs aborted. */ -static int +static int llcd_interpret(const struct lu_env *env, struct ptlrpc_request *req, void *noused, int rc) { struct llog_canceld_ctxt *llcd = req->rq_async_args.pointer_arg[0]; - CDEBUG(D_RPCTRACE, "Sent llcd %p (%d)\n", llcd, rc); + CDEBUG(D_RPCTRACE, "Sent llcd %p (%d) - killing it\n", llcd, rc); llcd_free(llcd); return 0; } - + /** * Send @llcd to remote node. Free llcd uppon completion or error. Sending - * is performed in async style so this function will return asap without + * is performed in async style so this function will return asap without * blocking. */ static int llcd_send(struct llog_canceld_ctxt *llcd) @@ -175,7 +211,7 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) ctxt = llcd->llcd_ctxt; if (!ctxt) { - CERROR("Invalid llcd with NULL ctxt found (%p)\n", + CERROR("Invalid llcd with NULL ctxt found (%p)\n", llcd); llcd_print(llcd, __FUNCTION__, __LINE__); LBUG(); @@ -186,10 +222,10 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) GOTO(exit, rc = 0); lcm = llcd->llcd_lcm; - - /* + + /* * Check if we're in exit stage. Do not send llcd in - * this case. + * this case. */ if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags)) GOTO(exit, rc = -ENODEV); @@ -197,9 +233,9 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) CDEBUG(D_RPCTRACE, "Sending llcd %p\n", llcd); import = llcd->llcd_ctxt->loc_imp; - if (!import || (import == LP_POISON) || + if (!import || (import == LP_POISON) || (import->imp_client == LP_POISON)) { - CERROR("Invalid import %p for llcd %p\n", + CERROR("Invalid import %p for llcd %p\n", import, llcd); GOTO(exit, rc = -ENODEV); } @@ -207,12 +243,12 @@ static int llcd_send(struct llog_canceld_ctxt *llcd) OBD_FAIL_TIMEOUT(OBD_FAIL_PTLRPC_DELAY_RECOV, 10); /* - * No need to get import here as it is already done in + * No need to get import here as it is already done in * llog_receptor_accept(). */ req = ptlrpc_request_alloc(import, &RQF_LOG_CANCEL); if (req == NULL) { - CERROR("Can't allocate request for sending llcd %p\n", + CERROR("Can't allocate request for sending llcd %p\n", llcd); GOTO(exit, rc = -ENOMEM); } @@ -253,18 +289,15 @@ exit: static int llcd_attach(struct llog_ctxt *ctxt, struct llog_canceld_ctxt *llcd) { - struct llog_commit_master *lcm; - LASSERT(ctxt != NULL && llcd != NULL); LASSERT_SEM_LOCKED(&ctxt->loc_sem); LASSERT(ctxt->loc_llcd == NULL); - lcm = ctxt->loc_lcm; - atomic_inc(&lcm->lcm_count); - CDEBUG(D_RPCTRACE, "Attach llcd %p to ctxt %p (%d)\n", - llcd, ctxt, atomic_read(&lcm->lcm_count)); llcd->llcd_ctxt = llog_ctxt_get(ctxt); - llcd->llcd_lcm = ctxt->loc_lcm; ctxt->loc_llcd = llcd; + + CDEBUG(D_RPCTRACE, "Attach llcd %p to ctxt %p\n", + llcd, ctxt); + return 0; } @@ -274,7 +307,6 @@ llcd_attach(struct llog_ctxt *ctxt, struct llog_canceld_ctxt *llcd) */ static struct llog_canceld_ctxt *llcd_detach(struct llog_ctxt *ctxt) { - struct llog_commit_master *lcm; struct llog_canceld_ctxt *llcd; LASSERT(ctxt != NULL); @@ -284,18 +316,10 @@ static struct llog_canceld_ctxt *llcd_detach(struct llog_ctxt *ctxt) if (!llcd) return NULL; - lcm = ctxt->loc_lcm; - if (atomic_read(&lcm->lcm_count) == 0) { - CERROR("Invalid detach occured %p:%p\n", ctxt, llcd); - llcd_print(llcd, __FUNCTION__, __LINE__); - LBUG(); - } - atomic_dec(&lcm->lcm_count); - ctxt->loc_llcd = NULL; - - CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p (%d)\n", - llcd, ctxt, atomic_read(&lcm->lcm_count)); + CDEBUG(D_RPCTRACE, "Detach llcd %p from ctxt %p\n", + llcd, ctxt); + ctxt->loc_llcd = NULL; llog_ctxt_put(ctxt); return llcd; } @@ -308,9 +332,9 @@ static struct llog_canceld_ctxt *llcd_get(struct llog_ctxt *ctxt) { struct llog_canceld_ctxt *llcd; - llcd = llcd_alloc(); + llcd = llcd_alloc(ctxt->loc_lcm); if (!llcd) { - CERROR("Couldn't alloc an llcd for ctxt %p\n", ctxt); + CERROR("Can't alloc an llcd for ctxt %p\n", ctxt); return NULL; } llcd_attach(ctxt, llcd); @@ -322,16 +346,11 @@ static struct llog_canceld_ctxt *llcd_get(struct llog_ctxt *ctxt) */ static void llcd_put(struct llog_ctxt *ctxt) { - struct llog_commit_master *lcm; struct llog_canceld_ctxt *llcd; - lcm = ctxt->loc_lcm; llcd = llcd_detach(ctxt); if (llcd) llcd_free(llcd); - - if (atomic_read(&lcm->lcm_count) == 0) - cfs_waitq_signal(&lcm->lcm_waitq); } /** @@ -344,7 +363,7 @@ static int llcd_push(struct llog_ctxt *ctxt) int rc; /* - * Make sure that this llcd will not be sent again as we detach + * Make sure that this llcd will not be sent again as we detach * it from ctxt. */ llcd = llcd_detach(ctxt); @@ -353,7 +372,7 @@ static int llcd_push(struct llog_ctxt *ctxt) llcd_print(llcd, __FUNCTION__, __LINE__); LBUG(); } - + rc = llcd_send(llcd); if (rc) CERROR("Couldn't send llcd %p (%d)\n", llcd, rc); @@ -372,11 +391,10 @@ int llog_recov_thread_start(struct llog_commit_master *lcm) rc = ptlrpcd_start(lcm->lcm_name, &lcm->lcm_pc); if (rc) { - CERROR("Error %d while starting recovery thread %s\n", + CERROR("Error %d while starting recovery thread %s\n", rc, lcm->lcm_name); RETURN(rc); } - lcm->lcm_set = lcm->lcm_pc.pc_set; RETURN(rc); } EXPORT_SYMBOL(llog_recov_thread_start); @@ -386,27 +404,52 @@ EXPORT_SYMBOL(llog_recov_thread_start); */ void llog_recov_thread_stop(struct llog_commit_master *lcm, int force) { - struct l_wait_info lwi = LWI_INTR(LWI_ON_SIGNAL_NOOP, NULL); ENTRY; - /** - * Let all know that we're stopping. This will also make + /* + * Let all know that we're stopping. This will also make * llcd_send() refuse any new llcds. */ set_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags); - /** + /* * Stop processing thread. No new rpcs will be accepted for * for processing now. */ ptlrpcd_stop(&lcm->lcm_pc, force); /* - * Wait for llcd number == 0. Note, this is infinite wait. - * All other parts should make sure that no lost llcd is left. + * By this point no alive inflight llcds should be left. Only + * those forgotten in sync may still be attached to ctxt. Let's + * print them. */ - l_wait_event(lcm->lcm_waitq, - atomic_read(&lcm->lcm_count) == 0, &lwi); + if (atomic_read(&lcm->lcm_count) != 0) { + struct llog_canceld_ctxt *llcd; + struct list_head *tmp; + + CERROR("Busy llcds found (%d) on lcm %p\n", + atomic_read(&lcm->lcm_count) == 0, lcm); + + spin_lock(&lcm->lcm_lock); + list_for_each(tmp, &lcm->lcm_llcds) { + llcd = list_entry(tmp, struct llog_canceld_ctxt, + llcd_list); + llcd_print(llcd, __FUNCTION__, __LINE__); + } + spin_unlock(&lcm->lcm_lock); + + /* + * No point to go further with busy llcds at this point + * as this is clear bug. It might mean we got hanging + * rpc which holds import ref and this means we will not + * be able to cleanup anyways. + * + * Or we just missed to kill them when they were not + * attached to ctxt. In this case our slab will remind + * us about this a bit later. + */ + LBUG(); + } EXIT; } EXPORT_SYMBOL(llog_recov_thread_stop); @@ -427,12 +470,12 @@ struct llog_commit_master *llog_recov_thread_init(char *name) /* * Try to create threads with unique names. */ - snprintf(lcm->lcm_name, sizeof(lcm->lcm_name), + snprintf(lcm->lcm_name, sizeof(lcm->lcm_name), "ll_log_commit_%s", name); - strncpy(lcm->lcm_name, name, sizeof(lcm->lcm_name)); - cfs_waitq_init(&lcm->lcm_waitq); atomic_set(&lcm->lcm_count, 0); + spin_lock_init(&lcm->lcm_lock); + CFS_INIT_LIST_HEAD(&lcm->lcm_llcds); rc = llog_recov_thread_start(lcm); if (rc) { CERROR("Can't start commit thread, rc %d\n", rc); @@ -457,7 +500,7 @@ void llog_recov_thread_fini(struct llog_commit_master *lcm, int force) } EXPORT_SYMBOL(llog_recov_thread_fini); -static int llog_recov_thread_replay(struct llog_ctxt *ctxt, +static int llog_recov_thread_replay(struct llog_ctxt *ctxt, void *cb, void *arg) { struct obd_device *obd = ctxt->loc_obd; @@ -486,7 +529,7 @@ static int llog_recov_thread_replay(struct llog_ctxt *ctxt, OBD_FREE_PTR(lpca); RETURN(-ENODEV); } - rc = cfs_kernel_thread(llog_cat_process_thread, lpca, + rc = cfs_kernel_thread(llog_cat_process_thread, lpca, CLONE_VM | CLONE_FILES); if (rc < 0) { CERROR("Error starting llog_cat_process_thread(): %d\n", rc); @@ -507,14 +550,14 @@ int llog_obd_repl_connect(struct llog_ctxt *ctxt, int rc; ENTRY; - /* + /* * Send back cached llcd from llog before recovery if we have any. * This is void is nothing cached is found there. */ llog_sync(ctxt, NULL); - /* - * Start recovery in separate thread. + /* + * Start recovery in separate thread. */ mutex_down(&ctxt->loc_sem); ctxt->loc_gen = *gen; @@ -525,7 +568,7 @@ int llog_obd_repl_connect(struct llog_ctxt *ctxt, } EXPORT_SYMBOL(llog_obd_repl_connect); -/** +/** * Deleted objects have a commit callback that cancels the MDS * log record for the deletion. The commit callback calls this * function. @@ -543,7 +586,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, mutex_down(&ctxt->loc_sem); lcm = ctxt->loc_lcm; - + /* * Let's check if we have all structures alive. We also check for * possible shutdown. Do nothing if we're stopping. @@ -553,13 +596,8 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, GOTO(out, rc = -ENODEV); } - if (ctxt->loc_obd->obd_stopping) { - CDEBUG(D_RPCTRACE, "Obd is stopping for ctxt %p\n", ctxt); - GOTO(out, rc = -ENODEV); - } - if (test_bit(LLOG_LCM_FL_EXIT, &lcm->lcm_flags)) { - CDEBUG(D_RPCTRACE, "Commit thread is stopping for ctxt %p\n", + CDEBUG(D_RPCTRACE, "Commit thread is stopping for ctxt %p\n", ctxt); GOTO(out, rc = -ENODEV); } @@ -568,7 +606,7 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, if (count > 0 && cookies != NULL) { /* - * Get new llcd from ctxt if required. + * Get new llcd from ctxt if required. */ if (!llcd) { llcd = llcd_get(ctxt); @@ -583,8 +621,8 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, } /* - * Llcd does not have enough room for @cookies. Let's push - * it out and allocate new one. + * Llcd does not have enough room for @cookies. Let's push + * it out and allocate new one. */ if (!llcd_fit(llcd, cookies)) { rc = llcd_push(ctxt); @@ -602,15 +640,18 @@ int llog_obd_repl_cancel(struct llog_ctxt *ctxt, } /* - * Copy cookies to @llcd, no matter old or new allocated one. + * Copy cookies to @llcd, no matter old or new allocated + * one. */ llcd_copy(llcd, cookies); } /* - * Let's check if we need to send copied @cookies asap. If yes - do it. + * Let's check if we need to send copied @cookies asap. If yes + * then do it. */ if (llcd && (flags & OBD_LLOG_FL_SENDNOW)) { + CDEBUG(D_RPCTRACE, "Sync llcd %p\n", llcd); rc = llcd_push(ctxt); if (rc) GOTO(out, rc); @@ -629,16 +670,25 @@ int llog_obd_repl_sync(struct llog_ctxt *ctxt, struct obd_export *exp) int rc = 0; ENTRY; + /* + * Flush any remaining llcd. + */ mutex_down(&ctxt->loc_sem); if (exp && (ctxt->loc_imp == exp->exp_imp_reverse)) { - CDEBUG(D_RPCTRACE, "Reverse import disconnect\n"); /* - * Check for llcd which might be left attached to @ctxt. - * Let's kill it. + * This is ost->mds connection, we can't be sure that mds + * can still receive cookies, let's killed the cached llcd. */ + CDEBUG(D_RPCTRACE, "Kill cached llcd\n"); llcd_put(ctxt); mutex_up(&ctxt->loc_sem); } else { + /* + * This is either llog_sync() from generic llog code or sync + * on client disconnect. In either way let's do it and send + * llcds to the target with waiting for completion. + */ + CDEBUG(D_RPCTRACE, "Sync cached llcd\n"); mutex_up(&ctxt->loc_sem); rc = llog_cancel(ctxt, NULL, 0, NULL, OBD_LLOG_FL_SENDNOW); } @@ -663,7 +713,7 @@ int llog_recov_init(void) { int llcd_size; - llcd_size = CFS_PAGE_SIZE - + llcd_size = CFS_PAGE_SIZE - lustre_msg_size(LUSTRE_MSG_MAGIC_V2, 1, NULL); llcd_size += offsetof(struct llog_canceld_ctxt, llcd_cookies); llcd_cache = cfs_mem_cache_create("llcd_cache", llcd_size, 0, 0); @@ -680,7 +730,7 @@ int llog_recov_init(void) void llog_recov_fini(void) { /* - * Kill llcd cache when thread is stopped and we're sure no + * Kill llcd cache when thread is stopped and we're sure no * llcd in use left. */ if (llcd_cache) { @@ -688,7 +738,7 @@ void llog_recov_fini(void) * In 2.6.22 cfs_mem_cache_destroy() will not return error * for busy resources. Let's check it another way. */ - LASSERTF(atomic_read(&llcd_count) == 0, + LASSERTF(atomic_read(&llcd_count) == 0, "Can't destroy llcd cache! Number of " "busy llcds: %d\n", atomic_read(&llcd_count)); cfs_mem_cache_destroy(llcd_cache); diff --git a/lustre/ptlrpc/recover.c b/lustre/ptlrpc/recover.c index 03d8270..4adf785 100644 --- a/lustre/ptlrpc/recover.c +++ b/lustre/ptlrpc/recover.c @@ -110,7 +110,7 @@ int ptlrpc_replay_next(struct obd_import *imp, int *inflight) */ list_for_each_safe(tmp, pos, &imp->imp_replay_list) { req = list_entry(tmp, struct ptlrpc_request, rq_replay_list); - + /* If need to resend the last sent transno (because a reconnect has occurred), then stop on the matching req and send it again. If, however, the last sent @@ -186,7 +186,7 @@ void ptlrpc_wake_delayed(struct obd_import *imp) req = list_entry(tmp, struct ptlrpc_request, rq_list); DEBUG_REQ(D_HA, req, "waking (set %p):", req->rq_set); - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } spin_unlock(&imp->imp_lock); } @@ -245,19 +245,18 @@ int ptlrpc_set_import_active(struct obd_import *imp, int active) if (!active) { LCONSOLE_WARN("setting import %s INACTIVE by administrator " "request\n", obd2cli_tgt(imp->imp_obd)); - ptlrpc_invalidate_import(imp); + /* set before invalidate to avoid messages about imp_inval + * set without imp_deactive in ptlrpc_import_delay_req */ spin_lock(&imp->imp_lock); imp->imp_deactive = 1; spin_unlock(&imp->imp_lock); + + ptlrpc_invalidate_import(imp); } /* When activating, mark import valid, and attempt recovery */ if (active) { - spin_lock(&imp->imp_lock); - imp->imp_deactive = 0; - spin_unlock(&imp->imp_lock); - CDEBUG(D_HA, "setting import %s VALID\n", obd2cli_tgt(imp->imp_obd)); rc = ptlrpc_recover_import(imp, NULL); @@ -272,6 +271,13 @@ int ptlrpc_recover_import(struct obd_import *imp, char *new_uuid) int rc; ENTRY; + spin_lock(&imp->imp_lock); + if (atomic_read(&imp->imp_inval_count)) { + spin_unlock(&imp->imp_lock); + RETURN(-EINVAL); + } + spin_unlock(&imp->imp_lock); + /* force import to be disconnected. */ ptlrpc_set_import_discon(imp, 0); diff --git a/lustre/ptlrpc/sec.c b/lustre/ptlrpc/sec.c index 977ac89..d268380 100644 --- a/lustre/ptlrpc/sec.c +++ b/lustre/ptlrpc/sec.c @@ -289,7 +289,7 @@ void sptlrpc_cli_ctx_wakeup(struct ptlrpc_cli_ctx *ctx) spin_lock(&ctx->cc_lock); list_for_each_entry_safe(req, next, &ctx->cc_req_list, rq_ctx_chain) { list_del_init(&req->rq_ctx_chain); - ptlrpc_wake_client_req(req); + ptlrpc_client_wake_req(req); } spin_unlock(&ctx->cc_lock); } @@ -305,7 +305,7 @@ int sptlrpc_cli_ctx_display(struct ptlrpc_cli_ctx *ctx, char *buf, int bufsize) return ctx->cc_ops->display(ctx, buf, bufsize); } -static int sptlrpc_import_sec_check_expire(struct obd_import *imp) +static int import_sec_check_expire(struct obd_import *imp) { int adapt = 0; @@ -324,34 +324,47 @@ static int sptlrpc_import_sec_check_expire(struct obd_import *imp) return sptlrpc_import_sec_adapt(imp, NULL, 0); } -int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +static int import_sec_validate_get(struct obd_import *imp, + struct ptlrpc_sec **sec) { - struct obd_import *imp = req->rq_import; - struct ptlrpc_sec *sec; - int rc; - ENTRY; - - LASSERT(!req->rq_cli_ctx); - LASSERT(imp); + int rc; if (unlikely(imp->imp_sec_expire)) { - rc = sptlrpc_import_sec_check_expire(imp); + rc = import_sec_check_expire(imp); if (rc) - RETURN(rc); + return rc; } - sec = sptlrpc_import_sec_ref(imp); - if (sec == NULL) { - CERROR("import %p (%s) with no ptlrpc_sec\n", + *sec = sptlrpc_import_sec_ref(imp); + if (*sec == NULL) { + CERROR("import %p (%s) with no sec\n", imp, ptlrpc_import_state_name(imp->imp_state)); - RETURN(-EACCES); + return -EACCES; } - if (unlikely(sec->ps_dying)) { + if (unlikely((*sec)->ps_dying)) { CERROR("attempt to use dying sec %p\n", sec); + sptlrpc_sec_put(*sec); return -EACCES; } + return 0; +} + +int sptlrpc_req_get_ctx(struct ptlrpc_request *req) +{ + struct obd_import *imp = req->rq_import; + struct ptlrpc_sec *sec; + int rc; + ENTRY; + + LASSERT(!req->rq_cli_ctx); + LASSERT(imp); + + rc = import_sec_validate_get(imp, &sec); + if (rc) + RETURN(rc); + req->rq_cli_ctx = get_my_ctx(sec); sptlrpc_sec_put(sec); @@ -447,9 +460,13 @@ int sptlrpc_req_ctx_switch(struct ptlrpc_request *req, return rc; } -/* +/** + * if current context has died, or if we resend after flavor switched, + * call this func to switch context. if no switch is needed, request + * will end up with the same context. + * * request must have a context. in any case of failure, restore the - * restore the old one. a request must have a ctx. + * restore the old one - a request must have a context. */ int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) { @@ -459,7 +476,6 @@ int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) ENTRY; LASSERT(oldctx); - LASSERT(test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags)); sptlrpc_cli_ctx_get(oldctx); sptlrpc_req_put_ctx(req, 0); @@ -477,13 +493,16 @@ int sptlrpc_req_replace_dead_ctx(struct ptlrpc_request *req) LASSERT(newctx); if (unlikely(newctx == oldctx)) { - /* - * still get the old ctx, usually means system busy - */ - CWARN("ctx (%p, fl %lx) doesn't switch, relax a little bit\n", - newctx, newctx->cc_flags); - - cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, HZ); + if (test_bit(PTLRPC_CTX_DEAD_BIT, &oldctx->cc_flags)) { + /* + * still get the old ctx, usually means system busy + */ + CWARN("ctx (%p, fl %lx) doesn't switch, " + "relax a little bit\n", + newctx, newctx->cc_flags); + + cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, HZ); + } } else { rc = sptlrpc_req_ctx_switch(req, oldctx, newctx); if (rc) { @@ -518,7 +537,7 @@ int ctx_refresh_timeout(void *data) /* conn_cnt is needed in expire_one_request */ lustre_msg_set_conn_cnt(req->rq_reqmsg, req->rq_import->imp_conn_cnt); - rc = ptlrpc_expire_one_request(req); + rc = ptlrpc_expire_one_request(req, 1); /* if we started recovery, we should mark this ctx dead; otherwise * in case of lgssd died nobody would retire this ctx, following * connecting will still find the same ctx thus cause deadlock. @@ -564,23 +583,35 @@ void req_off_ctx_list(struct ptlrpc_request *req, struct ptlrpc_cli_ctx *ctx) int sptlrpc_req_refresh_ctx(struct ptlrpc_request *req, long timeout) { struct ptlrpc_cli_ctx *ctx = req->rq_cli_ctx; + struct ptlrpc_sec *sec; struct l_wait_info lwi; int rc; ENTRY; LASSERT(ctx); + if (req->rq_ctx_init || req->rq_ctx_fini) + RETURN(0); + /* * during the process a request's context might change type even * (e.g. from gss ctx to plain ctx), so each loop we need to re-check * everything */ again: - /* skip special ctxs */ - if (cli_ctx_is_eternal(ctx) || req->rq_ctx_init || req->rq_ctx_fini) + rc = import_sec_validate_get(req->rq_import, &sec); + if (rc) + RETURN(rc); + + if (sec->ps_flvr.sf_rpc != req->rq_flvr.sf_rpc) + sptlrpc_req_replace_dead_ctx(req); + + sptlrpc_sec_put(sec); + + if (cli_ctx_is_eternal(ctx)) RETURN(0); - if (test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags)) { + if (unlikely(test_bit(PTLRPC_CTX_NEW_BIT, &ctx->cc_flags))) { LASSERT(ctx->cc_ops->refresh); ctx->cc_ops->refresh(ctx); } @@ -630,6 +661,15 @@ again: } if (unlikely(test_bit(PTLRPC_CTX_DEAD_BIT, &ctx->cc_flags))) { + /* + * don't switch ctx if import was deactivated + */ + if (req->rq_import->imp_deactive) { + req_off_ctx_list(req, ctx); + req->rq_err = 1; + RETURN(-EINTR); + } + rc = sptlrpc_req_replace_dead_ctx(req); if (rc) { LASSERT(ctx == req->rq_cli_ctx); @@ -659,9 +699,8 @@ again: list_add(&req->rq_ctx_chain, &ctx->cc_req_list); spin_unlock(&ctx->cc_lock); - if (timeout < 0) { + if (timeout < 0) RETURN(-EWOULDBLOCK); - } /* Clear any flags that may be present from previous sends */ LASSERT(req->rq_receiving_reply == 0); @@ -682,7 +721,7 @@ again: * - timedout, and we don't want recover from the failure; * - timedout, and waked up upon recovery finished; * - someone else mark this ctx dead by force; - * - someone invalidate the req and call wake_client_req(), + * - someone invalidate the req and call ptlrpc_client_wake_req(), * e.g. ptlrpc_abort_inflight(); */ if (!cli_ctx_is_refreshed(ctx)) { @@ -794,7 +833,7 @@ int sptlrpc_import_check_ctx(struct obd_import *imp) sptlrpc_sec_put(sec); if (!ctx) - RETURN(1); + RETURN(-ENOMEM); if (cli_ctx_is_eternal(ctx) || ctx->cc_ops->validate(ctx) == 0) { @@ -802,6 +841,11 @@ int sptlrpc_import_check_ctx(struct obd_import *imp) RETURN(0); } + if (cli_ctx_is_error(ctx)) { + sptlrpc_cli_ctx_put(ctx, 1); + RETURN(-EACCES); + } + OBD_ALLOC_PTR(req); if (!req) RETURN(-ENOMEM); @@ -811,6 +855,7 @@ int sptlrpc_import_check_ctx(struct obd_import *imp) CFS_INIT_LIST_HEAD(&req->rq_ctx_chain); cfs_waitq_init(&req->rq_reply_waitq); req->rq_import = imp; + req->rq_flvr = sec->ps_flvr; req->rq_cli_ctx = ctx; rc = sptlrpc_req_refresh_ctx(req, 0); @@ -1275,17 +1320,28 @@ int sptlrpc_import_sec_adapt(struct obd_import *imp, enum lustre_sec_part sp; int rc; + might_sleep(); + if (imp == NULL) return 0; conn = imp->imp_connection; if (svc_ctx == NULL) { - /* normal import, determine flavor from rule set */ - sptlrpc_rule_set_choose(&imp->imp_obd->u.cli.cl_sptlrpc_rset, - LUSTRE_SP_ANY, conn->c_self, &sf); - - sp = imp->imp_obd->u.cli.cl_sec_part; + struct client_obd *cliobd = &imp->imp_obd->u.cli; + /* + * normal import, determine flavor from rule set, except + * for mgc the flavor is predetermined. + */ + if (cliobd->cl_sp_me == LUSTRE_SP_MGC) + sf = cliobd->cl_flvr_mgc; + else + sptlrpc_conf_choose_flavor(cliobd->cl_sp_me, + cliobd->cl_sp_to, + &cliobd->cl_target_uuid, + conn->c_self, &sf); + + sp = imp->imp_obd->u.cli.cl_sp_me; } else { /* reverse import, determine flavor from incoming reqeust */ sf.sf_rpc = rpc_flavor; @@ -1599,7 +1655,7 @@ static int flavor_allowed(struct sptlrpc_flavor *exp, { struct sptlrpc_flavor *flvr = &req->rq_flvr; - if (exp->sf_rpc == flvr->sf_rpc) + if (exp->sf_rpc == SPTLRPC_FLVR_ANY || exp->sf_rpc == flvr->sf_rpc) return 1; if ((req->rq_ctx_init || req->rq_ctx_fini) && @@ -1770,11 +1826,23 @@ int sptlrpc_target_export_check(struct obd_export *exp, spin_unlock(&exp->exp_lock); - CWARN("req %p: (%u|%u|%u|%u|%u) with unauthorized flavor %x\n", + CWARN("exp %p(%s): req %p (%u|%u|%u|%u|%u) with " + "unauthorized flavor %x, expect %x|%x(%+ld)|%x(%+ld)\n", + exp, exp->exp_obd->obd_name, req, req->rq_auth_gss, req->rq_ctx_init, req->rq_ctx_fini, - req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_flvr.sf_rpc); + req->rq_auth_usr_root, req->rq_auth_usr_mdt, req->rq_flvr.sf_rpc, + exp->exp_flvr.sf_rpc, + exp->exp_flvr_old[0].sf_rpc, + exp->exp_flvr_expire[0] ? + (unsigned long) (exp->exp_flvr_expire[0] - + cfs_time_current_sec()) : 0, + exp->exp_flvr_old[1].sf_rpc, + exp->exp_flvr_expire[1] ? + (unsigned long) (exp->exp_flvr_expire[1] - + cfs_time_current_sec()) : 0); return -EACCES; } +EXPORT_SYMBOL(sptlrpc_target_export_check); void sptlrpc_target_update_exp_flavor(struct obd_device *obd, struct sptlrpc_rule_set *rset) @@ -1794,15 +1862,16 @@ void sptlrpc_target_update_exp_flavor(struct obd_device *obd, * (exp_flvr_changed == 1), this will override the * previous one. */ spin_lock(&exp->exp_lock); - sptlrpc_rule_set_choose(rset, exp->exp_sp_peer, - exp->exp_connection->c_peer.nid, - &new_flvr); + sptlrpc_target_choose_flavor(rset, exp->exp_sp_peer, + exp->exp_connection->c_peer.nid, + &new_flvr); if (exp->exp_flvr_changed || memcmp(&new_flvr, &exp->exp_flvr, sizeof(new_flvr))) { exp->exp_flvr_old[1] = new_flvr; exp->exp_flvr_expire[1] = 0; exp->exp_flvr_changed = 1; exp->exp_flvr_adapt = 1; + CDEBUG(D_SEC, "exp %p (%s): updated flavor %x->%x\n", exp, sptlrpc_part2name(exp->exp_sp_peer), exp->exp_flvr.sf_rpc, @@ -1824,6 +1893,7 @@ static int sptlrpc_svc_check_from(struct ptlrpc_request *req, int svc_rc) case LUSTRE_SP_CLI: case LUSTRE_SP_MDT: case LUSTRE_SP_OST: + case LUSTRE_SP_MGC: case LUSTRE_SP_MGS: case LUSTRE_SP_ANY: break; @@ -2271,35 +2341,6 @@ EXPORT_SYMBOL(sec2target_str); * crypto API helper/alloc blkciper * ****************************************/ -#ifdef __KERNEL__ -#ifndef HAVE_ASYNC_BLOCK_CIPHER -struct ll_crypto_cipher *ll_crypto_alloc_blkcipher(const char * algname, - u32 type, u32 mask) -{ - char buf[CRYPTO_MAX_ALG_NAME + 1]; - const char *pan = algname; - u32 flag = 0; - - if (strncmp("cbc(", algname, 4) == 0) - flag |= CRYPTO_TFM_MODE_CBC; - else if (strncmp("ecb(", algname, 4) == 0) - flag |= CRYPTO_TFM_MODE_ECB; - if (flag) { - char *vp = strnchr(algname, CRYPTO_MAX_ALG_NAME, ')'); - if (vp) { - memcpy(buf, algname + 4, vp - algname - 4); - buf[vp - algname - 4] = '\0'; - pan = buf; - } else { - flag = 0; - } - } - return crypto_alloc_tfm(pan, flag); -} -EXPORT_SYMBOL(ll_crypto_alloc_blkcipher); -#endif -#endif - /**************************************** * initialize/finalize * ****************************************/ @@ -2310,14 +2351,18 @@ int __init sptlrpc_init(void) rwlock_init(&policy_lock); - rc = sptlrpc_gc_start_thread(); + rc = sptlrpc_gc_init(); if (rc) goto out; - rc = sptlrpc_enc_pool_init(); + rc = sptlrpc_conf_init(); if (rc) goto out_gc; + rc = sptlrpc_enc_pool_init(); + if (rc) + goto out_conf; + rc = sptlrpc_null_init(); if (rc) goto out_pool; @@ -2338,8 +2383,10 @@ out_null: sptlrpc_null_fini(); out_pool: sptlrpc_enc_pool_fini(); +out_conf: + sptlrpc_conf_fini(); out_gc: - sptlrpc_gc_stop_thread(); + sptlrpc_gc_fini(); out: return rc; } @@ -2350,5 +2397,6 @@ void __exit sptlrpc_fini(void) sptlrpc_plain_fini(); sptlrpc_null_fini(); sptlrpc_enc_pool_fini(); - sptlrpc_gc_stop_thread(); + sptlrpc_conf_fini(); + sptlrpc_gc_fini(); } diff --git a/lustre/ptlrpc/sec_config.c b/lustre/ptlrpc/sec_config.c index 67287e8..b54a3a4 100644 --- a/lustre/ptlrpc/sec_config.c +++ b/lustre/ptlrpc/sec_config.c @@ -53,7 +53,10 @@ #include #include #include +#include +#include #include +#include #include #include "ptlrpc_internal.h" @@ -67,6 +70,8 @@ const char *sptlrpc_part2name(enum lustre_sec_part part) return "mdt"; case LUSTRE_SP_OST: return "ost"; + case LUSTRE_SP_MGC: + return "mgc"; case LUSTRE_SP_MGS: return "mgs"; case LUSTRE_SP_ANY: @@ -117,11 +122,11 @@ static void get_default_flavor(struct sptlrpc_flavor *sf) sf->sf_flags = 0; } -static void get_flavor_by_rpc(struct sptlrpc_rule *rule, __u16 rpc_flavor) +static void get_flavor_by_rpc(struct sptlrpc_flavor *flvr, __u16 rpc_flavor) { - get_default_flavor(&rule->sr_flvr); + get_default_flavor(flvr); - rule->sr_flvr.sf_rpc = rpc_flavor; + flvr->sf_rpc = rpc_flavor; switch (rpc_flavor) { case SPTLRPC_FLVR_NULL: @@ -129,46 +134,46 @@ static void get_flavor_by_rpc(struct sptlrpc_rule *rule, __u16 rpc_flavor) case SPTLRPC_FLVR_PLAIN: case SPTLRPC_FLVR_KRB5N: case SPTLRPC_FLVR_KRB5A: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_DEFAULT; + flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT; break; case SPTLRPC_FLVR_KRB5P: - rule->sr_flvr.sf_bulk_ciph = BULK_CIPH_ALG_AES128; + flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128; /* fall through */ case SPTLRPC_FLVR_KRB5I: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_SHA1; + flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; break; default: LBUG(); } } -static void get_flavor_by_bulk(struct sptlrpc_rule *rule, +static void get_flavor_by_bulk(struct sptlrpc_flavor *flvr, __u16 rpc_flavor, bulk_type_t bulk_type) { switch (bulk_type) { case BULK_TYPE_N: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_NULL; - rule->sr_flvr.sf_bulk_ciph = BULK_CIPH_ALG_NULL; + flvr->sf_bulk_hash = BULK_HASH_ALG_NULL; + flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; break; case BULK_TYPE_I: switch (rpc_flavor) { case SPTLRPC_FLVR_PLAIN: case SPTLRPC_FLVR_KRB5N: case SPTLRPC_FLVR_KRB5A: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_DEFAULT; + flvr->sf_bulk_hash = BULK_HASH_ALG_DEFAULT; break; case SPTLRPC_FLVR_KRB5I: case SPTLRPC_FLVR_KRB5P: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_SHA1; + flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; break; default: LBUG(); } - rule->sr_flvr.sf_bulk_ciph = BULK_CIPH_ALG_NULL; + flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; break; case BULK_TYPE_P: - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_SHA1; - rule->sr_flvr.sf_bulk_ciph = BULK_CIPH_ALG_AES128; + flvr->sf_bulk_hash = BULK_HASH_ALG_SHA1; + flvr->sf_bulk_ciph = BULK_CIPH_ALG_AES128; break; default: LBUG(); @@ -195,7 +200,7 @@ static __u16 __flavors[] = { * krb5i-bulkp * krb5i-bulkp:sha512/arc4 */ -static int parse_flavor(char *str, struct sptlrpc_rule *rule) +int sptlrpc_parse_flavor(const char *str, struct sptlrpc_flavor *flvr) { const char *f; char *bulk, *alg, *enc; @@ -205,7 +210,7 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) ENTRY; if (str == NULL || str[0] == '\0') { - rule->sr_flvr.sf_rpc = SPTLRPC_FLVR_INVALID; + flvr->sf_rpc = SPTLRPC_FLVR_INVALID; goto out; } @@ -231,7 +236,7 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) if (strcmp(buf, f) != 0) GOTO(invalid, -EINVAL); - get_flavor_by_rpc(rule, __flavors[i]); + get_flavor_by_rpc(flvr, __flavors[i]); if (bulk == NULL) goto out; @@ -243,8 +248,8 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) /* verify bulk section */ if (strcmp(bulk, "bulkn") == 0) { - rule->sr_flvr.sf_bulk_hash = BULK_HASH_ALG_NULL; - rule->sr_flvr.sf_bulk_ciph = BULK_CIPH_ALG_NULL; + flvr->sf_bulk_hash = BULK_HASH_ALG_NULL; + flvr->sf_bulk_ciph = BULK_CIPH_ALG_NULL; bulk_type = BULK_TYPE_N; } else if (strcmp(bulk, "bulki") == 0) bulk_type = BULK_TYPE_I; @@ -261,7 +266,7 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) if (__flavors[i] == SPTLRPC_FLVR_PLAIN && bulk_type == BULK_TYPE_P) GOTO(invalid, -EINVAL); - get_flavor_by_bulk(rule, __flavors[i], bulk_type); + get_flavor_by_bulk(flvr, __flavors[i], bulk_type); if (alg == NULL) goto out; @@ -274,7 +279,7 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) /* checksum algorithm */ for (i = 0; i < BULK_HASH_ALG_MAX; i++) { if (strcmp(alg, sptlrpc_get_hash_name(i)) == 0) { - rule->sr_flvr.sf_bulk_hash = i; + flvr->sf_bulk_hash = i; break; } } @@ -285,7 +290,7 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) if (enc) { for (i = 0; i < BULK_CIPH_ALG_MAX; i++) { if (strcmp(enc, sptlrpc_get_ciph_name(i)) == 0) { - rule->sr_flvr.sf_bulk_ciph = i; + flvr->sf_bulk_ciph = i; break; } } @@ -297,17 +302,17 @@ static int parse_flavor(char *str, struct sptlrpc_rule *rule) * bulk combination sanity checks */ if (bulk_type == BULK_TYPE_P && - rule->sr_flvr.sf_bulk_ciph == BULK_CIPH_ALG_NULL) + flvr->sf_bulk_ciph == BULK_CIPH_ALG_NULL) GOTO(invalid, -EINVAL); if (bulk_type == BULK_TYPE_I && - (rule->sr_flvr.sf_bulk_hash == BULK_HASH_ALG_NULL || - rule->sr_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL)) + (flvr->sf_bulk_hash == BULK_HASH_ALG_NULL || + flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL)) GOTO(invalid, -EINVAL); if (bulk_type == BULK_TYPE_N && - (rule->sr_flvr.sf_bulk_hash != BULK_HASH_ALG_NULL || - rule->sr_flvr.sf_bulk_ciph != BULK_CIPH_ALG_NULL)) + (flvr->sf_bulk_hash != BULK_HASH_ALG_NULL || + flvr->sf_bulk_ciph != BULK_CIPH_ALG_NULL)) GOTO(invalid, -EINVAL); out: @@ -316,6 +321,7 @@ invalid: CERROR("invalid flavor string: %s\n", str); return -EINVAL; } +EXPORT_SYMBOL(sptlrpc_parse_flavor); /**************************************** * configure rules * @@ -382,7 +388,7 @@ int sptlrpc_parse_rule(char *param, struct sptlrpc_rule *rule) } /* 2.1 flavor */ - rc = parse_flavor(flavor, rule); + rc = sptlrpc_parse_flavor(flavor, &rule->sr_flvr); if (rc) RETURN(-EINVAL); @@ -418,22 +424,21 @@ int sptlrpc_rule_set_expand(struct sptlrpc_rule_set *rset, int expand) if (expand == 0) return -E2BIG; - if (rset->srs_nslot == 0) - nslot = 8; - else - nslot = rset->srs_nslot + 8; + nslot = rset->srs_nslot + 8; /* better use realloc() if available */ OBD_ALLOC(rules, nslot * sizeof(*rset->srs_rules)); if (rules == NULL) return -ENOMEM; - memcpy(rules, rset->srs_rules, - rset->srs_nrule * sizeof(*rset->srs_rules)); + if (rset->srs_nrule) { + LASSERT(rset->srs_nslot && rset->srs_rules); + memcpy(rules, rset->srs_rules, + rset->srs_nrule * sizeof(*rset->srs_rules)); - if (rset->srs_rules) OBD_FREE(rset->srs_rules, rset->srs_nslot * sizeof(*rset->srs_rules)); + } rset->srs_rules = rules; rset->srs_nslot = nslot; @@ -543,7 +548,7 @@ int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, memcpy(&rset->srs_rules[n], rule, sizeof(*rule)); rset->srs_nrule++; } else { - CWARN("ignore the unmatched deletion\n"); + CDEBUG(D_CONFIG, "ignore the unmatched deletion\n"); } } @@ -551,35 +556,15 @@ int sptlrpc_rule_set_merge(struct sptlrpc_rule_set *rset, } EXPORT_SYMBOL(sptlrpc_rule_set_merge); -int sptlrpc_rule_set_from_log(struct sptlrpc_rule_set *rset, - struct sptlrpc_conf_log *log) -{ - LASSERT(rset); - LASSERT(log); - - sptlrpc_rule_set_free(rset); - - if (log->scl_nrule == 0) - return 0; - - OBD_ALLOC(rset->srs_rules, log->scl_nrule * sizeof(*log->scl_rules)); - if (!rset->srs_rules) - return -ENOMEM; - - memcpy(rset->srs_rules, log->scl_rules, - log->scl_nrule * sizeof(*log->scl_rules)); - rset->srs_nslot = rset->srs_nrule = log->scl_nrule; - return 0; -} -EXPORT_SYMBOL(sptlrpc_rule_set_from_log); - -/* - * according to NID/from choose a flavor from rule set. +/** + * given from/to/nid, determine a matching flavor in ruleset. + * return 1 if a match found, otherwise return 0. */ -void sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, - enum lustre_sec_part from, - lnet_nid_t nid, - struct sptlrpc_flavor *sf) +int sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + enum lustre_sec_part to, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) { struct sptlrpc_rule *r; int n; @@ -596,12 +581,15 @@ void sptlrpc_rule_set_choose(struct sptlrpc_rule_set *rset, from != r->sr_from) continue; + if (to != LUSTRE_SP_ANY && r->sr_to != LUSTRE_SP_ANY && + to != r->sr_to) + continue; + *sf = r->sr_flvr; - return; + return 1; } - /* no match found, set as default flavor */ - get_default_flavor(sf); + return 0; } EXPORT_SYMBOL(sptlrpc_rule_set_choose); @@ -618,71 +606,16 @@ void sptlrpc_rule_set_dump(struct sptlrpc_rule_set *rset) } EXPORT_SYMBOL(sptlrpc_rule_set_dump); -/**************************************** - * sptlrpc config log * - ****************************************/ - -struct sptlrpc_conf_log *sptlrpc_conf_log_alloc(void) -{ - struct sptlrpc_conf_log *log; - - OBD_ALLOC_PTR(log); - if (log == NULL) - return ERR_PTR(-ENOMEM); - - log->scl_max = SPTLRPC_CONF_LOG_MAX; - return log; -} -EXPORT_SYMBOL(sptlrpc_conf_log_alloc); - -void sptlrpc_conf_log_free(struct sptlrpc_conf_log *log) -{ - LASSERT(log->scl_max == SPTLRPC_CONF_LOG_MAX); - OBD_FREE_PTR(log); -} -EXPORT_SYMBOL(sptlrpc_conf_log_free); - -static __u32 get_log_rule_flags(enum lustre_sec_part from, - enum lustre_sec_part to, - unsigned int fl_udesc) -{ - /* MDT->MDT; MDT->OST */ - if (from == LUSTRE_SP_MDT) - return PTLRPC_SEC_FL_ROOTONLY; - /* CLI->OST */ - if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) - return PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; - /* CLI->MDT */ - if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) - if (fl_udesc) - return PTLRPC_SEC_FL_UDESC; - - return 0; -} - -/* - * generate config log: merge general and target rules, which - * match @from @to - */ -int sptlrpc_conf_log_populate(struct sptlrpc_rule_set *gen, - struct sptlrpc_rule_set *tgt, - enum lustre_sec_part from, - enum lustre_sec_part to, - unsigned int fl_udesc, - struct sptlrpc_conf_log *log) +static int sptlrpc_rule_set_extract(struct sptlrpc_rule_set *gen, + struct sptlrpc_rule_set *tgt, + enum lustre_sec_part from, + enum lustre_sec_part to, + struct sptlrpc_rule_set *rset) { struct sptlrpc_rule_set *src[2] = { gen, tgt }; - struct sptlrpc_rule_set dst; struct sptlrpc_rule *rule; - __u32 flags; int i, n, rc; - LASSERT(log); - - dst.srs_nslot = log->scl_max; - dst.srs_nrule = 0; - dst.srs_rules = log->scl_rules; - /* merge general rules firstly, then target-specific rules */ for (i = 0; i < 2; i++) { if (src[i] == NULL) @@ -700,7 +633,7 @@ int sptlrpc_conf_log_populate(struct sptlrpc_rule_set *gen, rule->sr_to != to) continue; - rc = sptlrpc_rule_set_merge(&dst, rule, 0); + rc = sptlrpc_rule_set_merge(rset, rule, 1); if (rc) { CERROR("can't merge: %d\n", rc); return rc; @@ -708,125 +641,757 @@ int sptlrpc_conf_log_populate(struct sptlrpc_rule_set *gen, } } - log->scl_nrule = dst.srs_nrule; + return 0; +} + +/********************************** + * sptlrpc configuration support * + **********************************/ + +struct sptlrpc_conf_tgt { + struct list_head sct_list; + char sct_name[MAX_OBD_NAME]; + struct sptlrpc_rule_set sct_rset; +}; + +struct sptlrpc_conf { + struct list_head sc_list; + char sc_fsname[MTI_NAME_MAXLEN]; + unsigned int sc_modified; /* modified during updating */ + unsigned int sc_updated:1, /* updated copy from MGS */ + sc_local:1; /* local copy from target */ + struct sptlrpc_rule_set sc_rset; /* fs general rules */ + struct list_head sc_tgts; /* target-specific rules */ +}; - /* set flags for each rule */ - flags = get_log_rule_flags(from, to, fl_udesc); +static struct mutex sptlrpc_conf_lock; +static CFS_LIST_HEAD(sptlrpc_confs); - for (i = 0; i < log->scl_nrule; i++) { - log->scl_rules[i].sr_flvr.sf_flags = flags; +static inline int is_hex(char c) +{ + return ((c >= '0' && c <= '9') || + (c >= 'a' && c <= 'f')); +} - /* also clear the from/to fields which don't need to be known - * accordingly. @from == ANY means this log is for target, - * otherwise for client. */ - if (from != LUSTRE_SP_ANY) - log->scl_rules[i].sr_from = LUSTRE_SP_ANY; - log->scl_rules[i].sr_to = LUSTRE_SP_ANY; +static void target2fsname(const char *tgt, char *fsname, int buflen) +{ + const char *ptr; + int len; + + ptr = strrchr(tgt, '-'); + if (ptr) { + if ((strncmp(ptr, "-MDT", 4) != 0 && + strncmp(ptr, "-OST", 4) != 0) || + !is_hex(ptr[4]) || !is_hex(ptr[5]) || + !is_hex(ptr[6]) || !is_hex(ptr[7])) + ptr = NULL; } - return 0; + /* if we didn't find the pattern, treat the whole string as fsname */ + if (ptr == NULL) + len = strlen(tgt); + else + len = ptr - tgt; + + len = min(len, buflen - 1); + memcpy(fsname, tgt, len); + fsname[len] = '\0'; } -EXPORT_SYMBOL(sptlrpc_conf_log_populate); -/* - * extract config log from @lcfg +static void sptlrpc_conf_free_rsets(struct sptlrpc_conf *conf) +{ + struct sptlrpc_conf_tgt *conf_tgt, *conf_tgt_next; + + sptlrpc_rule_set_free(&conf->sc_rset); + + list_for_each_entry_safe(conf_tgt, conf_tgt_next, + &conf->sc_tgts, sct_list) { + sptlrpc_rule_set_free(&conf_tgt->sct_rset); + list_del(&conf_tgt->sct_list); + OBD_FREE_PTR(conf_tgt); + } + LASSERT(list_empty(&conf->sc_tgts)); + + conf->sc_updated = 0; + conf->sc_local = 0; +} + +static void sptlrpc_conf_free(struct sptlrpc_conf *conf) +{ + CDEBUG(D_SEC, "free sptlrpc conf %s\n", conf->sc_fsname); + + sptlrpc_conf_free_rsets(conf); + list_del(&conf->sc_list); + OBD_FREE_PTR(conf); +} + +static +struct sptlrpc_conf_tgt *sptlrpc_conf_get_tgt(struct sptlrpc_conf *conf, + const char *name, + int create) +{ + struct sptlrpc_conf_tgt *conf_tgt; + + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + if (strcmp(conf_tgt->sct_name, name) == 0) + return conf_tgt; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf_tgt); + if (conf_tgt) { + strncpy(conf_tgt->sct_name, name, sizeof(conf_tgt->sct_name)); + sptlrpc_rule_set_init(&conf_tgt->sct_rset); + list_add(&conf_tgt->sct_list, &conf->sc_tgts); + } + + return conf_tgt; +} + +static +struct sptlrpc_conf *sptlrpc_conf_get(const char *fsname, + int create) +{ + struct sptlrpc_conf *conf; + + list_for_each_entry(conf, &sptlrpc_confs, sc_list) { + if (strcmp(conf->sc_fsname, fsname) == 0) + return conf; + } + + if (!create) + return NULL; + + OBD_ALLOC_PTR(conf); + if (conf == NULL) + return NULL; + + strcpy(conf->sc_fsname, fsname); + sptlrpc_rule_set_init(&conf->sc_rset); + CFS_INIT_LIST_HEAD(&conf->sc_tgts); + list_add(&conf->sc_list, &sptlrpc_confs); + + CDEBUG(D_SEC, "create sptlrpc conf %s\n", conf->sc_fsname); + return conf; +} + +/** + * caller must hold conf_lock already. */ -struct sptlrpc_conf_log *sptlrpc_conf_log_extract(struct lustre_cfg *lcfg) +static int sptlrpc_conf_merge_rule(struct sptlrpc_conf *conf, + const char *target, + struct sptlrpc_rule *rule) { - struct sptlrpc_conf_log *log; - struct sptlrpc_rule *r; - int i; + struct sptlrpc_conf_tgt *conf_tgt; + struct sptlrpc_rule_set *rule_set; + + /* fsname == target means general rules for the whole fs */ + if (strcmp(conf->sc_fsname, target) == 0) { + rule_set = &conf->sc_rset; + } else { + conf_tgt = sptlrpc_conf_get_tgt(conf, target, 1); + if (conf_tgt) { + rule_set = &conf_tgt->sct_rset; + } else { + CERROR("out of memory, can't merge rule!\n"); + return -ENOMEM; + } + } + + return sptlrpc_rule_set_merge(rule_set, rule, 1); +} + +/** + * process one LCFG_SPTLRPC_CONF record. if \a conf is NULL, we + * find one through the target name in the record inside conf_lock; + * otherwise means caller already hold conf_lock. + */ +static int __sptlrpc_process_config(struct lustre_cfg *lcfg, + struct sptlrpc_conf *conf) +{ + char *target, *param; + char fsname[MTI_NAME_MAXLEN]; + struct sptlrpc_rule rule; + int rc; ENTRY; - log = lustre_cfg_buf(lcfg, 1); - if (log == NULL) { - CERROR("no sptlrpc config data\n"); - RETURN(ERR_PTR(-EINVAL)); + target = lustre_cfg_string(lcfg, 1); + if (target == NULL) { + CERROR("missing target name\n"); + RETURN(-EINVAL); } - if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { - __swab32s(&log->scl_max); - __swab32s(&log->scl_nrule); + param = lustre_cfg_string(lcfg, 2); + if (param == NULL) { + CERROR("missing parameter\n"); + RETURN(-EINVAL); } - if (LUSTRE_CFG_BUFLEN(lcfg, 1) < - log->scl_max * sizeof(log->scl_rules[0])) { - CERROR("mal-formed config log\n"); - RETURN(ERR_PTR(-EINVAL)); + CDEBUG(D_SEC, "got one rule: %s.%s\n", target, param); + + /* parse rule to make sure the format is correct */ + if (strncmp(param, PARAM_SRPC_FLVR, sizeof(PARAM_SRPC_FLVR) - 1) != 0) { + CERROR("Invalid sptlrpc parameter: %s\n", param); + RETURN(-EINVAL); } + param += sizeof(PARAM_SRPC_FLVR) - 1; + + rc = sptlrpc_parse_rule(param, &rule); + if (rc) + RETURN(-EINVAL); - if (lcfg->lcfg_version == __swab32(LUSTRE_CFG_VERSION)) { - for (i = 0; i < log->scl_nrule; i++) { - r = &log->scl_rules[i]; - __swab32s(&r->sr_netid); - __swab16s(&r->sr_flvr.sf_rpc); - __swab32s(&r->sr_flvr.sf_flags); + if (conf == NULL) { + target2fsname(target, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("can't find conf\n"); + rc = -ENOMEM; + } else { + rc = sptlrpc_conf_merge_rule(conf, target, &rule); } + mutex_unlock(&sptlrpc_conf_lock); + } else { + LASSERT(mutex_is_locked(&sptlrpc_conf_lock)); + rc = sptlrpc_conf_merge_rule(conf, target, &rule); } - RETURN(log); + if (rc == 0) + conf->sc_modified++; + + RETURN(rc); } -EXPORT_SYMBOL(sptlrpc_conf_log_extract); -void sptlrpc_conf_log_cleanup(struct sptlrpc_conf_log *log) +int sptlrpc_process_config(struct lustre_cfg *lcfg) { - log->scl_nrule = 0; - memset(log->scl_rules, 0, sizeof(log->scl_rules)); + return __sptlrpc_process_config(lcfg, NULL); } -EXPORT_SYMBOL(sptlrpc_conf_log_cleanup); +EXPORT_SYMBOL(sptlrpc_process_config); -void sptlrpc_conf_log_dump(struct sptlrpc_conf_log *log) +static int logname2fsname(const char *logname, char *buf, int buflen) { - struct sptlrpc_rule *r; - int n; + char *ptr; + int len; - CWARN("max %u, rule# %u part %u\n", - log->scl_max, log->scl_nrule, log->scl_part); + ptr = strrchr(logname, '-'); + if (ptr == NULL || strcmp(ptr, "-sptlrpc")) { + CERROR("%s is not a sptlrpc config log\n", logname); + return -EINVAL; + } - for (n = 0; n < log->scl_nrule; n++) { - r = &log->scl_rules[n]; - CWARN("<%02d> %x -> %x, net %x, rpc %x\n", n, - r->sr_from, r->sr_to, r->sr_netid, r->sr_flvr.sf_rpc); + len = min((int) (ptr - logname), buflen - 1); + + memcpy(buf, logname, len); + buf[len] = '\0'; + return 0; +} + +void sptlrpc_conf_log_update_begin(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf && conf->sc_local) { + LASSERT(conf->sc_updated == 0); + sptlrpc_conf_free_rsets(conf); } + conf->sc_modified = 0; + + mutex_unlock(&sptlrpc_conf_lock); } -EXPORT_SYMBOL(sptlrpc_conf_log_dump); +EXPORT_SYMBOL(sptlrpc_conf_log_update_begin); -/* - * caller should guarantee that no concurrent calls to this function +/** + * mark a config log has been updated + */ +void sptlrpc_conf_log_update_end(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf) { + /* + * if original state is not updated, make sure the + * modified counter > 0 to enforce updating local copy. + */ + if (conf->sc_updated == 0) + conf->sc_modified++; + + conf->sc_updated = 1; + } + + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_update_end); + +void sptlrpc_conf_log_start(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 1); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_start); + +void sptlrpc_conf_log_stop(const char *logname) +{ + struct sptlrpc_conf *conf; + char fsname[16]; + + if (logname2fsname(logname, fsname, sizeof(fsname))) + return; + + mutex_lock(&sptlrpc_conf_lock); + conf = sptlrpc_conf_get(fsname, 0); + if (conf) + sptlrpc_conf_free(conf); + mutex_unlock(&sptlrpc_conf_lock); +} +EXPORT_SYMBOL(sptlrpc_conf_log_stop); + +static void inline flavor_set_flags(struct sptlrpc_flavor *sf, + enum lustre_sec_part from, + enum lustre_sec_part to, + unsigned int fl_udesc) +{ + if (from == LUSTRE_SP_MDT) { + /* MDT->MDT; MDT->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_OST) { + /* CLI->OST */ + sf->sf_flags |= PTLRPC_SEC_FL_ROOTONLY | PTLRPC_SEC_FL_BULK; + } else if (from == LUSTRE_SP_CLI && to == LUSTRE_SP_MDT) { + /* CLI->MDT */ + if (fl_udesc && sf->sf_rpc != SPTLRPC_FLVR_NULL) + sf->sf_flags |= PTLRPC_SEC_FL_UDESC; + } +} + +void sptlrpc_conf_choose_flavor(enum lustre_sec_part from, + enum lustre_sec_part to, + struct obd_uuid *target, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + char name[MTI_NAME_MAXLEN]; + int len, rc = 0; + + target2fsname(target->uuid, name, sizeof(name)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(name, 0); + if (conf == NULL) + goto out; + + /* convert uuid name (supposed end with _UUID) to target name */ + len = strlen(target->uuid); + LASSERT(len > 5); + memcpy(name, target->uuid, len - 5); + name[len - 5] = '\0'; + + conf_tgt = sptlrpc_conf_get_tgt(conf, name, 0); + if (conf_tgt) { + rc = sptlrpc_rule_set_choose(&conf_tgt->sct_rset, + from, to, nid, sf); + if (rc) + goto out; + } + + rc = sptlrpc_rule_set_choose(&conf->sc_rset, from, to, nid, sf); +out: + mutex_unlock(&sptlrpc_conf_lock); + + if (rc == 0) + get_default_flavor(sf); + + flavor_set_flags(sf, from, to, 1); +} + +/** + * called by target devices, determine the expected flavor from + * certain peer (from, nid). */ +void sptlrpc_target_choose_flavor(struct sptlrpc_rule_set *rset, + enum lustre_sec_part from, + lnet_nid_t nid, + struct sptlrpc_flavor *sf) +{ + if (sptlrpc_rule_set_choose(rset, from, LUSTRE_SP_ANY, nid, sf) == 0) + get_default_flavor(sf); +} +EXPORT_SYMBOL(sptlrpc_target_choose_flavor); + #define SEC_ADAPT_DELAY (10) -int sptlrpc_cliobd_process_config(struct obd_device *obd, - struct lustre_cfg *lcfg) +/** + * called by client devices, notify the sptlrpc config has changed and + * do import_sec_adapt later. + */ +void sptlrpc_conf_client_adapt(struct obd_device *obd) +{ + struct obd_import *imp; + ENTRY; + + LASSERT(strcmp(obd->obd_type->typ_name, LUSTRE_MDC_NAME) == 0 || + strcmp(obd->obd_type->typ_name, LUSTRE_OSC_NAME) ==0); + CDEBUG(D_SEC, "obd %s\n", obd->u.cli.cl_target_uuid.uuid); + + /* serialize with connect/disconnect import */ + down_read(&obd->u.cli.cl_sem); + + imp = obd->u.cli.cl_import; + if (imp) { + spin_lock(&imp->imp_lock); + if (imp->imp_sec) + imp->imp_sec_expire = cfs_time_current_sec() + + SEC_ADAPT_DELAY; + spin_unlock(&imp->imp_lock); + } + + up_read(&obd->u.cli.cl_sem); + EXIT; +} +EXPORT_SYMBOL(sptlrpc_conf_client_adapt); + +#ifdef __KERNEL__ + +static void rule2string(struct sptlrpc_rule *r, char *buf, int buflen) +{ + char dirbuf[8]; + char *net; + char *ptr = buf; + + if (r->sr_netid == LNET_NIDNET(LNET_NID_ANY)) + net = "default"; + else + net = libcfs_net2str(r->sr_netid); + + if (r->sr_from == LUSTRE_SP_ANY && r->sr_to == LUSTRE_SP_ANY) + dirbuf[0] = '\0'; + else + snprintf(dirbuf, sizeof(dirbuf), ".%s2%s", + sptlrpc_part2name(r->sr_from), + sptlrpc_part2name(r->sr_to)); + + ptr += snprintf(buf, buflen, "srpc.flavor.%s%s=", net, dirbuf); + + sptlrpc_flavor2name(&r->sr_flvr, ptr, buflen - (ptr - buf)); + buf[buflen - 1] = '\0'; +} + +static int sptlrpc_record_rule_set(struct llog_handle *llh, + char *target, + struct sptlrpc_rule_set *rset) +{ + struct lustre_cfg_bufs bufs; + struct lustre_cfg *lcfg; + struct llog_rec_hdr rec; + int buflen; + char param[48]; + int i, rc; + + for (i = 0; i < rset->srs_nrule; i++) { + rule2string(&rset->srs_rules[i], param, sizeof(param)); + + lustre_cfg_bufs_reset(&bufs, NULL); + lustre_cfg_bufs_set_string(&bufs, 1, target); + lustre_cfg_bufs_set_string(&bufs, 2, param); + lcfg = lustre_cfg_new(LCFG_SPTLRPC_CONF, &bufs); + LASSERT(lcfg); + + buflen = lustre_cfg_len(lcfg->lcfg_bufcount, + lcfg->lcfg_buflens); + rec.lrh_len = llog_data_len(buflen); + rec.lrh_type = OBD_CFG_REC; + rc = llog_write_rec(llh, &rec, NULL, 0, (void *)lcfg, -1); + if (rc) + CERROR("failed to write a rec: rc = %d\n", rc); + lustre_cfg_free(lcfg); + } + return 0; +} + +static int sptlrpc_record_rules(struct llog_handle *llh, + struct sptlrpc_conf *conf) { - struct sptlrpc_conf_log *log; - struct obd_import *imp; - int rc; + struct sptlrpc_conf_tgt *conf_tgt; + + sptlrpc_record_rule_set(llh, conf->sc_fsname, &conf->sc_rset); - log = sptlrpc_conf_log_extract(lcfg); - if (IS_ERR(log)) { - CERROR("extract log error: %ld\n", PTR_ERR(log)); - return PTR_ERR(log); + list_for_each_entry(conf_tgt, &conf->sc_tgts, sct_list) { + sptlrpc_record_rule_set(llh, conf_tgt->sct_name, + &conf_tgt->sct_rset); } + return 0; +} - obd->u.cli.cl_sec_part = log->scl_part; +#define LOG_SPTLRPC_TMP "sptlrpc.tmp" +#define LOG_SPTLRPC "sptlrpc" + +static +int sptlrpc_target_local_copy_conf(struct obd_device *obd, + struct sptlrpc_conf *conf) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + struct lvfs_run_ctxt saved; + struct dentry *dentry; + int rc; + ENTRY; + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt == NULL) { + CERROR("missing llog context\n"); + RETURN(-EINVAL); + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + dentry = lookup_one_len(MOUNT_CONFIGS_DIR, current->fs->pwd, + strlen(MOUNT_CONFIGS_DIR)); + if (IS_ERR(dentry)) { + rc = PTR_ERR(dentry); + CERROR("cannot lookup %s directory: rc = %d\n", + MOUNT_CONFIGS_DIR, rc); + GOTO(out_ctx, rc); + } + + /* erase the old tmp log */ + rc = llog_create(ctxt, &llh, NULL, LOG_SPTLRPC_TMP); + if (rc == 0) { + rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + if (rc == 0) { + rc = llog_destroy(llh); + llog_free_handle(llh); + } else { + llog_close(llh); + } + } - rc = sptlrpc_rule_set_from_log(&obd->u.cli.cl_sptlrpc_rset, log); if (rc) { - CERROR("failed create rule set: %d\n", rc); - return rc; + CERROR("target %s: cannot erase temporary sptlrpc log: " + "rc = %d\n", obd->obd_name, rc); + GOTO(out_dput, rc); } - imp = obd->u.cli.cl_import; - if (imp == NULL) - return 0; - - /* even if imp_sec_expire is already set, we'll override it to a - * newer (later) time */ - spin_lock(&imp->imp_lock); - if (imp->imp_sec) - imp->imp_sec_expire = cfs_time_current_sec() + SEC_ADAPT_DELAY; - spin_unlock(&imp->imp_lock); + /* write temporary log */ + rc = llog_create(ctxt, &llh, NULL, LOG_SPTLRPC_TMP); + if (rc) + GOTO(out_dput, rc); + rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + rc = sptlrpc_record_rules(llh, conf); + +out_close: + llog_close(llh); + + if (rc == 0) { + rc = lustre_rename(dentry, obd->obd_lvfs_ctxt.pwdmnt, + LOG_SPTLRPC_TMP, LOG_SPTLRPC); + } + +out_dput: + l_dput(dentry); +out_ctx: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + CDEBUG(D_SEC, "target %s: write local sptlrpc conf: rc = %d\n", + obd->obd_name, rc); + RETURN(rc); +} + +static int local_read_handler(struct llog_handle *llh, + struct llog_rec_hdr *rec, + void *data) +{ + struct sptlrpc_conf *conf = (struct sptlrpc_conf *) data; + struct lustre_cfg *lcfg = (struct lustre_cfg *)(rec + 1); + int cfg_len, rc; + ENTRY; + + if (rec->lrh_type != OBD_CFG_REC) { + CERROR("unhandled lrh_type: %#x\n", rec->lrh_type); + RETURN(-EINVAL); + } + + cfg_len = rec->lrh_len - sizeof(struct llog_rec_hdr) - + sizeof(struct llog_rec_tail); + + rc = lustre_cfg_sanity_check(lcfg, cfg_len); + if (rc) { + CERROR("Insane cfg\n"); + RETURN(rc); + } + + if (lcfg->lcfg_command != LCFG_SPTLRPC_CONF) { + CERROR("invalid command (%x)\n", lcfg->lcfg_command); + RETURN(-EINVAL); + } + + RETURN(__sptlrpc_process_config(lcfg, conf)); +} + +static +int sptlrpc_target_local_read_conf(struct obd_device *obd, + struct sptlrpc_conf *conf) +{ + struct llog_handle *llh = NULL; + struct llog_ctxt *ctxt; + struct lvfs_run_ctxt saved; + int rc; + ENTRY; + + LASSERT(conf->sc_updated == 0 && conf->sc_local == 0); + + ctxt = llog_get_context(obd, LLOG_CONFIG_ORIG_CTXT); + if (ctxt == NULL) { + CERROR("missing llog context\n"); + RETURN(-EINVAL); + } + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + rc = llog_create(ctxt, &llh, NULL, LOG_SPTLRPC); + if (rc) + GOTO(out_pop, rc); + + rc = llog_init_handle(llh, LLOG_F_IS_PLAIN, NULL); + if (rc) + GOTO(out_close, rc); + + if (llog_get_size(llh) <= 1) { + CDEBUG(D_SEC, "no local sptlrpc copy found\n"); + GOTO(out_close, rc = 0); + } + + rc = llog_process(llh, local_read_handler, (void *) conf, NULL); + + if (rc == 0) { + conf->sc_local = 1; + } else { + sptlrpc_conf_free_rsets(conf); + } + +out_close: + llog_close(llh); +out_pop: + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + llog_ctxt_put(ctxt); + CDEBUG(D_SEC, "target %s: read local sptlrpc conf: rc = %d\n", + obd->obd_name, rc); + RETURN(rc); +} + +#endif /* __KRENEL__ */ + +/** + * called by target devices, extract sptlrpc rules which applies to + * this target, to be used for future rpc flavor checking. + */ +int sptlrpc_conf_target_get_rules(struct obd_device *obd, + struct sptlrpc_rule_set *rset, + int initial) +{ + struct sptlrpc_conf *conf; + struct sptlrpc_conf_tgt *conf_tgt; + enum lustre_sec_part sp_dst; + char fsname[MTI_NAME_MAXLEN]; + int rc = 0; + ENTRY; + + if (strcmp(obd->obd_type->typ_name, LUSTRE_MDT_NAME) == 0) { + sp_dst = LUSTRE_SP_MDT; + } else if (strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME) == 0) { + sp_dst = LUSTRE_SP_OST; + } else { + CERROR("unexpected obd type %s\n", obd->obd_type->typ_name); + RETURN(-EINVAL); + } + CDEBUG(D_SEC, "get rules for target %s\n", obd->obd_uuid.uuid); + + target2fsname(obd->obd_uuid.uuid, fsname, sizeof(fsname)); + + mutex_lock(&sptlrpc_conf_lock); + + conf = sptlrpc_conf_get(fsname, 0); + if (conf == NULL) { + CERROR("missing sptlrpc config log\n"); + GOTO(out, rc); + } + +#ifdef __KERNEL__ + if (conf->sc_updated == 0) { + /* + * always read from local copy. here another option is + * if we already have a local copy (read from another + * target device hosted on the same node) we simply use that. + */ + if (conf->sc_local) + sptlrpc_conf_free_rsets(conf); + + sptlrpc_target_local_read_conf(obd, conf); + } else { + LASSERT(conf->sc_local == 0); + + /* write a local copy */ + if (initial || conf->sc_modified) + sptlrpc_target_local_copy_conf(obd, conf); + else + CDEBUG(D_SEC, "unchanged, skip updating local copy\n"); + } +#endif + + /* extract rule set for this target */ + conf_tgt = sptlrpc_conf_get_tgt(conf, obd->obd_name, 0); + + rc = sptlrpc_rule_set_extract(&conf->sc_rset, + conf_tgt ? &conf_tgt->sct_rset: NULL, + LUSTRE_SP_ANY, sp_dst, rset); +out: + mutex_unlock(&sptlrpc_conf_lock); + RETURN(rc); +} +EXPORT_SYMBOL(sptlrpc_conf_target_get_rules); + +int sptlrpc_conf_init(void) +{ + mutex_init(&sptlrpc_conf_lock); return 0; } -EXPORT_SYMBOL(sptlrpc_cliobd_process_config); + +void sptlrpc_conf_fini(void) +{ + struct sptlrpc_conf *conf, *conf_next; + + mutex_lock(&sptlrpc_conf_lock); + list_for_each_entry_safe(conf, conf_next, &sptlrpc_confs, sc_list) { + sptlrpc_conf_free(conf); + } + LASSERT(list_empty(&sptlrpc_confs)); + mutex_unlock(&sptlrpc_conf_lock); +} diff --git a/lustre/ptlrpc/sec_gc.c b/lustre/ptlrpc/sec_gc.c index bc1fe0d..76c4296 100644 --- a/lustre/ptlrpc/sec_gc.c +++ b/lustre/ptlrpc/sec_gc.c @@ -58,12 +58,12 @@ #ifdef __KERNEL__ -static DECLARE_MUTEX(sec_gc_mutex); +static struct mutex sec_gc_mutex; static CFS_LIST_HEAD(sec_gc_list); -static spinlock_t sec_gc_list_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t sec_gc_list_lock; static CFS_LIST_HEAD(sec_gc_ctx_list); -static spinlock_t sec_gc_ctx_list_lock = SPIN_LOCK_UNLOCKED; +static spinlock_t sec_gc_ctx_list_lock; static struct ptlrpc_thread sec_gc_thread; static atomic_t sec_gc_wait_del = ATOMIC_INIT(0); @@ -100,8 +100,8 @@ void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) spin_unlock(&sec_gc_list_lock); /* barrier */ - mutex_down(&sec_gc_mutex); - mutex_up(&sec_gc_mutex); + mutex_lock(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); atomic_dec(&sec_gc_wait_del); @@ -190,19 +190,19 @@ again: * to trace each sec as order of expiry time. * another issue here is we wakeup as fixed interval instead of * according to each sec's expiry time */ - mutex_down(&sec_gc_mutex); + mutex_lock(&sec_gc_mutex); list_for_each_entry(sec, &sec_gc_list, ps_gc_list) { /* if someone is waiting to be deleted, let it * proceed as soon as possible. */ if (atomic_read(&sec_gc_wait_del)) { CWARN("deletion pending, start over\n"); - mutex_up(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); goto again; } sec_do_gc(sec); } - mutex_up(&sec_gc_mutex); + mutex_unlock(&sec_gc_mutex); /* check ctx list again before sleep */ sec_process_ctx_list(); @@ -223,11 +223,15 @@ again: return 0; } -int sptlrpc_gc_start_thread(void) +int sptlrpc_gc_init(void) { struct l_wait_info lwi = { 0 }; int rc; + mutex_init(&sec_gc_mutex); + spin_lock_init(&sec_gc_list_lock); + spin_lock_init(&sec_gc_ctx_list_lock); + /* initialize thread control */ memset(&sec_gc_thread, 0, sizeof(sec_gc_thread)); cfs_waitq_init(&sec_gc_thread.t_ctl_waitq); @@ -244,7 +248,7 @@ int sptlrpc_gc_start_thread(void) return 0; } -void sptlrpc_gc_stop_thread(void) +void sptlrpc_gc_fini(void) { struct l_wait_info lwi = { 0 }; @@ -263,11 +267,11 @@ void sptlrpc_gc_add_sec(struct ptlrpc_sec *sec) void sptlrpc_gc_del_sec(struct ptlrpc_sec *sec) { } -int sptlrpc_gc_start_thread(void) +int sptlrpc_gc_init(void) { return 0; } -void sptlrpc_gc_stop_thread(void) +void sptlrpc_gc_fini(void) { } diff --git a/lustre/ptlrpc/sec_null.c b/lustre/ptlrpc/sec_null.c index c63401b..7b4368d 100644 --- a/lustre/ptlrpc/sec_null.c +++ b/lustre/ptlrpc/sec_null.c @@ -96,7 +96,7 @@ int null_ctx_sign(struct ptlrpc_cli_ctx *ctx, struct ptlrpc_request *req) if (!req->rq_import->imp_dlm_fake) { struct obd_device *obd = req->rq_import->imp_obd; null_encode_sec_part(req->rq_reqbuf, - obd->u.cli.cl_sec_part); + obd->u.cli.cl_sp_me); } req->rq_reqdata_len = req->rq_reqlen; return 0; diff --git a/lustre/ptlrpc/sec_plain.c b/lustre/ptlrpc/sec_plain.c index fd4e723..eb9ee82 100644 --- a/lustre/ptlrpc/sec_plain.c +++ b/lustre/ptlrpc/sec_plain.c @@ -318,7 +318,7 @@ struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, /* * initialize plain_sec */ - plsec->pls_lock = RW_LOCK_UNLOCKED; + rwlock_init(&plsec->pls_lock); plsec->pls_ctx = NULL; sec = &plsec->pls_base; @@ -328,7 +328,7 @@ struct ptlrpc_sec *plain_create_sec(struct obd_import *imp, sec->ps_id = sptlrpc_get_next_secid(); sec->ps_import = class_import_get(imp); sec->ps_flvr = *sf; - sec->ps_lock = SPIN_LOCK_UNLOCKED; + spin_lock_init(&sec->ps_lock); CFS_INIT_LIST_HEAD(&sec->ps_gc_list); sec->ps_gc_interval = 0; sec->ps_gc_next = 0; diff --git a/lustre/ptlrpc/service.c b/lustre/ptlrpc/service.c index 05ff23d..2422ac5 100644 --- a/lustre/ptlrpc/service.c +++ b/lustre/ptlrpc/service.c @@ -169,7 +169,7 @@ ptlrpc_grow_req_bufs(struct ptlrpc_service *svc) void ptlrpc_save_lock (struct ptlrpc_request *req, - struct lustre_handle *lock, int mode) + struct lustre_handle *lock, int mode, int no_ack) { struct ptlrpc_reply_state *rs = req->rq_reply_state; int idx; @@ -181,26 +181,29 @@ ptlrpc_save_lock (struct ptlrpc_request *req, rs->rs_locks[idx] = *lock; rs->rs_modes[idx] = mode; rs->rs_difficult = 1; + rs->rs_no_ack = !!no_ack; } void ptlrpc_schedule_difficult_reply (struct ptlrpc_reply_state *rs) { struct ptlrpc_service *svc = rs->rs_service; + ENTRY; -#ifdef CONFIG_SMP - LASSERT (spin_is_locked (&svc->srv_lock)); -#endif + LASSERT_SPIN_LOCKED(&svc->srv_lock); LASSERT (rs->rs_difficult); rs->rs_scheduled_ever = 1; /* flag any notification attempt */ - if (rs->rs_scheduled) /* being set up or already notified */ + if (rs->rs_scheduled) { /* being set up or already notified */ + EXIT; return; + } rs->rs_scheduled = 1; list_del (&rs->rs_list); list_add (&rs->rs_list, &svc->srv_reply_queue); cfs_waitq_signal (&svc->srv_waitq); + EXIT; } void @@ -208,6 +211,7 @@ ptlrpc_commit_replies (struct obd_device *obd) { struct list_head *tmp; struct list_head *nxt; + ENTRY; /* Find any replies that have been committed and get their service * to attend to complete them. */ @@ -232,6 +236,7 @@ ptlrpc_commit_replies (struct obd_device *obd) } spin_unlock(&obd->obd_uncommitted_replies_lock); + EXIT; } static int @@ -293,7 +298,7 @@ struct ptlrpc_service *ptlrpc_init_svc_conf(struct ptlrpc_service_conf *c, c->psc_watchdog_factor, h, name, proc_entry, prntfn, c->psc_min_threads, c->psc_max_threads, - threadname, c->psc_ctx_tags); + threadname, c->psc_ctx_tags, NULL); } EXPORT_SYMBOL(ptlrpc_init_svc_conf); @@ -313,7 +318,8 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, cfs_proc_dir_entry_t *proc_entry, svcreq_printfn_t svcreq_printfn, int min_threads, int max_threads, - char *threadname, __u32 ctx_tags) + char *threadname, __u32 ctx_tags, + svc_hpreq_handler_t hp_handler) { int rc; struct ptlrpc_service *service; @@ -348,11 +354,16 @@ ptlrpc_init_svc(int nbufs, int bufsize, int max_req_size, int max_reply_size, service->srv_threads_max = max_threads; service->srv_thread_name = threadname; service->srv_ctx_tags = ctx_tags; + service->srv_hpreq_handler = hp_handler; + service->srv_hpreq_ratio = PTLRPC_SVC_HP_RATIO; + service->srv_hpreq_count = 0; + service->srv_n_hpreq = 0; rc = LNetSetLazyPortal(service->srv_req_portal); LASSERT (rc == 0); CFS_INIT_LIST_HEAD(&service->srv_request_queue); + CFS_INIT_LIST_HEAD(&service->srv_request_hpq); CFS_INIT_LIST_HEAD(&service->srv_idle_rqbds); CFS_INIT_LIST_HEAD(&service->srv_active_rqbds); CFS_INIT_LIST_HEAD(&service->srv_history_rqbds); @@ -513,6 +524,11 @@ static void ptlrpc_server_finish_request(struct ptlrpc_request *req) { struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + if (req->rq_export) { + class_export_put(req->rq_export); + req->rq_export = NULL; + } + if (req->rq_phase != RQ_PHASE_NEW) /* incorrect message magic */ DEBUG_REQ(D_INFO, req, "free req"); @@ -530,7 +546,7 @@ static void ptlrpc_server_finish_request(struct ptlrpc_request *req) static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) { struct obd_export *oldest_exp; - time_t oldest_time; + time_t oldest_time, new_time; ENTRY; @@ -541,9 +557,13 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) of the list, we can be really lazy here - we don't have to evict at the exact right moment. Eventually, all silent exports will make it to the top of the list. */ - exp->exp_last_request_time = max(exp->exp_last_request_time, - cfs_time_current_sec() + extra_delay); + /* Do not pay attention on 1sec or smaller renewals. */ + new_time = cfs_time_current_sec() + extra_delay; + if (exp->exp_last_request_time + 1 /*second */ >= new_time) + RETURN_EXIT; + + exp->exp_last_request_time = new_time; CDEBUG(D_HA, "updating export %s at "CFS_TIME_T" exp %p\n", exp->exp_client_uuid.uuid, exp->exp_last_request_time, exp); @@ -556,8 +576,7 @@ static void ptlrpc_update_export_timer(struct obd_export *exp, long extra_delay) if (list_empty(&exp->exp_obd_chain_timed)) { /* this one is not timed */ spin_unlock(&exp->exp_obd->obd_dev_lock); - EXIT; - return; + RETURN_EXIT; } list_move_tail(&exp->exp_obd_chain_timed, @@ -917,6 +936,167 @@ static int ptlrpc_at_check_timed(struct ptlrpc_service *svc) RETURN(0); } +/** + * Put the request to the export list if the request may become + * a high priority one. + */ +static int ptlrpc_hpreq_init(struct ptlrpc_service *svc, + struct ptlrpc_request *req) +{ + int rc; + ENTRY; + + if (svc->srv_hpreq_handler) { + rc = svc->srv_hpreq_handler(req); + if (rc) + RETURN(rc); + } + if (req->rq_export && req->rq_ops) { + spin_lock(&req->rq_export->exp_lock); + list_add(&req->rq_exp_list, &req->rq_export->exp_queued_rpc); + spin_unlock(&req->rq_export->exp_lock); + } + + RETURN(0); +} + +/** Remove the request from the export list. */ +static void ptlrpc_hpreq_fini(struct ptlrpc_request *req) +{ + ENTRY; + if (req->rq_export && req->rq_ops) { + spin_lock(&req->rq_export->exp_lock); + list_del_init(&req->rq_exp_list); + spin_unlock(&req->rq_export->exp_lock); + } + EXIT; +} + +/** + * Make the request a high priority one. + * + * All the high priority requests are queued in a separate FIFO + * ptlrpc_service::srv_request_hpq list which is parallel to + * ptlrpc_service::srv_request_queue list but has a higher priority + * for handling. + * + * \see ptlrpc_server_handle_request(). + */ +static void ptlrpc_hpreq_reorder_nolock(struct ptlrpc_service *svc, + struct ptlrpc_request *req) +{ + ENTRY; + LASSERT(svc != NULL); + spin_lock(&req->rq_lock); + if (req->rq_hp == 0) { + int opc = lustre_msg_get_opc(req->rq_reqmsg); + + /* Add to the high priority queue. */ + list_move_tail(&req->rq_list, &svc->srv_request_hpq); + req->rq_hp = 1; + if (opc != OBD_PING) + DEBUG_REQ(D_NET, req, "high priority req"); + } + spin_unlock(&req->rq_lock); + EXIT; +} + +void ptlrpc_hpreq_reorder(struct ptlrpc_request *req) +{ + struct ptlrpc_service *svc = req->rq_rqbd->rqbd_service; + ENTRY; + + spin_lock(&svc->srv_lock); + /* It may happen that the request is already taken for the processing + * but still in the export list, do not re-add it into the HP list. */ + if (req->rq_phase == RQ_PHASE_NEW) + ptlrpc_hpreq_reorder_nolock(svc, req); + spin_unlock(&svc->srv_lock); + EXIT; +} + +/** Check if the request if a high priority one. */ +static int ptlrpc_server_hpreq_check(struct ptlrpc_request *req) +{ + int opc, rc = 0; + ENTRY; + + /* Check by request opc. */ + opc = lustre_msg_get_opc(req->rq_reqmsg); + if (opc == OBD_PING) + RETURN(1); + + /* Perform request specific check. */ + if (req->rq_ops && req->rq_ops->hpreq_check) + rc = req->rq_ops->hpreq_check(req); + RETURN(rc); +} + +/** Check if a request is a high priority one. */ +static int ptlrpc_server_request_add(struct ptlrpc_service *svc, + struct ptlrpc_request *req) +{ + int rc; + ENTRY; + + rc = ptlrpc_server_hpreq_check(req); + if (rc < 0) + RETURN(rc); + + spin_lock(&svc->srv_lock); + /* Before inserting the request into the queue, check if it is not + * inserted yet, or even already handled -- it may happen due to + * a racing ldlm_server_blocking_ast(). */ + if (req->rq_phase == RQ_PHASE_NEW && list_empty(&req->rq_list)) { + if (rc) + ptlrpc_hpreq_reorder_nolock(svc, req); + else + list_add_tail(&req->rq_list, &svc->srv_request_queue); + } + spin_unlock(&svc->srv_lock); + + RETURN(0); +} + +/* Only allow normal priority requests on a service that has a high-priority + * queue if forced (i.e. cleanup), if there are other high priority requests + * already being processed (i.e. those threads can service more high-priority + * requests), or if there are enough idle threads that a later thread can do + * a high priority request. */ +static int ptlrpc_server_allow_normal(struct ptlrpc_service *svc, int force) +{ + return force || !svc->srv_hpreq_handler || svc->srv_n_hpreq > 0 || + svc->srv_n_active_reqs < svc->srv_threads_running - 2; +} + +static struct ptlrpc_request * +ptlrpc_server_request_get(struct ptlrpc_service *svc, int force) +{ + struct ptlrpc_request *req = NULL; + ENTRY; + + if (ptlrpc_server_allow_normal(svc, force) && + !list_empty(&svc->srv_request_queue) && + (list_empty(&svc->srv_request_hpq) || + svc->srv_hpreq_count >= svc->srv_hpreq_ratio)) { + req = list_entry(svc->srv_request_queue.next, + struct ptlrpc_request, rq_list); + svc->srv_hpreq_count = 0; + } else if (!list_empty(&svc->srv_request_hpq)) { + req = list_entry(svc->srv_request_hpq.next, + struct ptlrpc_request, rq_list); + svc->srv_hpreq_count++; + } + RETURN(req); +} + +static int ptlrpc_server_request_pending(struct ptlrpc_service *svc, int force) +{ + return ((ptlrpc_server_allow_normal(svc, force) && + !list_empty(&svc->srv_request_queue)) || + !list_empty(&svc->srv_request_hpq)); +} + /* Handle freshly incoming reqs, add to timed early reply list, pass on to regular request queue */ static int @@ -996,10 +1176,9 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) "illegal security flavor,"); } - class_export_put(req->rq_export); - req->rq_export = NULL; if (rc) goto err_req; + ptlrpc_update_export_timer(req->rq_export, 0); } /* req_in handling should/must be fast */ @@ -1020,12 +1199,15 @@ ptlrpc_server_handle_req_in(struct ptlrpc_service *svc) } ptlrpc_at_add_timed(req); + rc = ptlrpc_hpreq_init(svc, req); + if (rc) + GOTO(err_req, rc); /* Move it over to the request processing queue */ - spin_lock(&svc->srv_lock); - list_add_tail(&req->rq_list, &svc->srv_request_queue); + rc = ptlrpc_server_request_add(svc, req); + if (rc) + GOTO(err_req, rc); cfs_waitq_signal(&svc->srv_waitq); - spin_unlock(&svc->srv_lock); RETURN(1); err_req: @@ -1047,13 +1229,14 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, struct timeval work_start; struct timeval work_end; long timediff; - int rc; + int opc, rc; + int fail_opc = 0; ENTRY; LASSERT(svc); spin_lock(&svc->srv_lock); - if (unlikely(list_empty (&svc->srv_request_queue) || + if (unlikely(!ptlrpc_server_request_pending(svc, 0) || ( #ifndef __KERNEL__ /* !@%$# liblustre only has 1 thread */ @@ -1066,16 +1249,47 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, * That means we always need at least 2 service threads. */ spin_unlock(&svc->srv_lock); RETURN(0); + } + + request = ptlrpc_server_request_get(svc, 0); + if (request == NULL) { + spin_unlock(&svc->srv_lock); + RETURN(0); } - request = list_entry (svc->srv_request_queue.next, - struct ptlrpc_request, rq_list); - list_del_init (&request->rq_list); + opc = lustre_msg_get_opc(request->rq_reqmsg); + if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT; + else if (OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_HPREQ_TIMEOUT)) + fail_opc = OBD_FAIL_PTLRPC_HPREQ_TIMEOUT; + + if (unlikely(fail_opc)) { + if (request->rq_export && request->rq_ops) { + spin_unlock(&svc->srv_lock); + OBD_FAIL_TIMEOUT(fail_opc, 4); + spin_lock(&svc->srv_lock); + request = ptlrpc_server_request_get(svc, 0); + if (request == NULL) { + spin_unlock(&svc->srv_lock); + RETURN(0); + } + LASSERT(ptlrpc_server_request_pending(svc, 0)); + } + } + + list_del_init(&request->rq_list); svc->srv_n_queued_reqs--; svc->srv_n_active_reqs++; + if (request->rq_hp) + svc->srv_n_hpreq++; + /* The phase is changed under the lock here because we need to know + * the request is under processing (see ptlrpc_hpreq_reorder()). */ + ptlrpc_rqphase_move(request, RQ_PHASE_INTERPRET); spin_unlock(&svc->srv_lock); + ptlrpc_hpreq_fini(request); + if(OBD_FAIL_CHECK(OBD_FAIL_PTLRPC_DUMP_LOG)) libcfs_debug_dumplog(); @@ -1092,12 +1306,14 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, at_get(&svc->srv_at_estimate)); } - rc = lu_context_init(&request->rq_session, LCT_SESSION); + rc = lu_context_init(&request->rq_session, + LCT_SESSION|LCT_REMEMBER|LCT_NOREF); if (rc) { CERROR("Failure to initialize session: %d\n", rc); goto out_req; } request->rq_session.lc_thread = thread; + request->rq_session.lc_cookie = 0x5; lu_context_enter(&request->rq_session); CDEBUG(D_NET, "got req "LPU64"\n", request->rq_xid); @@ -1106,9 +1322,6 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, if (thread) request->rq_svc_thread->t_env->le_ses = &request->rq_session; - request->rq_export = class_conn2export( - lustre_msg_get_handle(request->rq_reqmsg)); - if (likely(request->rq_export)) { if (unlikely(ptlrpc_check_req(request))) goto put_conn; @@ -1129,8 +1342,6 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, goto put_rpc_export; } - request->rq_phase = RQ_PHASE_INTERPRET; - CDEBUG(D_RPCTRACE, "Handling RPC pname:cluuid+ref:pid:xid:nid:opc " "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), (request->rq_export ? @@ -1145,7 +1356,7 @@ ptlrpc_server_handle_request(struct ptlrpc_service *svc, rc = svc->srv_handler(request); - request->rq_phase = RQ_PHASE_COMPLETE; + ptlrpc_rqphase_move(request, RQ_PHASE_COMPLETE); CDEBUG(D_RPCTRACE, "Handled RPC pname:cluuid+ref:pid:xid:nid:opc " "%s:%s+%d:%d:x"LPU64":%s:%d\n", cfs_curproc_comm(), @@ -1161,9 +1372,6 @@ put_rpc_export: if (export != NULL) class_export_rpc_put(export); put_conn: - if (likely(request->rq_export != NULL)) - class_export_put(request->rq_export); - lu_context_exit(&request->rq_session); lu_context_fini(&request->rq_session); @@ -1208,6 +1416,10 @@ put_conn: } out_req: + spin_lock(&svc->srv_lock); + if (request->rq_hp) + svc->srv_n_hpreq--; + spin_unlock(&svc->srv_lock); ptlrpc_server_finish_request(request); RETURN(1); @@ -1293,6 +1505,11 @@ ptlrpc_server_handle_reply (struct ptlrpc_service *svc) if (!rs->rs_on_net) { /* Off the net */ svc->srv_n_difficult_replies--; + if (svc->srv_n_difficult_replies == 0 && svc->srv_is_stopping) + /* wake up threads that are being stopped by + ptlrpc_unregister_service/ptlrpc_stop_threads + and sleep waiting svr_n_difficult_replies == 0 */ + cfs_waitq_broadcast(&svc->srv_waitq); spin_unlock(&svc->srv_lock); class_export_put (exp); @@ -1401,7 +1618,6 @@ static int ptlrpc_main(void *arg) struct ptlrpc_thread *thread = data->thread; struct obd_device *dev = data->dev; struct ptlrpc_reply_state *rs; - struct lc_watchdog *watchdog; #ifdef WITH_GROUP_INFO struct group_info *ginfo = NULL; #endif @@ -1445,12 +1661,14 @@ static int ptlrpc_main(void *arg) goto out; } - rc = lu_context_init(&env.le_ctx, svc->srv_ctx_tags); + rc = lu_context_init(&env.le_ctx, + svc->srv_ctx_tags|LCT_REMEMBER|LCT_NOREF); if (rc) goto out_srv_fini; thread->t_env = &env; env.le_ctx.lc_thread = thread; + env.le_ctx.lc_cookie = 0x6; /* Alloc reply state structure for this one */ OBD_ALLOC_GFP(rs, svc->srv_max_reply_size, CFS_ALLOC_STD); @@ -1467,9 +1685,10 @@ static int ptlrpc_main(void *arg) */ cfs_waitq_signal(&thread->t_ctl_waitq); - watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 : - at_get(&svc->srv_at_estimate)) * - svc->srv_watchdog_factor, NULL, NULL); + thread->t_watchdog = lc_watchdog_add(max_t(int, obd_timeout, AT_OFF ? 0 : + at_get(&svc->srv_at_estimate)) + * svc->srv_watchdog_factor, + NULL, NULL); spin_lock(&svc->srv_lock); svc->srv_threads_running++; @@ -1477,8 +1696,8 @@ static int ptlrpc_main(void *arg) spin_unlock(&svc->srv_lock); cfs_waitq_signal(&svc->srv_free_rs_waitq); - CDEBUG(D_NET, "service thread %d (#%d)started\n", thread->t_id, - svc->srv_threads_running); + CDEBUG(D_NET, "service thread %d (#%d) started\n", thread->t_id, + svc->srv_threads_running); /* XXX maintain a list of all managed devices: insert here */ @@ -1488,7 +1707,7 @@ static int ptlrpc_main(void *arg) struct l_wait_info lwi = LWI_TIMEOUT(svc->srv_rqbd_timeout, ptlrpc_retry_rqbds, svc); - lc_watchdog_disable(watchdog); + lc_watchdog_disable(thread->t_watchdog); cond_resched(); @@ -1499,13 +1718,13 @@ static int ptlrpc_main(void *arg) svc->srv_rqbd_timeout == 0) || !list_empty(&svc->srv_req_in_queue) || !list_empty(&svc->srv_reply_queue) || - (!list_empty(&svc->srv_request_queue) && + (ptlrpc_server_request_pending(svc, 0) && (svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) || svc->srv_at_check, &lwi); - lc_watchdog_touch_ms(watchdog, max_t(int, obd_timeout, + lc_watchdog_touch_ms(thread->t_watchdog, max_t(int, obd_timeout, AT_OFF ? 0 : at_get(&svc->srv_at_estimate)) * svc->srv_watchdog_factor); @@ -1534,7 +1753,7 @@ static int ptlrpc_main(void *arg) ptlrpc_at_check_timed(svc); /* don't handle requests in the last thread */ - if (!list_empty (&svc->srv_request_queue) && + if (ptlrpc_server_request_pending(svc, 0) && (svc->srv_n_active_reqs < (svc->srv_threads_running - 1))) { lu_context_enter(&env.le_ctx); ptlrpc_server_handle_request(svc, thread); @@ -1552,7 +1771,8 @@ static int ptlrpc_main(void *arg) } } - lc_watchdog_delete(watchdog); + lc_watchdog_delete(thread->t_watchdog); + thread->t_watchdog = NULL; out_srv_fini: /* @@ -1580,7 +1800,9 @@ static void ptlrpc_stop_thread(struct ptlrpc_service *svc, struct ptlrpc_thread *thread) { struct l_wait_info lwi = { 0 }; + ENTRY; + CDEBUG(D_RPCTRACE, "Stopping thread %p\n", thread); spin_lock(&svc->srv_lock); thread->t_flags = SVC_STOPPING; spin_unlock(&svc->srv_lock); @@ -1594,11 +1816,13 @@ static void ptlrpc_stop_thread(struct ptlrpc_service *svc, spin_unlock(&svc->srv_lock); OBD_FREE_PTR(thread); + EXIT; } void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) { struct ptlrpc_thread *thread; + ENTRY; spin_lock(&svc->srv_lock); while (!list_empty(&svc->srv_threads)) { @@ -1611,6 +1835,7 @@ void ptlrpc_stop_all_threads(struct ptlrpc_service *svc) } spin_unlock(&svc->srv_lock); + EXIT; } int ptlrpc_start_threads(struct obd_device *dev, struct ptlrpc_service *svc) @@ -1705,7 +1930,9 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) struct l_wait_info lwi; struct list_head *tmp; struct ptlrpc_reply_state *rs, *t; + ENTRY; + service->srv_is_stopping = 1; cfs_timer_disarm(&service->srv_at_timer); ptlrpc_stop_all_threads(service); @@ -1749,7 +1976,8 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) /* Network access will complete in finite time but the HUGE * timeout lets us CWARN for visibility of sluggish NALs */ - lwi = LWI_TIMEOUT(cfs_time_seconds(LONG_UNLINK), NULL, NULL); + lwi = LWI_TIMEOUT_INTERVAL(cfs_time_seconds(LONG_UNLINK), + cfs_time_seconds(1), NULL, NULL); rc = l_wait_event(service->srv_waitq, service->srv_nrqbd_receiving == 0, &lwi); @@ -1782,16 +2010,14 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) service->srv_n_active_reqs++; ptlrpc_server_finish_request(req); } - while (!list_empty(&service->srv_request_queue)) { - struct ptlrpc_request *req = - list_entry(service->srv_request_queue.next, - struct ptlrpc_request, - rq_list); + while (ptlrpc_server_request_pending(service, 1)) { + struct ptlrpc_request *req; + req = ptlrpc_server_request_get(service, 1); list_del(&req->rq_list); service->srv_n_queued_reqs--; service->srv_n_active_reqs++; - + ptlrpc_hpreq_fini(req); ptlrpc_server_finish_request(req); } LASSERT(service->srv_n_queued_reqs == 0); @@ -1835,7 +2061,7 @@ int ptlrpc_unregister_service(struct ptlrpc_service *service) cfs_timer_disarm(&service->srv_at_timer); OBD_FREE_PTR(service); - return 0; + RETURN(0); } /* Returns 0 if the service is healthy. @@ -1855,14 +2081,18 @@ int ptlrpc_service_health_check(struct ptlrpc_service *svc) do_gettimeofday(&right_now); spin_lock(&svc->srv_lock); - if (list_empty(&svc->srv_request_queue)) { + if (!ptlrpc_server_request_pending(svc, 1)) { spin_unlock(&svc->srv_lock); return 0; } /* How long has the next entry been waiting? */ - request = list_entry(svc->srv_request_queue.next, - struct ptlrpc_request, rq_list); + if (list_empty(&svc->srv_request_queue)) + request = list_entry(svc->srv_request_hpq.next, + struct ptlrpc_request, rq_list); + else + request = list_entry(svc->srv_request_queue.next, + struct ptlrpc_request, rq_list); timediff = cfs_timeval_sub(&right_now, &request->rq_arrival_time, NULL); spin_unlock(&svc->srv_lock); diff --git a/lustre/ptlrpc/wiretest.c b/lustre/ptlrpc/wiretest.c index 0df9d23..d35ec0d 100644 --- a/lustre/ptlrpc/wiretest.c +++ b/lustre/ptlrpc/wiretest.c @@ -66,8 +66,8 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwiretest) - * running on Linux xlab.hostel 2.6.23.15-80.fc7 #1 SMP Sun Feb 10 17:29:10 EST 2008 i686 i68 - * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-7) */ + * running on Linux vb1 2.6.18-build.1 #1 SMP Thu Mar 27 14:34:21 MDT 2008 i686 i686 i386 GNU + * with gcc version 4.1.2 20070626 (Red Hat 4.1.2-14) */ /* Constants... */ @@ -129,7 +129,9 @@ void lustre_assert_wire_constants(void) (long long)OST_QUOTACHECK); LASSERTF(OST_QUOTACTL == 19, " found %lld\n", (long long)OST_QUOTACTL); - LASSERTF(OST_LAST_OPC == 20, " found %lld\n", + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, " found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LAST_OPC == 21, " found %lld\n", (long long)OST_LAST_OPC); LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL," found %lld\n", (long long)OBD_OBJECT_EOF); @@ -237,9 +239,9 @@ void lustre_assert_wire_constants(void) (long long)LCK_NL); LASSERTF(LCK_GROUP == 64, " found %lld\n", (long long)LCK_GROUP); - LASSERTF(LCK_MAXMODE == 65, " found %lld\n", + LASSERTF(LCK_MAXMODE == 129, " found %lld\n", (long long)LCK_MAXMODE); - LASSERTF(LCK_MODE_NUM == 7, " found %lld\n", + LASSERTF(LCK_MODE_NUM == 8, " found %lld\n", (long long)LCK_MODE_NUM); CLASSERT(LDLM_PLAIN == 10); CLASSERT(LDLM_EXTENT == 11); @@ -253,9 +255,9 @@ void lustre_assert_wire_constants(void) (long long)OBD_QC_CALLBACK); LASSERTF(OBD_LAST_OPC == 403, " found %lld\n", (long long)OBD_LAST_OPC); - LASSERTF(QUOTA_DQACQ == 601, " found %lld\n", + LASSERTF(QUOTA_DQACQ == 901, " found %lld\n", (long long)QUOTA_DQACQ); - LASSERTF(QUOTA_DQREL == 602, " found %lld\n", + LASSERTF(QUOTA_DQREL == 902, " found %lld\n", (long long)QUOTA_DQREL); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -461,8 +463,8 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL); CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL); CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL); - CLASSERT(OBD_CONNECT_LCL_CLIENT == 0x00010000ULL); - CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00020000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL); CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL); CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL); CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL); @@ -700,6 +702,67 @@ void lustre_assert_wire_constants(void) LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n", (long long)LOV_PATTERN_RAID1); + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, " found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_id) == 8, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_id)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_gr) == 16, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_gr)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name) == 16, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects) == 48, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects) == 0, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects)); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, " found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_id) == 0, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_object_id)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_gr) == 8, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_object_gr)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0); + LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n", + (long long)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n", + (long long)LOV_PATTERN_RAID1); + /* Checks for struct lov_mds_md_join */ LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n", (long long)(int)sizeof(struct lov_mds_md_join)); @@ -1584,6 +1647,38 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n", (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, " found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + /* Checks for struct llog_logid */ LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n", (long long)(int)sizeof(struct llog_logid)); @@ -1802,6 +1897,46 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n", (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail)); + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 56, " found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oid) == 16, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_ogen) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_ogen)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogen)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, padding)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->padding)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 48, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + /* Checks for struct llog_size_change_rec */ LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n", (long long)(int)sizeof(struct llog_size_change_rec)); @@ -2012,7 +2147,7 @@ void lustre_assert_wire_constants(void) (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_lmm)); /* Checks for struct qunit_data */ - LASSERTF((int)sizeof(struct qunit_data) == 16, " found %lld\n", + LASSERTF((int)sizeof(struct qunit_data) == 32, " found %lld\n", (long long)(int)sizeof(struct qunit_data)); LASSERTF((int)offsetof(struct qunit_data, qd_id) == 0, " found %lld\n", (long long)(int)offsetof(struct qunit_data, qd_id)); @@ -2026,26 +2161,38 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct qunit_data, qd_count)); LASSERTF((int)sizeof(((struct qunit_data *)0)->qd_count) == 8, " found %lld\n", (long long)(int)sizeof(((struct qunit_data *)0)->qd_count)); - - /* Checks for struct qunit_data_old */ - LASSERTF((int)sizeof(struct qunit_data_old) == 16, " found %lld\n", - (long long)(int)sizeof(struct qunit_data_old)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_id) == 0, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_id)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_id) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_id)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_type) == 4, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_type)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_type) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_type)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_count) == 8, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_count)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_count) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_count)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_isblk) == 12, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_isblk)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_isblk) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_isblk)); + LASSERTF((int)offsetof(struct qunit_data, qd_qunit) == 16, " found %lld\n", + (long long)(int)offsetof(struct qunit_data, qd_qunit)); + LASSERTF((int)sizeof(((struct qunit_data *)0)->qd_qunit) == 8, " found %lld\n", + (long long)(int)sizeof(((struct qunit_data *)0)->qd_qunit)); + LASSERTF((int)offsetof(struct qunit_data, padding) == 24, " found %lld\n", + (long long)(int)offsetof(struct qunit_data, padding)); + LASSERTF((int)sizeof(((struct qunit_data *)0)->padding) == 8, " found %lld\n", + (long long)(int)sizeof(((struct qunit_data *)0)->padding)); + + /* Checks for struct quota_adjust_qunit */ + LASSERTF((int)sizeof(struct quota_adjust_qunit) == 32, " found %lld\n", + (long long)(int)sizeof(struct quota_adjust_qunit)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_flags) == 0, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_flags)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_flags)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_id) == 4, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_id)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_id) == 4, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_id)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_bunit_sz) == 8, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_bunit_sz)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_bunit_sz) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_bunit_sz)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_iunit_sz) == 16, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_iunit_sz)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_iunit_sz) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_iunit_sz)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, padding1) == 24, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, padding1)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->padding1) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->padding1)); /* Checks for struct mgs_target_info */ LASSERTF((int)sizeof(struct mgs_target_info) == 4544, " found %lld\n", diff --git a/lustre/quota/Makefile.in b/lustre/quota/Makefile.in index e42dff1..f052b42 100644 --- a/lustre/quota/Makefile.in +++ b/lustre/quota/Makefile.in @@ -1,7 +1,7 @@ MODULES := lquota lquota-objs := quota_check.o quota_context.o quota_ctl.o quota_interface.o -lquota-objs += quota_master.o +lquota-objs += quota_master.o quota_adjust_qunit.o lproc_quota.o @INCLUDE_RULES@ diff --git a/lustre/quota/autoMakefile.am b/lustre/quota/autoMakefile.am index e070f4c..9a20d28 100644 --- a/lustre/quota/autoMakefile.am +++ b/lustre/quota/autoMakefile.am @@ -36,12 +36,12 @@ if LIBLUSTRE noinst_LIBRARIES = libquota.a -libquota_a_SOURCES = quota_check.c quota_ctl.c quota_interface.c +libquota_a_SOURCES = quota_check.c quota_ctl.c quota_interface.c quota_adjust_qunit.c libquota_a_CPPFLAGS = $(LLCPPFLAGS) libquota_a_CFLAGS = $(LLCFLAGS) endif -if MODULES +if QUOTA modulefs_DATA = lquota$(KMODEXT) endif diff --git a/lustre/quota/lproc_quota.c b/lustre/quota/lproc_quota.c new file mode 100644 index 0000000..a76807a --- /dev/null +++ b/lustre/quota/lproc_quota.c @@ -0,0 +1,667 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ + +#define DEBUG_SUBSYSTEM S_LQUOTA + +#include +#include +#include +#include +#include + +#include "quota_internal.h" + +#ifdef HAVE_QUOTA_SUPPORT + +#ifdef LPROCFS +int lprocfs_quota_rd_bunit(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_bunit_sz); +} +EXPORT_SYMBOL(lprocfs_quota_rd_bunit); + +int lprocfs_quota_wr_bunit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val % QUOTABLOCK_SIZE || + val <= obd->u.obt.obt_qctxt.lqc_btune_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_bunit_sz = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_bunit); + +int lprocfs_quota_rd_btune(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_btune_sz); +} +EXPORT_SYMBOL(lprocfs_quota_rd_btune); + +int lprocfs_quota_wr_btune(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE || + val >= obd->u.obt.obt_qctxt.lqc_bunit_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_btune_sz = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_btune); + +int lprocfs_quota_rd_iunit(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_iunit_sz); +} +EXPORT_SYMBOL(lprocfs_quota_rd_iunit); + +int lprocfs_quota_wr_iunit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_iunit_sz = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_iunit); + +int lprocfs_quota_rd_itune(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_itune_sz); +} +EXPORT_SYMBOL(lprocfs_quota_rd_itune); + +int lprocfs_quota_wr_itune(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= MIN_QLIMIT || + val >= obd->u.obt.obt_qctxt.lqc_iunit_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_itune_sz = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_itune); + +#define USER_QUOTA 1 +#define GROUP_QUOTA 2 + +#define MAX_STYPE_SIZE 5 + +int lprocfs_quota_rd_type(char *page, char **start, off_t off, int count, + int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + char stype[MAX_STYPE_SIZE + 1] = ""; + int oq_type; + struct obd_device_target *obt; + + LASSERT(obd != NULL); + + obt = &obd->u.obt; + + /* Collect the needed information */ + oq_type = obd->u.obt.obt_qctxt.lqc_flags; + + /* Transform the collected data into a user-readable string */ + if (oq_type & LQC_USRQUOTA_FLAG) + strcat(stype, "u"); + if (oq_type & LQC_GRPQUOTA_FLAG) + strcat(stype, "g"); + + strcat(stype, "3"); + + return snprintf(page, count, "%s\n", stype); +} +EXPORT_SYMBOL(lprocfs_quota_rd_type); + +static int auto_quota_on(struct obd_device *obd, int type, + struct super_block *sb, int is_master) +{ + struct obd_quotactl *oqctl; + struct lvfs_run_ctxt saved; + int rc = 0, id; + struct obd_device_target *obt; + ENTRY; + + LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA); + + obt = &obd->u.obt; + + OBD_ALLOC_PTR(oqctl); + if (!oqctl) + RETURN(-ENOMEM); + + if (!atomic_dec_and_test(&obt->obt_quotachecking)) { + CDEBUG(D_INFO, "other people are doing quotacheck\n"); + atomic_inc(&obt->obt_quotachecking); + RETURN(-EBUSY); + } + + id = UGQUOTA2LQC(type); + /* quota already turned on */ + if ((obt->obt_qctxt.lqc_flags & id) == id) { + rc = 0; + goto out; + } + + oqctl->qc_type = type; + oqctl->qc_cmd = Q_QUOTAON; + oqctl->qc_id = obt->obt_qfmt; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (is_master) { + struct mds_obd *mds = &obd->u.mds; + + down(&mds->mds_qonoff_sem); + /* turn on cluster wide quota */ + rc = mds_admin_quota_on(obd, oqctl); + if (rc) + CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, + "auto-enable admin quota failed. rc=%d\n", rc); + up(&mds->mds_qonoff_sem); + + } + if (!rc) { + /* turn on local quota */ + rc = fsfilt_quotactl(obd, sb, oqctl); + if (rc) + CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, + "auto-enable local quota failed. rc=%d\n", rc); + else + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(type); + } + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + +out: + atomic_inc(&obt->obt_quotachecking); + + OBD_FREE_PTR(oqctl); + RETURN(rc); +} + +int lprocfs_quota_wr_type(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + struct obd_device_target *obt; + int type = 0, is_mds; + unsigned long i; + char stype[MAX_STYPE_SIZE + 1] = ""; + + LASSERT(obd != NULL); + + obt = &obd->u.obt; + + is_mds = !strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME); + + if (count > MAX_STYPE_SIZE) + return -EINVAL; + + if (copy_from_user(stype, buffer, count)) + return -EFAULT; + + for (i = 0 ; i < count ; i++) { + switch (stype[i]) { + case 'u' : + type |= USER_QUOTA; + break; + case 'g' : + type |= GROUP_QUOTA; + break; + case '1' : + case '2' : + CWARN("quota_type options 1 and 2 are obsolete, " + "they will be ignored\n"); + break; + case '3' : /* the only valid version spec, do nothing */ + default : /* just skip stray symbols like \n */ + break; + } + } + + if (type != 0) + auto_quota_on(obd, type - 1, obt->obt_sb, is_mds); + + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_type); + +int lprocfs_quota_rd_switch_seconds(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->u.obt.obt_qctxt.lqc_switch_seconds); +} +EXPORT_SYMBOL(lprocfs_quota_rd_switch_seconds); + +int lprocfs_quota_wr_switch_seconds(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val <= 10) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_switch_seconds = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_switch_seconds); + +int lprocfs_quota_rd_sync_blk(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "%d\n", + obd->u.obt.obt_qctxt.lqc_sync_blk); +} +EXPORT_SYMBOL(lprocfs_quota_rd_sync_blk); + +int lprocfs_quota_wr_sync_blk(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 0) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_sync_blk = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_sync_blk); + +int lprocfs_quota_rd_switch_qs(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + return snprintf(page, count, "changing qunit size is %s\n", + obd->u.obt.obt_qctxt.lqc_switch_qs ? + "enabled" : "disabled"); +} +EXPORT_SYMBOL(lprocfs_quota_rd_switch_qs); + +int lprocfs_quota_wr_switch_qs(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val) + obd->u.obt.obt_qctxt.lqc_switch_qs = 1; + else + obd->u.obt.obt_qctxt.lqc_switch_qs = 0; + + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_switch_qs); + +int lprocfs_quota_rd_boundary_factor(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor); +} +EXPORT_SYMBOL(lprocfs_quota_rd_boundary_factor); + +int lprocfs_quota_wr_boundary_factor(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 2) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_cqs_boundary_factor = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_boundary_factor); + +int lprocfs_quota_rd_least_bunit(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_cqs_least_bunit); +} +EXPORT_SYMBOL(lprocfs_quota_rd_least_bunit); + +int lprocfs_quota_wr_least_bunit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < PTLRPC_MAX_BRW_SIZE || + val >= obd->u.obt.obt_qctxt.lqc_bunit_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_cqs_least_bunit = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_least_bunit); + +int lprocfs_quota_rd_least_iunit(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_cqs_least_iunit); +} +EXPORT_SYMBOL(lprocfs_quota_rd_least_iunit); + +int lprocfs_quota_wr_least_iunit(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 1 || val >= obd->u.obt.obt_qctxt.lqc_iunit_sz) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_cqs_least_iunit = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_least_iunit); + +int lprocfs_quota_rd_qs_factor(char *page, char **start, off_t off, + int count, int *eof, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + LASSERT(obd != NULL); + + + return snprintf(page, count, "%lu\n", + obd->u.obt.obt_qctxt.lqc_cqs_qs_factor); +} +EXPORT_SYMBOL(lprocfs_quota_rd_qs_factor); + +int lprocfs_quota_wr_qs_factor(struct file *file, const char *buffer, + unsigned long count, void *data) +{ + struct obd_device *obd = (struct obd_device *)data; + int val, rc; + LASSERT(obd != NULL); + + rc = lprocfs_write_helper(buffer, count, &val); + if (rc) + return rc; + + if (val < 2) + return -EINVAL; + + obd->u.obt.obt_qctxt.lqc_cqs_qs_factor = val; + return count; +} +EXPORT_SYMBOL(lprocfs_quota_wr_qs_factor); + +struct lprocfs_vars lprocfs_quota_common_vars[] = { + { "quota_bunit_sz", lprocfs_quota_rd_bunit, + lprocfs_quota_wr_bunit, 0}, + { "quota_btune_sz", lprocfs_quota_rd_btune, + lprocfs_quota_wr_btune, 0}, + { "quota_iunit_sz", lprocfs_quota_rd_iunit, + lprocfs_quota_wr_iunit, 0}, + { "quota_itune_sz", lprocfs_quota_rd_itune, + lprocfs_quota_wr_itune, 0}, + { "quota_type", lprocfs_quota_rd_type, + lprocfs_quota_wr_type, 0}, + { "quota_switch_seconds", lprocfs_quota_rd_switch_seconds, + lprocfs_quota_wr_switch_seconds, 0 }, + { "quota_sync_blk", lprocfs_quota_rd_sync_blk, + lprocfs_quota_wr_sync_blk, 0}, +}; + +struct lprocfs_vars lprocfs_quota_master_vars[] = { + { "quota_switch_qs", lprocfs_quota_rd_switch_qs, + lprocfs_quota_wr_switch_qs, 0 }, + { "quota_boundary_factor", lprocfs_quota_rd_boundary_factor, + lprocfs_quota_wr_boundary_factor, 0 }, + { "quota_least_bunit", lprocfs_quota_rd_least_bunit, + lprocfs_quota_wr_least_bunit, 0 }, + { "quota_least_iunit", lprocfs_quota_rd_least_iunit, + lprocfs_quota_wr_least_iunit, 0 }, + { "quota_qs_factor", lprocfs_quota_rd_qs_factor, + lprocfs_quota_wr_qs_factor, 0 }, +}; + +int lquota_proc_setup(struct obd_device *obd, int is_master) +{ + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + int rc = 0; + ENTRY; + + LASSERT(lquota_type_proc_dir && obd); + qctxt->lqc_proc_dir = lprocfs_register(obd->obd_name, + lquota_type_proc_dir, + lprocfs_quota_common_vars, obd); + if (IS_ERR(qctxt->lqc_proc_dir)) { + rc = PTR_ERR(qctxt->lqc_proc_dir); + CERROR("error %d setting up lprocfs for %s\n", rc, + obd->obd_name); + qctxt->lqc_proc_dir = NULL; + GOTO(out, rc); + } + + if (is_master) { + rc = lprocfs_add_vars(qctxt->lqc_proc_dir, + lprocfs_quota_master_vars, obd); + if (rc) { + CERROR("error %d setting up lprocfs for %s" + "(quota master)\n", rc, obd->obd_name); + GOTO(out_free_proc, rc); + } + } + + qctxt->lqc_stats = lprocfs_alloc_stats(LQUOTA_LAST_STAT - + LQUOTA_FIRST_STAT, 0); + if (!qctxt->lqc_stats) + GOTO(out_free_proc, rc = -ENOMEM); + + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_SYNC_ACQ, + LPROCFS_CNTR_AVGMINMAX, "sync_acq_req", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_SYNC_REL, + LPROCFS_CNTR_AVGMINMAX, "sync_rel_req", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ASYNC_ACQ, + LPROCFS_CNTR_AVGMINMAX, "async_acq_req", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ASYNC_REL, + LPROCFS_CNTR_AVGMINMAX, "async_rel_req", "us"); + + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_CHK_BLK, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_blk_quota(lquota_chkquota)", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_CHK_INO, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_ino_quota(lquota_chkquota)", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_COMMIT_BLK, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_blk_quota(lquota_pending_commit)", + "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_FOR_COMMIT_INO, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_ino_quota(lquota_pending_commit)", + "us"); + + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_PENDING_BLK_QUOTA, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_pending_blk_quota_req" + "(qctxt_wait_pending_dqacq)", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_WAIT_PENDING_INO_QUOTA, + LPROCFS_CNTR_AVGMINMAX, + "wait_for_pending_ino_quota_req" + "(qctxt_wait_pending_dqacq)", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_NOWAIT_PENDING_BLK_QUOTA, + LPROCFS_CNTR_AVGMINMAX, + "nowait_for_pending_blk_quota_req" + "(qctxt_wait_pending_dqacq)", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_NOWAIT_PENDING_INO_QUOTA, + LPROCFS_CNTR_AVGMINMAX, + "nowait_for_pending_ino_quota_req" + "(qctxt_wait_pending_dqacq)", "us"); + + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_QUOTA_CTL, + LPROCFS_CNTR_AVGMINMAX, "quota_ctl", "us"); + lprocfs_counter_init(qctxt->lqc_stats, LQUOTA_ADJUST_QUNIT, + LPROCFS_CNTR_AVGMINMAX, "adjust_qunit", "us"); + + lprocfs_register_stats(qctxt->lqc_proc_dir, "stats", qctxt->lqc_stats); + + RETURN(rc); + +out_free_proc: + lprocfs_remove(&qctxt->lqc_proc_dir); +out: + RETURN(rc); +} + +int lquota_proc_cleanup(struct lustre_quota_ctxt *qctxt) +{ + if (!qctxt || !qctxt->lqc_proc_dir) + return -EINVAL; + + if (qctxt->lqc_stats != NULL) + lprocfs_free_stats(&qctxt->lqc_stats); + + lprocfs_remove(&qctxt->lqc_proc_dir); + return 0; +} + +#endif /* LPROCFS */ +#endif diff --git a/lustre/quota/quota_adjust_qunit.c b/lustre/quota/quota_adjust_qunit.c new file mode 100644 index 0000000..20ee26b --- /dev/null +++ b/lustre/quota/quota_adjust_qunit.c @@ -0,0 +1,419 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ +#ifndef EXPORT_SYMTAB +# define EXPORT_SYMTAB +#endif +#define DEBUG_SUBSYSTEM S_LQUOTA + +#ifdef __KERNEL__ +# include +# include +# include +# include +# include +# include +# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +# include +# include +# include +# include +# else +# include +# endif +#else /* __KERNEL__ */ +# include +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include "quota_internal.h" + +#ifdef HAVE_QUOTA_SUPPORT + +#ifdef __KERNEL__ +/** + * This function is charge of recording lqs_ino_rec and + * lqs_blk_rec. when a lquota slave checks a quota + * request(check_cur_qunit) and finishes a quota + * request(dqacq_completion), it will be called. + * is_chk: whether it is checking quota; otherwise, it is finishing + * is_acq: whether it is acquiring; otherwise, it is releasing + */ +void quota_compute_lqs(struct qunit_data *qdata, struct lustre_qunit_size *lqs, + int is_chk, int is_acq) +{ + int is_blk; + + LASSERT(qdata && lqs); + LASSERT_SPIN_LOCKED(&lqs->lqs_lock); + is_blk = QDATA_IS_BLK(qdata); + + if (is_chk) { + if (is_acq) { + if (is_blk) + lqs->lqs_blk_rec += qdata->qd_count; + else + lqs->lqs_ino_rec += qdata->qd_count; + } else { + if (is_blk) + lqs->lqs_blk_rec -= qdata->qd_count; + else + lqs->lqs_ino_rec -= qdata->qd_count; + } + } else { + if (is_acq) { + if (is_blk) + lqs->lqs_blk_rec -= qdata->qd_count; + else + lqs->lqs_ino_rec -= qdata->qd_count; + } else { + if (is_blk) + lqs->lqs_blk_rec += qdata->qd_count; + else + lqs->lqs_ino_rec += qdata->qd_count; + } + } +} + +void qdata_to_oqaq(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq) +{ + LASSERT(qdata); + LASSERT(oqaq); + + oqaq->qaq_flags = qdata->qd_flags; + oqaq->qaq_id = qdata->qd_id; + if (QDATA_IS_ADJBLK(qdata)) + oqaq->qaq_bunit_sz = qdata->qd_qunit; + if (QDATA_IS_ADJINO(qdata)) + oqaq->qaq_iunit_sz = qdata->qd_qunit; +} + +int quota_search_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt, + struct lustre_qunit_size **lqs_return) +{ + struct quota_adjust_qunit *oqaq_tmp = NULL; + ENTRY; + + LASSERT(*lqs_return == NULL); + LASSERT(oqaq || qdata); + + if (!oqaq) { + OBD_ALLOC_PTR(oqaq_tmp); + if (!oqaq_tmp) + RETURN(-ENOMEM); + qdata_to_oqaq(qdata, oqaq_tmp); + } else { + oqaq_tmp = oqaq; + } + + *lqs_return = lustre_hash_lookup(qctxt->lqc_lqs_hash, oqaq_tmp); + if (*lqs_return) + LQS_DEBUG((*lqs_return), "show lqs\n"); + + if (!oqaq) + OBD_FREE_PTR(oqaq_tmp); + RETURN(0); +} + +int quota_create_lqs(struct qunit_data *qdata, struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt, + struct lustre_qunit_size **lqs_return) +{ + struct lustre_qunit_size *lqs = NULL; + int rc = 0; + ENTRY; + + LASSERT(*lqs_return == NULL); + LASSERT(oqaq || qdata); + + OBD_ALLOC_PTR(lqs); + if (!lqs) + GOTO(out, rc = -ENOMEM); + + if (!oqaq) + qdata_to_oqaq(qdata, &lqs->lqs_key); + else + lqs->lqs_key = *oqaq; + + spin_lock_init(&lqs->lqs_lock); + lqs->lqs_bwrite_pending = 0; + lqs->lqs_iwrite_pending = 0; + lqs->lqs_ino_rec = 0; + lqs->lqs_blk_rec = 0; + lqs->lqs_id = lqs->lqs_key.qaq_id; + lqs->lqs_flags = QAQ_IS_GRP(&lqs->lqs_key); + lqs->lqs_bunit_sz = qctxt->lqc_bunit_sz; + lqs->lqs_iunit_sz = qctxt->lqc_iunit_sz; + lqs->lqs_btune_sz = qctxt->lqc_btune_sz; + lqs->lqs_itune_sz = qctxt->lqc_itune_sz; + lqs->lqs_ctxt = qctxt; + if (qctxt->lqc_handler) { + lqs->lqs_last_bshrink = 0; + lqs->lqs_last_ishrink = 0; + } + lqs_initref(lqs); + rc = lustre_hash_add_unique(qctxt->lqc_lqs_hash, + &lqs->lqs_key, &lqs->lqs_hash); + LQS_DEBUG(lqs, "create lqs\n"); + if (!rc) { + lqs_getref(lqs); + *lqs_return = lqs; + } +out: + if (rc && lqs) + OBD_FREE_PTR(lqs); + RETURN(rc); +} + +int quota_adjust_slave_lqs(struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt) +{ + struct lustre_qunit_size *lqs = NULL; + unsigned long *lbunit, *liunit, *lbtune, *litune; + signed long b_tmp = 0, i_tmp = 0; + cfs_time_t time_limit = 0; + int rc = 0; + ENTRY; + + LASSERT(qctxt); +search_lqs: + rc = quota_search_lqs(NULL, oqaq, qctxt, &lqs); + + /* deleting the lqs, because a user sets lfs quota 0 0 0 0 */ + if (!oqaq->qaq_bunit_sz && !oqaq->qaq_iunit_sz && QAQ_IS_ADJBLK(oqaq) && + QAQ_IS_ADJINO(oqaq)) { + if (lqs) { + LQS_DEBUG(lqs, "release lqs\n"); + /* this is for quota_search_lqs */ + lqs_putref(lqs); + /* kill lqs */ + lqs_putref(lqs); + } + RETURN(rc); + } + + if (!lqs) { + rc = quota_create_lqs(NULL, oqaq, qctxt, &lqs); + if (rc == -EALREADY) + goto search_lqs; + if (rc < 0) + RETURN(rc); + } + + lbunit = &lqs->lqs_bunit_sz; + liunit = &lqs->lqs_iunit_sz; + lbtune = &lqs->lqs_btune_sz; + litune = &lqs->lqs_itune_sz; + + CDEBUG(D_QUOTA, "before: bunit: %lu, iunit: %lu.\n", *lbunit, *liunit); + spin_lock(&lqs->lqs_lock); + /* adjust the slave's block qunit size */ + if (QAQ_IS_ADJBLK(oqaq)) { + cfs_duration_t sec = cfs_time_seconds(qctxt->lqc_switch_seconds); + + b_tmp = *lbunit - oqaq->qaq_bunit_sz; + + if (qctxt->lqc_handler && b_tmp > 0) + lqs->lqs_last_bshrink = cfs_time_current(); + + if (qctxt->lqc_handler && b_tmp < 0) { + time_limit = cfs_time_add(lqs->lqs_last_bshrink, sec); + if (!lqs->lqs_last_bshrink || + cfs_time_after(cfs_time_current(), time_limit)) { + *lbunit = oqaq->qaq_bunit_sz; + *lbtune = (*lbunit) / 2; + } else { + b_tmp = 0; + } + } else { + *lbunit = oqaq->qaq_bunit_sz; + *lbtune = (*lbunit) / 2; + } + } + + /* adjust the slave's file qunit size */ + if (QAQ_IS_ADJINO(oqaq)) { + i_tmp = *liunit - oqaq->qaq_iunit_sz; + + if (qctxt->lqc_handler && i_tmp > 0) + lqs->lqs_last_ishrink = cfs_time_current(); + + if (qctxt->lqc_handler && i_tmp < 0) { + time_limit = cfs_time_add(lqs->lqs_last_ishrink, + cfs_time_seconds(qctxt-> + lqc_switch_seconds)); + if (!lqs->lqs_last_ishrink || + cfs_time_after(cfs_time_current(), time_limit)) { + *liunit = oqaq->qaq_iunit_sz; + *litune = (*liunit) / 2; + } else { + i_tmp = 0; + } + } else { + *liunit = oqaq->qaq_iunit_sz; + *litune = (*liunit) / 2; + } + } + spin_unlock(&lqs->lqs_lock); + CDEBUG(D_QUOTA, "after: bunit: %lu, iunit: %lu.\n", *lbunit, *liunit); + + lqs_putref(lqs); + + if (b_tmp > 0) + rc |= LQS_BLK_DECREASE; + else if (b_tmp < 0) + rc |= LQS_BLK_INCREASE; + + if (i_tmp > 0) + rc |= LQS_INO_DECREASE; + else if (i_tmp < 0) + rc |= LQS_INO_INCREASE; + + RETURN(rc); +} + +int filter_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt) +{ + struct obd_device *obd = exp->exp_obd; + unsigned int uid = 0, gid = 0; + int rc = 0; + ENTRY; + + LASSERT(oqaq); + LASSERT(QAQ_IS_ADJBLK(oqaq)); + rc = quota_adjust_slave_lqs(oqaq, qctxt); + if (rc < 0) { + CERROR("adjust mds slave's qunit size failed!(rc:%d)\n", rc); + RETURN(rc); + } + if (QAQ_IS_GRP(oqaq)) + gid = oqaq->qaq_id; + else + uid = oqaq->qaq_id; + + if (rc > 0) { + rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 0, NULL); + if (rc == -EDQUOT || rc == -EBUSY || rc == -EAGAIN) { + CDEBUG(D_QUOTA, "rc: %d.\n", rc); + rc = 0; + } + if (rc) + CERROR("slave adjust block quota failed!(rc:%d)\n", rc); + } + RETURN(rc); +} +#endif /* __KERNEL__ */ +#endif + +int client_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt) +{ + struct ptlrpc_request *req; + struct quota_adjust_qunit *oqa; + int rc = 0; + ENTRY; + + /* client don't support this kind of operation, abort it */ + if (!(exp->exp_connect_flags & OBD_CONNECT_CHANGE_QS)) { + CDEBUG(D_QUOTA, "osc: %s don't support change qunit size\n", + exp->exp_obd->obd_name); + RETURN(rc); + } + if (strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) + RETURN(-EINVAL); + + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), + &RQF_OST_QUOTA_ADJUST_QUNIT, + LUSTRE_OST_VERSION, + OST_QUOTA_ADJUST_QUNIT); + if (req == NULL) + RETURN(-ENOMEM); + + oqa = req_capsule_client_get(&req->rq_pill, &RMF_QUOTA_ADJUST_QUNIT); + *oqa = *oqaq; + + ptlrpc_request_set_replen(req); + + rc = ptlrpc_queue_wait(req); + if (rc) + CERROR("%s: %s failed: rc = %d\n", exp->exp_obd->obd_name, + __FUNCTION__, rc); + ptlrpc_req_finished(req); + RETURN (rc); +} + +int lov_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lov_obd *lov = &obd->u.lov; + int i, rc = 0; + ENTRY; + + if (!QAQ_IS_ADJBLK(oqaq)) { + CERROR("bad qaq_flags %x for lov obd.\n", oqaq->qaq_flags); + RETURN(-EFAULT); + } + + for (i = 0; i < lov->desc.ld_tgt_count; i++) { + int err; + + if (!lov->lov_tgts[i]->ltd_active) { + CDEBUG(D_HA, "ost %d is inactive\n", i); + continue; + } + + err = obd_quota_adjust_qunit(lov->lov_tgts[i]->ltd_exp, oqaq, + NULL); + if (err) { + if (lov->lov_tgts[i]->ltd_active && !rc) + rc = err; + continue; + } + } + RETURN(rc); +} diff --git a/lustre/quota/quota_check.c b/lustre/quota/quota_check.c index 20ffb9b..62fc1f0 100644 --- a/lustre/quota/quota_check.c +++ b/lustre/quota/quota_check.c @@ -33,10 +33,12 @@ * This file is part of Lustre, http://www.lustre.org/ * Lustre is a trademark of Sun Microsystems, Inc. */ + + #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_MDS +#define DEBUG_SUBSYSTEM S_LQUOTA #ifdef __KERNEL__ # include @@ -62,6 +64,7 @@ #include #include "quota_internal.h" +#ifdef HAVE_QUOTA_SUPPORT #ifdef __KERNEL__ static int target_quotacheck_callback(struct obd_export *exp, struct obd_quotactl *oqctl) @@ -71,7 +74,7 @@ static int target_quotacheck_callback(struct obd_export *exp, int rc; ENTRY; - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), &RQF_QC_CALLBACK, + req = ptlrpc_request_alloc_pack(exp->exp_imp_reverse, &RQF_QC_CALLBACK, LUSTRE_OBD_VERSION, OBD_QC_CALLBACK); if (req == NULL) RETURN(-ENOMEM); @@ -99,7 +102,7 @@ static int target_quotacheck_thread(void *data) ptlrpc_daemonize("quotacheck"); exp = qta->qta_exp; - obd = exp->exp_obd; + obd = qta->qta_obd; oqctl = &qta->qta_oqctl; push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -118,9 +121,9 @@ static int target_quotacheck_thread(void *data) return rc; } -int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) +int target_quota_check(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl) { - struct obd_device *obd = exp->exp_obd; struct obd_device_target *obt = &obd->u.obt; struct quotacheck_thread_args *qta; int rc = 0; @@ -136,7 +139,9 @@ int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) GOTO(out, rc = -ENOMEM); qta->qta_exp = exp; + qta->qta_obd = obd; qta->qta_oqctl = *oqctl; + qta->qta_oqctl.qc_id = obt->obt_qfmt; /* override qfmt version */ qta->qta_sb = obt->obt_sb; qta->qta_sem = &obt->obt_quotachecking; @@ -166,27 +171,31 @@ out: } #endif /* __KERNEL__ */ +#endif /* HAVE_QUOTA_SUPPORT */ -int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) +int client_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) { - struct client_obd *cli = &exp->exp_obd->u.cli; - struct ptlrpc_request *req; - struct obd_quotactl *body; - int ver, opc, rc; + struct client_obd *cli = &exp->exp_obd->u.cli; + struct ptlrpc_request *req; + struct obd_quotactl *body; + const struct req_format *rf; + int ver, opc, rc; ENTRY; if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) { + rf = &RQF_MDS_QUOTACHECK; ver = LUSTRE_MDS_VERSION; opc = MDS_QUOTACHECK; } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + rf = &RQF_OST_QUOTACHECK; ver = LUSTRE_OST_VERSION; opc = OST_QUOTACHECK; } else { RETURN(-EINVAL); } - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MDS_QUOTACHECK, ver, opc); + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), rf, ver, opc); if (req == NULL) RETURN(-ENOMEM); @@ -220,18 +229,44 @@ int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk) qchk->obd_uuid = cli->cl_target_uuid; /* FIXME change strncmp to strcmp and save the strlen op */ if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME, - strlen(LUSTRE_OSC_NAME))) + strlen(LUSTRE_OSC_NAME)) == 0) memcpy(qchk->obd_type, LUSTRE_OST_NAME, strlen(LUSTRE_OST_NAME)); else if (strncmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME, - strlen(LUSTRE_MDC_NAME))) + strlen(LUSTRE_MDC_NAME)) == 0) memcpy(qchk->obd_type, LUSTRE_MDS_NAME, strlen(LUSTRE_MDS_NAME)); RETURN(rc); } -int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) +int lmv_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt; + int i, rc = 0; + ENTRY; + + for (i = 0, tgt = lmv->tgts; i < lmv->desc.ld_tgt_count; i++, tgt++) { + int err; + + if (!tgt->ltd_active) { + CERROR("lmv idx %d inactive\n", i); + RETURN(-EIO); + } + + err = obd_quotacheck(tgt->ltd_exp, oqctl); + if (err && !rc) + rc = err; + } + + RETURN(rc); +} + +int lov_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) { struct obd_device *obd = class_exp2obd(exp); struct lov_obd *lov = &obd->u.lov; @@ -247,7 +282,7 @@ int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl) } err = obd_quotacheck(lov->lov_tgts[i]->ltd_exp, oqctl); - if (err && lov->lov_tgts[i]->ltd_active && !rc) + if (err && !rc) rc = err; } diff --git a/lustre/quota/quota_context.c b/lustre/quota/quota_context.c index 2b7635f..1e928c3 100644 --- a/lustre/quota/quota_context.c +++ b/lustre/quota/quota_context.c @@ -44,7 +44,7 @@ # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_MDS +#define DEBUG_SUBSYSTEM S_LQUOTA #include #include @@ -57,33 +57,94 @@ #include #include #include +#include +#include #include "quota_internal.h" -unsigned long default_bunit_sz = 100 * 1024 * 1024; /* 100M bytes */ -unsigned long default_btune_ratio = 50; /* 50 percentage */ -unsigned long default_iunit_sz = 5000; /* 5000 inodes */ -unsigned long default_itune_ratio = 50; /* 50 percentage */ +#ifdef HAVE_QUOTA_SUPPORT + +static lustre_hash_ops_t lqs_hash_ops; + +unsigned long default_bunit_sz = 128 * 1024 * 1024; /* 128M bytes */ +unsigned long default_btune_ratio = 50; /* 50 percentage */ +unsigned long default_iunit_sz = 5120; /* 5120 inodes */ +unsigned long default_itune_ratio = 50; /* 50 percentage */ cfs_mem_cache_t *qunit_cachep = NULL; struct list_head qunit_hash[NR_DQHASH]; spinlock_t qunit_hash_lock = SPIN_LOCK_UNLOCKED; +/* please sync qunit_state with qunit_state_names */ +enum qunit_state { + /** + * a qunit is created + */ + QUNIT_CREATED = 0, + /** + * a qunit is added into qunit hash, that means + * a quota req will be sent or is flying + */ + QUNIT_IN_HASH = 1, + /** + * a qunit is removed from qunit hash, that + * means a quota req is handled and comes back + */ + QUNIT_RM_FROM_HASH = 2, + /** + * qunit can wake up all threads waiting for it + */ + QUNIT_FINISHED = 3, +}; + +static const char *qunit_state_names[] = { + [QUNIT_CREATED] = "CREATED", + [QUNIT_IN_HASH] = "IN_HASH", + [QUNIT_RM_FROM_HASH] = "RM_FROM_HASH", + [QUNIT_FINISHED] = "FINISHED", +}; + struct lustre_qunit { - struct list_head lq_hash; /* Hash list in memory */ - atomic_t lq_refcnt; /* Use count */ - struct lustre_quota_ctxt *lq_ctxt; /* Quota context this applies to */ - struct qunit_data lq_data; /* See qunit_data */ - unsigned int lq_opc; /* QUOTA_DQACQ, QUOTA_DQREL */ - struct list_head lq_waiters; /* All write threads waiting for this qunit */ + struct list_head lq_hash; /** Hash list in memory */ + atomic_t lq_refcnt; /** Use count */ + struct lustre_quota_ctxt *lq_ctxt; /** Quota context this applies to */ + struct qunit_data lq_data; /** See qunit_data */ + unsigned int lq_opc; /** QUOTA_DQACQ, QUOTA_DQREL */ + cfs_waitq_t lq_waitq; /** Threads waiting for this qunit */ + spinlock_t lq_lock; /** Protect the whole structure */ + enum qunit_state lq_state; /** Present the status of qunit */ + int lq_rc; /** The rc of lq_data */ }; +#define QUNIT_SET_STATE(qunit, state) \ +do { \ + spin_lock(&qunit->lq_lock); \ + QDATA_DEBUG((&qunit->lq_data), "qunit(%p) lq_state(%s->%s), " \ + "lq_rc(%d)\n", \ + qunit, qunit_state_names[qunit->lq_state], \ + qunit_state_names[state], qunit->lq_rc); \ + qunit->lq_state = state; \ + spin_unlock(&qunit->lq_lock); \ +} while(0) + +#define QUNIT_SET_STATE_AND_RC(qunit, state, rc) \ +do { \ + spin_lock(&qunit->lq_lock); \ + qunit->lq_rc = rc; \ + QDATA_DEBUG((&qunit->lq_data), "qunit(%p) lq_state(%s->%s), " \ + "lq_rc(%d)\n", \ + qunit, qunit_state_names[qunit->lq_state], \ + qunit_state_names[state], qunit->lq_rc); \ + qunit->lq_state = state; \ + spin_unlock(&qunit->lq_lock); \ +} while(0) + + int should_translate_quota (struct obd_import *imp) { ENTRY; LASSERT(imp); - if ((imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64) && - !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) + if (imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_QUOTA64) RETURN(0); else RETURN(1); @@ -135,66 +196,13 @@ static inline int qunit_hashfn(struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata) { unsigned int id = qdata->qd_id; - unsigned int type = qdata->qd_flags & QUOTA_IS_GRP; + unsigned int type = QDATA_IS_GRP(qdata); unsigned long tmp = ((unsigned long)qctxt >> L1_CACHE_SHIFT) ^ id; tmp = (tmp * (MAXQUOTAS - type)) % NR_DQHASH; return tmp; } -/* compute the remaining quota for certain gid or uid b=11693 */ -int compute_remquota(struct obd_device *obd, - struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata) -{ - struct super_block *sb = qctxt->lqc_sb; - __u64 usage, limit; - struct obd_quotactl *qctl; - int ret = QUOTA_RET_OK; - __u32 qdata_type = qdata->qd_flags & QUOTA_IS_GRP; - ENTRY; - - if (!sb_any_quota_enabled(sb)) - RETURN(QUOTA_RET_NOQUOTA); - - /* ignore root user */ - if (qdata->qd_id == 0 && qdata_type == USRQUOTA) - RETURN(QUOTA_RET_NOLIMIT); - - OBD_ALLOC_PTR(qctl); - if (qctl == NULL) - RETURN(-ENOMEM); - - /* get fs quota usage & limit */ - qctl->qc_cmd = Q_GETQUOTA; - qctl->qc_id = qdata->qd_id; - qctl->qc_type = qdata_type; - ret = fsfilt_quotactl(obd, sb, qctl); - if (ret) { - if (ret == -ESRCH) /* no limit */ - ret = QUOTA_RET_NOLIMIT; - else - CDEBUG(D_QUOTA, "can't get fs quota usage! (rc:%d)", - ret); - GOTO(out, ret); - } - - usage = qctl->qc_dqblk.dqb_curspace; - limit = qctl->qc_dqblk.dqb_bhardlimit << QUOTABLOCK_BITS; - if (!limit){ /* no limit */ - ret = QUOTA_RET_NOLIMIT; - GOTO(out, ret); - } - - if (limit >= usage) - qdata->qd_count = limit - usage; - else - qdata->qd_count = 0; - EXIT; -out: - OBD_FREE_PTR(qctl); - return ret; -} - /* caller must hold qunit_hash_lock */ static inline struct lustre_qunit *find_qunit(unsigned int hashent, struct lustre_quota_ctxt *qctxt, @@ -207,7 +215,9 @@ static inline struct lustre_qunit *find_qunit(unsigned int hashent, list_for_each_entry(qunit, qunit_hash + hashent, lq_hash) { tmp = &qunit->lq_data; if (qunit->lq_ctxt == qctxt && - qdata->qd_id == tmp->qd_id && qdata->qd_flags == tmp->qd_flags) + qdata->qd_id == tmp->qd_id && + (qdata->qd_flags & LQUOTA_QUNIT_FLAGS) == + (tmp->qd_flags & LQUOTA_QUNIT_FLAGS)) return qunit; } return NULL; @@ -218,9 +228,9 @@ static inline struct lustre_qunit *find_qunit(unsigned int hashent, * @qdata: the type of quota unit to be checked * * return: 1 - need acquire qunit; - * 2 - need release qunit; - * 0 - need do nothing. - * < 0 - error. + * 2 - need release qunit; + * 0 - need do nothing. + * < 0 - error. */ static int check_cur_qunit(struct obd_device *obd, @@ -228,16 +238,23 @@ check_cur_qunit(struct obd_device *obd, { struct super_block *sb = qctxt->lqc_sb; unsigned long qunit_sz, tune_sz; - __u64 usage, limit; + __u64 usage, limit, limit_org, pending_write = 0; + long long record = 0; struct obd_quotactl *qctl; + struct lustre_qunit_size *lqs = NULL; int ret = 0; - __u32 qdata_type = qdata->qd_flags & QUOTA_IS_GRP; - __u32 is_blk = (qdata->qd_flags & QUOTA_IS_BLOCK) >> 1; ENTRY; if (!sb_any_quota_enabled(sb)) RETURN(0); + spin_lock(&qctxt->lqc_lock); + if (!qctxt->lqc_valid){ + spin_unlock(&qctxt->lqc_lock); + RETURN(0); + } + spin_unlock(&qctxt->lqc_lock); + OBD_ALLOC_PTR(qctl); if (qctl == NULL) RETURN(-ENOMEM); @@ -245,7 +262,7 @@ check_cur_qunit(struct obd_device *obd, /* get fs quota usage & limit */ qctl->qc_cmd = Q_GETQUOTA; qctl->qc_id = qdata->qd_id; - qctl->qc_type = qdata_type; + qctl->qc_type = QDATA_IS_GRP(qdata); ret = fsfilt_quotactl(obd, sb, qctl); if (ret) { if (ret == -ESRCH) /* no limit */ @@ -255,57 +272,149 @@ check_cur_qunit(struct obd_device *obd, GOTO(out, ret); } - if (is_blk) { + if (QDATA_IS_BLK(qdata)) { usage = qctl->qc_dqblk.dqb_curspace; limit = qctl->qc_dqblk.dqb_bhardlimit << QUOTABLOCK_BITS; - qunit_sz = qctxt->lqc_bunit_sz; - tune_sz = qctxt->lqc_btune_sz; - - LASSERT(!(qunit_sz % QUOTABLOCK_SIZE)); } else { usage = qctl->qc_dqblk.dqb_curinodes; limit = qctl->qc_dqblk.dqb_ihardlimit; - qunit_sz = qctxt->lqc_iunit_sz; - tune_sz = qctxt->lqc_itune_sz; } - /* ignore the no quota limit case */ + /* ignore the no quota limit case; and it can avoid creating + * unnecessary lqs for uid/gid */ if (!limit) GOTO(out, ret = 0); + search_lqs: + quota_search_lqs(qdata, NULL, qctxt, &lqs); + if (!lqs) { + CDEBUG(D_QUOTA, "Can't find the lustre qunit size!\n"); + ret = quota_create_lqs(qdata, NULL, qctxt, &lqs); + if (ret == -EALREADY) { + ret = 0; + goto search_lqs; + } + if (ret < 0) + GOTO (out, ret); + } + spin_lock(&lqs->lqs_lock); + + if (QDATA_IS_BLK(qdata)) { + qunit_sz = lqs->lqs_bunit_sz; + tune_sz = lqs->lqs_btune_sz; + pending_write = lqs->lqs_bwrite_pending * CFS_PAGE_SIZE; + record = lqs->lqs_blk_rec; + LASSERT(!(qunit_sz % QUOTABLOCK_SIZE)); + } else { + /* we didn't need change inode qunit size now */ + qunit_sz = lqs->lqs_iunit_sz; + tune_sz = lqs->lqs_itune_sz; + pending_write = lqs->lqs_iwrite_pending; + record = lqs->lqs_ino_rec; + } + /* we don't count the MIN_QLIMIT */ - if ((limit == MIN_QLIMIT && !is_blk) || - (toqb(limit) == MIN_QLIMIT && is_blk)) + if ((limit == MIN_QLIMIT && !QDATA_IS_BLK(qdata)) || + (toqb(limit) == MIN_QLIMIT && QDATA_IS_BLK(qdata))) limit = 0; + usage += pending_write; + limit_org = limit; + /* when a releasing quota req is sent, before it returned + limit is assigned a small value. limit will overflow */ + if (limit + record < 0) + usage -= record; + else + limit += record; + LASSERT(qdata->qd_count == 0); if (limit <= usage + tune_sz) { - while (qdata->qd_count + limit <= usage + tune_sz) + while (qdata->qd_count + limit <= + usage + tune_sz) qdata->qd_count += qunit_sz; ret = 1; - } else if (limit > usage + qunit_sz + tune_sz) { - while (limit - qdata->qd_count > usage + qunit_sz + tune_sz) + } else if (limit > usage + qunit_sz + tune_sz && + limit_org > qdata->qd_count + qunit_sz) { + while (limit - qdata->qd_count > usage + qunit_sz + tune_sz && + limit_org > qdata->qd_count + qunit_sz) qdata->qd_count += qunit_sz; ret = 2; + /* if there are other pending writes for this uid/gid, releasing + * quota is put off until the last pending write b=16645 */ + if (ret == 2 && pending_write) { + CDEBUG(D_QUOTA, "delay quota release\n"); + ret = 0; + } } + CDEBUG(D_QUOTA, "type: %c, limit: "LPU64", usage: "LPU64 + ", pending_write: "LPU64", record: "LPD64 + ", qunit_sz: %lu, tune_sz: %lu, ret: %d.\n", + QDATA_IS_BLK(qdata) ? 'b' : 'i', limit, usage, pending_write, + record, qunit_sz, tune_sz, ret); LASSERT(ret == 0 || qdata->qd_count); + + spin_unlock(&lqs->lqs_lock); + lqs_putref(lqs); EXIT; -out: + out: OBD_FREE_PTR(qctl); return ret; } -/* caller must hold qunit_hash_lock */ -static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt, - struct qunit_data *qdata) +/** + * Compute the remaining quota for certain gid or uid b=11693 + */ +int compute_remquota(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, + struct qunit_data *qdata, int isblk) { - unsigned int hashent = qunit_hashfn(qctxt, qdata); - struct lustre_qunit *qunit; + struct super_block *sb = qctxt->lqc_sb; + __u64 usage, limit; + struct obd_quotactl *qctl; + int ret = QUOTA_RET_OK; ENTRY; - LASSERT_SPIN_LOCKED(&qunit_hash_lock); - qunit = find_qunit(hashent, qctxt, qdata); - RETURN(qunit); + if (!sb_any_quota_enabled(sb)) + RETURN(QUOTA_RET_NOQUOTA); + + /* ignore root user */ + if (qdata->qd_id == 0 && QDATA_IS_GRP(qdata) == USRQUOTA) + RETURN(QUOTA_RET_NOLIMIT); + + OBD_ALLOC_PTR(qctl); + if (qctl == NULL) + RETURN(-ENOMEM); + + /* get fs quota usage & limit */ + qctl->qc_cmd = Q_GETQUOTA; + qctl->qc_id = qdata->qd_id; + qctl->qc_type = QDATA_IS_GRP(qdata); + ret = fsfilt_quotactl(obd, sb, qctl); + if (ret) { + if (ret == -ESRCH) /* no limit */ + ret = QUOTA_RET_NOLIMIT; + else + CDEBUG(D_QUOTA, "can't get fs quota usage! (rc:%d)", + ret); + GOTO(out, ret); + } + + usage = isblk ? qctl->qc_dqblk.dqb_curspace : + qctl->qc_dqblk.dqb_curinodes; + limit = isblk ? qctl->qc_dqblk.dqb_bhardlimit << QUOTABLOCK_BITS : + qctl->qc_dqblk.dqb_ihardlimit; + if (!limit){ /* no limit */ + ret = QUOTA_RET_NOLIMIT; + GOTO(out, ret); + } + + if (limit >= usage) + qdata->qd_count = limit - usage; + else + qdata->qd_count = 0; + EXIT; +out: + OBD_FREE_PTR(qctl); + return ret; } static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt, @@ -319,12 +428,13 @@ static struct lustre_qunit *alloc_qunit(struct lustre_quota_ctxt *qctxt, RETURN(NULL); CFS_INIT_LIST_HEAD(&qunit->lq_hash); - CFS_INIT_LIST_HEAD(&qunit->lq_waiters); + init_waitqueue_head(&qunit->lq_waitq); atomic_set(&qunit->lq_refcnt, 1); qunit->lq_ctxt = qctxt; memcpy(&qunit->lq_data, qdata, sizeof(*qdata)); qunit->lq_opc = opc; - + qunit->lq_lock = SPIN_LOCK_UNLOCKED; + QUNIT_SET_STATE_AND_RC(qunit, QUNIT_CREATED, 0); RETURN(qunit); } @@ -345,102 +455,95 @@ static void qunit_put(struct lustre_qunit *qunit) free_qunit(qunit); } +/* caller must hold qunit_hash_lock and release ref of qunit after using it */ +static struct lustre_qunit *dqacq_in_flight(struct lustre_quota_ctxt *qctxt, + struct qunit_data *qdata) +{ + unsigned int hashent = qunit_hashfn(qctxt, qdata); + struct lustre_qunit *qunit; + ENTRY; + + LASSERT_SPIN_LOCKED(&qunit_hash_lock); + qunit = find_qunit(hashent, qctxt, qdata); + if (qunit) + qunit_get(qunit); + RETURN(qunit); +} + static void insert_qunit_nolock(struct lustre_quota_ctxt *qctxt, struct lustre_qunit *qunit) { struct list_head *head; LASSERT(list_empty(&qunit->lq_hash)); + qunit_get(qunit); head = qunit_hash + qunit_hashfn(qctxt, &qunit->lq_data); list_add(&qunit->lq_hash, head); + QUNIT_SET_STATE(qunit, QUNIT_IN_HASH); +} + +static void compute_lqs_after_removing_qunit(struct lustre_qunit *qunit) +{ + struct lustre_qunit_size *lqs = NULL; + + quota_search_lqs(&qunit->lq_data, NULL, qunit->lq_ctxt, &lqs); + if (lqs) { + spin_lock(&lqs->lqs_lock); + if (qunit->lq_opc == QUOTA_DQACQ) + quota_compute_lqs(&qunit->lq_data, lqs, 0, 1); + if (qunit->lq_opc == QUOTA_DQREL) + quota_compute_lqs(&qunit->lq_data, lqs, 0, 0); + spin_unlock(&lqs->lqs_lock); + /* this is for quota_search_lqs */ + lqs_putref(lqs); + /* this is for schedule_dqacq */ + lqs_putref(lqs); + } + } static void remove_qunit_nolock(struct lustre_qunit *qunit) { LASSERT(!list_empty(&qunit->lq_hash)); + LASSERT_SPIN_LOCKED(&qunit_hash_lock); + list_del_init(&qunit->lq_hash); + QUNIT_SET_STATE(qunit, QUNIT_RM_FROM_HASH); + qunit_put(qunit); } -struct qunit_waiter { - struct list_head qw_entry; - cfs_waitq_t qw_waitq; - int qw_rc; -}; - #define INC_QLIMIT(limit, count) (limit == MIN_QLIMIT) ? \ (limit = count) : (limit += count) -/* FIXME check if this mds is the master of specified id */ -static int -is_master(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, - unsigned int id, int type) +static inline int is_master(struct lustre_quota_ctxt *qctxt) { return qctxt->lqc_handler ? 1 : 0; } static int schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, - struct qunit_data *qdata, int opc, int wait); - -static int split_before_schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, - struct qunit_data *qdata, int opc, int wait) -{ - int rc = 0; - unsigned long factor; - struct qunit_data tmp_qdata; - ENTRY; - - LASSERT(qdata && qdata->qd_count); - QDATA_DEBUG(qdata, "%s quota split.\n", - (qdata->qd_flags & QUOTA_IS_BLOCK) ? "block" : "inode"); - if (qdata->qd_flags & QUOTA_IS_BLOCK) - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * - qctxt->lqc_bunit_sz; - else - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * - qctxt->lqc_iunit_sz; - - if (qctxt->lqc_import && should_translate_quota(qctxt->lqc_import) && - qdata->qd_count > factor) { - tmp_qdata = *qdata; - tmp_qdata.qd_count = factor; - qdata->qd_count -= tmp_qdata.qd_count; - QDATA_DEBUG((&tmp_qdata), "be split.\n"); - rc = schedule_dqacq(obd, qctxt, &tmp_qdata, opc, wait); - } else{ - QDATA_DEBUG(qdata, "don't be split.\n"); - rc = schedule_dqacq(obd, qctxt, qdata, opc, wait); - } - - RETURN(rc); -} + struct qunit_data *qdata, int opc, int wait, + struct obd_trans_info *oti); static int -dqacq_completion(struct obd_device *obd, - struct lustre_quota_ctxt *qctxt, +dqacq_completion(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata, int rc, int opc) { struct lustre_qunit *qunit = NULL; struct super_block *sb = qctxt->lqc_sb; - unsigned long qunit_sz; - struct qunit_waiter *qw, *tmp; int err = 0; - __u32 qdata_type = qdata->qd_flags & QUOTA_IS_GRP; - __u32 is_blk = (qdata->qd_flags & QUOTA_IS_BLOCK) >> 1; - __u64 qd_tmp = qdata->qd_count; - unsigned long div_r; + struct quota_adjust_qunit *oqaq = NULL; + int rc1 = 0; ENTRY; LASSERT(qdata); - qunit_sz = is_blk ? qctxt->lqc_bunit_sz : qctxt->lqc_iunit_sz; - div_r = do_div(qd_tmp, qunit_sz); - LASSERTF(!div_r, "qunit_sz: %lu, return qunit_sz: "LPU64"\n", - qunit_sz, qd_tmp); + QDATA_DEBUG(qdata, "obd(%s): complete %s quota req\n", + obd->obd_name, (opc == QUOTA_DQACQ) ? "acq" : "rel"); /* update local operational quota file */ if (rc == 0) { - __u32 count = QUSG(qdata->qd_count, is_blk); + __u64 count = QUSG(qdata->qd_count, QDATA_IS_BLK(qdata)); struct obd_quotactl *qctl; __u64 *hardlimit; @@ -453,14 +556,14 @@ dqacq_completion(struct obd_device *obd, * set fs quota limit */ qctl->qc_cmd = Q_GETQUOTA; qctl->qc_id = qdata->qd_id; - qctl->qc_type = qdata_type; + qctl->qc_type = QDATA_IS_GRP(qdata); err = fsfilt_quotactl(obd, sb, qctl); if (err) { CERROR("error get quota fs limit! (rc:%d)\n", err); GOTO(out_mem, err); } - if (is_blk) { + if (QDATA_IS_BLK(qdata)) { qctl->qc_dqblk.dqb_valid = QIF_BLIMITS; hardlimit = &qctl->qc_dqblk.dqb_bhardlimit; } else { @@ -468,20 +571,24 @@ dqacq_completion(struct obd_device *obd, hardlimit = &qctl->qc_dqblk.dqb_ihardlimit; } + CDEBUG(D_QUOTA, "hardlimt: "LPU64"\n", *hardlimit); + + if (*hardlimit == 0) + goto out_mem; + switch (opc) { case QUOTA_DQACQ: - CDEBUG(D_QUOTA, "%s(acq):count: %d, hardlimt: "LPU64 - ",type: %s.\n", obd->obd_name, count, *hardlimit, - qdata_type ? "grp": "usr"); INC_QLIMIT(*hardlimit, count); break; case QUOTA_DQREL: - CDEBUG(D_QUOTA, "%s(rel):count: %d, hardlimt: "LPU64 - ",type: %s.\n", obd->obd_name, count, *hardlimit, - qdata_type ? "grp": "usr"); - LASSERTF(count < *hardlimit, - "count: %d, hardlimit: "LPU64".\n", - count, *hardlimit); + LASSERTF(count < *hardlimit, + "id(%u) flag(%u) type(%c) isblk(%c) " + "count("LPU64") qd_qunit("LPU64") " + "hardlimit("LPU64").\n", + qdata->qd_id, qdata->qd_flags, + QDATA_IS_GRP(qdata) ? 'g' : 'u', + QDATA_IS_BLK(qdata) ? 'b': 'i', + qdata->qd_count, qdata->qd_qunit, *hardlimit); *hardlimit -= count; break; default: @@ -516,40 +623,59 @@ out: /* this qunit has been removed by qctxt_cleanup() */ if (!qunit) { spin_unlock(&qunit_hash_lock); + QDATA_DEBUG(qdata, "%s is discarded because qunit isn't found\n", + opc == QUOTA_DQACQ ? "DQACQ" : "DQREL"); RETURN(err); } LASSERT(opc == qunit->lq_opc); + /* remove this qunit from lq_hash so that new processes cannot be added + * to qunit->lq_waiters */ remove_qunit_nolock(qunit); + spin_unlock(&qunit_hash_lock); - /* wake up all waiters */ - list_for_each_entry_safe(qw, tmp, &qunit->lq_waiters, qw_entry) { - list_del_init(&qw->qw_entry); - qw->qw_rc = rc; - wake_up(&qw->qw_waitq); - } + compute_lqs_after_removing_qunit(qunit); - spin_unlock(&qunit_hash_lock); + /* wake up all waiters */ + QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, rc); + wake_up_all(&qunit->lq_waitq); + /* this is for dqacq_in_flight() */ + qunit_put(qunit); + /* this is for alloc_qunit() */ qunit_put(qunit); + if (rc < 0 && rc != -EDQUOT) + RETURN(err); /* don't reschedule in such cases: - * - acq/rel failure, but not for quota recovery. + * - acq/rel failure and qunit isn't changed, + * but not for quota recovery. * - local dqacq/dqrel. * - local disk io failure. */ - if (err || (rc && rc != -EBUSY) || - is_master(obd, qctxt, qdata->qd_id, qdata_type)) + OBD_ALLOC_PTR(oqaq); + if (!oqaq) + RETURN(-ENOMEM); + qdata_to_oqaq(qdata, oqaq); + /* adjust the qunit size in slaves */ + rc1 = quota_adjust_slave_lqs(oqaq, qctxt); + OBD_FREE_PTR(oqaq); + if (rc1 < 0) { + CERROR("adjust slave's qunit size failed!(rc:%d)\n", rc1); + RETURN(rc1); + } + if (err || (rc && rc != -EBUSY && rc1 == 0) || is_master(qctxt)) RETURN(err); /* reschedule another dqacq/dqrel if needed */ qdata->qd_count = 0; - rc = check_cur_qunit(obd, qctxt, qdata); - if (rc > 0) { + qdata->qd_flags &= LQUOTA_QUNIT_FLAGS; + rc1 = check_cur_qunit(obd, qctxt, qdata); + if (rc1 > 0) { int opc; - opc = rc == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - rc = split_before_schedule_dqacq(obd, qctxt, qdata, opc, 0); - QDATA_DEBUG(qdata, "reschedudle opc(%d) rc(%d)\n", opc, rc); + opc = rc1 == 1 ? QUOTA_DQACQ : QUOTA_DQREL; + rc1 = schedule_dqacq(obd, qctxt, qdata, opc, 0, NULL); + QDATA_DEBUG(qdata, "reschedudle opc(%d) rc(%d)\n", opc, rc1); } RETURN(err); } @@ -564,185 +690,304 @@ static int dqacq_interpret(const struct lu_env *env, { struct dqacq_async_args *aa = (struct dqacq_async_args *)data; struct lustre_quota_ctxt *qctxt = aa->aa_ctxt; + struct obd_device_target *obt = qctxt->lqc_obt; struct lustre_qunit *qunit = aa->aa_qunit; struct obd_device *obd = req->rq_import->imp_obd; struct qunit_data *qdata = NULL; - struct qunit_data_old *qdata_old = NULL; + int rc1 = 0; ENTRY; LASSERT(req); LASSERT(req->rq_import); - if ((req->rq_import->imp_connect_data.ocd_connect_flags & - OBD_CONNECT_QUOTA64) && - !OBD_FAIL_CHECK(OBD_FAIL_QUOTA_QD_COUNT_32BIT)) { - CDEBUG(D_QUOTA, "qd_count is 64bit!\n"); - - qdata = req_capsule_server_swab_get(&req->rq_pill, - &RMF_QUNIT_DATA, - (void*)lustre_swab_qdata); - } else { - CDEBUG(D_QUOTA, "qd_count is 32bit!\n"); + /* there are several forms of qunit(historic causes), so we need to + * adjust qunit from slaves to the same form here */ + OBD_ALLOC(qdata, sizeof(struct qunit_data)); + if (!qdata) + RETURN(-ENOMEM); - qdata = req_capsule_server_swab_get(&req->rq_pill, - &RMF_QUNIT_DATA, - (void*)lustre_swab_qdata_old); - qdata = lustre_quota_old_to_new(qdata_old); - } - if (qdata == NULL) { - DEBUG_REQ(D_ERROR, req, "error unpacking qunit_data"); - RETURN(-EPROTO); + down_read(&obt->obt_rwsem); + /* if a quota req timeouts or is dropped, we should update quota + * statistics which will be handled in dqacq_completion. And in + * this situation we should get qdata from request instead of + * reply */ + rc1 = quota_get_qdata(req, qdata, + (rc != 0) ? QUOTA_REQUEST : QUOTA_REPLY, + QUOTA_IMPORT); + if (rc1 < 0) { + DEBUG_REQ(D_ERROR, req, + "error unpacking qunit_data(rc: %d)\n", rc1); + GOTO(exit, rc = rc1); } - LASSERT(qdata->qd_id == qunit->lq_data.qd_id && - (qdata->qd_flags & QUOTA_IS_GRP) == - (qunit->lq_data.qd_flags & QUOTA_IS_GRP) && - (qdata->qd_count == qunit->lq_data.qd_count || - qdata->qd_count == 0)); + QDATA_DEBUG(qdata, "qdata: interpret rc(%d).\n", rc); + QDATA_DEBUG((&qunit->lq_data), "lq_data: \n"); - QDATA_DEBUG(qdata, "%s interpret rc(%d).\n", - lustre_msg_get_opc(req->rq_reqmsg) == QUOTA_DQACQ ? - "DQACQ" : "DQREL", rc); + if (qdata->qd_id != qunit->lq_data.qd_id || + OBD_FAIL_CHECK(OBD_FAIL_QUOTA_RET_QDATA)) { + CDEBUG(D_ERROR, "the returned qd_id isn't expected!" + "(qdata: %u, lq_data: %u)\n", qdata->qd_id, + qunit->lq_data.qd_id); + qdata->qd_id = qunit->lq_data.qd_id; + rc = -EPROTO; + } + if (QDATA_IS_GRP(qdata) != QDATA_IS_GRP(&qunit->lq_data)) { + CDEBUG(D_ERROR, "the returned grp/usr isn't expected!" + "(qdata: %u, lq_data: %u)\n", qdata->qd_flags, + qunit->lq_data.qd_flags); + if (QDATA_IS_GRP(&qunit->lq_data)) + QDATA_SET_GRP(qdata); + else + QDATA_CLR_GRP(qdata); + rc = -EPROTO; + } + if (qdata->qd_count > qunit->lq_data.qd_count) { + CDEBUG(D_ERROR, "the returned qd_count isn't expected!" + "(qdata: "LPU64", lq_data: "LPU64")\n", qdata->qd_count, + qunit->lq_data.qd_count); + rc = -EPROTO; + } rc = dqacq_completion(obd, qctxt, qdata, rc, lustre_msg_get_opc(req->rq_reqmsg)); +exit: + up_read(&obt->obt_rwsem); + OBD_FREE(qdata, sizeof(struct qunit_data)); + RETURN(rc); } -static int got_qunit(struct qunit_waiter *waiter) +/** + * check if quota master is online + */ +int check_qm(struct lustre_quota_ctxt *qctxt) { - int rc = 0; + int rc; ENTRY; - spin_lock(&qunit_hash_lock); - rc = list_empty(&waiter->qw_entry); - spin_unlock(&qunit_hash_lock); + + spin_lock(&qctxt->lqc_lock); + /* quit waiting when mds is back or qctxt is cleaned up */ + rc = qctxt->lqc_import || !qctxt->lqc_valid; + spin_unlock(&qctxt->lqc_lock); + + RETURN(rc); +} + +static int got_qunit(struct lustre_qunit *qunit) +{ + int rc; + ENTRY; + + spin_lock(&qunit->lq_lock); + switch (qunit->lq_state) { + case QUNIT_IN_HASH: + case QUNIT_RM_FROM_HASH: + rc = 0; + break; + case QUNIT_FINISHED: + rc = 1; + break; + default: + rc = 0; + CERROR("invalid qunit state %d\n", qunit->lq_state); + } + spin_unlock(&qunit->lq_lock); RETURN(rc); } static int -schedule_dqacq(struct obd_device *obd, - struct lustre_quota_ctxt *qctxt, - struct qunit_data *qdata, int opc, int wait) +schedule_dqacq(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, + struct qunit_data *qdata, int opc, int wait, + struct obd_trans_info *oti) { struct lustre_qunit *qunit, *empty; - struct qunit_waiter qw; struct l_wait_info lwi = { 0 }; struct ptlrpc_request *req; - struct qunit_data *reqdata; struct dqacq_async_args *aa; - unsigned long factor; + struct obd_import *imp = NULL; + struct lustre_qunit_size *lqs = NULL; + struct timeval work_start; + struct timeval work_end; + long timediff; int rc = 0; ENTRY; - CFS_INIT_LIST_HEAD(&qw.qw_entry); - init_waitqueue_head(&qw.qw_waitq); - qw.qw_rc = 0; - + LASSERT(opc == QUOTA_DQACQ || opc == QUOTA_DQREL); + do_gettimeofday(&work_start); if ((empty = alloc_qunit(qctxt, qdata, opc)) == NULL) RETURN(-ENOMEM); spin_lock(&qunit_hash_lock); - qunit = dqacq_in_flight(qctxt, qdata); if (qunit) { - if (wait) - list_add_tail(&qw.qw_entry, &qunit->lq_waiters); spin_unlock(&qunit_hash_lock); + qunit_put(empty); - free_qunit(empty); goto wait_completion; } qunit = empty; + qunit_get(qunit); insert_qunit_nolock(qctxt, qunit); - if (wait) - list_add_tail(&qw.qw_entry, &qunit->lq_waiters); spin_unlock(&qunit_hash_lock); - LASSERT(qunit); + quota_search_lqs(qdata, NULL, qctxt, &lqs); + if (lqs) { + spin_lock(&lqs->lqs_lock); + quota_compute_lqs(qdata, lqs, 1, (opc == QUOTA_DQACQ) ? 1 : 0); + /* when this qdata returned from mds, it will call lqs_putref */ + lqs_getref(lqs); + spin_unlock(&lqs->lqs_lock); + /* this is for quota_search_lqs */ + lqs_putref(lqs); + } else { + CDEBUG(D_ERROR, "Can't find the lustre qunit size!\n"); + } + QDATA_DEBUG(qdata, "obd(%s): send %s quota req\n", + obd->obd_name, (opc == QUOTA_DQACQ) ? "acq" : "rel"); /* master is going to dqacq/dqrel from itself */ - if (is_master(obd, qctxt, qdata->qd_id, qdata->qd_flags & QUOTA_IS_GRP)) - { + if (is_master(qctxt)) { int rc2; QDATA_DEBUG(qdata, "local %s.\n", opc == QUOTA_DQACQ ? "DQACQ" : "DQREL"); + QDATA_SET_CHANGE_QS(qdata); rc = qctxt->lqc_handler(obd, qdata, opc); rc2 = dqacq_completion(obd, qctxt, qdata, rc, opc); - RETURN((rc && rc != -EDQUOT) ? rc : rc2); + /* this is for qunit_get() */ + qunit_put(qunit); + + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + if (opc == QUOTA_DQACQ) + lprocfs_counter_add(qctxt->lqc_stats, + wait ? LQUOTA_SYNC_ACQ : LQUOTA_ASYNC_ACQ, + timediff); + else + lprocfs_counter_add(qctxt->lqc_stats, + wait ? LQUOTA_SYNC_REL : LQUOTA_ASYNC_REL, + timediff); + RETURN(rc ? rc : rc2); + } + + spin_lock(&qctxt->lqc_lock); + if (!qctxt->lqc_import) { + spin_unlock(&qctxt->lqc_lock); + QDATA_DEBUG(qdata, "lqc_import is invalid.\n"); + + spin_lock(&qunit_hash_lock); + remove_qunit_nolock(qunit); + spin_unlock(&qunit_hash_lock); + + compute_lqs_after_removing_qunit(qunit); + + QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, -EAGAIN); + wake_up_all(&qunit->lq_waitq); + + /* this is for qunit_get() */ + qunit_put(qunit); + /* this for alloc_qunit() */ + qunit_put(qunit); + spin_lock(&qctxt->lqc_lock); + if (wait && !qctxt->lqc_import) { + spin_unlock(&qctxt->lqc_lock); + + LASSERT(oti && oti->oti_thread && + oti->oti_thread->t_watchdog); + + lc_watchdog_disable(oti->oti_thread->t_watchdog); + CDEBUG(D_QUOTA, "sleep for quota master\n"); + l_wait_event(qctxt->lqc_wait_for_qmaster, + check_qm(qctxt), &lwi); + CDEBUG(D_QUOTA, "wake up when quota master is back\n"); + lc_watchdog_touch(oti->oti_thread->t_watchdog); + } else { + spin_unlock(&qctxt->lqc_lock); + } + + RETURN(-EAGAIN); } + imp = class_import_get(qctxt->lqc_import); + spin_unlock(&qctxt->lqc_lock); /* build dqacq/dqrel request */ - LASSERT(qctxt->lqc_import); + LASSERT(imp); - req = ptlrpc_request_alloc_pack(qctxt->lqc_import, &RQF_MDS_QUOTA_DQACQ, + req = ptlrpc_request_alloc_pack(imp, &RQF_MDS_QUOTA_DQACQ, LUSTRE_MDS_VERSION, opc); + class_import_put(imp); if (req == NULL) { + CDEBUG(D_ERROR, "Can't alloc request\n"); dqacq_completion(obd, qctxt, qdata, -ENOMEM, opc); + /* this is for qunit_get() */ + qunit_put(qunit); RETURN(-ENOMEM); } - if (qdata->qd_flags & QUOTA_IS_BLOCK) - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_bunit_sz * - qctxt->lqc_bunit_sz; - else - factor = MAX_QUOTA_COUNT32 / qctxt->lqc_iunit_sz * - qctxt->lqc_iunit_sz; - - LASSERT(!should_translate_quota(qctxt->lqc_import) || - qdata->qd_count <= factor); - if (should_translate_quota(qctxt->lqc_import)) - { - struct qunit_data_old *reqdata_old, *tmp; - - reqdata_old = req_capsule_client_get(&req->rq_pill, - &RMF_QUNIT_DATA); - - tmp = lustre_quota_new_to_old(qdata); - *reqdata_old = *tmp; - req_capsule_set_size(&req->rq_pill, &RMF_QUNIT_DATA, RCL_SERVER, - sizeof(*reqdata_old)); - CDEBUG(D_QUOTA, "qd_count is 32bit!\n"); - } else { - reqdata = req_capsule_client_get(&req->rq_pill, - &RMF_QUNIT_DATA); - - *reqdata = *qdata; - req_capsule_set_size(&req->rq_pill, &RMF_QUNIT_DATA, RCL_SERVER, - sizeof(*reqdata)); - CDEBUG(D_QUOTA, "qd_count is 64bit!\n"); - } ptlrpc_request_set_replen(req); + req->rq_no_resend = req->rq_no_delay = 1; + rc = quota_copy_qdata(req, qdata, QUOTA_REQUEST, QUOTA_IMPORT); + if (rc < 0) { + CDEBUG(D_ERROR, "Can't pack qunit_data(rc: %d)\n", rc); + ptlrpc_req_finished(req); + dqacq_completion(obd, qctxt, qdata, -EPROTO, opc); + /* this is for qunit_get() */ + qunit_put(qunit); + RETURN(rc); + } CLASSERT(sizeof(*aa) <= sizeof(req->rq_async_args)); - aa = (struct dqacq_async_args *)&req->rq_async_args; + aa = ptlrpc_req_async_args(req); aa->aa_ctxt = qctxt; aa->aa_qunit = qunit; req->rq_interpret_reply = dqacq_interpret; - ptlrpcd_add_req(req); + ptlrpcd_add_req(req, PSCOPE_OTHER); QDATA_DEBUG(qdata, "%s scheduled.\n", opc == QUOTA_DQACQ ? "DQACQ" : "DQREL"); wait_completion: if (wait && qunit) { struct qunit_data *p = &qunit->lq_data; - QDATA_DEBUG(p, "wait for dqacq.\n"); - l_wait_event(qw.qw_waitq, got_qunit(&qw), &lwi); - if (qw.qw_rc == 0) + QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit); + l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi); + /* rc = -EAGAIN, it means a quota req is finished; + * rc = -EDQUOT, it means out of quota + * rc = -EBUSY, it means recovery is happening + * other rc < 0, it means real errors, functions who call + * schedule_dqacq should take care of this */ + spin_lock(&qunit->lq_lock); + if (qunit->lq_rc == 0) rc = -EAGAIN; - - CDEBUG(D_QUOTA, "wait dqacq done. (rc:%d)\n", qw.qw_rc); + else + rc = qunit->lq_rc; + spin_unlock(&qunit->lq_lock); + CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n", + qunit, rc); } + + qunit_put(qunit); + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + if (opc == QUOTA_DQACQ) + lprocfs_counter_add(qctxt->lqc_stats, + wait ? LQUOTA_SYNC_ACQ : LQUOTA_ASYNC_ACQ, + timediff); + else + lprocfs_counter_add(qctxt->lqc_stats, + wait ? LQUOTA_SYNC_REL : LQUOTA_ASYNC_REL, + timediff); + RETURN(rc); } int qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, - uid_t uid, gid_t gid, __u32 isblk, int wait) + uid_t uid, gid_t gid, __u32 isblk, int wait, + struct obd_trans_info *oti) { - int ret, rc = 0, i = USRQUOTA; + int rc = 0, i = USRQUOTA; __u32 id[MAXQUOTAS] = { uid, gid }; struct qunit_data qdata[MAXQUOTAS]; ENTRY; @@ -753,20 +998,26 @@ qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, for (i = 0; i < MAXQUOTAS; i++) { qdata[i].qd_id = id[i]; - qdata[i].qd_flags = 0; - qdata[i].qd_flags |= i; - qdata[i].qd_flags |= isblk ? QUOTA_IS_BLOCK : 0; + qdata[i].qd_flags = i; + if (isblk) + QDATA_SET_BLK(&qdata[i]); qdata[i].qd_count = 0; - ret = check_cur_qunit(obd, qctxt, &qdata[i]); - if (ret > 0) { + rc = check_cur_qunit(obd, qctxt, &qdata[i]); + if (rc > 0) { int opc; /* need acquire or release */ - opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - ret = split_before_schedule_dqacq(obd, qctxt, &qdata[i], - opc, wait); - if (!rc) - rc = ret; + opc = rc == 1 ? QUOTA_DQACQ : QUOTA_DQREL; + rc = schedule_dqacq(obd, qctxt, &qdata[i], opc, + wait,oti); + if (rc < 0) + RETURN(rc); + } else if (wait == 1) { + /* when wait equates 1, that means mds_quota_acquire + * or filter_quota_acquire is calling it. */ + rc = qctxt_wait_pending_dqacq(qctxt, id[i], i, isblk); + if (rc < 0) + RETURN(rc); } } @@ -778,93 +1029,170 @@ qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id, unsigned short type, int isblk) { struct lustre_qunit *qunit = NULL; - struct qunit_waiter qw; struct qunit_data qdata; + struct timeval work_start; + struct timeval work_end; + long timediff; struct l_wait_info lwi = { 0 }; + int rc = 0; ENTRY; - CFS_INIT_LIST_HEAD(&qw.qw_entry); - init_waitqueue_head(&qw.qw_waitq); - qw.qw_rc = 0; - + do_gettimeofday(&work_start); qdata.qd_id = id; - qdata.qd_flags = 0; - qdata.qd_flags |= type; - qdata.qd_flags |= isblk ? QUOTA_IS_BLOCK : 0; + qdata.qd_flags = type; + if (isblk) + QDATA_SET_BLK(&qdata); qdata.qd_count = 0; spin_lock(&qunit_hash_lock); - qunit = dqacq_in_flight(qctxt, &qdata); - if (qunit) - list_add_tail(&qw.qw_entry, &qunit->lq_waiters); - spin_unlock(&qunit_hash_lock); if (qunit) { - struct qunit_data *p = &qdata; - QDATA_DEBUG(p, "wait for dqacq completion.\n"); - l_wait_event(qw.qw_waitq, got_qunit(&qw), &lwi); - QDATA_DEBUG(p, "wait dqacq done. (rc:%d)\n", qw.qw_rc); + struct qunit_data *p = &qunit->lq_data; + + QDATA_DEBUG(p, "qunit(%p) is waiting for dqacq.\n", qunit); + l_wait_event(qunit->lq_waitq, got_qunit(qunit), &lwi); + CDEBUG(D_QUOTA, "qunit(%p) finishes waiting. (rc:%d)\n", + qunit, qunit->lq_rc); + /* keep same as schedule_dqacq() b=17030 */ + spin_lock(&qunit->lq_lock); + if (qunit->lq_rc == 0) + rc = -EAGAIN; + else + rc = qunit->lq_rc; + spin_unlock(&qunit->lq_lock); + /* this is for dqacq_in_flight() */ + qunit_put(qunit); + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, + isblk ? LQUOTA_WAIT_PENDING_BLK_QUOTA : + LQUOTA_WAIT_PENDING_INO_QUOTA, + timediff); + } else { + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, + isblk ? LQUOTA_NOWAIT_PENDING_BLK_QUOTA : + LQUOTA_NOWAIT_PENDING_INO_QUOTA, + timediff); } - RETURN(0); + + RETURN(rc); } int -qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb, - dqacq_handler_t handler) +qctxt_init(struct obd_device *obd, dqacq_handler_t handler) { + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + struct obd_device_target *obt = &obd->u.obt; + struct super_block *sb = obt->obt_sb; int rc = 0; ENTRY; + LASSERT(qctxt); + rc = ptlrpcd_addref(); if (rc) RETURN(rc); + cfs_waitq_init(&qctxt->lqc_wait_for_qmaster); + spin_lock_init(&qctxt->lqc_lock); + spin_lock(&qctxt->lqc_lock); qctxt->lqc_handler = handler; qctxt->lqc_sb = sb; + qctxt->lqc_obt = obt; qctxt->lqc_import = NULL; qctxt->lqc_recovery = 0; - qctxt->lqc_atype = 0; - qctxt->lqc_status= 0; + qctxt->lqc_switch_qs = 1; /* Change qunit size in default setting */ + qctxt->lqc_valid = 1; + qctxt->lqc_cqs_boundary_factor = 4; + qctxt->lqc_cqs_least_bunit = PTLRPC_MAX_BRW_SIZE; + qctxt->lqc_cqs_least_iunit = 2; + qctxt->lqc_cqs_qs_factor = 2; + qctxt->lqc_flags = 0; + QUOTA_MASTER_UNREADY(qctxt); qctxt->lqc_bunit_sz = default_bunit_sz; qctxt->lqc_btune_sz = default_bunit_sz / 100 * default_btune_ratio; qctxt->lqc_iunit_sz = default_iunit_sz; qctxt->lqc_itune_sz = default_iunit_sz * default_itune_ratio / 100; + qctxt->lqc_switch_seconds = 300; /* enlarging will wait 5 minutes + * after the last shrinking */ + qctxt->lqc_sync_blk = 0; + spin_unlock(&qctxt->lqc_lock); + + qctxt->lqc_lqs_hash = lustre_hash_init("LQS_HASH", 7, 7, + &lqs_hash_ops, 0); + if (!qctxt->lqc_lqs_hash) { + CERROR("initialize hash lqs for %s error!\n", obd->obd_name); + RETURN(-ENOMEM); + } - RETURN(0); +#ifdef LPROCFS + rc = lquota_proc_setup(obd, is_master(qctxt)); + if (rc) + CERROR("initialize proc for %s error!\n", obd->obd_name); +#endif + + RETURN(rc); } void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force) { struct lustre_qunit *qunit, *tmp; - struct qunit_waiter *qw, *tmp2; + struct list_head tmp_list; + struct obd_device_target *obt = qctxt->lqc_obt; int i; ENTRY; - spin_lock(&qunit_hash_lock); + CFS_INIT_LIST_HEAD(&tmp_list); + + spin_lock(&qctxt->lqc_lock); + qctxt->lqc_valid = 0; + spin_unlock(&qctxt->lqc_lock); + spin_lock(&qunit_hash_lock); for (i = 0; i < NR_DQHASH; i++) { list_for_each_entry_safe(qunit, tmp, &qunit_hash[i], lq_hash) { if (qunit->lq_ctxt != qctxt) continue; - remove_qunit_nolock(qunit); - /* wake up all waiters */ - list_for_each_entry_safe(qw, tmp2, &qunit->lq_waiters, - qw_entry) { - list_del_init(&qw->qw_entry); - qw->qw_rc = 0; - wake_up(&qw->qw_waitq); - } - qunit_put(qunit); + list_add(&qunit->lq_hash, &tmp_list); } } - spin_unlock(&qunit_hash_lock); + list_for_each_entry_safe(qunit, tmp, &tmp_list, lq_hash) { + list_del_init(&qunit->lq_hash); + compute_lqs_after_removing_qunit(qunit); + + /* wake up all waiters */ + QUNIT_SET_STATE_AND_RC(qunit, QUNIT_FINISHED, 0); + wake_up_all(&qunit->lq_waitq); + qunit_put(qunit); + } + + down_write(&obt->obt_rwsem); + lustre_hash_exit(qctxt->lqc_lqs_hash); + qctxt->lqc_lqs_hash = NULL; + up_write(&obt->obt_rwsem); + + /* after qctxt_cleanup, qctxt might be freed, then check_qm() is + * unpredicted. So we must wait until lqc_wait_for_qmaster is empty */ + while (cfs_waitq_active(&qctxt->lqc_wait_for_qmaster)) { + cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster); + cfs_schedule_timeout(CFS_TASK_INTERRUPTIBLE, + cfs_time_seconds(1)); + } + ptlrpcd_decref(); +#ifdef LPROCFS + if (lquota_proc_cleanup(qctxt)) + CERROR("cleanup proc error!\n"); +#endif + EXIT; } @@ -907,7 +1235,7 @@ static int qslave_recovery_main(void *arg) LASSERT(dqopt->files[type] != NULL); CFS_INIT_LIST_HEAD(&id_list); -#ifndef KERNEL_SUPPORTS_QUOTA_READ +#ifndef KERNEL_SUPPORTS_QUOTA_READ rc = fsfilt_qids(obd, dqopt->files[type], NULL, type, &id_list); #else rc = fsfilt_qids(obd, NULL, dqopt->files[type], type, &id_list); @@ -919,24 +1247,27 @@ static int qslave_recovery_main(void *arg) list_for_each_entry_safe(dqid, tmp, &id_list, di_link) { list_del_init(&dqid->di_link); /* skip slave recovery on itself */ - if (is_master(obd, qctxt, dqid->di_id, type)) + if (is_master(qctxt)) goto free; if (rc && rc != -EBUSY) goto free; qdata.qd_id = dqid->di_id; - qdata.qd_flags = 0; - qdata.qd_flags |= type; - qdata.qd_flags |= QUOTA_IS_BLOCK; + qdata.qd_flags = type; + QDATA_SET_BLK(&qdata); qdata.qd_count = 0; ret = check_cur_qunit(obd, qctxt, &qdata); if (ret > 0) { int opc; opc = ret == 1 ? QUOTA_DQACQ : QUOTA_DQREL; - rc = split_before_schedule_dqacq(obd, qctxt, &qdata, opc, 0); - } else + rc = schedule_dqacq(obd, qctxt, &qdata, opc, + 0, NULL); + if (rc == -EDQUOT) + rc = 0; + } else { rc = 0; + } if (rc) CDEBUG(rc == -EBUSY ? D_QUOTA : D_ERROR, @@ -974,3 +1305,102 @@ qslave_start_recovery(struct obd_device *obd, struct lustre_quota_ctxt *qctxt) exit: EXIT; } + + +/** + * lqs<->qctxt hash operations + */ + +/** + * string hashing using djb2 hash algorithm + */ +static unsigned +lqs_hash(lustre_hash_t *lh, void *key, unsigned mask) +{ + struct quota_adjust_qunit *lqs_key; + unsigned hash; + ENTRY; + + LASSERT(key); + lqs_key = (struct quota_adjust_qunit *)key; + hash = (QAQ_IS_GRP(lqs_key) ? 5381 : 5387) * lqs_key->qaq_id; + + RETURN(hash & mask); +} + +static int +lqs_compare(void *key, struct hlist_node *hnode) +{ + struct quota_adjust_qunit *lqs_key; + struct lustre_qunit_size *q; + int rc; + ENTRY; + + LASSERT(key); + lqs_key = (struct quota_adjust_qunit *)key; + q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash); + + spin_lock(&q->lqs_lock); + rc = ((lqs_key->qaq_id == q->lqs_id) && + (QAQ_IS_GRP(lqs_key) == LQS_IS_GRP(q))); + spin_unlock(&q->lqs_lock); + + RETURN(rc); +} + +static void * +lqs_get(struct hlist_node *hnode) +{ + struct lustre_qunit_size *q = + hlist_entry(hnode, struct lustre_qunit_size, lqs_hash); + ENTRY; + + atomic_inc(&q->lqs_refcount); + CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", + q, atomic_read(&q->lqs_refcount)); + + RETURN(q); +} + +static void * +lqs_put(struct hlist_node *hnode) +{ + struct lustre_qunit_size *q = + hlist_entry(hnode, struct lustre_qunit_size, lqs_hash); + ENTRY; + + LASSERT(atomic_read(&q->lqs_refcount) > 0); + atomic_dec(&q->lqs_refcount); + CDEBUG(D_QUOTA, "lqs=%p refcount %d\n", + q, atomic_read(&q->lqs_refcount)); + + RETURN(q); +} + +static void +lqs_exit(struct hlist_node *hnode) +{ + struct lustre_qunit_size *q; + ENTRY; + + q = hlist_entry(hnode, struct lustre_qunit_size, lqs_hash); + /* + * Nothing should be left. User of lqs put it and + * lqs also was deleted from table by this time + * so we should have 0 refs. + */ + LASSERTF(atomic_read(&q->lqs_refcount) == 0, + "Busy lqs %p with %d refs\n", q, + atomic_read(&q->lqs_refcount)); + OBD_FREE_PTR(q); + EXIT; +} + +static lustre_hash_ops_t lqs_hash_ops = { + .lh_hash = lqs_hash, + .lh_compare = lqs_compare, + .lh_get = lqs_get, + .lh_put = lqs_put, + .lh_exit = lqs_exit +}; +#endif /* HAVE_QUOTA_SUPPORT */ diff --git a/lustre/quota/quota_ctl.c b/lustre/quota/quota_ctl.c index 2cb9c9d..826e9e5 100644 --- a/lustre/quota/quota_ctl.c +++ b/lustre/quota/quota_ctl.c @@ -36,7 +36,7 @@ #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_MDS +#define DEBUG_SUBSYSTEM S_LQUOTA #ifdef __KERNEL__ # include @@ -44,7 +44,6 @@ # include # include # include -# include # include # include # include @@ -63,19 +62,25 @@ #include #include "quota_internal.h" +#ifdef HAVE_QUOTA_SUPPORT #ifdef __KERNEL__ -int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) +int mds_quota_ctl(struct obd_device *obd, struct obd_export *unused, + struct obd_quotactl *oqctl) { - struct obd_device *obd = exp->exp_obd; + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + struct timeval work_start; + struct timeval work_end; + long timediff; int rc = 0; ENTRY; + do_gettimeofday(&work_start); switch (oqctl->qc_cmd) { case Q_QUOTAON: rc = mds_quota_on(obd, oqctl); break; case Q_QUOTAOFF: - mds_quota_off(obd, oqctl); + rc = mds_quota_off(obd, oqctl); break; case Q_SETINFO: rc = mds_set_dqinfo(obd, oqctl); @@ -93,6 +98,12 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) case Q_GETOQUOTA: rc = mds_get_obd_quota(obd, oqctl); break; + case LUSTRE_Q_INVALIDATE: + rc = mds_quota_invalidate(obd, oqctl); + break; + case LUSTRE_Q_FINVALIDATE: + rc = mds_quota_finvalidate(obd, oqctl); + break; default: CERROR("%s: unsupported mds_quotactl command: %d\n", obd->obd_name, oqctl->qc_cmd); @@ -103,19 +114,29 @@ int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) CDEBUG(D_INFO, "mds_quotactl admin quota command %d, id %u, " "type %d, failed: rc = %d\n", oqctl->qc_cmd, oqctl->qc_id, oqctl->qc_type, rc); + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_QUOTA_CTL, timediff); RETURN(rc); } -int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) +int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) { struct obd_device *obd = exp->exp_obd; struct obd_device_target *obt = &obd->u.obt; struct lvfs_run_ctxt saved; + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + struct timeval work_start; + struct timeval work_end; + long timediff; int rc = 0; ENTRY; + do_gettimeofday(&work_start); switch (oqctl->qc_cmd) { + case Q_FINVALIDATE: case Q_QUOTAON: case Q_QUOTAOFF: if (!atomic_dec_and_test(&obt->obt_quotachecking)) { @@ -124,6 +145,12 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) rc = -EBUSY; break; } + if (oqctl->qc_cmd == Q_FINVALIDATE && + (obt->obt_qctxt.lqc_flags & UGQUOTA2LQC(oqctl->qc_type))) { + rc = -EBUSY; + break; + } + oqctl->qc_id = obt->obt_qfmt; /* override qfmt version */ case Q_GETOINFO: case Q_GETOQUOTA: case Q_GETQUOTA: @@ -137,18 +164,21 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) 1); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); + rc = fsfilt_quotactl(obd, obt->obt_sb, oqctl); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF) { - if (!rc) - obt->obt_qctxt.lqc_status = - (oqctl->qc_cmd == Q_QUOTAON) ? 1 : 0; + if (oqctl->qc_cmd == Q_QUOTAON || oqctl->qc_cmd == Q_QUOTAOFF || + oqctl->qc_cmd == Q_FINVALIDATE) { + if (!rc && oqctl->qc_cmd == Q_QUOTAON) + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type); + if (!rc && oqctl->qc_cmd == Q_QUOTAOFF) + obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type); atomic_inc(&obt->obt_quotachecking); } break; case Q_SETQUOTA: - qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt, + /* currently, it is only used for nullifying the quota */ + qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt, oqctl->qc_id, oqctl->qc_type, 1); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); @@ -170,14 +200,14 @@ int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) LASSERT(oqctl->qc_dqblk.dqb_bsoftlimit == 0); /* There might be a pending dqacq/dqrel (which is going to - * clear stale limits on slave). we should wait for it's + * clear stale limits on slave). we should wait for it's * completion then initialize limits */ - qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt, + qctxt_wait_pending_dqacq(&obd->u.obt.obt_qctxt, oqctl->qc_id, oqctl->qc_type, 1); if (!oqctl->qc_dqblk.dqb_bhardlimit) goto adjust; - + LASSERT(oqctl->qc_dqblk.dqb_bhardlimit == MIN_QLIMIT); push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); @@ -200,8 +230,13 @@ adjust: else gid = oqctl->qc_id; - rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, - uid, gid, 1, 0); + rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, + uid, gid, 1, 0, NULL); + if (rc == -EDQUOT || rc == -EBUSY) { + CDEBUG(D_QUOTA, "rc: %d.\n", rc); + rc = 0; + } + break; } default: @@ -209,30 +244,37 @@ adjust: obd->obd_name, oqctl->qc_cmd); RETURN(-EFAULT); } + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, LQUOTA_QUOTA_CTL, timediff); RETURN(rc); } #endif /* __KERNEL__ */ +#endif -int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) +int client_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) { - struct ptlrpc_request *req; - struct obd_quotactl *oqc; - int ver, opc, rc; + struct ptlrpc_request *req; + struct obd_quotactl *oqc; + const struct req_format *rf; + int ver, opc, rc; ENTRY; if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_MDC_NAME)) { + rf = &RQF_MDS_QUOTACTL; ver = LUSTRE_MDS_VERSION, opc = MDS_QUOTACTL; } else if (!strcmp(exp->exp_obd->obd_type->typ_name, LUSTRE_OSC_NAME)) { + rf = &RQF_OST_QUOTACTL; ver = LUSTRE_OST_VERSION, opc = OST_QUOTACTL; } else { RETURN(-EINVAL); } - req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), - &RQF_MDS_QUOTACTL, ver, opc); + req = ptlrpc_request_alloc_pack(class_exp2cliimp(exp), rf, ver, opc); if (req == NULL) RETURN(-ENOMEM); @@ -242,30 +284,65 @@ int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) ptlrpc_request_set_replen(req); rc = ptlrpc_queue_wait(req); - if (!rc) { + if (rc) { + CERROR("ptlrpc_queue_wait failed, rc: %d\n", rc); + GOTO(out, rc); + } + + oqc = NULL; + if (req->rq_repmsg) oqc = req_capsule_server_get(&req->rq_pill, &RMF_OBD_QUOTACTL); - if (oqc == NULL) - GOTO(out, rc = -EPROTO); - *oqctl = *oqc; + if (oqc == NULL) { + CERROR ("Can't unpack obd_quotactl\n"); + GOTO(out, rc = -EPROTO); } + + *oqctl = *oqc; + EXIT; out: ptlrpc_req_finished(req); - RETURN (rc); + return rc; +} + +/** + * For lmv, only need to send request to master MDT, and the master MDT will + * process with other slave MDTs. + */ +int lmv_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) +{ + struct obd_device *obd = class_exp2obd(exp); + struct lmv_obd *lmv = &obd->u.lmv; + struct lmv_tgt_desc *tgt = &lmv->tgts[0]; + int rc; + ENTRY; + + if (!lmv->desc.ld_tgt_count || !tgt->ltd_active) { + CERROR("master lmv inactive\n"); + RETURN(-EIO); + } + + rc = obd_quotactl(tgt->ltd_exp, oqctl); + RETURN(rc); } -int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) +int lov_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl) { struct obd_device *obd = class_exp2obd(exp); struct lov_obd *lov = &obd->u.lov; __u64 curspace = 0; - __u32 bhardlimit = 0; + __u64 bhardlimit = 0; int i, rc = 0; ENTRY; - if (oqctl->qc_cmd != Q_QUOTAON && oqctl->qc_cmd != Q_QUOTAOFF && - oqctl->qc_cmd != Q_GETOQUOTA && oqctl->qc_cmd != Q_INITQUOTA && - oqctl->qc_cmd != Q_SETQUOTA) { + if (oqctl->qc_cmd != LUSTRE_Q_QUOTAON && + oqctl->qc_cmd != LUSTRE_Q_QUOTAOFF && + oqctl->qc_cmd != Q_GETOQUOTA && + oqctl->qc_cmd != Q_INITQUOTA && + oqctl->qc_cmd != LUSTRE_Q_SETQUOTA && + oqctl->qc_cmd != Q_FINVALIDATE) { CERROR("bad quota opc %x for lov obd", oqctl->qc_cmd); RETURN(-EFAULT); } @@ -277,11 +354,10 @@ int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl) if (oqctl->qc_cmd == Q_GETOQUOTA) { CERROR("ost %d is inactive\n", i); rc = -EIO; - break; } else { CDEBUG(D_HA, "ost %d is inactive\n", i); - continue; } + continue; } err = obd_quotactl(lov->lov_tgts[i]->ltd_exp, oqctl); diff --git a/lustre/quota/quota_interface.c b/lustre/quota/quota_interface.c index e035ceb..5e1c51b 100644 --- a/lustre/quota/quota_interface.c +++ b/lustre/quota/quota_interface.c @@ -37,7 +37,7 @@ #ifndef EXPORT_SYMTAB # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_MDS +#define DEBUG_SUBSYSTEM S_LQUOTA #ifdef __KERNEL__ # include @@ -45,11 +45,14 @@ # include # include # include -# include -# include -# include -# include -# include +# if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,5,0)) +# include +# include +# include +# include +# else +# include +# endif #else /* __KERNEL__ */ # include #endif @@ -64,245 +67,12 @@ #include #include "quota_internal.h" - #ifdef __KERNEL__ -/* quota proc file handling functions */ -#ifdef LPROCFS -int lprocfs_rd_bunit(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - LASSERT(obd != NULL); - - return snprintf(page, count, "%lu\n", - obd->u.obt.obt_qctxt.lqc_bunit_sz); -} -EXPORT_SYMBOL(lprocfs_rd_bunit); - -int lprocfs_rd_iunit(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - LASSERT(obd != NULL); - - return snprintf(page, count, "%lu\n", - obd->u.obt.obt_qctxt.lqc_iunit_sz); -} -EXPORT_SYMBOL(lprocfs_rd_iunit); - -int lprocfs_wr_bunit(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - int val, rc; - LASSERT(obd != NULL); - - rc = lprocfs_write_helper(buffer, count, &val); - - if (rc) - return rc; - - if (val % QUOTABLOCK_SIZE || - val <= obd->u.obt.obt_qctxt.lqc_btune_sz) - return -EINVAL; - - obd->u.obt.obt_qctxt.lqc_bunit_sz = val; - return count; -} -EXPORT_SYMBOL(lprocfs_wr_bunit); - -int lprocfs_wr_iunit(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - int val, rc; - LASSERT(obd != NULL); - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val <= obd->u.obt.obt_qctxt.lqc_itune_sz) - return -EINVAL; - - obd->u.obt.obt_qctxt.lqc_iunit_sz = val; - return count; -} -EXPORT_SYMBOL(lprocfs_wr_iunit); - -int lprocfs_rd_btune(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - LASSERT(obd != NULL); - - return snprintf(page, count, "%lu\n", - obd->u.obt.obt_qctxt.lqc_btune_sz); -} -EXPORT_SYMBOL(lprocfs_rd_btune); - -int lprocfs_rd_itune(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - LASSERT(obd != NULL); - return snprintf(page, count, "%lu\n", - obd->u.obt.obt_qctxt.lqc_itune_sz); -} -EXPORT_SYMBOL(lprocfs_rd_itune); - -int lprocfs_wr_btune(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - int val, rc; - LASSERT(obd != NULL); - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val <= QUOTABLOCK_SIZE * MIN_QLIMIT || val % QUOTABLOCK_SIZE || - val >= obd->u.obt.obt_qctxt.lqc_bunit_sz) - return -EINVAL; - - obd->u.obt.obt_qctxt.lqc_btune_sz = val; - return count; -} -EXPORT_SYMBOL(lprocfs_wr_btune); +#ifdef HAVE_QUOTA_SUPPORT -int lprocfs_wr_itune(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - int val, rc; - LASSERT(obd != NULL); - - rc = lprocfs_write_helper(buffer, count, &val); - if (rc) - return rc; - - if (val <= MIN_QLIMIT || - val >= obd->u.obt.obt_qctxt.lqc_iunit_sz) - return -EINVAL; - - obd->u.obt.obt_qctxt.lqc_itune_sz = val; - return count; -} -EXPORT_SYMBOL(lprocfs_wr_itune); - -#define USER_QUOTA 1 -#define GROUP_QUOTA 2 - -#define MAX_STYPE_SIZE 4 -int lprocfs_rd_type(char *page, char **start, off_t off, int count, - int *eof, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - char stype[MAX_STYPE_SIZE + 1] = ""; - int type = obd->u.obt.obt_qctxt.lqc_atype; - LASSERT(obd != NULL); - - if (type == 0) { - strcpy(stype, "off"); - } else { - if (type & USER_QUOTA) - strcat(stype, "u"); - if (type & GROUP_QUOTA) - strcat(stype, "g"); - } - - return snprintf(page, count, "%s\n", stype); -} -EXPORT_SYMBOL(lprocfs_rd_type); - -static int auto_quota_on(struct obd_device *obd, int type, - struct super_block *sb, int is_master) -{ - struct obd_quotactl *oqctl; - struct lvfs_run_ctxt saved; - int rc; - ENTRY; - - LASSERT(type == USRQUOTA || type == GRPQUOTA || type == UGQUOTA); - - /* quota already turned on */ - if (obd->u.obt.obt_qctxt.lqc_status) - RETURN(0); - - OBD_ALLOC_PTR(oqctl); - if (!oqctl) - RETURN(-ENOMEM); - - oqctl->qc_type = type; - oqctl->qc_cmd = Q_QUOTAON; - oqctl->qc_id = QFMT_LDISKFS; - - push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - - if (!is_master) - goto local_quota; - - /* turn on cluster wide quota */ - rc = mds_admin_quota_on(obd, oqctl); - if (rc) { - CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, - "auto-enable admin quota failed. rc=%d\n", rc); - GOTO(out_pop, rc); - } -local_quota: - /* turn on local quota */ - rc = fsfilt_quotactl(obd, sb, oqctl); - if (rc) { - CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, - "auto-enable local quota failed. rc=%d\n", rc); - if (is_master) - mds_quota_off(obd, oqctl); - } else { - obd->u.obt.obt_qctxt.lqc_status = 1; - } -out_pop: - pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); - - OBD_FREE_PTR(oqctl); - RETURN(rc); -} - - -int lprocfs_wr_type(struct file *file, const char *buffer, - unsigned long count, void *data) -{ - struct obd_device *obd = (struct obd_device *)data; - struct obd_device_target *obt = &obd->u.obt; - int type = 0; - char stype[MAX_STYPE_SIZE + 1] = ""; - LASSERT(obd != NULL); - - if (copy_from_user(stype, buffer, MAX_STYPE_SIZE)) - return -EFAULT; - - if (strchr(stype, 'u')) - type |= USER_QUOTA; - if (strchr(stype, 'g')) - type |= GROUP_QUOTA; - - obt->obt_qctxt.lqc_atype = type; - - if (type == 0) - return count; - - if (!strcmp(obd->obd_type->typ_name, LUSTRE_MDS_NAME)) - auto_quota_on(obd, type - 1, obt->obt_sb, 1); - else if (!strcmp(obd->obd_type->typ_name, LUSTRE_OST_NAME)) - auto_quota_on(obd, type - 1, obt->obt_sb, 0); - else - return -EFAULT; - - return count; -} -EXPORT_SYMBOL(lprocfs_wr_type); -#endif /* LPROCFS */ +static cfs_time_t last_print = 0; +static spinlock_t last_print_lock = SPIN_LOCK_UNLOCKED; static int filter_quota_setup(struct obd_device *obd) { @@ -310,41 +80,73 @@ static int filter_quota_setup(struct obd_device *obd) struct obd_device_target *obt = &obd->u.obt; ENTRY; + init_rwsem(&obt->obt_rwsem); + obt->obt_qfmt = LUSTRE_QUOTA_V2; atomic_set(&obt->obt_quotachecking, 1); - rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, NULL); - if (rc) { + rc = qctxt_init(obd, NULL); + if (rc) CERROR("initialize quota context failed! (rc:%d)\n", rc); - RETURN(rc); - } RETURN(rc); } static int filter_quota_cleanup(struct obd_device *obd) { + ENTRY; qctxt_cleanup(&obd->u.obt.obt_qctxt, 0); - return 0; + RETURN(0); } -static int filter_quota_setinfo(struct obd_export *exp, struct obd_device *obd) +static int filter_quota_setinfo(struct obd_device *obd, void *data) { + struct obd_export *exp = data; + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; struct obd_import *imp; + ENTRY; /* setup the quota context import */ - obd->u.obt.obt_qctxt.lqc_import = exp->exp_imp_reverse; + spin_lock(&qctxt->lqc_lock); + qctxt->lqc_import = exp->exp_imp_reverse; + spin_unlock(&qctxt->lqc_lock); + CDEBUG(D_QUOTA, "%s: lqc_import(%p) of obd(%p) is reactivated now, \n", + obd->obd_name,exp->exp_imp_reverse, obd); - /* make imp's connect flags equal relative exp's connect flags + /* make imp's connect flags equal relative exp's connect flags * adding it to avoid the scan export list */ - imp = exp->exp_imp_reverse; - if (imp) - imp->imp_connect_data.ocd_connect_flags |= - (exp->exp_connect_flags & OBD_CONNECT_QUOTA64); + imp = qctxt->lqc_import; + if (likely(imp)) + imp->imp_connect_data.ocd_connect_flags |= + (exp->exp_connect_flags & + (OBD_CONNECT_QUOTA64 | OBD_CONNECT_CHANGE_QS)); + cfs_waitq_signal(&qctxt->lqc_wait_for_qmaster); /* start quota slave recovery thread. (release high limits) */ - qslave_start_recovery(obd, &obd->u.obt.obt_qctxt); - return 0; + qslave_start_recovery(obd, qctxt); + RETURN(0); } + +static int filter_quota_clearinfo(struct obd_export *exp, struct obd_device *obd) +{ + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + ENTRY; + + /* lquota may be not set up before destroying export, b=14896 */ + if (!obd->obd_set_up) + RETURN(0); + + /* when exp->exp_imp_reverse is destroyed, the corresponding lqc_import + * should be invalid b=12374 */ + if (qctxt->lqc_import && qctxt->lqc_import == exp->exp_imp_reverse) { + spin_lock(&qctxt->lqc_lock); + qctxt->lqc_import = NULL; + spin_unlock(&qctxt->lqc_lock); + CDEBUG(D_QUOTA, "%s: lqc_import of obd(%p) is invalid now.\n", + obd->obd_name, obd); + } + RETURN(0); +} + static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore) { ENTRY; @@ -352,10 +154,12 @@ static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore) if (!sb_any_quota_enabled(obd->u.obt.obt_sb)) RETURN(0); - if (ignore) + if (ignore) { + CDEBUG(D_QUOTA, "blocks will be written with ignoring quota.\n"); cfs_cap_raise(CFS_CAP_SYS_RESOURCE); - else + } else { cfs_cap_lower(CFS_CAP_SYS_RESOURCE); + } RETURN(0); } @@ -363,6 +167,7 @@ static int filter_quota_enforce(struct obd_device *obd, unsigned int ignore) static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa) { struct obd_device_target *obt = &obd->u.obt; + struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt; int err, cnt, rc = 0; struct obd_quotactl *oqctl; ENTRY; @@ -370,15 +175,42 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa) if (!sb_any_quota_enabled(obt->obt_sb)) RETURN(0); - oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA); - OBD_ALLOC_PTR(oqctl); if (!oqctl) { CERROR("Not enough memory!"); RETURN(-ENOMEM); } + /* set over quota flags for a uid/gid */ + oa->o_valid |= OBD_MD_FLUSRQUOTA | OBD_MD_FLGRPQUOTA; + oa->o_flags &= ~(OBD_FL_NO_USRQUOTA | OBD_FL_NO_GRPQUOTA); + for (cnt = 0; cnt < MAXQUOTAS; cnt++) { + struct quota_adjust_qunit oqaq_tmp; + struct lustre_qunit_size *lqs = NULL; + + oqaq_tmp.qaq_flags = cnt; + oqaq_tmp.qaq_id = (cnt == USRQUOTA) ? oa->o_uid : oa->o_gid; + + quota_search_lqs(NULL, &oqaq_tmp, qctxt, &lqs); + if (lqs) { + spin_lock(&lqs->lqs_lock); + if (lqs->lqs_bunit_sz <= qctxt->lqc_sync_blk) { + oa->o_flags |= (cnt == USRQUOTA) ? + OBD_FL_NO_USRQUOTA : OBD_FL_NO_GRPQUOTA; + spin_unlock(&lqs->lqs_lock); + CDEBUG(D_QUOTA, "set sync flag: bunit(%lu), " + "sync_blk(%d)\n", lqs->lqs_bunit_sz, + qctxt->lqc_sync_blk); + /* this is for quota_search_lqs */ + lqs_putref(lqs); + continue; + } + spin_unlock(&lqs->lqs_lock); + /* this is for quota_search_lqs */ + lqs_putref(lqs); + } + memset(oqctl, 0, sizeof(*oqctl)); oqctl->qc_cmd = Q_GETQUOTA; @@ -388,14 +220,13 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa) if (err) { if (!rc) rc = err; + oa->o_valid &= ~((cnt == USRQUOTA) ? OBD_MD_FLUSRQUOTA : + OBD_MD_FLGRPQUOTA); continue; } - /* set over quota flags for a uid/gid */ - oa->o_valid |= (cnt == USRQUOTA) ? - OBD_MD_FLUSRQUOTA : OBD_MD_FLGRPQUOTA; if (oqctl->qc_dqblk.dqb_bhardlimit && - (toqb(oqctl->qc_dqblk.dqb_curspace) > + (toqb(oqctl->qc_dqblk.dqb_curspace) >= oqctl->qc_dqblk.dqb_bhardlimit)) oa->o_flags |= (cnt == USRQUOTA) ? OBD_FL_NO_USRQUOTA : OBD_FL_NO_GRPQUOTA; @@ -404,58 +235,288 @@ static int filter_quota_getflag(struct obd_device *obd, struct obdo *oa) RETURN(rc); } -static int filter_quota_acquire(struct obd_device *obd, unsigned int uid, - unsigned int gid) +/** + * check whether the left quota of certain uid and gid can satisfy a block_write + * or inode_create rpc. When need to acquire quota, return QUOTA_RET_ACQUOTA + */ +static int quota_check_common(struct obd_device *obd, unsigned int uid, + unsigned int gid, int count, int cycle, int isblk) { struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; - int rc; + int i; + __u32 id[MAXQUOTAS] = { uid, gid }; + struct qunit_data qdata[MAXQUOTAS]; + int rc = 0, rc2[2] = { 0, 0 }; ENTRY; - rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, 1, 1); - RETURN(rc == -EAGAIN); -} + CLASSERT(MAXQUOTAS < 4); + if (!sb_any_quota_enabled(qctxt->lqc_sb)) + RETURN(rc); -static int mds_quota_init(void) -{ - return lustre_dquot_init(); + spin_lock(&qctxt->lqc_lock); + if (!qctxt->lqc_valid){ + spin_unlock(&qctxt->lqc_lock); + RETURN(rc); + } + spin_unlock(&qctxt->lqc_lock); + + for (i = 0; i < MAXQUOTAS; i++) { + struct lustre_qunit_size *lqs = NULL; + + qdata[i].qd_id = id[i]; + qdata[i].qd_flags = i; + if (isblk) + QDATA_SET_BLK(&qdata[i]); + qdata[i].qd_count = 0; + + /* ignore root user */ + if (qdata[i].qd_id == 0 && !QDATA_IS_GRP(&qdata[i])) + continue; + + quota_search_lqs(&qdata[i], NULL, qctxt, &lqs); + if (!lqs) + continue; + + rc2[i] = compute_remquota(obd, qctxt, &qdata[i], isblk); + spin_lock(&lqs->lqs_lock); + if (!cycle) { + rc = QUOTA_RET_INC_PENDING; + if (isblk) + lqs->lqs_bwrite_pending += count; + else + lqs->lqs_iwrite_pending += count; + } + if (rc2[i] == QUOTA_RET_OK) { + if (isblk && qdata[i].qd_count < + lqs->lqs_bwrite_pending * CFS_PAGE_SIZE) + rc2[i] = QUOTA_RET_ACQUOTA; + if (!isblk && qdata[i].qd_count < + lqs->lqs_iwrite_pending) + rc2[i] = QUOTA_RET_ACQUOTA; + } + spin_unlock(&lqs->lqs_lock); + CDEBUG(D_QUOTA, "count: %d, write pending: %lu, qd_count: "LPU64 + ".\n", count, + isblk ? lqs->lqs_bwrite_pending : lqs->lqs_iwrite_pending, + qdata[i].qd_count); + + /* When cycle is zero, lqs_*_pending will be changed. We will + * get reference of the lqs here and put reference of lqs in + * quota_pending_commit b=14784 */ + if (!cycle) + lqs_getref(lqs); + + /* this is for quota_search_lqs */ + lqs_putref(lqs); + } + + if (rc2[0] == QUOTA_RET_ACQUOTA || rc2[1] == QUOTA_RET_ACQUOTA) + RETURN(rc | QUOTA_RET_ACQUOTA); + else + RETURN(rc); } -static int mds_quota_exit(void) +static int quota_chk_acq_common(struct obd_device *obd, unsigned int uid, + unsigned int gid, int count, int *pending, + quota_acquire acquire, + struct obd_trans_info *oti, int isblk) { - lustre_dquot_exit(); - return 0; + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + struct timeval work_start; + struct timeval work_end; + long timediff; + struct l_wait_info lwi = { 0 }; + int rc = 0, cycle = 0, count_err = 1; + ENTRY; + + CDEBUG(D_QUOTA, "check quota for %s\n", obd->obd_name); + *pending = 0; + /* Unfortunately, if quota master is too busy to handle the + * pre-dqacq in time and quota hash on ost is used up, we + * have to wait for the completion of in flight dqacq/dqrel, + * in order to get enough quota for write b=12588 */ + do_gettimeofday(&work_start); + while ((rc = quota_check_common(obd, uid, gid, count, cycle, isblk)) & + QUOTA_RET_ACQUOTA) { + + spin_lock(&qctxt->lqc_lock); + if (!qctxt->lqc_import && oti) { + spin_unlock(&qctxt->lqc_lock); + + LASSERT(oti && oti->oti_thread && + oti->oti_thread->t_watchdog); + + lc_watchdog_disable(oti->oti_thread->t_watchdog); + CDEBUG(D_QUOTA, "sleep for quota master\n"); + l_wait_event(qctxt->lqc_wait_for_qmaster, check_qm(qctxt), + &lwi); + CDEBUG(D_QUOTA, "wake up when quota master is back\n"); + lc_watchdog_touch(oti->oti_thread->t_watchdog); + } else { + spin_unlock(&qctxt->lqc_lock); + } + + if (rc & QUOTA_RET_INC_PENDING) + *pending = 1; + + cycle++; + if (isblk) + OBD_FAIL_TIMEOUT(OBD_FAIL_OST_HOLD_WRITE_RPC, 90); + /* after acquire(), we should run quota_check_common again + * so that we confirm there are enough quota to finish write */ + rc = acquire(obd, uid, gid, oti, isblk); + + /* please reference to dqacq_completion for the below */ + /* a new request is finished, try again */ + if (rc == -EAGAIN) { + CDEBUG(D_QUOTA, "finish a quota req, try again\n"); + continue; + } + + /* it is out of quota already */ + if (rc == -EDQUOT) { + CDEBUG(D_QUOTA, "out of quota, return -EDQUOT\n"); + break; + } + + /* -EBUSY and others, wait a second and try again */ + if (rc < 0) { + cfs_waitq_t waitq; + struct l_wait_info lwi; + + if (oti && oti->oti_thread && oti->oti_thread->t_watchdog) + lc_watchdog_touch(oti->oti_thread->t_watchdog); + CDEBUG(D_QUOTA, "rc: %d, count_err: %d\n", rc, + count_err++); + + init_waitqueue_head(&waitq); + lwi = LWI_TIMEOUT(cfs_time_seconds(min(cycle, 10)), NULL, + NULL); + l_wait_event(waitq, 0, &lwi); + } + + if (rc < 0 || cycle % 10 == 2) { + spin_lock(&last_print_lock); + if (last_print == 0 || + cfs_time_before((last_print + cfs_time_seconds(30)), + cfs_time_current())) { + last_print = cfs_time_current(); + spin_unlock(&last_print_lock); + CWARN("still haven't managed to acquire quota " + "space from the quota master after %d " + "retries (err=%d, rc=%d)\n", + cycle, count_err - 1, rc); + } else { + spin_unlock(&last_print_lock); + } + } + + CDEBUG(D_QUOTA, "recheck quota with rc: %d, cycle: %d\n", rc, + cycle); + } + + if (!cycle && rc & QUOTA_RET_INC_PENDING) + *pending = 1; + + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, + isblk ? LQUOTA_WAIT_FOR_CHK_BLK : + LQUOTA_WAIT_FOR_CHK_INO, + timediff); + + RETURN(rc); } -/* check whether the left quota of certain uid and uid can satisfy a write rpc - * when need to acquire quota, return QUOTA_RET_ACQUOTA */ -static int filter_quota_check(struct obd_device *obd, unsigned int uid, - unsigned int gid, int npage) +/** + * when a block_write or inode_create rpc is finished, adjust the record for + * pending blocks and inodes + */ +static int quota_pending_commit(struct obd_device *obd, unsigned int uid, + unsigned int gid, int count, int isblk) { struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + struct timeval work_start; + struct timeval work_end; + long timediff; int i; __u32 id[MAXQUOTAS] = { uid, gid }; struct qunit_data qdata[MAXQUOTAS]; - int rc; ENTRY; + CDEBUG(D_QUOTA, "commit pending quota for %s\n", obd->obd_name); CLASSERT(MAXQUOTAS < 4); if (!sb_any_quota_enabled(qctxt->lqc_sb)) RETURN(0); + do_gettimeofday(&work_start); for (i = 0; i < MAXQUOTAS; i++) { + struct lustre_qunit_size *lqs = NULL; + qdata[i].qd_id = id[i]; qdata[i].qd_flags = i; - qdata[i].qd_flags |= QUOTA_IS_BLOCK; + if (isblk) + QDATA_SET_BLK(&qdata[i]); qdata[i].qd_count = 0; - qctxt_wait_pending_dqacq(qctxt, id[i], i, 1); - rc = compute_remquota(obd, qctxt, &qdata[i]); - if (rc == QUOTA_RET_OK && - qdata[i].qd_count < npage * CFS_PAGE_SIZE) - RETURN(QUOTA_RET_ACQUOTA); + if (qdata[i].qd_id == 0 && !QDATA_IS_GRP(&qdata[i])) + continue; + + quota_search_lqs(&qdata[i], NULL, qctxt, &lqs); + if (lqs) { + int flag = 0; + CDEBUG(D_QUOTA, "pending: %lu, count: %d.\n", + isblk ? lqs->lqs_bwrite_pending : + lqs->lqs_iwrite_pending, count); + spin_lock(&lqs->lqs_lock); + if (isblk) { + if (lqs->lqs_bwrite_pending >= count) { + lqs->lqs_bwrite_pending -= count; + spin_unlock(&lqs->lqs_lock); + flag = 1; + } else { + spin_unlock(&lqs->lqs_lock); + CDEBUG(D_ERROR, + "there are too many blocks!\n"); + } + } else { + if (lqs->lqs_iwrite_pending >= count) { + lqs->lqs_iwrite_pending -= count; + spin_unlock(&lqs->lqs_lock); + flag = 1; + } else { + spin_unlock(&lqs->lqs_lock); + CDEBUG(D_ERROR, + "there are too many files!\n"); + } + } + + lqs_putref(lqs); + /* When lqs_*_pening is changed back, we'll putref lqs + * here b=14784 */ + if (flag) + lqs_putref(lqs); + } } + do_gettimeofday(&work_end); + timediff = cfs_timeval_sub(&work_end, &work_start, NULL); + lprocfs_counter_add(qctxt->lqc_stats, + isblk ? LQUOTA_WAIT_FOR_COMMIT_BLK : + LQUOTA_WAIT_FOR_COMMIT_INO, + timediff); - RETURN(rc); + RETURN(0); +} + +static int mds_quota_init(void) +{ + return lustre_dquot_init(); +} + +static int mds_quota_exit(void) +{ + lustre_dquot_exit(); + return 0; } static int mds_quota_setup(struct obd_device *obd) @@ -465,41 +526,83 @@ static int mds_quota_setup(struct obd_device *obd) int rc; ENTRY; + if (unlikely(mds->mds_quota)) { + CWARN("try to reinitialize quota context!\n"); + RETURN(0); + } + + init_rwsem(&obt->obt_rwsem); + obt->obt_qfmt = LUSTRE_QUOTA_V2; + mds->mds_quota_info.qi_version = LUSTRE_QUOTA_V2; atomic_set(&obt->obt_quotachecking, 1); /* initialize quota master and quota context */ sema_init(&mds->mds_qonoff_sem, 1); - rc = qctxt_init(&obt->obt_qctxt, obt->obt_sb, dqacq_handler); + rc = qctxt_init(obd, dqacq_handler); if (rc) { CERROR("initialize quota context failed! (rc:%d)\n", rc); RETURN(rc); } - + mds->mds_quota = 1; RETURN(rc); } static int mds_quota_cleanup(struct obd_device *obd) { + ENTRY; + if (unlikely(!obd->u.mds.mds_quota)) + RETURN(0); + qctxt_cleanup(&obd->u.obt.obt_qctxt, 0); RETURN(0); } +static int mds_quota_setinfo(struct obd_device *obd, void *data) +{ + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + ENTRY; + + if (unlikely(!obd->u.mds.mds_quota)) + RETURN(0); + + if (data != NULL) + QUOTA_MASTER_READY(qctxt); + else + QUOTA_MASTER_UNREADY(qctxt); + RETURN(0); +} + static int mds_quota_fs_cleanup(struct obd_device *obd) { struct mds_obd *mds = &obd->u.mds; - int i; + struct obd_quotactl oqctl; ENTRY; - /* close admin quota files */ + if (unlikely(!mds->mds_quota)) + RETURN(0); + + mds->mds_quota = 0; + memset(&oqctl, 0, sizeof(oqctl)); + oqctl.qc_type = UGQUOTA; + down(&mds->mds_qonoff_sem); - for (i = 0; i < MAXQUOTAS; i++) { - if (mds->mds_quota_info.qi_files[i]) { - filp_close(mds->mds_quota_info.qi_files[i], 0); - mds->mds_quota_info.qi_files[i] = NULL; - } - } + mds_admin_quota_off(obd, &oqctl); up(&mds->mds_qonoff_sem); RETURN(0); } + +static int quota_acquire_common(struct obd_device *obd, unsigned int uid, + unsigned int gid, struct obd_trans_info *oti, + int isblk) +{ + struct lustre_quota_ctxt *qctxt = &obd->u.obt.obt_qctxt; + int rc; + ENTRY; + + rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, isblk, 1, oti); + RETURN(rc); +} + +#endif /* HAVE_QUOTA_SUPPORT */ #endif /* __KERNEL__ */ struct osc_quota_info { @@ -548,6 +651,7 @@ static inline struct osc_quota_info *find_qinfo(struct client_obd *cli, { unsigned int hashent = hashfn(cli, id, type); struct osc_quota_info *oqi; + ENTRY; LASSERT_SPIN_LOCKED(&qinfo_list_lock); list_for_each_entry(oqi, &qinfo_hash[hashent], oqi_hash) { @@ -555,7 +659,7 @@ static inline struct osc_quota_info *find_qinfo(struct client_obd *cli, oqi->oqi_id == id && oqi->oqi_type == type) return oqi; } - return NULL; + RETURN(NULL); } static struct osc_quota_info *alloc_qinfo(struct client_obd *cli, @@ -581,8 +685,7 @@ static void free_qinfo(struct osc_quota_info *oqi) OBD_SLAB_FREE(oqi, qinfo_cachep, sizeof(*oqi)); } -int osc_quota_chkdq(struct client_obd *cli, - unsigned int uid, unsigned int gid) +int osc_quota_chkdq(struct client_obd *cli, unsigned int uid, unsigned int gid) { unsigned int id; int cnt, rc = QUOTA_OK; @@ -604,8 +707,7 @@ int osc_quota_chkdq(struct client_obd *cli, RETURN(rc); } -int osc_quota_setdq(struct client_obd *cli, - unsigned int uid, unsigned int gid, +int osc_quota_setdq(struct client_obd *cli, unsigned int uid, unsigned int gid, obd_flag valid, obd_flag flags) { unsigned int id; @@ -713,6 +815,7 @@ int osc_quota_exit(void) } #ifdef __KERNEL__ +#ifdef HAVE_QUOTA_SUPPORT quota_interface_t mds_quota_interface = { .quota_init = mds_quota_init, .quota_exit = mds_quota_exit, @@ -720,9 +823,13 @@ quota_interface_t mds_quota_interface = { .quota_cleanup = mds_quota_cleanup, .quota_check = target_quota_check, .quota_ctl = mds_quota_ctl, - .quota_fs_cleanup =mds_quota_fs_cleanup, + .quota_setinfo = mds_quota_setinfo, + .quota_fs_cleanup = mds_quota_fs_cleanup, .quota_recovery = mds_quota_recovery, .quota_adjust = mds_quota_adjust, + .quota_chkquota = quota_chk_acq_common, + .quota_acquire = quota_acquire_common, + .quota_pending_commit = quota_pending_commit, }; quota_interface_t filter_quota_interface = { @@ -731,12 +838,16 @@ quota_interface_t filter_quota_interface = { .quota_check = target_quota_check, .quota_ctl = filter_quota_ctl, .quota_setinfo = filter_quota_setinfo, + .quota_clearinfo = filter_quota_clearinfo, .quota_enforce = filter_quota_enforce, .quota_getflag = filter_quota_getflag, - .quota_acquire = filter_quota_acquire, + .quota_acquire = quota_acquire_common, .quota_adjust = filter_quota_adjust, - .quota_chkquota = filter_quota_check, + .quota_chkquota = quota_chk_acq_common, + .quota_adjust_qunit = filter_quota_adjust_qunit, + .quota_pending_commit = quota_pending_commit, }; +#endif #endif /* __KERNEL__ */ quota_interface_t mdc_quota_interface = { @@ -745,6 +856,11 @@ quota_interface_t mdc_quota_interface = { .quota_poll_check = client_quota_poll_check, }; +quota_interface_t lmv_quota_interface = { + .quota_ctl = lmv_quota_ctl, + .quota_check = lmv_quota_check, +}; + quota_interface_t osc_quota_interface = { .quota_ctl = client_quota_ctl, .quota_check = client_quota_check, @@ -754,22 +870,42 @@ quota_interface_t osc_quota_interface = { .quota_chkdq = osc_quota_chkdq, .quota_setdq = osc_quota_setdq, .quota_cleanup = osc_quota_cleanup, + .quota_adjust_qunit = client_quota_adjust_qunit, }; quota_interface_t lov_quota_interface = { - .quota_check = lov_quota_check, .quota_ctl = lov_quota_ctl, + .quota_check = lov_quota_check, + .quota_adjust_qunit = lov_quota_adjust_qunit, }; #ifdef __KERNEL__ + +cfs_proc_dir_entry_t *lquota_type_proc_dir = NULL; + static int __init init_lustre_quota(void) { - int rc = qunit_cache_init(); +#ifdef HAVE_QUOTA_SUPPORT + int rc = 0; + + lquota_type_proc_dir = lprocfs_register(OBD_LQUOTA_DEVICENAME, + proc_lustre_root, + NULL, NULL); + if (IS_ERR(lquota_type_proc_dir)) { + CERROR("LProcFS failed in lquota-init\n"); + rc = PTR_ERR(lquota_type_proc_dir); + return rc; + } + + rc = qunit_cache_init(); if (rc) return rc; + PORTAL_SYMBOL_REGISTER(filter_quota_interface); PORTAL_SYMBOL_REGISTER(mds_quota_interface); +#endif PORTAL_SYMBOL_REGISTER(mdc_quota_interface); + PORTAL_SYMBOL_REGISTER(lmv_quota_interface); PORTAL_SYMBOL_REGISTER(osc_quota_interface); PORTAL_SYMBOL_REGISTER(lov_quota_interface); return 0; @@ -777,13 +913,19 @@ static int __init init_lustre_quota(void) static void /*__exit*/ exit_lustre_quota(void) { - PORTAL_SYMBOL_UNREGISTER(filter_quota_interface); - PORTAL_SYMBOL_UNREGISTER(mds_quota_interface); PORTAL_SYMBOL_UNREGISTER(mdc_quota_interface); + PORTAL_SYMBOL_UNREGISTER(lmv_quota_interface); PORTAL_SYMBOL_UNREGISTER(osc_quota_interface); PORTAL_SYMBOL_UNREGISTER(lov_quota_interface); +#ifdef HAVE_QUOTA_SUPPORT + PORTAL_SYMBOL_UNREGISTER(filter_quota_interface); + PORTAL_SYMBOL_UNREGISTER(mds_quota_interface); qunit_cache_cleanup(); + + if (lquota_type_proc_dir) + lprocfs_remove(&lquota_type_proc_dir); +#endif } MODULE_AUTHOR("Sun Microsystems, Inc. "); @@ -792,9 +934,12 @@ MODULE_LICENSE("GPL"); cfs_module(lquota, "1.0.0", init_lustre_quota, exit_lustre_quota); +#ifdef HAVE_QUOTA_SUPPORT EXPORT_SYMBOL(mds_quota_interface); EXPORT_SYMBOL(filter_quota_interface); +#endif EXPORT_SYMBOL(mdc_quota_interface); +EXPORT_SYMBOL(lmv_quota_interface); EXPORT_SYMBOL(osc_quota_interface); EXPORT_SYMBOL(lov_quota_interface); #endif /* __KERNEL */ diff --git a/lustre/quota/quota_internal.h b/lustre/quota/quota_internal.h index d896fa7..e9073be 100644 --- a/lustre/quota/quota_internal.h +++ b/lustre/quota/quota_internal.h @@ -39,19 +39,22 @@ #include +#ifdef HAVE_QUOTA_SUPPORT + /* QUSG covnert bytes to blocks when counting block quota */ #define QUSG(count, isblk) (isblk ? toqb(count) : count) -/* This flag is set in qc_stat to distinguish if the current getquota +/* This flag is set in qc_stat to distinguish if the current getquota * operation is for quota recovery */ #define QUOTA_RECOVERING 0x01 +#define OBD_LQUOTA_DEVICENAME "lquota" #ifdef __KERNEL__ #define DQUOT_DEBUG(dquot, fmt, arg...) \ CDEBUG(D_QUOTA, "refcnt(%u) id(%u) type(%u) off(%llu) flags(%lu) " \ - "bhardlimit(%u) curspace("LPX64") ihardlimit(%u) " \ - "curinodes(%u): " fmt, dquot->dq_refcnt, \ + "bhardlimit("LPU64") curspace("LPU64") ihardlimit("LPU64") " \ + "curinodes("LPU64"): " fmt, dquot->dq_refcnt, \ dquot->dq_id, dquot->dq_type, dquot->dq_off, dquot->dq_flags, \ dquot->dq_dqb.dqb_bhardlimit, dquot->dq_dqb.dqb_curspace, \ dquot->dq_dqb.dqb_ihardlimit, dquot->dq_dqb.dqb_curinodes, \ @@ -68,26 +71,48 @@ qinfo->qi_info[1].dqi_free_entry, ## arg); #define QDATA_DEBUG(qd, fmt, arg...) \ - CDEBUG(D_QUOTA, "id(%u) type(%lu) count("LPU64") isblk(%lu):" \ - fmt, qd->qd_id, qd->qd_flags & QUOTA_IS_GRP, qd->qd_count, \ - (qd->qd_flags & QUOTA_IS_BLOCK) >> 1, \ + CDEBUG(D_QUOTA, "id(%u) flag(%u) type(%c) isblk(%c) count("LPU64") " \ + "qd_qunit("LPU64"): " fmt, qd->qd_id, qd->qd_flags, \ + QDATA_IS_GRP(qd) ? 'g' : 'u', QDATA_IS_BLK(qd) ? 'b': 'i', \ + qd->qd_count, \ + (QDATA_IS_ADJBLK(qd) | QDATA_IS_ADJINO(qd)) ? qd->qd_qunit : 0,\ ## arg); +#define QAQ_DEBUG(qaq, fmt, arg...) \ + CDEBUG(D_QUOTA, "id(%u) flag(%u) type(%c) bunit("LPU64") " \ + "iunit("LPU64"): " fmt, qaq->qaq_id, qaq->qaq_flags, \ + QAQ_IS_GRP(qaq) ? 'g': 'u', qaq->qaq_bunit_sz, \ + qaq->qaq_iunit_sz, ## arg); + +#define LQS_DEBUG(lqs, fmt, arg...) \ + CDEBUG(D_QUOTA, "lqs(%p) id(%u) flag(%lu) type(%c) bunit(%lu) " \ + "btune(%lu) iunit(%lu) itune(%lu) lqs_bwrite_pending(%lu) " \ + "lqs_iwrite_pending(%lu) ino_rec("LPD64") blk_rec("LPD64" ) " \ + "refcount(%d): " \ + fmt, lqs, lqs->lqs_id, lqs->lqs_flags, \ + LQS_IS_GRP(lqs) ? 'g' : 'u', \ + lqs->lqs_bunit_sz, lqs->lqs_btune_sz, lqs->lqs_iunit_sz, \ + lqs->lqs_itune_sz, lqs->lqs_bwrite_pending, \ + lqs->lqs_iwrite_pending, lqs->lqs_ino_rec, \ + lqs->lqs_blk_rec, atomic_read(&lqs->lqs_refcount), ## arg); + /* quota_context.c */ void qunit_cache_cleanup(void); int qunit_cache_init(void); int qctxt_adjust_qunit(struct obd_device *obd, struct lustre_quota_ctxt *qctxt, - uid_t uid, gid_t gid, __u32 isblk, int wait); + uid_t uid, gid_t gid, __u32 isblk, int wait, + struct obd_trans_info *oti); int qctxt_wait_pending_dqacq(struct lustre_quota_ctxt *qctxt, unsigned int id, unsigned short type, int isblk); -int qctxt_init(struct lustre_quota_ctxt *qctxt, struct super_block *sb, - dqacq_handler_t handler); +int qctxt_init(struct obd_device *obd, dqacq_handler_t handler); void qctxt_cleanup(struct lustre_quota_ctxt *qctxt, int force); -void qslave_start_recovery(struct obd_device *obd, +void qslave_start_recovery(struct obd_device *obd, struct lustre_quota_ctxt *qctxt); int compute_remquota(struct obd_device *obd, - struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata); + struct lustre_quota_ctxt *qctxt, struct qunit_data *qdata, + int isblk); +int check_qm(struct lustre_quota_ctxt *qctxt); /* quota_master.c */ int lustre_dquot_init(void); void lustre_dquot_exit(void); @@ -97,27 +122,89 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[], int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[], unsigned int qpids[], int rc, int opc); int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl); +int mds_quota_get_version(struct obd_device *obd, lustre_quota_version_t *ver); +int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl); +int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl); + int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl); +int mds_admin_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_set_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_get_dqinfo(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl); int mds_quota_recovery(struct obd_device *obd); int mds_get_obd_quota(struct obd_device *obd, struct obd_quotactl *oqctl); +int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt, struct lustre_dquot + *dquot, __u32 ost_num, __u32 mdt_num, int type, + struct quota_adjust_qunit *oqaq); #endif /* quota_ctl.c */ -int mds_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl); -int filter_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl); -int client_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl); -int lov_quota_ctl(struct obd_export *exp, struct obd_quotactl *oqctl); +int mds_quota_ctl(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl); +int filter_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); /* quota_chk.c */ -int target_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl); -int client_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl); -int lov_quota_check(struct obd_export *exp, struct obd_quotactl *oqctl); -int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk); +int target_quota_check(struct obd_device *obd, struct obd_export *exp, + struct obd_quotactl *oqctl); + +int quota_adjust_slave_lqs(struct quota_adjust_qunit *oqaq, struct + lustre_quota_ctxt *qctxt); +void qdata_to_oqaq(struct qunit_data *qdata, + struct quota_adjust_qunit *oqaq); +#ifdef __KERNEL__ +int quota_search_lqs(struct qunit_data *qdata, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt, + struct lustre_qunit_size **lqs_return); +int quota_create_lqs(struct qunit_data *qdata, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt, + struct lustre_qunit_size **lqs_return); +void quota_compute_lqs(struct qunit_data *qdata, struct lustre_qunit_size *lqs, + int is_chk, int is_acq); + +extern int quote_get_qdata(struct ptlrpc_request *req, struct qunit_data *qdata, + int is_req, int is_exp); +extern int quote_copy_qdata(struct ptlrpc_request *req, struct qunit_data *qdata, + int is_req, int is_exp); +int filter_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); +int lquota_proc_setup(struct obd_device *obd, int is_master); +int lquota_proc_cleanup(struct lustre_quota_ctxt *qctxt); + +extern cfs_proc_dir_entry_t *lquota_type_proc_dir; +#endif + +#define LQS_BLK_DECREASE 1 +#define LQS_BLK_INCREASE 2 +#define LQS_INO_DECREASE 4 +#define LQS_INO_INCREASE 8 + + +#endif +int client_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); +int lov_quota_adjust_qunit(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); +int client_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int lmv_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int lov_quota_ctl(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int client_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int lmv_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int lov_quota_check(struct obd_device *unused, struct obd_export *exp, + struct obd_quotactl *oqctl); +int client_quota_poll_check(struct obd_export *exp, struct if_quotacheck *qchk); #endif diff --git a/lustre/quota/quota_master.c b/lustre/quota/quota_master.c index 5ffdf8d..20b91f9 100644 --- a/lustre/quota/quota_master.c +++ b/lustre/quota/quota_master.c @@ -44,7 +44,7 @@ # define EXPORT_SYMTAB #endif -#define DEBUG_SUBSYSTEM S_MDS +#define DEBUG_SUBSYSTEM S_LQUOTA #include #include @@ -62,8 +62,9 @@ #include "quota_internal.h" -/* lock ordering: - * mds->mds_qonoff_sem > dquot->dq_sem */ +#ifdef HAVE_QUOTA_SUPPORT + +/* lock ordering: mds->mds_qonoff_sem > dquot->dq_sem */ static struct list_head lustre_dquot_hash[NR_DQHASH]; static spinlock_t dquot_hash_lock = SPIN_LOCK_UNLOCKED; @@ -198,7 +199,7 @@ static struct lustre_dquot *lustre_dqget(struct obd_device *obd, if ((empty = alloc_dquot(lqi, id, type)) == NULL) RETURN(ERR_PTR(-ENOMEM)); - + spin_lock(&dquot_hash_lock); if ((dquot = find_dquot(hashent, lqi, id, type)) != NULL) { dquot->dq_refcnt++; @@ -226,24 +227,134 @@ static struct lustre_dquot *lustre_dqget(struct obd_device *obd, RETURN(dquot); } +static void init_oqaq(struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt, + qid_t id, int type) +{ + struct lustre_qunit_size *lqs = NULL; + + oqaq->qaq_id = id; + oqaq->qaq_flags = type; + quota_search_lqs(NULL, oqaq, qctxt, &lqs); + if (lqs) { + spin_lock(&lqs->lqs_lock); + oqaq->qaq_bunit_sz = lqs->lqs_bunit_sz; + oqaq->qaq_iunit_sz = lqs->lqs_iunit_sz; + oqaq->qaq_flags = lqs->lqs_flags; + spin_unlock(&lqs->lqs_lock); + lqs_putref(lqs); + } else { + CDEBUG(D_QUOTA, "Can't find the lustre qunit size!\n"); + oqaq->qaq_bunit_sz = qctxt->lqc_bunit_sz; + oqaq->qaq_iunit_sz = qctxt->lqc_iunit_sz; + } +} + +int dqacq_adjust_qunit_sz(struct obd_device *obd, qid_t id, int type, + __u32 is_blk) +{ + struct mds_obd *mds = &obd->u.mds; + struct lustre_quota_ctxt *qctxt = &mds->mds_obt.obt_qctxt; + struct obd_device *lov_mds_obd = class_exp2obd(mds->mds_osc_exp); + struct lov_obd *lov = &lov_mds_obd->u.lov; + __u32 ost_num = lov->desc.ld_tgt_count, mdt_num = 1; + struct quota_adjust_qunit *oqaq = NULL; + unsigned int uid = 0, gid = 0; + struct lustre_quota_info *info = &mds->mds_quota_info; + struct lustre_dquot *dquot = NULL; + int adjust_res = 0; + int rc = 0; + ENTRY; + + LASSERT(mds); + dquot = lustre_dqget(obd, info, id, type); + if (IS_ERR(dquot)) + RETURN(PTR_ERR(dquot)); + + OBD_ALLOC_PTR(oqaq); + if (!oqaq) + GOTO(out, rc = -ENOMEM); + + down(&dquot->dq_sem); + init_oqaq(oqaq, qctxt, id, type); + + rc = dquot_create_oqaq(qctxt, dquot, ost_num, mdt_num, + is_blk ? LQUOTA_FLAGS_ADJBLK : + LQUOTA_FLAGS_ADJINO, oqaq); + + if (rc < 0) { + CDEBUG(D_ERROR, "create oqaq failed! (rc:%d)\n", rc); + GOTO(out_sem, rc); + } + QAQ_DEBUG(oqaq, "show oqaq.\n") + + if (!QAQ_IS_ADJBLK(oqaq) && !QAQ_IS_ADJINO(oqaq)) + GOTO(out_sem, rc); + + /* adjust the mds slave qunit size */ + adjust_res = quota_adjust_slave_lqs(oqaq, qctxt); + if (adjust_res <= 0) { + if (adjust_res < 0) { + rc = adjust_res; + CDEBUG(D_ERROR, "adjust mds slave's qunit size failed! \ + (rc:%d)\n", rc); + } else { + CDEBUG(D_QUOTA, "qunit doesn't need to be adjusted.\n"); + } + GOTO(out_sem, rc); + } + + if (type) + gid = dquot->dq_id; + else + uid = dquot->dq_id; + + up(&dquot->dq_sem); + + rc = qctxt_adjust_qunit(obd, qctxt, uid, gid, is_blk, 0, NULL); + if (rc == -EDQUOT || rc == -EBUSY) { + CDEBUG(D_QUOTA, "rc: %d.\n", rc); + rc = 0; + } + if (rc) { + CDEBUG(D_ERROR, "mds fail to adjust file quota! \ + (rc:%d)\n", rc); + GOTO(out, rc); + } + + /* only when block qunit is reduced, boardcast to osts */ + if ((adjust_res & LQS_BLK_DECREASE) && QAQ_IS_ADJBLK(oqaq)) + rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq, qctxt); + +out: + lustre_dqput(dquot); + if (oqaq) + OBD_FREE_PTR(oqaq); + + RETURN(rc); +out_sem: + up(&dquot->dq_sem); + goto out; +} + int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc) { struct mds_obd *mds = &obd->u.mds; + struct lustre_quota_ctxt *qctxt = &mds->mds_obt.obt_qctxt; struct lustre_quota_info *info = &mds->mds_quota_info; struct lustre_dquot *dquot = NULL; __u64 *usage = NULL; - __u32 hlimit = 0, slimit = 0; - __u32 qdata_type = qdata->qd_flags & QUOTA_IS_GRP; - __u32 is_blk = (qdata->qd_flags & QUOTA_IS_BLOCK) >> 1; + __u64 hlimit = 0, slimit = 0; time_t *time = NULL; unsigned int grace = 0; + struct lustre_qunit_size *lqs = NULL; int rc = 0; ENTRY; if (OBD_FAIL_CHECK(OBD_FAIL_OBD_DQACQ)) RETURN(-EIO); - dquot = lustre_dqget(obd, info, qdata->qd_id, qdata_type); + dquot = lustre_dqget(obd, info, qdata->qd_id, QDATA_IS_GRP(qdata)); if (IS_ERR(dquot)) RETURN(PTR_ERR(dquot)); @@ -258,14 +369,14 @@ int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc) GOTO(out, rc = -EBUSY); } - if (is_blk) { - grace = info->qi_info[qdata_type].dqi_bgrace; + if (QDATA_IS_BLK(qdata)) { + grace = info->qi_info[QDATA_IS_GRP(qdata)].dqi_bgrace; usage = &dquot->dq_dqb.dqb_curspace; hlimit = dquot->dq_dqb.dqb_bhardlimit; slimit = dquot->dq_dqb.dqb_bsoftlimit; time = &dquot->dq_dqb.dqb_btime; } else { - grace = info->qi_info[qdata_type].dqi_igrace; + grace = info->qi_info[QDATA_IS_GRP(qdata)].dqi_igrace; usage = (__u64 *) & dquot->dq_dqb.dqb_curinodes; hlimit = dquot->dq_dqb.dqb_ihardlimit; slimit = dquot->dq_dqb.dqb_isoftlimit; @@ -281,12 +392,21 @@ int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc) switch (opc) { case QUOTA_DQACQ: - if (hlimit && - QUSG(*usage + qdata->qd_count, is_blk) > hlimit) - GOTO(out, rc = -EDQUOT); + if (hlimit && + QUSG(*usage + qdata->qd_count, QDATA_IS_BLK(qdata)) > hlimit) + { + if (QDATA_IS_CHANGE_QS(qdata) && + QUSG(*usage, QDATA_IS_BLK(qdata)) < hlimit) + qdata->qd_count = (hlimit - + QUSG(*usage, QDATA_IS_BLK(qdata))) + * (QDATA_IS_BLK(qdata) ? + QUOTABLOCK_SIZE : 1); + else + GOTO(out, rc = -EDQUOT); + } if (slimit && - QUSG(*usage + qdata->qd_count, is_blk) > slimit) { + QUSG(*usage + qdata->qd_count, QDATA_IS_BLK(qdata)) > slimit) { if (*time && cfs_time_current_sec() >= *time) GOTO(out, rc = -EDQUOT); else if (!*time) @@ -304,7 +424,7 @@ int dqacq_handler(struct obd_device *obd, struct qunit_data *qdata, int opc) *usage -= qdata->qd_count; /* (usage <= soft limit) but not (usage < soft limit) */ - if (!slimit || QUSG(*usage, is_blk) <= slimit) + if (!slimit || QUSG(*usage, QDATA_IS_BLK(qdata)) <= slimit) *time = 0; break; default: @@ -317,6 +437,37 @@ out: up(&dquot->dq_sem); up(&mds->mds_qonoff_sem); lustre_dqput(dquot); + if (rc != -EDQUOT) + dqacq_adjust_qunit_sz(obd, qdata->qd_id, QDATA_IS_GRP(qdata), + QDATA_IS_BLK(qdata)); + + quota_search_lqs(qdata, NULL, qctxt, &lqs); + if (QDATA_IS_BLK(qdata)) { + if (!lqs) { + CDEBUG(D_INFO, "Can't find the lustre qunit size!\n"); + qdata->qd_qunit = qctxt->lqc_bunit_sz; + } else { + spin_lock(&lqs->lqs_lock); + qdata->qd_qunit = lqs->lqs_bunit_sz; + spin_unlock(&lqs->lqs_lock); + } + QDATA_SET_ADJBLK(qdata); + } else { + if (!lqs) { + CDEBUG(D_INFO, "Can't find the lustre qunit size!\n"); + qdata->qd_qunit = qctxt->lqc_iunit_sz; + } else { + spin_lock(&lqs->lqs_lock); + qdata->qd_qunit = lqs->lqs_iunit_sz; + spin_unlock(&lqs->lqs_lock); + } + QDATA_SET_ADJINO(qdata); + } + + QDATA_DEBUG(qdata, "alloc/release qunit in dqacq_handler\n"); + if (lqs) + lqs_putref(lqs); + return rc; } @@ -327,25 +478,73 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[], int rc2 = 0; ENTRY; - if (rc && rc != -EDQUOT) + if (rc && rc != -EDQUOT && rc != ENOLCK) RETURN(0); switch (opc) { - case FSFILT_OP_RENAME: - /* acquire/release block quota on owner of original parent */ - rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[2], qpids[3], 1, 0); - /* fall-through */ case FSFILT_OP_SETATTR: - /* acquire/release file quota on original owner */ - rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0); - /* fall-through */ - case FSFILT_OP_CREATE: + /* release file quota on original owner */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 0, 0, + NULL); + /* release block quota on original owner */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); + /* acquire file quota on current owner */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0, + NULL); + /* acquire block quota on current owner */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0, + NULL); + break; + case FSFILT_OP_UNLINK_PARTIAL_CHILD: + /* release file quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0, + NULL); + /* rlease block quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0, + NULL); + break; + case FSFILT_OP_CREATE_PARTIAL_CHILD: + /* acquire file quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0, + NULL); + /* acquire block quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0, + NULL); + break; + case FSFILT_OP_LINK: + /* acquire block quota on parent */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); + break; case FSFILT_OP_UNLINK: - /* acquire/release file/block quota on owner of child (or current owner) */ - rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0); - rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0); - /* acquire/release block quota on owner of parent (or original owner) */ - rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0); + /* release block quota on parent */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); + /* release file quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0, + NULL); + if (qpids[0] != qcids[0] || qpids[1] != qcids[1]) + /* release block quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], + qcids[1], 1, 0, NULL); + break; + case FSFILT_OP_UNLINK_PARTIAL_PARENT: + /* release block quota on parent */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); + break; + case FSFILT_OP_CREATE: + /* acquire block quota on parent */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); + /* acquire file quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 0, 0, + NULL); + if (qpids[0] != qcids[0] || qpids[1] != qcids[1]) + /* acquire block quota on child */ + rc2 |= qctxt_adjust_qunit(obd, qctxt, qcids[0], + qcids[1], 1, 0, NULL); break; default: LBUG(); @@ -353,7 +552,8 @@ int mds_quota_adjust(struct obd_device *obd, unsigned int qcids[], } if (rc2) - CERROR("mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2); + CDEBUG(rc2 == -EAGAIN ? D_QUOTA: D_ERROR, + "mds adjust qunit failed! (opc:%d rc:%d)\n", opc, rc2); RETURN(0); } @@ -370,50 +570,122 @@ int filter_quota_adjust(struct obd_device *obd, unsigned int qcids[], switch (opc) { case FSFILT_OP_SETATTR: /* acquire/release block quota on original & current owner */ - rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0); - rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0); + rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0, + NULL); + rc2 = qctxt_adjust_qunit(obd, qctxt, qpids[0], qpids[1], 1, 0, + NULL); break; case FSFILT_OP_UNLINK: /* release block quota on this owner */ case FSFILT_OP_CREATE: /* XXX for write operation on obdfilter */ /* acquire block quota on this owner */ - rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0); + rc = qctxt_adjust_qunit(obd, qctxt, qcids[0], qcids[1], 1, 0, + NULL); break; default: LBUG(); break; } - if (rc || rc2) - CERROR("filter adjust qunit failed! (opc:%d rc%d)\n", - opc, rc ?: rc2); + if (rc || rc2) { + if (!rc) + rc = rc2; + CDEBUG(rc == -EAGAIN ? D_QUOTA: D_ERROR, + "filter adjust qunit failed! (opc:%d rc%d)\n", + opc, rc); + } + RETURN(0); } -#define LUSTRE_ADMIN_QUOTAFILES {\ - "admin_quotafile.usr", /* user admin quotafile */\ - "admin_quotafile.grp" /* group admin quotafile */\ -} static const char prefix[] = "OBJECTS/"; +int mds_quota_invalidate(struct obd_device *obd, struct obd_quotactl *oqctl) +{ + struct mds_obd *mds = &obd->u.mds; + struct lustre_quota_info *qinfo = &mds->mds_quota_info; + int rc = 0, i; + char *quotafile[] = LUSTRE_ADMIN_QUOTAFILES_V2; + char name[64]; + struct lvfs_run_ctxt saved; + + LASSERT(qinfo->qi_version == LUSTRE_QUOTA_V2); + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + down(&mds->mds_qonoff_sem); + + for (i = 0; i < MAXQUOTAS; i++) { + struct file *fp; + + if (!Q_TYPESET(oqctl, i)) + continue; + + /* quota file has been opened ? */ + if (qinfo->qi_files[i]) { + rc = -EBUSY; + goto out; + } + + LASSERT(strlen(quotafile[i]) + sizeof(prefix) <= sizeof(name)); + sprintf(name, "%s%s", prefix, quotafile[i]); + + fp = filp_open(name, O_CREAT | O_TRUNC | O_RDWR, 0644); + if (IS_ERR(fp)) { + rc = PTR_ERR(fp); + CERROR("error invalidating admin quotafile %s (rc:%d)\n", + name, rc); + } + else + filp_close(fp, 0); + } + +out: + up(&mds->mds_qonoff_sem); + + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + return rc; +} + +int mds_quota_finvalidate(struct obd_device *obd, struct obd_quotactl *oqctl) +{ + struct mds_obd *mds = &obd->u.mds; + int rc; + struct lvfs_run_ctxt saved; + + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + down(&mds->mds_qonoff_sem); + + oqctl->qc_cmd = Q_FINVALIDATE; + oqctl->qc_id = obd->u.obt.obt_qfmt; + rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); + if (!rc) + rc = obd_quotactl(mds->mds_osc_exp, oqctl); + + up(&mds->mds_qonoff_sem); + pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + + return rc; +} + int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; struct lustre_quota_info *qinfo = &mds->mds_quota_info; - const char *quotafiles[] = LUSTRE_ADMIN_QUOTAFILES; + const char *quotafile[] = LUSTRE_ADMIN_QUOTAFILES_V2; struct lvfs_run_ctxt saved; char name[64]; int i, rc = 0; - struct dentry *dparent = mds->mds_objects_dir; - struct inode *iparent = dparent->d_inode; ENTRY; - LASSERT(iparent); + LASSERT(qinfo->qi_version == LUSTRE_QUOTA_V2); + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); down(&mds->mds_qonoff_sem); - for (i = 0; i < MAXQUOTAS; i++) { - struct dentry *de; + + for (i = 0; i < MAXQUOTAS && !rc; i++) { struct file *fp; if (!Q_TYPESET(oqctl, i)) @@ -426,33 +698,44 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) continue; } - /* lookup quota file */ - rc = 0; - LOCK_INODE_MUTEX(iparent); - de = lookup_one_len(quotafiles[i], dparent, - strlen(quotafiles[i])); - UNLOCK_INODE_MUTEX(iparent); - if (IS_ERR(de) || de->d_inode == NULL || - !S_ISREG(de->d_inode->i_mode)) - rc = IS_ERR(de) ? PTR_ERR(de) : -ENOENT; - if (!IS_ERR(de)) - dput(de); - - if (rc && rc != -ENOENT) { - CERROR("error lookup quotafile %s! (rc:%d)\n", + LASSERT(strlen(quotafile[i]) + sizeof(prefix) <= sizeof(name)); + sprintf(name, "%s%s", prefix, quotafile[i]); + + /* check if quota file exists and is correct */ + fp = filp_open(name, O_RDONLY, 0); + if (!IS_ERR(fp)) { + /* irregular file is not the right place for quota */ + if (!S_ISREG(fp->f_dentry->d_inode->i_mode)) { + CERROR("admin quota file %s is not " + "regular!", name); + filp_close(fp, 0); + rc = -EINVAL; + break; + } + qinfo->qi_files[i] = fp; + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_CHK); + qinfo->qi_files[i] = 0; + filp_close(fp, 0); + } + else + rc = PTR_ERR(fp); + + if (!rc) + continue; + + /* -EINVAL may be returned by quotainfo for bad quota file */ + if (rc != -ENOENT && rc != -EINVAL) { + CERROR("error opening old quota file %s (%d)\n", name, rc); break; - } else if (!rc) { - continue; } - LASSERT(strlen(quotafiles[i]) + sizeof(prefix) <= sizeof(name)); - sprintf(name, "%s%s", prefix, quotafiles[i]); + CDEBUG(D_INFO, "%s new quota file %s\n", name, + rc == -ENOENT ? "creating" : "overwriting"); - LASSERT(rc == -ENOENT); - /* create quota file */ - fp = filp_open(name, O_CREAT | O_EXCL, 0644); - if (IS_ERR(fp) || !S_ISREG(fp->f_dentry->d_inode->i_mode)) { + /* create quota file overwriting old if needed */ + fp = filp_open(name, O_CREAT | O_TRUNC | O_RDWR, 0644); + if (IS_ERR(fp)) { rc = PTR_ERR(fp); CERROR("error creating admin quotafile %s (rc:%d)\n", name, rc); @@ -460,15 +743,14 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) } qinfo->qi_files[i] = fp; - rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_INIT_INFO); - filp_close(fp, 0); - qinfo->qi_files[i] = NULL; - if (rc) { + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_INIT_INFO); + if (rc) CERROR("error init %s admin quotafile! (rc:%d)\n", i == USRQUOTA ? "user" : "group", rc); - break; - } + + filp_close(fp, 0); + qinfo->qi_files[i] = NULL; } up(&mds->mds_qonoff_sem); @@ -476,7 +758,7 @@ int init_admin_quotafiles(struct obd_device *obd, struct obd_quotactl *oqctl) RETURN(rc); } -static int close_quota_files(struct obd_quotactl *oqctl, +static int close_quota_files(struct obd_quotactl *oqctl, struct lustre_quota_info *qinfo) { int i, rc = 0; @@ -499,13 +781,12 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; struct lustre_quota_info *qinfo = &mds->mds_quota_info; - const char *quotafiles[] = LUSTRE_ADMIN_QUOTAFILES; + const char *quotafile[] = LUSTRE_ADMIN_QUOTAFILES_V2; char name[64]; int i, rc = 0; - struct inode *iparent = mds->mds_objects_dir->d_inode; ENTRY; - LASSERT(iparent); + LASSERT(qinfo->qi_version == LUSTRE_QUOTA_V2); /* open admin quota files and read quotafile info */ for (i = 0; i < MAXQUOTAS; i++) { @@ -514,27 +795,33 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) if (!Q_TYPESET(oqctl, i)) continue; - LASSERT(strlen(quotafiles[i]) + sizeof(prefix) <= sizeof(name)); - sprintf(name, "%s%s", prefix, quotafiles[i]); + LASSERT(strlen(quotafile[i]) + + sizeof(prefix) <= sizeof(name)); + sprintf(name, "%s%s", prefix, quotafile[i]); if (qinfo->qi_files[i] != NULL) { rc = -EBUSY; break; } - fp = filp_open(name, O_RDWR | O_EXCL, 0644); + fp = filp_open(name, O_RDWR, 0); if (IS_ERR(fp) || !S_ISREG(fp->f_dentry->d_inode->i_mode)) { - rc = PTR_ERR(fp); - CDEBUG(rc == -ENOENT ? D_QUOTA : D_ERROR, - "open %s failed! (rc:%d)\n", name, rc); + rc = IS_ERR(fp) ? PTR_ERR(fp) : -EINVAL; + CERROR("error open/create %s! (rc:%d)\n", name, rc); break; } qinfo->qi_files[i] = fp; + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_CHK); + if (rc) { + CERROR("invalid quota file %s! (rc:%d)\n", name, rc); + break; + } + rc = fsfilt_quotainfo(obd, qinfo, i, QFILE_RD_INFO); if (rc) { - CERROR("error read quotainfo of %s! (rc:%d)\n", - name, rc); + CERROR("error read quotainfo of %s! (rc:%d)\n", name, + rc); break; } } @@ -545,8 +832,8 @@ int mds_admin_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) RETURN(rc); } -static int mds_admin_quota_off(struct obd_device *obd, - struct obd_quotactl *oqctl) +int mds_admin_quota_off(struct obd_device *obd, + struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; struct lustre_quota_info *qinfo = &mds->mds_quota_info; @@ -584,7 +871,7 @@ int mds_quota_on(struct obd_device *obd, struct obd_quotactl *oqctl) rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); if (!rc) - obt->obt_qctxt.lqc_status = 1; + obt->obt_qctxt.lqc_flags |= UGQUOTA2LQC(oqctl->qc_type); out: pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); @@ -614,7 +901,7 @@ int mds_quota_off(struct obd_device *obd, struct obd_quotactl *oqctl) rc = obd_quotactl(mds->mds_osc_exp, oqctl); rc2 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, oqctl); if (!rc2) - obt->obt_qctxt.lqc_status = 0; + obt->obt_qctxt.lqc_flags &= ~UGQUOTA2LQC(oqctl->qc_type); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); up(&mds->mds_qonoff_sem); @@ -671,10 +958,124 @@ out: RETURN(rc); } +int dquot_create_oqaq(struct lustre_quota_ctxt *qctxt, + struct lustre_dquot *dquot, __u32 ost_num, __u32 mdt_num, + int type, struct quota_adjust_qunit *oqaq) +{ + __u64 bunit_curr_o, iunit_curr_o; + unsigned long shrink_qunit_limit = qctxt->lqc_cqs_boundary_factor; + unsigned long cqs_factor = qctxt->lqc_cqs_qs_factor; + __u64 blimit = dquot->dq_dqb.dqb_bhardlimit ? + dquot->dq_dqb.dqb_bhardlimit : dquot->dq_dqb.dqb_bsoftlimit; + __u64 ilimit = dquot->dq_dqb.dqb_ihardlimit ? + dquot->dq_dqb.dqb_ihardlimit : dquot->dq_dqb.dqb_isoftlimit; + int rc = 0; + ENTRY; + + if (!dquot || !oqaq) + RETURN(-EINVAL); + LASSERT_SEM_LOCKED(&dquot->dq_sem); + LASSERT(oqaq->qaq_iunit_sz); + LASSERT(oqaq->qaq_bunit_sz); + + /* don't change qunit size */ + if (!qctxt->lqc_switch_qs) + RETURN(rc); + + bunit_curr_o = oqaq->qaq_bunit_sz; + iunit_curr_o = oqaq->qaq_iunit_sz; + + if (dquot->dq_type == GRPQUOTA) + QAQ_SET_GRP(oqaq); + + if ((type & LQUOTA_FLAGS_ADJBLK) && blimit) { + __u64 b_limitation = + oqaq->qaq_bunit_sz * ost_num * shrink_qunit_limit; + /* enlarge block qunit size */ + while (blimit > + QUSG(dquot->dq_dqb.dqb_curspace + 2 * b_limitation, 1)) { + oqaq->qaq_bunit_sz = + QUSG(oqaq->qaq_bunit_sz * cqs_factor, 1) + << QUOTABLOCK_BITS; + b_limitation = oqaq->qaq_bunit_sz * ost_num * + shrink_qunit_limit; + } + + if (oqaq->qaq_bunit_sz > qctxt->lqc_bunit_sz) + oqaq->qaq_bunit_sz = qctxt->lqc_bunit_sz; + + /* shrink block qunit size */ + while (blimit < + QUSG(dquot->dq_dqb.dqb_curspace + b_limitation, 1)) { + do_div(oqaq->qaq_bunit_sz , cqs_factor); + oqaq->qaq_bunit_sz = QUSG(oqaq->qaq_bunit_sz, 1) << + QUOTABLOCK_BITS; + b_limitation = oqaq->qaq_bunit_sz * ost_num * + shrink_qunit_limit; + if (oqaq->qaq_bunit_sz < qctxt->lqc_cqs_least_bunit) + break; + } + + if (oqaq->qaq_bunit_sz < qctxt->lqc_cqs_least_bunit) + oqaq->qaq_bunit_sz = qctxt->lqc_cqs_least_bunit; + + if (bunit_curr_o != oqaq->qaq_bunit_sz) + QAQ_SET_ADJBLK(oqaq); + + } + + if ((type & LQUOTA_FLAGS_ADJINO) && ilimit) { + __u64 i_limitation = + oqaq->qaq_iunit_sz * mdt_num * shrink_qunit_limit; + /* enlarge file qunit size */ + while (ilimit > dquot->dq_dqb.dqb_curinodes + + 2 * i_limitation) { + oqaq->qaq_iunit_sz = oqaq->qaq_iunit_sz * cqs_factor; + i_limitation = oqaq->qaq_iunit_sz * mdt_num * + shrink_qunit_limit; + } + + if (oqaq->qaq_iunit_sz > qctxt->lqc_iunit_sz) + oqaq->qaq_iunit_sz = qctxt->lqc_iunit_sz; + + /* shrink file qunit size */ + while (ilimit < dquot->dq_dqb.dqb_curinodes + + i_limitation) { + do_div(oqaq->qaq_iunit_sz, cqs_factor); + i_limitation = oqaq->qaq_iunit_sz * mdt_num * + shrink_qunit_limit; + if (oqaq->qaq_iunit_sz < qctxt->lqc_cqs_least_iunit) + break; + } + + if (oqaq->qaq_iunit_sz < qctxt->lqc_cqs_least_iunit) + oqaq->qaq_iunit_sz = qctxt->lqc_cqs_least_iunit; + + if (iunit_curr_o != oqaq->qaq_iunit_sz) + QAQ_SET_ADJINO(oqaq); + + } + + if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit && + !dquot->dq_dqb.dqb_ihardlimit && !dquot->dq_dqb.dqb_isoftlimit) { + oqaq->qaq_bunit_sz = 0; + oqaq->qaq_iunit_sz = 0; + QAQ_SET_ADJBLK(oqaq); + QAQ_SET_ADJINO(oqaq); + } + + QAQ_DEBUG(oqaq, "the oqaq computed\n"); + + RETURN(rc); +} + static int mds_init_slave_ilimits(struct obd_device *obd, - struct obd_quotactl *oqctl, int set) + struct obd_quotactl *oqctl, int set, + struct quota_adjust_qunit *oqaq) { /* XXX: for file limits only adjust local now */ + struct obd_device_target *obt = &obd->u.obt; + struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt; unsigned int uid = 0, gid = 0; struct obd_quotactl *ioqc = NULL; int flag; @@ -683,21 +1084,29 @@ static int mds_init_slave_ilimits(struct obd_device *obd, /* if we are going to set zero limit, needn't init slaves */ if (!oqctl->qc_dqblk.dqb_ihardlimit && !oqctl->qc_dqblk.dqb_isoftlimit && - set) + !set) RETURN(0); OBD_ALLOC_PTR(ioqc); if (!ioqc) RETURN(-ENOMEM); - - flag = oqctl->qc_dqblk.dqb_ihardlimit || - oqctl->qc_dqblk.dqb_isoftlimit || set; + + flag = oqctl->qc_dqblk.dqb_ihardlimit || + oqctl->qc_dqblk.dqb_isoftlimit || !set; ioqc->qc_cmd = flag ? Q_INITQUOTA : Q_SETQUOTA; ioqc->qc_id = oqctl->qc_id; ioqc->qc_type = oqctl->qc_type; ioqc->qc_dqblk.dqb_valid = QIF_ILIMITS; ioqc->qc_dqblk.dqb_ihardlimit = flag ? MIN_QLIMIT : 0; + if (QAQ_IS_ADJINO(oqaq)) { + /* adjust the mds slave's inode qunit size */ + rc = quota_adjust_slave_lqs(oqaq, qctxt); + if (rc < 0) + CDEBUG(D_ERROR, "adjust mds slave's inode qunit size \ + failed! (rc:%d)\n", rc); + } + /* set local limit to MIN_QLIMIT */ rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, ioqc); if (rc) @@ -709,9 +1118,15 @@ static int mds_init_slave_ilimits(struct obd_device *obd, else gid = oqctl->qc_id; - rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0); + rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 0, 0, + NULL); + if (rc == -EDQUOT || rc == -EBUSY) { + CDEBUG(D_QUOTA, "rc: %d.\n", rc); + rc = 0; + } if (rc) { - CERROR("error mds adjust local file quota! (rc:%d)\n", rc); + CDEBUG(D_QUOTA,"error mds adjust local file quota! (rc:%d)\n", + rc); GOTO(out, rc); } /* FIXME initialize all slaves in CMD */ @@ -723,31 +1138,41 @@ out: } static int mds_init_slave_blimits(struct obd_device *obd, - struct obd_quotactl *oqctl, int set) + struct obd_quotactl *oqctl, int set, + struct quota_adjust_qunit *oqaq) { + struct obd_device_target *obt = &obd->u.obt; + struct lustre_quota_ctxt *qctxt = &obt->obt_qctxt; struct mds_obd *mds = &obd->u.mds; struct obd_quotactl *ioqc; unsigned int uid = 0, gid = 0; + int rc, rc1 = 0; int flag; - int rc; ENTRY; /* if we are going to set zero limit, needn't init slaves */ if (!oqctl->qc_dqblk.dqb_bhardlimit && !oqctl->qc_dqblk.dqb_bsoftlimit && - set) + !set) RETURN(0); OBD_ALLOC_PTR(ioqc); if (!ioqc) RETURN(-ENOMEM); - flag = oqctl->qc_dqblk.dqb_bhardlimit || - oqctl->qc_dqblk.dqb_bsoftlimit || set; + flag = oqctl->qc_dqblk.dqb_bhardlimit || + oqctl->qc_dqblk.dqb_bsoftlimit || !set; ioqc->qc_cmd = flag ? Q_INITQUOTA : Q_SETQUOTA; ioqc->qc_id = oqctl->qc_id; ioqc->qc_type = oqctl->qc_type; ioqc->qc_dqblk.dqb_valid = QIF_BLIMITS; ioqc->qc_dqblk.dqb_bhardlimit = flag ? MIN_QLIMIT : 0; + if (QAQ_IS_ADJBLK(oqaq)) { + /* adjust the mds slave's block qunit size */ + rc1 = quota_adjust_slave_lqs(oqaq, qctxt); + if (rc1 < 0) + CERROR("adjust mds slave's block qunit size failed!" + "(rc:%d)\n", rc1); + } rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, ioqc); if (rc) @@ -759,14 +1184,26 @@ static int mds_init_slave_blimits(struct obd_device *obd, else gid = oqctl->qc_id; - rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0); + /* initialize all slave's limit */ + rc = obd_quotactl(mds->mds_osc_exp, ioqc); + + rc = qctxt_adjust_qunit(obd, &obd->u.obt.obt_qctxt, uid, gid, 1, 0, + NULL); + if (rc == -EDQUOT || rc == -EBUSY) { + CDEBUG(D_QUOTA, "rc: %d.\n", rc); + rc = 0; + } if (rc) { CERROR("error mds adjust local block quota! (rc:%d)\n", rc); GOTO(out, rc); } - /* initialize all slave's limit */ - rc = obd_quotactl(mds->mds_osc_exp, ioqc); + /* adjust all slave's qunit size when setting quota + * this is will create a lqs for every ost, which will present + * certain uid/gid is set quota or not */ + QAQ_SET_ADJBLK(oqaq); + rc = obd_quota_adjust_qunit(mds->mds_osc_exp, oqaq, qctxt); + EXIT; out: OBD_FREE_PTR(ioqc); @@ -776,15 +1213,27 @@ out: int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) { struct mds_obd *mds = &obd->u.mds; + struct lustre_quota_ctxt *qctxt = &mds->mds_obt.obt_qctxt; + struct obd_device *lov_obd = class_exp2obd(mds->mds_osc_exp); + struct lov_obd *lov = &lov_obd->u.lov; + struct quota_adjust_qunit *oqaq = NULL; struct lustre_quota_info *qinfo = &mds->mds_quota_info; - __u32 ihardlimit, isoftlimit, bhardlimit, bsoftlimit; + __u64 ihardlimit, isoftlimit, bhardlimit, bsoftlimit; time_t btime, itime; struct lustre_dquot *dquot; struct obd_dqblk *dqblk = &oqctl->qc_dqblk; - int set, rc; + /* orig_set means if quota was set before; now_set means we are + * setting/cancelling quota */ + int orig_set, now_set; + int rc, rc2 = 0, flag = 0; ENTRY; + OBD_ALLOC_PTR(oqaq); + if (!oqaq) + RETURN(-ENOMEM); down(&mds->mds_qonoff_sem); + init_oqaq(oqaq, qctxt, oqctl->qc_id, oqctl->qc_type); + if (qinfo->qi_files[oqctl->qc_type] == NULL) GOTO(out_sem, rc = -ESRCH); @@ -819,18 +1268,20 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) dquot->dq_dqb.dqb_bhardlimit = dqblk->dqb_bhardlimit; dquot->dq_dqb.dqb_bsoftlimit = dqblk->dqb_bsoftlimit; /* clear usage (limit pool) */ - if (!dquot->dq_dqb.dqb_bhardlimit && + if (!dquot->dq_dqb.dqb_bhardlimit && !dquot->dq_dqb.dqb_bsoftlimit) dquot->dq_dqb.dqb_curspace = 0; /* clear grace time */ - if (!dqblk->dqb_bsoftlimit || + if (!dqblk->dqb_bsoftlimit || toqb(dquot->dq_dqb.dqb_curspace) <= dqblk->dqb_bsoftlimit) dquot->dq_dqb.dqb_btime = 0; /* set grace only if user hasn't provided his own */ else if (!(dqblk->dqb_valid & QIF_BTIME)) - dquot->dq_dqb.dqb_btime = cfs_time_current_sec() + + dquot->dq_dqb.dqb_btime = cfs_time_current_sec() + qinfo->qi_info[dquot->dq_type].dqi_bgrace; + + flag |= LQUOTA_FLAGS_ADJBLK; } if (dqblk->dqb_valid & QIF_ILIMITS) { @@ -847,7 +1298,16 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) else if (!(dqblk->dqb_valid & QIF_ITIME)) dquot->dq_dqb.dqb_itime = cfs_time_current_sec() + qinfo->qi_info[dquot->dq_type].dqi_igrace; + + flag |= LQUOTA_FLAGS_ADJINO; } + QAQ_DEBUG(oqaq, "before dquot_create_oqaq\n"); + rc = dquot_create_oqaq(qctxt, dquot, lov->desc.ld_tgt_count, 1, + flag, oqaq); + QAQ_DEBUG(oqaq, "after dquot_create_oqaq\n"); + if (rc < 0) + CDEBUG(D_QUOTA, "adjust qunit size failed! (rc:%d)\n", rc); + rc = fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT); @@ -859,38 +1319,47 @@ int mds_set_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) } up(&mds->mds_qonoff_sem); - if (dqblk->dqb_valid & QIF_ILIMITS) { - set = !(ihardlimit || isoftlimit); - rc = mds_init_slave_ilimits(obd, oqctl, set); + orig_set = ihardlimit || isoftlimit; + now_set = dqblk->dqb_ihardlimit || dqblk->dqb_isoftlimit; + if (dqblk->dqb_valid & QIF_ILIMITS && orig_set != now_set) { + down(&dquot->dq_sem); + dquot->dq_dqb.dqb_curinodes = 0; + up(&dquot->dq_sem); + rc = mds_init_slave_ilimits(obd, oqctl, orig_set, oqaq); if (rc) { CERROR("init slave ilimits failed! (rc:%d)\n", rc); goto revoke_out; } } - if (dqblk->dqb_valid & QIF_BLIMITS) { - set = !(bhardlimit || bsoftlimit); - rc = mds_init_slave_blimits(obd, oqctl, set); + orig_set = bhardlimit || bsoftlimit; + now_set = dqblk->dqb_bhardlimit || dqblk->dqb_bsoftlimit; + if (dqblk->dqb_valid & QIF_BLIMITS && orig_set != now_set) { + down(&dquot->dq_sem); + dquot->dq_dqb.dqb_curspace = 0; + up(&dquot->dq_sem); + rc = mds_init_slave_blimits(obd, oqctl, orig_set, oqaq); if (rc) { CERROR("init slave blimits failed! (rc:%d)\n", rc); goto revoke_out; } } - down(&mds->mds_qonoff_sem); revoke_out: + down(&mds->mds_qonoff_sem); + down(&dquot->dq_sem); if (rc) { /* cancel previous setting */ - down(&dquot->dq_sem); dquot->dq_dqb.dqb_ihardlimit = ihardlimit; dquot->dq_dqb.dqb_isoftlimit = isoftlimit; dquot->dq_dqb.dqb_bhardlimit = bhardlimit; dquot->dq_dqb.dqb_bsoftlimit = bsoftlimit; dquot->dq_dqb.dqb_btime = btime; dquot->dq_dqb.dqb_itime = itime; - fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT); - up(&dquot->dq_sem); } + rc2 = fsfilt_dquot(obd, dquot, QFILE_WR_DQUOT); + up(&dquot->dq_sem); + out: down(&dquot->dq_sem); dquot->dq_status &= ~DQ_STATUS_SET; @@ -899,14 +1368,18 @@ out: EXIT; out_sem: up(&mds->mds_qonoff_sem); - return rc; + + if (oqaq) + OBD_FREE_PTR(oqaq); + + return rc ? rc : rc2; } static int mds_get_space(struct obd_device *obd, struct obd_quotactl *oqctl) { struct obd_quotactl *soqc; struct lvfs_run_ctxt saved; - int rc; + int rc, rc1; ENTRY; OBD_ALLOC_PTR(soqc); @@ -917,26 +1390,29 @@ static int mds_get_space(struct obd_device *obd, struct obd_quotactl *oqctl) soqc->qc_id = oqctl->qc_id; soqc->qc_type = oqctl->qc_type; + /* get block usage from OSS */ + soqc->qc_dqblk.dqb_curspace = 0; rc = obd_quotactl(obd->u.mds.mds_osc_exp, soqc); - if (rc) - GOTO(out, rc); - - oqctl->qc_dqblk.dqb_curspace = soqc->qc_dqblk.dqb_curspace; + if (!rc) { + oqctl->qc_dqblk.dqb_curspace = soqc->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_valid |= QIF_SPACE; + } - push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + /* get block/inode usage from MDS */ soqc->qc_dqblk.dqb_curspace = 0; - rc = fsfilt_quotactl(obd, obd->u.obt.obt_sb, soqc); + soqc->qc_dqblk.dqb_curinodes = 0; + push_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + rc1 = fsfilt_quotactl(obd, obd->u.obt.obt_sb, soqc); pop_ctxt(&saved, &obd->obd_lvfs_ctxt, NULL); + if (!rc1) { + oqctl->qc_dqblk.dqb_curspace += soqc->qc_dqblk.dqb_curspace; + oqctl->qc_dqblk.dqb_curinodes = soqc->qc_dqblk.dqb_curinodes; + oqctl->qc_dqblk.dqb_valid |= QIF_INODES; + } - if (rc) - GOTO(out, rc); - - oqctl->qc_dqblk.dqb_curinodes += soqc->qc_dqblk.dqb_curinodes; - oqctl->qc_dqblk.dqb_curspace += soqc->qc_dqblk.dqb_curspace; - EXIT; -out: OBD_FREE_PTR(soqc); - return rc; + + RETURN(rc ? : rc1); } int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) @@ -949,6 +1425,7 @@ int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) ENTRY; down(&mds->mds_qonoff_sem); + dqblk->dqb_valid = 0; if (qinfo->qi_files[oqctl->qc_type] == NULL) GOTO(out, rc = -ESRCH); @@ -963,6 +1440,7 @@ int mds_get_dqblk(struct obd_device *obd, struct obd_quotactl *oqctl) dqblk->dqb_bsoftlimit = dquot->dq_dqb.dqb_bsoftlimit; dqblk->dqb_btime = dquot->dq_dqb.dqb_btime; dqblk->dqb_itime = dquot->dq_dqb.dqb_itime; + dqblk->dqb_valid |= QIF_LIMITS | QIF_TIMES; up(&dquot->dq_sem); lustre_dqput(dquot); @@ -997,7 +1475,7 @@ static int dquot_recovery(struct obd_device *obd, unsigned int id, unsigned short type) { struct mds_obd *mds = &obd->u.mds; - struct lustre_quota_info *qinfo= &obd->u.mds.mds_quota_info; + struct lustre_quota_info *qinfo= &mds->mds_quota_info; struct lustre_dquot *dquot; struct obd_quotactl *qctl; __u64 total_limits = 0; @@ -1030,7 +1508,7 @@ dquot_recovery(struct obd_device *obd, unsigned int id, unsigned short type) qctl->qc_type = type; qctl->qc_id = id; qctl->qc_stat = QUOTA_RECOVERING; - rc = obd_quotactl(obd->u.mds.mds_osc_exp, qctl); + rc = obd_quotactl(mds->mds_osc_exp, qctl); if (rc) GOTO(out, rc); total_limits = qctl->qc_dqblk.dqb_bhardlimit; @@ -1094,7 +1572,7 @@ static int qmaster_recovery_main(void *arg) continue; } CFS_INIT_LIST_HEAD(&id_list); - rc = fsfilt_qids(obd, qinfo->qi_files[type], NULL, type, + rc = fsfilt_qids(obd, qinfo->qi_files[type], NULL, type, &id_list); up(&mds->mds_qonoff_sem); @@ -1119,11 +1597,15 @@ free: int mds_quota_recovery(struct obd_device *obd) { - struct lov_obd *lov = &obd->u.mds.mds_osc_obd->u.lov; + struct mds_obd *mds = &obd->u.mds; + struct lov_obd *lov = &mds->mds_osc_obd->u.lov; struct qmaster_recov_thread_data data; int rc = 0; ENTRY; + if (unlikely(!mds->mds_quota)) + RETURN(rc); + mutex_down(&lov->lov_lock); if (lov->desc.ld_tgt_count != lov->desc.ld_active_tgt_count) { CWARN("Not all osts are active, abort quota recovery\n"); @@ -1142,3 +1624,5 @@ int mds_quota_recovery(struct obd_device *obd) wait_for_completion(&data.comp); RETURN(rc); } + +#endif /* HAVE_QUOTA_SUPPORT */ diff --git a/lustre/tests/Makefile.am b/lustre/tests/Makefile.am index 944bc9b..ebda53f 100644 --- a/lustre/tests/Makefile.am +++ b/lustre/tests/Makefile.am @@ -11,6 +11,7 @@ noinst_SCRIPTS += sanity.sh rundbench acceptance-small.sh compile.sh noinst_SCRIPTS += conf-sanity.sh insanity.sh lfscktest.sh oos.sh oos2.sh noinst_SCRIPTS += llog-test.sh recovery-small.sh replay-dual.sh sanity-quota.sh noinst_SCRIPTS += replay-ost-single.sh replay-single.sh run-llog.sh sanityN.sh +noinst_SCRIPTS += runracer noinst_SCRIPTS += performance-sanity.sh mdsrate-create-small.sh noinst_SCRIPTS += mdsrate-create-large.sh mdsrate-lookup-1dir.sh noinst_SCRIPTS += mdsrate-stat-small.sh mdsrate-stat-large.sh diff --git a/lustre/tests/acceptance-small.sh b/lustre/tests/acceptance-small.sh index ca92df4..9d14d74 100755 --- a/lustre/tests/acceptance-small.sh +++ b/lustre/tests/acceptance-small.sh @@ -23,7 +23,7 @@ fi [ "$DEBUG_OFF" ] || DEBUG_OFF="eval lctl set_param debug=\"$DEBUG_LVL\"" [ "$DEBUG_ON" ] || DEBUG_ON="eval lctl set_param debug=0x33f0484" -export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY" +export TESTSUITE_LIST="RUNTESTS SANITY DBENCH BONNIE IOZONE FSX SANITYN LFSCK LIBLUSTRE RACER REPLAY_SINGLE CONF_SANITY RECOVERY_SMALL REPLAY_OST_SINGLE REPLAY_DUAL INSANITY SANITY_QUOTA SANITY_SEC SANITY_GSS PERFORMANCE_SANITY" if [ "$ACC_SM_ONLY" ]; then for O in $TESTSUITE_LIST; do @@ -36,7 +36,6 @@ if [ "$ACC_SM_ONLY" ]; then done fi LFSCK="no" # bug 13698 -SANITY_QUOTA="no" # bug 13058 LIBLUSTRETESTS=${LIBLUSTRETESTS:-../liblustre/tests} @@ -59,8 +58,21 @@ FORMAT=${FORMAT:-formatall} CLEANUP=${CLEANUP:-stopall} setup_if_needed() { - mount | grep $MOUNT && return - $FORMAT && $SETUP + local MOUNTED=$(mounted_lustre_filesystems) + if $(echo $MOUNTED | grep -w -q $MOUNT); then + check_config $MOUNT + return + fi + + echo "Lustre is not mounted, trying to do setup SETUP=$SETUP ... " + [ "$REFORMAT" ] && $FORMAT + $SETUP + + MOUNTED=$(mounted_lustre_filesystems) + if ! $(echo $MOUNTED | grep -w -q $MOUNT); then + echo "Lustre is not mounted after setup! SETUP=$SETUP" + exit 1 + fi } title() { @@ -281,7 +293,7 @@ for NAME in $CONFIGS; do mount_client $MOUNT2 #echo "can't mount2 for '$NAME', skipping sanityN.sh" START=: CLEAN=: bash sanityN.sh - umount $MOUNT2 + [ "$(mount | grep $MOUNT2)" ] && umount $MOUNT2 $DEBUG_ON $CLEANUP @@ -321,7 +333,20 @@ for NAME in $CONFIGS; do LIBLUSTRE="done" fi - $CLEANUP + [ "$RACER" != "no" ] && [ -n "$CLIENTS" -a "$PDSH" = "no_dsh" ] && log "Remote client with no_dsh" && RACER=no + if [ "$RACER" != "no" ]; then + title racer + setup_if_needed + DURATION=${DURATION:-900} + [ "$SLOW" = "no" ] && DURATION=300 + RACERCLIENTS=$HOSTNAME + [ ! -z ${CLIENTS} ] && RACERCLIENTS=$CLIENTS + log "racer on clients: $RACERCLIENTS DURATION=$DURATION" + CLIENTS=${RACERCLIENTS} DURATION=$DURATION bash runracer + $CLEANUP + $SETUP + RACER="done" + fi done [ "$REPLAY_SINGLE" != "no" ] && skip_remmds replay-single && REPLAY_SINGLE=no && MSKIPPED=1 diff --git a/lustre/tests/cfg/insanity-lmv.sh b/lustre/tests/cfg/insanity-lmv.sh index 99a3ccb..9ef06ad 100644 --- a/lustre/tests/cfg/insanity-lmv.sh +++ b/lustre/tests/cfg/insanity-lmv.sh @@ -59,6 +59,10 @@ MOUNTOPT="" MKFSOPT=$MKFSOPT" -i $MDSISIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param mdt.sec_level=$SECLEVEL" +[ "x$MDSCAPA" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param mdt.capa=$MDSCAPA" [ "x$mdsfailover_HOST" != "x" ] && MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" [ "x$STRIPE_BYTES" != "x" ] && @@ -76,6 +80,10 @@ MOUNTOPT="" MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param ost.sec_level=$SECLEVEL" +[ "x$OSSCAPA" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param ost.capa=$OSSCAPA" [ "x$ostfailover_HOST" != "x" ] && MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT" diff --git a/lustre/tests/cfg/lmv.sh b/lustre/tests/cfg/lmv.sh index 10ba95f..3b573bc 100644 --- a/lustre/tests/cfg/lmv.sh +++ b/lustre/tests/cfg/lmv.sh @@ -29,6 +29,7 @@ TMP=${TMP:-/tmp} MDSDEV=${MDSDEV:-$TMP/${FSNAME}-mdt1} MDSCOUNT=${MDSCOUNT:-3} test $MDSCOUNT -gt 4 && MDSCOUNT=4 +MDSCOUNT=1 MDSDEVBASE=${MDSDEVBASE:-$TMP/${FSNAME}-mdt} MDSSIZE=${MDSSIZE:-100000} @@ -69,8 +70,10 @@ MOUNTOPT="" MKFSOPT=$MKFSOPT" -i $MDSISIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param mdt.sec_level=$SECLEVEL" [ "x$MDSCAPA" != "x" ] && - MKFSOPT="--param mdt.capa=$MDSCAPA" + MOUNTOPT=$MOUNTOPT" --param mdt.capa=$MDSCAPA" [ "x$mdsfailover_HOST" != "x" ] && MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" [ "x$STRIPE_BYTES" != "x" ] && @@ -88,8 +91,10 @@ MOUNTOPT="" MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\"$MKFSOPT\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param ost.sec_level=$SECLEVEL" [ "x$OSSCAPA" != "x" ] && - MKFSOPT="--param ost.capa=$OSSCAPA" + MOUNTOPT=$MOUNTOPT" --param ost.capa=$OSSCAPA" [ "x$ostfailover_HOST" != "x" ] && MOUNTOPT=$MOUNTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $MOUNTOPT $OSTOPT" diff --git a/lustre/tests/cfg/local.sh b/lustre/tests/cfg/local.sh index f958d58..6422b79 100644 --- a/lustre/tests/cfg/local.sh +++ b/lustre/tests/cfg/local.sh @@ -50,8 +50,10 @@ MKFSOPT="" MKFSOPT=$MKFSOPT" -i $MDSISIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param mdt.sec_level=$SECLEVEL" [ "x$MDSCAPA" != "x" ] && - MKFSOPT="--param mdt.capa=$MDSCAPA" + MOUNTOPT=$MOUNTOPT" --param mdt.capa=$MDSCAPA" [ "x$mdsfailover_HOST" != "x" ] && MDSOPT=$MDSOPT" --failnode=`h2$NETTYPE $mdsfailover_HOST`" [ "x$STRIPE_BYTES" != "x" ] && @@ -69,8 +71,10 @@ MKFSOPT="" MKFSOPT=$MKFSOPT" -J size=$OSTJOURNALSIZE" [ "x$MKFSOPT" != "x" ] && MKFSOPT="--mkfsoptions=\\\"$MKFSOPT\\\"" +[ "x$SECLEVEL" != "x" ] && + MOUNTOPT=$MOUNTOPT" --param ost.sec_level=$SECLEVEL" [ "x$OSSCAPA" != "x" ] && - MKFSOPT="--param ost.capa=$OSSCAPA" + MOUNTOPT=$MOUNTOPT" --param ost.capa=$OSSCAPA" [ "x$ostfailover_HOST" != "x" ] && OSTOPT=$OSTOPT" --failnode=`h2$NETTYPE $ostfailover_HOST`" OST_MKFS_OPTS="--ost --fsname=$FSNAME --device-size=$OSTSIZE --mgsnode=$MGSNID --param sys.timeout=$TIMEOUT $MKFSOPT $OSTOPT $OST_MKFS_OPTS" @@ -79,6 +83,7 @@ MDS_MOUNT_OPTS=${MDS_MOUNT_OPTS:-"-o loop,user_xattr,acl"} OST_MOUNT_OPTS=${OST_MOUNT_OPTS:-"-o loop"} #client +MOUNTOPT="" MOUNT=${MOUNT:-/mnt/${FSNAME}} MOUNT1=${MOUNT1:-$MOUNT} MOUNT2=${MOUNT2:-${MOUNT}2} diff --git a/lustre/tests/conf-sanity.sh b/lustre/tests/conf-sanity.sh index d6ddc29..913f695 100644 --- a/lustre/tests/conf-sanity.sh +++ b/lustre/tests/conf-sanity.sh @@ -11,8 +11,8 @@ set -e ONLY=${ONLY:-"$*"} -# bug number for skipped test: 13739 -HEAD_EXCEPT=" 32a 32b " +# bug number for skipped test: 13739 +HEAD_EXCEPT=" 32a 32b" # bug number for skipped test: ALWAYS_EXCEPT=" $CONF_SANITY_EXCEPT $HEAD_EXCEPT" @@ -41,7 +41,7 @@ remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 # -[ "$SLOW" = "no" ] && EXCEPT_SLOW="0 1 2 3 6 7 15 18 24b 25 30 31 32 33 34a " +[ "$SLOW" = "no" ] && EXCEPT_SLOW="0 1 2 3 6 7 15 18 24b 25 30 31 32 33 34a 45" assert_DIR @@ -129,8 +129,12 @@ umount_client() { } manual_umount_client(){ + local rc + local FORCE=$1 echo "manual umount lustre on ${MOUNT}...." - do_facet client "umount -d $MOUNT" + do_facet client "umount -d ${FORCE} $MOUNT" + rc=$? + return $rc } setup() { @@ -179,12 +183,12 @@ if [ "$ONLY" == "cleanup" ]; then exit fi +init_gss + #create single point mountpoint gen_config -init_gss - test_0() { setup check_mount || return 41 @@ -370,7 +374,9 @@ test_9() { do_facet ost1 lctl set_param subsystem_debug=\'mds ost\' || return 1 CHECK_PTLDEBUG="`do_facet ost1 lctl get_param -n debug`" - if [ "$CHECK_PTLDEBUG" ] && [ "$CHECK_PTLDEBUG" = "trace inode" ];then + if [ "$CHECK_PTLDEBUG" ] && { \ + [ "$CHECK_PTLDEBUG" = "trace inode warning error emerg console" ] || + [ "$CHECK_PTLDEBUG" = "trace inode" ]; }; then echo "lnet.debug success" else echo "lnet.debug: want 'trace inode', have '$CHECK_PTLDEBUG'" @@ -559,17 +565,13 @@ test_21c() { stop_ost stop_ost2 stop_mds + #writeconf to remove all ost2 traces for subsequent tests + writeconf } run_test 21c "start mds between two osts, stop mds last" test_22() { - #reformat to remove all logs - reformat start_mds - echo Client mount before any osts are in the logs - mount_client $MOUNT - check_mount && return 41 - pass echo Client mount with ost in logs, but none running start_ost @@ -901,6 +903,7 @@ test_29() { writeconf start_mds start_ost + sleep 5 cleanup } run_test 29 "permanently remove an OST" @@ -1016,26 +1019,27 @@ test_32b() { [ -z "$TUNEFS" ] && skip "No tunefs" && return local DISK1_8=$LUSTRE/tests/disk1_8.tgz [ ! -r $DISK1_8 ] && skip "Cannot find $DISK1_8" && return 0 - mkdir -p $TMP/$tdir - tar xjvf $DISK1_8 -C $TMP/$tdir || \ + local tmpdir=$TMP/$tdir + mkdir -p $tmpdir + tar xjvf $DISK1_8 -C $tmpdir || \ { skip "Cannot untar $DISK1_8" && return ; } load_modules lctl set_param debug=$PTLDEBUG - NEWNAME=sofia + NEWNAME=lustre # writeconf will cause servers to register with their current nids $TUNEFS --writeconf --fsname=$NEWNAME $tmpdir/mds || error "tunefs failed" - start mds $tmpdir/mds "-o loop" || return 3 + start mds1 $tmpdir/mds "-o loop" || return 3 local UUID=$(lctl get_param -n mdt.${NEWNAME}-MDT0000.uuid) echo MDS uuid $UUID - [ "$UUID" == "mdsA_UUID" ] || error "UUID is wrong: $UUID" + [ "$UUID" == "${NEWNAME}-MDT0000_UUID" ] || error "UUID is wrong: $UUID" - $TUNEFS --mgsnode=`hostname` --fsname=$NEWNAME --writeconf $tmpdir/ost1 || error "tunefs failed" + $TUNEFS --mgsnode=`hostname` --writeconf --fsname=$NEWNAME $tmpdir/ost1 || error "tunefs failed" start ost1 $tmpdir/ost1 "-o loop" || return 5 UUID=$(lctl get_param -n obdfilter.${NEWNAME}-OST0000.uuid) echo OST uuid $UUID - [ "$UUID" == "ost1_UUID" ] || error "UUID is wrong: $UUID" + [ "$UUID" == "${NEWNAME}-OST0000_UUID" ] || error "UUID is wrong: $UUID" echo "OSC changes should succeed:" $LCTL conf_param ${NEWNAME}-OST0000.osc.max_dirty_mb=15 || return 7 @@ -1053,7 +1057,7 @@ test_32b() { mount_client $MOUNT FSNAME=$OLDFS set_and_check client "lctl get_param -n mdc.*.max_rpcs_in_flight" "${NEWNAME}-MDT0000.mdc.max_rpcs_in_flight" || return 11 - [ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "2479747619 779" ] || return 12 + [ "$(cksum $MOUNT/passwd | cut -d' ' -f 1,2)" == "94306271 1478" ] || return 12 echo "ok." cleanup @@ -1415,7 +1419,27 @@ run_test 42 "invalid config param should not prevent client from mounting" umount_client $MOUNT cleanup_nocli -cleanup_gss +test_45() { #17310 + setup + check_mount || return 2 + stop_mds + df -h $MOUNT & + log "sleep 60 sec" + sleep 60 +#define OBD_FAIL_PTLRPC_LONG_UNLINK 0x50f + do_facet client "lctl set_param fail_loc=0x50f" + log "sleep 10 sec" + sleep 10 + manual_umount_client --force || return 3 + do_facet client "lctl set_param fail_loc=0x0" + start_mds + mount_client $MOUNT || return 4 + cleanup + return 0 +} +run_test 45 "long unlink handling in ptlrpcd" + +cleanup_gss equals_msg `basename $0`: test complete -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/createmany.c b/lustre/tests/createmany.c index 3ae06cb..a937fc5 100644 --- a/lustre/tests/createmany.c +++ b/lustre/tests/createmany.c @@ -43,69 +43,103 @@ #include #include #include +#include -void usage(char *prog) +static void usage(char *prog) { - printf("usage: %s {-o|-m|-d|-l} filenamefmt count\n", prog); - printf(" %s {-o|-m|-d|-l} filenamefmt -seconds\n", prog); - printf(" %s {-o|-m|-d|-l} filenamefmt start count\n", prog); + printf("usage: %s {-o|-m|-d|-l} [-r altpath ] filenamefmt count\n", prog); + printf(" %s {-o|-m|-d|-l} [-r altpath ] filenamefmt ] -seconds\n", prog); + printf(" %s {-o|-m|-d|-l} [-r altpath ] filenamefmt start count\n", prog); + exit(EXIT_FAILURE); } -int main(int argc, char ** argv) +static char *get_file_name(const char *fmt, long n, int has_fmt_spec) { - int i, rc = 0, do_open = 0, do_link = 0, do_mkdir = 0; - char format[4096], *fmt, *tgt = NULL; - char filename[4096]; - long start, last, end; - long begin = 0, count; + static char filename[4096]; + int bytes; - if (argc < 4 || argc > 5) { - usage(argv[0]); - return 1; + bytes = has_fmt_spec ? snprintf(filename, 4095, fmt, n) : + snprintf(filename, 4095, "%s%ld", fmt, n); + if (bytes >= 4095) { + printf("file name too long\n"); + exit(EXIT_FAILURE); } + return filename; +} - if (strcmp(argv[1], "-d") == 0) { - do_mkdir = 1; - } else if (strcmp(argv[1], "-o") == 0) { - do_open = 1; - } else if (strncmp(argv[1], "-l", 2) == 0 && argv[1][2]) { - tgt = argv[1] + 2; - do_link = 1; - } else if (strcmp(argv[1], "-m") != 0) { - usage(argv[0]); - return 1; +int main(int argc, char ** argv) +{ + long i; + int rc = 0, do_open = 0, do_link = 0, do_mkdir = 0; + int do_unlink = 0, do_mknod = 0; + char *filename; + char *fmt = NULL, *fmt_unlink = NULL, *tgt = NULL; + long start, last, end = ~0UL >> 1; + long begin = 0, count = ~0UL >> 1; + int c, has_fmt_spec = 0, unlink_has_fmt_spec = 0; + + /* Handle the last argument in form of "-seconds" */ + if (argc > 1 && argv[argc - 1][0] == '-') { + char *endp; + + argc--; + end = strtol(argv[argc] + 1, &endp, 0); + if (end <= 0 || *endp != '\0') + usage(argv[0]); + end = end + time(NULL); } - if (strlen(argv[2]) > 4080) { - printf("name too long\n"); - return 1; + while ((c = getopt(argc, argv, "omdl:r:")) != -1) { + switch(c) { + case 'o': + do_open++; + break; + case 'm': + do_mknod++; + break; + case 'd': + do_mkdir++; + break; + case 'l': + do_link++; + tgt = optarg; + break; + case 'r': + do_unlink++; + fmt_unlink = optarg; + break; + case '?': + printf("Unknown option '%c'\n", optopt); + usage(argv[0]); + } } - start = last = time(0); + if (do_open + do_mkdir + do_link + do_mknod != 1 || + do_unlink > 1) + usage(argv[0]); - if (argc == 4) { - end = strtol(argv[3], NULL, 0); - } else { - begin = strtol(argv[3], NULL, 0); - end = strtol(argv[4], NULL, 0); + switch (argc - optind) { + case 3: + begin = strtol(argv[argc - 2], NULL, 0); + case 2: + count = strtol(argv[argc - 1], NULL, 0); + if (end != ~0UL >> 1) + usage(argv[0]); + case 1: + fmt = argv[optind]; + break; + default: + usage(argv[0]); } - if (end > 0) { - count = end; - end = -1UL >> 1; - } else { - end = start - end; - count = -1UL >> 1; - } + start = last = time(NULL); - if (strchr(argv[2], '%')) - fmt = argv[2]; - else { - sprintf(format, "%s%%d", argv[2]); - fmt = format; - } - for (i = 0; i < count && time(0) < end; i++, begin++) { - sprintf(filename, fmt, begin); + has_fmt_spec = strchr(fmt, '%') != NULL; + if (do_unlink) + unlink_has_fmt_spec = strchr(fmt_unlink, '%') != NULL; + + for (i = 0; i < count && time(NULL) < end; i++, begin++) { + filename = get_file_name(fmt, begin, has_fmt_spec); if (do_open) { int fd = open(filename, O_CREAT|O_RDWR, 0644); if (fd < 0) { @@ -140,14 +174,27 @@ int main(int argc, char ** argv) break; } } + if (do_unlink) { + filename = get_file_name(fmt_unlink, begin, + unlink_has_fmt_spec); + rc = do_mkdir ? rmdir(filename) : unlink(filename); + if (rc) { + printf("unlink(%s) error: %s\n", + filename, strerror(errno)); + rc = errno; + break; + } + } + if ((i % 10000) == 0) { - printf(" - created %d (time %ld total %ld last %ld)\n", + printf(" - created %ld (time %ld total %ld last %ld)\n", i, time(0), time(0) - start, time(0) - last); - last = time(0); + last = time(NULL); } } - printf("total: %d creates in %ld seconds: %f creates/second\n", i, - time(0) - start, ((float)i / (time(0) - start))); + printf("total: %ld creates%s in %ld seconds: %f creates/second\n", i, + do_unlink ? "/deletions" : "", + time(NULL) - start, ((float)i / (time(0) - start))); return rc; } diff --git a/lustre/tests/createtest.c b/lustre/tests/createtest.c index 6f7ec0d..1e0c112 100644 --- a/lustre/tests/createtest.c +++ b/lustre/tests/createtest.c @@ -67,7 +67,7 @@ int main(int argc, char *argv[]) int mode = i | 0644; int rc; - sprintf(name, "%s-mknod%06o", argv[1], mode); + sprintf(name, "%s-mknod%07o", argv[1], mode); rc = mknod(name, mode, 0x1234); switch (i) { case 0: @@ -116,7 +116,7 @@ int main(int argc, char *argv[]) int rc; mode = i | 0644; - sprintf(name, "%s-creat%06o", argv[1], mode); + sprintf(name, "%s-creat%07o", argv[1], mode); fd = open(name, O_CREAT|O_RDONLY, mode); if (fd < 0) { fprintf(stderr, "%s: ERROR creat %s: %s\n", diff --git a/lustre/tests/disk1_8.tgz b/lustre/tests/disk1_8.tgz new file mode 100644 index 0000000000000000000000000000000000000000..1657c1e948c54abc39259d9f2a1f34ddbfa62c77 GIT binary patch literal 10506 zcma)iWn5d$_AV{7g%aGgg+P$v1lQv379e;i65Oo?iaWvG-6^icy~W*%J4K2V+TQd% z_kYg0U+%ptlbL5`p7pFXvuA(Vzmd?l65xCb0ck=$-2*e9c@zKnf7&Kk%=dsaeOUYH z*A`NyrG|M_MlLLd`6U#TCvxRw%;k)A;%amUB(VWQwNK(uP|#Q-ym%l4-~cr&0E)I< zVw_r>0SqmiNE{dADI+2Yg^_^NVp3ZH5fWND(g>cU&YdJj`HxK^;qX6Q)a6|8NO4p; z82mYnjQYd5w)R_kpnCD!JPuG);(BqMw}^Uzq|!%a6mnE?@QjGsviRhTq|I}G95pV~ zx5Y~0SXd}iabOgbNh=gonSi+Gff_|(@28WPt*oq%{swrGmy3e(+#iXokpF)h*Iz`Q zgg(qk{ydsApg`OL5Tf+jiVF}Dt7e7;#ySe^)Vk&Ypw9r&KOE(Ii*u2_HY2yd0=yo! zV<;;CMTuKkSXj-VJY%AxMM=Oy>+j{3FD@xiMitNYPq(^eIPwQ$gY=N(zh3`eQXy(*PG%6e`vMDDkhv6z7kTUJ%2nDv^tXQVi~KNjfJ7&zi^4R0fvB;q=@h%n zf%0}-2(urG+1O$S7io-KTDRCE+!Hn!{q?o>#O_S5Jz}howW1W;dn~thwMoB%m>F6@ zq!U$|)fVy$Gsby@$_Z$)mEPP|sl8SEDYKO`qFkHft10NxOeHOR!YM97OVzEb%?>sM zY&vXRzMFoZvorQ~3ZcOyUgkYH`MGwOt5;y)})Y&K>iV>;7De+I+vbKs#ZrT4cQ2`G!Gq?67tI!by^BJ zI7L(@GsQ0sGf51=+5U=`$yrH4>8G>5S2p?v@5L5ktEkbkDTdy(%Z|u9N&kwxWWJs9 zxz6l2ej*EQl?7%Er;t#*pllf!(j~vVLQrZn4sU&{R+WD{Sx?oDsiQJE#==TC>c0I( zJ5-0O@M6zrg(4uaPDj}Bv%W^0YO>aqU(>Fg0fvB4*>#O3i#Pv;#`#RnN3OC2+nM|J zZ<+LSduLFPi5n&f|FHW!A-_TZ6-=^F>#Oo~$KeAMT=}N-&=|^Vtj{x;ahYM#3@Ed& zr+4|zj+#&r856cldvKsB@llUZR@R6uIN8@hqQYT3$D00*x!GJb|{9M6Y<#Iogte(uK$ZqbirEL_rl7Z^eL<$Fl&`r@(%&qT6~^K)Qf*jn)z&q8PX>!y zl-n_W>2}m86YNmx6Rxgd3c_Y6=yF}Ygq~B_e-}bf;$yySk%)dBe|K+M1?O zUJ#FvDybTIDGbukRr$)l>Bm{jtuuP`Nt@7ly@zTJM#_|{q(#RRZRPA8`3 ze)=6Uz<~GP|o* zslFZtDhl-Ov26R4pEFf<=Tj8SuWVxttnmd=Y;RKZV9is_v)T5q{8;P~-(YyUN;k$n67hDYDdzB$tNrDL07aBGFFvi99l zrQNK##2FCh6Fk)}=Fi6u#yO!$j>GcJ#L@7}w%iB_mBg7si4$nJHU3O;WqsRum5Dp& z3gxZYr|f2{fganUz(Z&XY{?y(I>TZCn92}B<$tGiF-uZn<}NGY{tm@)Pi;627|n~g z?qpZ)D1z_W%-O8WmdN8vjo3F`|cd%JmpGn;=^p5;$`@ummO%;30~X)8!y(@c&p|`(5Op= zxmtM8#ssu1ugrzg1j~UgGWXE>V}%&V$i9*>cz_+hp0Vo^K-QJhDFpZN6?P^L*HtdP z@y5ML&~W*hy}0HZPDnYUclU|SDdqQ9rL&sqij#}V=WOM5%4-`06mcN|z5V+_gmYnL zpdYSfg$myjh_+nxbd9Qb*u|FjR^D)Z0apko=u|6|yHRx7x_t!jkMs^U${Tr4PnWpv zEX!)7G3gvH=qE*{CX~9~dN?$%VPgADwejv%YnNavkm-yuLtu?bBhaKh)?1jvPMR)e z@b1~tOjbg$`&_0feMR-Id*%1&Ap)A@R6tH# zPMnkw>bGVp#DtZ@o4uQx-)c*q`RDwS8=>1wua^ADMJ)>yg*B zs9wjR=Z^1QU@<$audi=($~{d^EXrjJYfWq@V*N}gof)eR)Gai&92^`RpP;j8YHPGv z;O>ksjwS~We(aRi2kS(nzfstBU9D*H4$^Ky+TM$orq&7uy{Y;dDHSFTS3cbOV5wd)-}u8${_Zfg6V|f&Tgww1kE`2`tNRg39DSI7QQOILQqvH?|MdQu{X6`xa$*`n3g=FS zz!K1Cp5q45pi3Yt3}mYm#nJl@r5zth`CEcg?P<{A7RV{&wB)F@S)7 z9QFnW`;iJRSi@m)$QE6Kg01au5F`hw*G}+H)B5xucLHQMq@MUcJOX6sfK+7oVF}#D zrN6q1eE<&RJ|OFM77k?H2Vnar>pEmI;dH5{Atb!KymU0^auT?I{r;+$1}{L}k=c^` zlh|K@@g8(Z^M9G?rnr=p(-S-bmZk+nj=@oMGMyo zwJIeak)zTGQ74U4`Z&jxZzKPnaJWL9<+bmXhy2D*F~v0x=+_Go2J_7}TUh2;y7aVG z{ozR?g!%BfL`-^GKwP*Nt%m8*&jR7ovkyMkzEz*@I`XIJGIr6mV8jr;VZrqjGRx4w zB?EZtPQ^6IWJ7#jpdys{ACTTxqJ>*T@K)C82pyS} zu#8r)ZE@OnNPW%6w=^@Z2_lNCxnhA`$YINz3zw>2j8O20`bhbEl<%R!4xY(8t{yUD-$y!rja3O-lwO5-5H>M? zuMV!J)(dm9sW`*!_)ln*T7OYQNyCSN+CM^OZ!au72D(5S_Zz zZ6^`O@i|}8u3YLY!)ZW}Pimzw4LqW0iS70&b6?<#?rUdShCo^cso$zWQc_aV7%!#N zsl%_Ysndd~tKqXV73C3Rf*6Op)ZH)=+b=mqvjyuisneERnq(KZ%P55?;wS|EbpH7M z7~fy<`{n%z*|dLA@_>~*cFJqO-txejvIcK5_N!m=yKRSeQ(W>IC)Gcmrl8-(V11{= zycILJ<7YtIG^7arKyvCwa_JaD*4Ld8_CfZC6Z^^U^Y2?ZySJ1EQBL+eL;Sa|w{s4M zTNxhgUtNr}$};|`d0cH)9wSX%$OF3J-%OvcVAuCUy=Vpt4yc0$>WUFsCAaVxQwf0{)_ON)`ZIwTHI$-UBt`mG* zq!KbgO9E~3bP4tddAh&F4ecMWF#WIY-bAKH!99x(y^RVWaRLpZ|0E z-#llgzZzK8Z(Oe3Y9{Bcre;^j`5%cb(!zc{R&aa*a+)){6)U8isRvxV8;)jDs&Cax z*yTG{F3>csU0ZELI)kqF!kd&&k*L|_`d^{4ME)IZ8UNp*{y%|o?|+E=h5tKud)Eo@ z8t+fu44FO^>8gT)79_bS?JuVbt~ zf_#QAbNs(-kB#})tD1YIkw&9Fl2;cCP)Qiq zjUS+4`$;3in?D$78t!`ua*tHMFHvT$)*d4O$=9H%Pj79iU}cREf>19<#pk!It6oA*VV^j^21g+#iL^N`6KOLKX`ZyZLs?eNITVb zMvYt&yx2f&ZH!#3i^XnxF5BcJ<{Yb-8TS7~?Lcfnsr7QUz>D=AQ}4|tBCUQ`nmyR* z4Kxa(Re>UA+{c(B2WvK_%-wKZ{EGkB46|Q?x!Lb+? zo-b!@^;ArGnr2}4p+udG&9MI$zJhm^D4MBb%PwU^Kp0bmf1R0Y^@RB*)tG%<`>%D$ zJ>C*}2Yq!_rBVH!@%BKADN~%yN?GAzcReXb!~{se%!YaR7MZ;yph+BO)Hu? zh+iao!V`_|DLt8=uYdRw`I1;wv)NWoI|mC1>i{AK@gc#oQLXN01tNB&KE*sTfXX_H zQMQt-&6+AyqQ|w+Jb_y@HKlcl&m5G*oy*^-u>7(HQAKY)b`RYNQW3t0jTo|~Z(&!-~Y)9HnPahR=tvkDnis9{0`53WoA!Y>8=7J&ZhxF;EKYx+~^=K0Q&@ z6}$YEfB7EmF*tOJX^miHwO-Xzku`I8XPO^3?JEQB4ftpRyQX5gVq6gV!bK7*N4{{+ zB*k8j7v5({cXb)+4-ld#u&qfZi)FJ~>(6=ayVdvM1B5psr*wI9HWnn<8hT*LR|I^= zt@xW*mOHIRMPPv-rLB`*FT08l!f6@m-J+jyboawMWzt>Db-M-o+84~jKuf(4N z@?yn1@p2`fm=kL@lYSuHr4YDQQlcu1d|fGrE-=ufR+;-+g?;z%E{ z7Rclp2h}Uo=jRwY@vnv)oz(pnrM7=ujjx5X@mpj-BAiZ=)fJf36%xx7GTk656LVwL zD~6bSq&`)*TMiRp^U8|RdP&-(2*(z_l0=($N&=brL5k`xgGb;5iLIHEj7Sx_gq!?J#x4?8ge@`Ot|SX;jNXQ@9F{aEQiU- zXwf_#{5Grhf>qIRQ{WWBhJ@fAR^gK|K26&-GkA#Rn}RHMx`Qvhmi#66sC{(cRQsl8 zny}uTUFYVKPtLGx8z<0$IZhna<%=A?Liei3vF>d@{i~7_MxP9wLspqen<3x#lYP(U zdhLSCQ=WtP9qp9U_2$5J;JL=)ktiiIb0DeSNL1=b2x%XwISVW*b&xY^&>R4PB}2Fd z^~ORH{CeQLqI_i;{ zG+q%l3fS~MCBnb?$@}3im>T9EH5z5g?9#+;azZI4$oWeJo!TH~q3Le4u{1)Ce)N+q zMAF|89~uUUjjedY-}gii@l~oi`(294*LnA$rl-2AVoQvOQf5vb2Y>r>?s^3qoQ{3Z zq9`q@RIzSh6n7QUyt3|o>teC&mTGM@m0zo*@>~07=$gS;`dRcN!5~%KsigR?AFlxo3LGSDD#Ic^6Ijxnjj0y+O{v-)$Dv zt?wEks_B;MNy(<(Ay#fr;%+fQSZaR?eXE*T+4A%J6Cuj+AX3<`B}~>T5_A;L62st| z-pXcj<8tUjwtn>jq5iJEJ!5$_wELFNzM`AQb7o<-=u0!1X)+lppKpx58|y9ecll+u zESK{3C$rk~I7=$64*>m+*L!14Q86}pr_b1=4&+Y}ht1VpYnU4JLn_Y-)#M(lBRy(C z|ecVrcrV6M_xc0qwBbpS>kI6}ef1 z-yVQR>)5BsA<`*$vll;si)<3V=QV?OK-IRh2Ub0!5Yi7vUJZAl3Zk$9?WqzHxBzC) znsPEr2T-T7-W;LdXb#eIwwE`=|L|7QZ^S^>-XE~oC z1F>WRmfi=h4is4{_rr`;Oiq7RjLU{4af(dLDeh6W#z2 zU5!H{h5NN``qz|e=;x7cA3^gd{t=#64%5`)UJBbf_pyqEsbPdm$ZiOMy|XpXR}53iwOO!W<830 z(Knkb?YrQsz=R2gXEDTdi7ZkvM58Kb`=Em(@60H!NJ$ILA%vO8JE)*@f?h&wIX-wP zPq6=|i9^imeeGcul}3&WD$e_AXQqz#P+;J0B|~VfaM~C-J6@6?uyNHvPta0Rt86GS zd<^S!gUvfHjbZz;MRRDIaso@5?mPy*d%+NBNuD=Zd#F0Vf^dK=hr4D;xD0v|XDKV| z=yL(N!~s2sURS&+Kqnpk<(zwQI|1O z1=PPxU$j7bA-Kt)Wtyx3QwY}sakMGP=~~`E#xS@-ui;z#UL!V_kwpZd^I6ZyLC2%yHKTUmKJ)XI>O5-o<#h;&L_!M$#UIhV*PZ$HD+tOXST7ktHJcTqoRQ#z_#+Xkd6qEyJoIv2p{_$UGpy8BD?jpl< z)sA$jJ^{PJDWcRg%k8H*ug8_DIdYiZ9gr~$gH5awPmepm^l572UU;PR`$ZrvOu!&U z@W9q3izwPbNS{p;82lUF6CNv3lapF&QD5J^0OwQwQLz?6lF96GrPI^a@aWx%YZ;p9 ziM~8QCh2J6P<8t-_Z>D;!93r`JSWMeKV3Wr*C}6Z*8W4Q*O}i_o(tF|j`Un-opn-`sL`tq`gvc!2)Bb&b@e3+*hJHs zkWAAbD={V~lF)ClMUIdUv5BY^mTK#wWzZapyuPXT`|)x3`rP`J$oiJ^*`MVXpPE*( zk}cWfTFZt+4Lcy6Ha~)EyZ;5(g=k^>GoACGsDRl(m2_K zWUyCoHb64e%bq3FF8yO|3i-H0P1kV)X>*ZQu8G)p@Oa8cTMotBf_)d08i9KQ&y9*_g4^{!Ux%7n3nP z?we3KumJ--AAA9lC)1%l(o#8~kNFvBA`aF;l(DBz~E{p-a6z|2Khls?{Kd8 zx_EFvbkMXVS~;E50E;3dsR-_>J^#j!9HOa?F#XX}MZ4U18@S9HURPwTAIj0|JybTQ z;shbcZj0@&%e_fH;-I>|H7H%vtuNcVfMu{T-YNQB67M)SDTT&GE$JuE*49`XncFO0 zl>FpNqGwoOcu&qyrM4QUzKtUz{4HP&P~i8T{bzss?E^aH3yUT69PV{*j9@{&m0`={GfxIOO9TNS~|J#Ep$d)n#~Nt(Z2a`!I^ zE&mu0k+@4qZTrbvWgKPg^xl$rVL1wo9!tcG;iouJxs@nu-V3(}U z-IsxHkH)^g=&OAj+%Dz^R$`x>GxTR=-o~DfTyy}xtKWp(T!N*Ky|yxOZp4BfN_xH6 zzjpz@wBZOJiu42tHn73v6Qv3mN8#GD)fh06oPB^Wftn50;uPjGQBPoZ#mnBV6oz`2 z5SFc|vS|K3oi3hRR)1-I2`2QP4jjdV#ZI8q8M?-*U3vv&|61ZFbNBv+0nQ*HX>~1~ z*XPjV(gW^}&iy9elKpPGKRSP$A83r&t{T}i?+Q{dg!lcvnd`?Q?v1bcEL3i1^NZhdu7w2J+^oj;nF0hn(z5)YzF*WS3+@XL(B`A1abe%+3L ztcyd4+y5!ITx{0#FbX(6gY{RFZtL33LfLFV@BWOd>BuL}!}oBH#PK$a|9to%K*W#o z=Qk$AV~YLZ&mAr<3bLR)MWEoKa7YlM9M7ICQa~4N0_EaBn!TwYB|H)F2_PuSA7wea zIGPM>G0equ$Pke|hvK90KbMYBm4Xy<@?|n6pNa512|_^u1G3||$g!SGkb}X!Pm0lI z#L2J#;^dsEr^B>Hz>;|R61UQR}SL?`?Pd5!W`3$q0p7Z`v-^5kh?ey;0Rok{;+y;$D3 zT)1Z9@7N@hUx~|pF4V&UU@E?i;T3u+sq3L;i-#HC-I=$u6j{|R$g**`QGYw6QVq;e zyjF6X*j|XkCgCdJv4o)sakIMwy*|o~Wg?AA2sc4VcGf@y&3w~8$X1+IPY2P*gd?oD zh%Mf!Gkr|kd?xVptw|o?M+w0!w9MN0QdWO)8=X7`RL#l66J2!0vf|@fl4(J<*V6e$ z6v3k2@Q(t7j`SJIPqlq+8;m5=lWXLEq8qT;v$c~rzk7%jN~Z%+m{5q9>#q!OKTEsO$WinT^bS?blC3G z(=RQ4jeE;FjjRjejo5OKcf5IR&%#Pj8(WedyuqNdbjTLsD0{@o>wta0>l9*np3-Gula0J`FxW4!SCRR0n#gd}S5{!S z1V+)D+$FN@M3{|536qm>ZY|UmEVb&B3Wp9x)8hdo<8X@;pW#}Ml;VT-GJejqZ8D22 z+2LfMg1L*yaf@3&0}S!0(Uynf5N#2xhS3BXm--QCtBQF_(y5hL=oP41FDt;W_|WC( z10gpq4C-N%%I)dndsBgH{hvxYy%87;O=dBBwFYTY{`ASrNhw-jaO0bPGXH| b4bApKQa1(SL2(Gw-^dgl^Jq$EF<$>KU6Ay3 literal 0 HcmV?d00001 diff --git a/lustre/tests/fsx.c b/lustre/tests/fsx.c index dc97fd9..af93599 100644 --- a/lustre/tests/fsx.c +++ b/lustre/tests/fsx.c @@ -69,7 +69,7 @@ struct log_entry { int args[3]; }; -#define LOGSIZE 1000 +#define LOGSIZE 100000 struct log_entry oplog[LOGSIZE]; /* the log */ int logptr = 0; /* current position in log */ @@ -299,7 +299,7 @@ save_buffer(char *buffer, off_t bufferlength, int fd) prterr("save_buffer: lseek eof"); else if (bufferlength > size_by_seek) { warn("save_buffer: .fsxgood file too short... will" - "save 0x%llx bytes instead of 0x%llx\n", + "save 0x%llx bytes instead of 0x%llx\n", (unsigned long long)size_by_seek, (unsigned long long)bufferlength); bufferlength = size_by_seek; @@ -402,7 +402,7 @@ enum fd_iteration_policy { int fd_policy = FD_RANDOM; int fd_last = 0; -struct test_file * +struct test_file * get_tf(void) { unsigned index = 0; @@ -471,7 +471,7 @@ open_test_files(char **argv, int argc) for (i = 0, tf = test_files; i < num_test_files; i++, tf++) { tf->path = argv[i]; - tf->fd = open(tf->path, O_RDWR|(lite ? 0 : O_CREAT|O_TRUNC), + tf->fd = open(tf->path, O_RDWR|(lite ? 0 : O_CREAT|O_TRUNC), 0666); if (tf->fd < 0) { prterr(tf->path); @@ -575,7 +575,7 @@ alloc_tf_buf(void) } } -char * +char * fill_tf_buf(struct test_file *tf) { if (tf_buf == NULL) @@ -586,7 +586,7 @@ fill_tf_buf(struct test_file *tf) } void -output_line(struct test_file *tf, int op, unsigned offset, +output_line(struct test_file *tf, int op, unsigned offset, unsigned size, struct timeval *tv) { char *tf_num = ""; @@ -613,7 +613,7 @@ output_line(struct test_file *tf, int op, unsigned offset, prt("%06lu %lu.%06lu %.*s%-10s %#08x %s %#08x\t(0x%x bytes)\n", testcalls, tv->tv_sec, tv->tv_usec, max_tf_len, - tf_num, ops[op], + tf_num, ops[op], offset, op == OP_TRUNCATE ? " to " : "thru", offset + size - 1, size); } @@ -972,7 +972,7 @@ writefileimage() prterr("writefileimage: write"); else prt("short write: 0x%lx bytes instead of 0x%llx\n", - (unsigned long)iret, + (unsigned long)iret, (unsigned long long)file_size); report_failure(172); } @@ -1185,7 +1185,7 @@ main(int argc, char **argv) setvbuf(stdout, (char *)0, _IOLBF, 0); /* line buffered stdout */ - while ((ch = getopt(argc, argv, + while ((ch = getopt(argc, argv, "b:c:dl:m:no:p:qr:s:t:w:D:I:LN:OP:RS:W")) != EOF) switch (ch) { diff --git a/lustre/tests/insanity.sh b/lustre/tests/insanity.sh index 2704a3e..ebf5cb8 100755 --- a/lustre/tests/insanity.sh +++ b/lustre/tests/insanity.sh @@ -73,13 +73,6 @@ shutdown_client() { fi } -reboot_node() { - NODE=$1 - if [ "$FAILURE_MODE" = HARD ]; then - $POWER_UP $NODE - fi -} - fail_clients() { num=$1 @@ -105,7 +98,7 @@ fail_clients() { echo "down clients: $DOWN_CLIENTS" for client in $DOWN_CLIENTS; do - reboot_node $client + boot_node $client done DOWN_NUM=`echo $DOWN_CLIENTS | wc -w` client_rmdirs @@ -162,7 +155,7 @@ clients_recover_osts() { # do_node $CLIENTS "$LCTL "'--device %OSC_`hostname`_'"${facet}_svc_MNT_client_facet recover" } -cleanup_and_setup_lustre +check_and_setup_lustre # 9 Different Failure Modes Combinations echo "Starting Test 17 at `date`" @@ -585,4 +578,4 @@ run_test 10 "Running Availability for 6 hours..." equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/it_test.c b/lustre/tests/it_test.c index 0b394a0..714828a 100644 --- a/lustre/tests/it_test.c +++ b/lustre/tests/it_test.c @@ -90,7 +90,7 @@ static enum interval_iter cb(struct interval_node *n, void *args) error("duplicate node accessing found\n"); return INTERVAL_ITER_STOP; } - + if (node->valid == 0) { error("A deleted node "__S" being accessed\n", __F(&n->in_extent)); @@ -128,23 +128,23 @@ static int it_test_search(struct interval_node *root) interval_search(root, &ext, cb, NULL); dprintf("\nverifing ..."); - + /* verify */ for (i = 0; i < it_count; i++) { n = &it_array[i]; if (n->valid == 0) continue; - if (extent_overlapped(&ext, &n->node.in_extent) && + if (extent_overlapped(&ext, &n->node.in_extent) && n->hit == 0) error("node "__S" overlaps" __S"," - "but never to be hit.\n", + "but never to be hit.\n", __F(&n->node.in_extent), __F(&ext)); - if (!extent_overlapped(&ext, &n->node.in_extent) && + if (!extent_overlapped(&ext, &n->node.in_extent) && n->hit) - error("node "__S" overlaps" __S", but hit.\n", + error("node "__S" overlaps" __S", but hit.\n", __F(&n->node.in_extent), __F(&ext)); } @@ -285,7 +285,7 @@ err: } if (nr) error("wrong tree, unbalanced!\n"); - + return 0; } @@ -341,7 +341,7 @@ static int it_test_search_hole(struct interval_node *root) return 0; } -static int contended_count = 0; +static int contended_count = 0; #define LOOP_COUNT 1000 static enum interval_iter perf_cb(struct interval_node *n, void *args) { @@ -356,7 +356,7 @@ static inline long tv_delta(struct timeval *s, struct timeval *e) long c = e->tv_sec - s->tv_sec; c *= 1000; c += (long int)(e->tv_usec - s->tv_usec) / 1000; - dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n", + dprintf("\tStart: %lu:%lu -> End: %lu:%lu\n", s->tv_sec, s->tv_usec, e->tv_sec, e->tv_usec); return c; } @@ -368,7 +368,7 @@ static int it_test_performance(struct interval_node *root, unsigned long len) struct it_node *n; struct timeval start, end; unsigned long count; - + ext.start = (random() % (max_count - len)) & ALIGN_MASK; ext.end = (ext.start + len) & ALIGN_MASK; if (have_wide_lock) { @@ -422,7 +422,7 @@ static struct interval_node *it_test_helper(struct interval_node *root) if (n->valid) { if (!interval_find(root, &n->node.in_extent)) error("Cannot find an existent node\n"); - dprintf("Erasing a node "__S"\n", + dprintf("Erasing a node "__S"\n", __F(&n->node.in_extent)); interval_erase(&n->node, &root); n->valid = 0; @@ -436,7 +436,7 @@ static struct interval_node *it_test_helper(struct interval_node *root) interval_set(&n->node, low, high); while (interval_insert(&n->node, &root)) interval_set(&n->node, low, ++high); - dprintf("Adding a node "__S"\n", + dprintf("Adding a node "__S"\n", __F(&n->node.in_extent)); n->valid = 1; list_add(&n->list, &header); diff --git a/lustre/tests/kbuild b/lustre/tests/kbuild new file mode 100755 index 0000000..4630d82 --- /dev/null +++ b/lustre/tests/kbuild @@ -0,0 +1,312 @@ +#! /bin/sh + +# +# lustre/lustre/tests/kbuild +# +# Copyright (C) 2005 Cluster File Systems, Inc. +# +# Author: Nikita Danilov +# +# This file is part of Lustre, http://www.lustre.org. +# +# Lustre is free software; you can redistribute it and/or modify it +# under the terms of version 2 of the GNU General Public License as +# published by the Free Software Foundation. +# +# Lustre is distributed in the hope that it will be useful, but +# WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU +# General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with Lustre; if not, write to the Free Software Foundation, +# Inc., 675 Mass Ave, Cambridge, MA 02139, USA. +# +# +# kbuild is a swiss-army linux kernel build script. Its purpose is to run +# automated kernel builds on given target file system (presumably Lustre) to +# measure file system performance and, occasionally, correctness. +# +# Usual kernel build doesn't not stress file system, because the bottleneck +# is CPU consumption by the user level (compiler). To work around this, +# kbuild uses ccache(1) that eliminates most of CPU load by the compiler, +# once the cache is primed. +# +# Options: + +function usage() +{ + cat <] \\ + [-t ] \\ + [-m ] \\ + [-i ] \\ + [-v ] \\ + [-c ] \\ + [-S] \\ + [-C ] + + -s source of kernel to build. This can be: + + . path to directory; + + . tar.gz, .tgz, or .tar.bz2 archive; + + . ftp or http URL to the source archive; + + defaults to "$src". + + -t target directory, where build process takes place. + Defaults to "$tgt". + + -m additional options supplied to each make invocation. + Defaults to "$mopt" + + -c kernel makefile target to invoke to configure kernel + (defconfig, allyesconfig, allmodconfig, etc.). This + option conflicts with -C . Defaults to + "$mconfig". + + -C use given .config file as kernel configuration. Not + used by default. + + -S skip kernel copying: kernel source is already unpacked + in $target. Defaults to false. + + -v increase verbosity level. + +Examples: + + $pname -s /usr/src/linux-2.6.10-base.tar.gz -t /mnt/lustre2 \\ + -m -j4 -C /usr/src/.config.fc3 + + $pname -s ftp://ftp.clusterfs.com/pub/kernels/fc3-2.6/linux-2.6.10-base.tgz \\ + -m -j4 -c defconfig -vvv + +EOF + exit 1 +} + +# +# Results: +# +# The output of kbuild are times as reported by time. First line is for build +# that fills the ccache cache (that is also located on the target file +# system). Consecutive times are repeated builds that reuse ccache +# cache. Number of iteration is set through -i option. Example output: +# +# R 783.757 S 319.615 U 281.720 +# R 540.823 S 277.387 U 54.168 +# R 557.762 S 263.566 U 53.222 +# R 543.877 S 278.569 U 54.412 +# R 544.455 S 279.096 U 53.697 +# R 545.445 S 280.546 U 53.943 +# +# Notes: +# +# Kernel builds can be quite slow as example output above shows. Create your +# own .config file to build smaller kernel. +# +# + +OPTVAL=`getopt -o s:m:i:t:vc:SC:h -n 'kbuild' -- "$@"` || usage + +# Note the quotes around `$OPTVAL': they are essential! +eval set -- "$OPTVAL" + +LOG_CRIT=0 +LOG_ERROR=1 +LOG_WARN=2 +LOG_INFO=3 +LOG_PROGRESS=4 +LOG_TRACE=5 +LOG_ALL=6 +LOG_DEBUG=7 + +src=/usr/src/linux +tgt=/mnt/lustre +verbose=$LOG_CRIT + +pname=$(basename $0) + +mopt="" +mconfig=allyesconfig +it=3 +lfile=/tmp/$pname-tmp-log.$$ +skip_copy=0 +conf_file="" + +while : ;do + case "$1" in + -s) + src="$2" + shift 2 + ;; + -t) + tgt="$2" + shift 2 + ;; + -m) + mopt="$2" + shift 2 + ;; + -C) + conf_file="$2" + shift 2 + ;; + -i) + it="$2" + shift 2 + ;; + -c) + mconfig="$2" + shift 2 + ;; + -S) + skip_copy=1 + shift + ;; + -v) + verbose=$(($verbose + 1)) + shift + ;; + -h) + usage + ;; + --) + shift + break + ;; + *) + echo "Internal error!" + usage + ;; + esac +done + +[ $verbose -ge $LOG_ALL ] && set -x + + +function warning() +{ + echo WARNING $pname: $* +} + +function fail() +{ + local rc + + rc=$1 + shift + warning $* ... failing. + exit $rc +} + +function log() +{ + local level + + level=$1 + shift + if [ $verbose -ge $level ] ;then + echo $* + fi +} + +function doquiet() +{ + local cmd + + cmd="$*" + echo >> $lfile + echo ---- start: $(date +"%Y-%m-%d %H:%M:%S") ---- >> $lfile + for i in $cmd ;do + echo "ARG: $i" >> $lfile + done + log $LOG_PROGRESS "Running '$cmd'..." + $cmd >>$lfile 2>&1 || \ + fail 1 "Errors while running '$cmd'. See $lfile for transcript" + log $LOG_PROGRESS "Finished '$cmd'." + echo ---- done: $(date +"%Y-%m-%d %H:%M:%S") ---- >> $lfile +} + +function dotime() +{ + local cmd + + cmd="$*" + export TIMEFORMAT="R %3R S %3S U %3U" + time $cmd +} + +ccache_dir=$tgt/ccache_dir +cc_script=$tgt/cc_script + +which ccache >/dev/null || fail 2 "No ccache found" +mkdir -p $ccache_dir || fail 3 "Cannot create $ccache_dir" + +export CCACHE_DIR=$ccache_dir + +# start the stuff + +cd $tgt || fail 4 "Cannot cd into $tgt" + +echo '#! /bin/sh' > $cc_script || fail 5 "Cannot write into $cc_script" +echo 'ccache cc $*' >> $cc_script || fail 6 "Cannot append to $cc_script" +chmod u+rx $cc_script || fail 7 "Cannot chmod u+rx $cc_script" + +cc_opt="CC=$cc_script" + +[ $verbose -ge $LOG_TRACE ] && vopt=-v + +if [ $skip_copy -eq 0 ] ;then + case "$src" in + ftp://*|http://*) + wget -c $src + src=$(basename $src) + ;; + esac + + case "$src" in + */) + log $LOG_PROGRESS "Copying directory $src into $tgt" + cp -a$vopt "$src" . + ;; + *.tar.gz|*.tgz) + tar xzf "$src" $vopt + ;; + *.tar.bz2) + tar xjf "$src" $vopt + ;; + *) + fail 10 "No $src" + ;; + esac +fi + +cd linux-* || fail 20 "Cannot change to linux-* from $PWD" + +function dokernel() +{ + doquiet make $mopt mrproper + if [ x$conf_file = x ] ;then + doquiet make $mopt $mconfig + else + cp $conf_file .config || fail 8 "Cannot copy $conf_file" + ls -l .config + doquiet make $mopt oldconfig + fi + + dotime doquiet make $mopt $cc_opt bzImage modules +} + +log $LOG_PROGRESS Fill the cache... + +dokernel + +for i in $(seq 1 $it) ;do + log $LOG_PROGRESS Iteration $i... + dokernel +done diff --git a/lustre/tests/lockorder.sh b/lustre/tests/lockorder.sh index 4f1ca4b..4d4e7e1 100644 --- a/lustre/tests/lockorder.sh +++ b/lustre/tests/lockorder.sh @@ -42,7 +42,7 @@ while [ $MINRES -gt $MAXRES ]; do MAXDIR=$DIRTMP MAXRES=$DIRRES fi - if [ $FILERES -lt $MINRES ]; then + if [ $FILERES -lt $MINRES -o -z "$MINFILE" ]; then [ -f "$MINFILE" ] && rm $MINFILE MINFILE=$FILETMP MINRES=$FILERES diff --git a/lustre/tests/multifstat.c b/lustre/tests/multifstat.c index c305acc..91c0d73 100644 --- a/lustre/tests/multifstat.c +++ b/lustre/tests/multifstat.c @@ -84,7 +84,7 @@ int main(int argc, char **argv) if ( st1.st_size != st2.st_size ) { printf("Sizes don't match %lu, %lu\n", - (unsigned long)st1.st_size, + (unsigned long)st1.st_size, (unsigned long)st2.st_size); return 1; } diff --git a/lustre/tests/performance-sanity.sh b/lustre/tests/performance-sanity.sh index 0139f0c..ce3d2f9 100644 --- a/lustre/tests/performance-sanity.sh +++ b/lustre/tests/performance-sanity.sh @@ -79,3 +79,4 @@ run_test 8 "getattr large files ======" equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre [ -f "$LOG" ] && cat $LOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/racer/racer.sh b/lustre/tests/racer/racer.sh index c1f8b99..645e349 100755 --- a/lustre/tests/racer/racer.sh +++ b/lustre/tests/racer/racer.sh @@ -3,16 +3,12 @@ MAX_FILES=${MAX_FILES:-20} DIR=${DIR:-$1} DIR=${DIR:-"/mnt/lustre/racer"} -if ! [ -d "$DIR" -o -d "`basename $DIR`" ]; then - echo "$0: '$DIR' and '`basename $DIR`' are not directories" - exit 1 -fi DURATION=${DURATION:-$((60*5))} NUM_THREADS=${NUM_THREADS:-$2} NUM_THREADS=${NUM_THREADS:-3} -[ -e $DIR ] || mkdir $DIR +mkdir -p $DIR racer_cleanup() { @@ -32,7 +28,7 @@ trap " echo \"Cleaning up\" racer_cleanup exit 0 -" 2 +" 2 15 cd `dirname $0` for N in `seq 1 $NUM_THREADS`; do diff --git a/lustre/tests/recovery-small.sh b/lustre/tests/recovery-small.sh index c02fd18..bae9793d 100755 --- a/lustre/tests/recovery-small.sh +++ b/lustre/tests/recovery-small.sh @@ -31,7 +31,7 @@ build_test_filter SETUP=${SETUP:-""} CLEANUP=${CLEANUP:-""} -cleanup_and_setup_lustre +check_and_setup_lustre assert_DIR rm -rf $DIR/[df][0-9]* @@ -255,6 +255,11 @@ test_18a() { # 1 stripe on ost2 lfs setstripe $f -s $((128 * 1024)) -i 1 -c 1 + get_stripe_info client $f + if [ $stripe_index -ne 1 ]; then + lfs getstripe $f + error "$f: different stripe offset ($stripe_index)" && return + fi do_facet client cp $SAMPLE_FILE $f sync @@ -275,14 +280,17 @@ test_18b() { do_facet client mkdir -p $DIR/$tdir f=$DIR/$tdir/$tfile - f2=$DIR/$tdir/${tfile}-2 cancel_lru_locks osc pgcache_empty || return 1 # shouldn't have to set stripe size of count==1 lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 - lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1 + get_stripe_info client $f + if [ $stripe_index -ne 0 ]; then + lfs getstripe $f + error "$f: different stripe offset ($stripe_index)" && return + fi do_facet client cp $SAMPLE_FILE $f sync @@ -293,7 +301,7 @@ test_18b() { # cache after the client reconnects? rc=0 pgcache_empty || rc=2 - rm -f $f $f2 + rm -f $f return $rc } run_test 18b "eviction and reconnect clears page cache (2766)" @@ -303,14 +311,17 @@ test_18c() { do_facet client mkdir -p $DIR/$tdir f=$DIR/$tdir/$tfile - f2=$DIR/$tdir/${tfile}-2 cancel_lru_locks osc pgcache_empty || return 1 # shouldn't have to set stripe size of count==1 lfs setstripe $f -s $((128 * 1024)) -i 0 -c 1 - lfs setstripe $f2 -s $((128 * 1024)) -i 0 -c 1 + get_stripe_info client $f + if [ $stripe_index -ne 0 ]; then + lfs getstripe $f + error "$f: different stripe offset ($stripe_index)" && return + fi do_facet client cp $SAMPLE_FILE $f sync @@ -326,7 +337,7 @@ test_18c() { # cache after the client reconnects? rc=0 pgcache_empty || rc=2 - rm -f $f $f2 + rm -f $f return $rc } run_test 18c "Dropped connect reply after eviction handing (14755)" @@ -841,7 +852,7 @@ test_55() { mkdir -p $DIR/$tdir # first dd should be finished quickly - lfs setstripe DIR/$tdir/$tfile-1 -c 1 -i 0 + lfs setstripe $DIR/$tdir/$tfile-1 -c 1 -i 0 dd if=/dev/zero of=$DIR/$tdir/$tfile-1 bs=32M count=4 & DDPID=$! count=0 @@ -856,7 +867,7 @@ test_55() { done echo "(dd_pid=$DDPID, time=$count)successful" - lfs setstripe DIR/$tdir/$tfile-2 -c 1 -i 0 + lfs setstripe $DIR/$tdir/$tfile-2 -c 1 -i 0 #define OBD_FAIL_OST_DROP_REQ 0x21d do_facet ost1 lctl set_param fail_loc=0x0000021d # second dd will be never finished @@ -962,4 +973,4 @@ run_test 59 "Read cancel race on client eviction" equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/replay-dual.sh b/lustre/tests/replay-dual.sh index dc31f90..3eaea5f 100755 --- a/lustre/tests/replay-dual.sh +++ b/lustre/tests/replay-dual.sh @@ -30,7 +30,13 @@ remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 build_test_filter -cleanup_and_setup_lustre +check_and_setup_lustre +MOUNTED=$(mounted_lustre_filesystems) +if ! $(echo $MOUNTED | grep -w -q $MOUNT2); then + zconf_mount $HOSTNAME $MOUNT2 + MOUNTED2=yes +fi + assert_DIR rm -rf $DIR/[df][0-9]* @@ -403,9 +409,142 @@ test_20() { #16389 } run_test 20 "recovery time is not increasing" +# commit on sharing tests +test_21a() { + local param_file=$TMP/$tfile-params + + save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file + do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=1 + touch $MOUNT1/$tfile-1 + mv $MOUNT2/$tfile-1 $MOUNT2/$tfile-2 + mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3 + replay_barrier_nosync $SINGLEMDS + umount $MOUNT2 + + facet_failover $SINGLEMDS + + # all renames are replayed + unlink $MOUNT1/$tfile-3 || return 2 + + zconf_mount `hostname` $MOUNT2 || error "mount $MOUNT2 fail" + + do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=0 + rm -rf $MOUNT1/$tfile-* + restore_lustre_params < $param_file + rm -f $param_file + return 0 +} +run_test 21a "commit on sharing" + +shutdown_client() { + local client=$1 + local mnt=$2 + + if [ "$FAILURE_MODE" = HARD ]; then + $POWER_DOWN $client + while ping -w 3 -c 1 $client > /dev/null 2>&1; do + echo "waiting for node $client to fail" + sleep 1 + done + else + zconf_umount_clients $client $mnt -f + fi +} + +# CMD: determine mds index where directory inode presents +get_mds_dir () { + local dir=$1 + local file=$dir/$tfile + + rm -f $file + local iused=$(lfs df -i $dir | grep MDT | awk '{print $3}') + local oldused=($iused) + + touch $file + sleep 1 + iused=$(lfs df -i $dir | grep MDT | awk '{print $3}') + local newused=($iused) + + local num=0 + for ((i=0; i<${#newused[@]}; i++)); do + if [ ${oldused[$i]} -lt ${newused[$i]} ]; then + echo $(( i + 1 )) + rm -f $dir/$tfile + return 0 + fi + done + error "mdt-s : inodes count OLD ${oldused[@]} NEW ${newused[@]}" +} + +test_21b_sub () { + local mds=$1 + do_node $CLIENT1 rm -f $MOUNT1/$tfile-* + + do_facet $mds sync + do_node $CLIENT1 touch $MOUNT1/$tfile-1 + do_node $CLIENT2 mv $MOUNT1/$tfile-1 $MOUNT1/$tfile-2 + do_node $CLIENT1 mv $MOUNT1/$tfile-2 $MOUNT1/$tfile-3 + + replay_barrier_nosync $mds + shutdown_client $CLIENT2 $MOUNT1 + + facet_failover $mds + + # were renames replayed? + local rc=0 + echo UNLINK $MOUNT1/$tfile-3 + do_node $CLIENT1 unlink $MOUNT1/$tfile-3 || \ + { echo "unlink $tfile-3 fail!" && rc=1; } + + boot_node $CLIENT2 + zconf_mount_clients $CLIENT2 $MOUNT1 || error "mount $CLIENT2 $MOUNT1 fail" + + return $rc +} + +test_21b() { + [ -z "$CLIENTS" ] && skip "Need two or more clients." && return + [ $CLIENTCOUNT -lt 2 ] && \ + { skip "Need two or more clients, have $CLIENTCOUNT" && return; } + + if [ "$FAILURE_MODE" = "HARD" ] && mixed_mdt_devs; then + skip "Several mdt services on one mds node are used with FAILURE_MODE=$FAILURE_MODE. " + return 0 + fi + + + zconf_umount_clients $CLIENTS $MOUNT2 + zconf_mount_clients $CLIENTS $MOUNT1 + + local param_file=$TMP/$tfile-params + + local num=$(get_mds_dir $MOUNT1) + + save_lustre_params $(facet_active_host mds$num) "mdt.*.commit_on_sharing" > $param_file + + # COS enabled + local COS=1 + do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS + + test_21b_sub mds$num || error "Not all renames are replayed. COS=$COS" + + # COS disabled (should fail) + COS=0 + do_facet mds$num lctl set_param mdt.*.commit_on_sharing=$COS + + test_21b_sub mds$num && error "Not all renames are replayed. COS=$COS" + + restore_lustre_params < $param_file + rm -f $param_file + return 0 +} +run_test 21b "commit on sharing, two clients" + +# end commit on sharing tests + equals_msg `basename $0`: test complete, cleaning up SLEEP=$((`date +%s` - $NOW)) [ $SLEEP -lt $TIMEOUT ] && sleep $SLEEP +[ "$MOUNTED2" = yes ] && zconf_umount $HOSTNAME $MOUNT2 || true check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true - +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/replay-ost-single.sh b/lustre/tests/replay-ost-single.sh index c2e493b..5c8d9d3 100755 --- a/lustre/tests/replay-ost-single.sh +++ b/lustre/tests/replay-ost-single.sh @@ -10,11 +10,18 @@ CLEANUP=${CLEANUP:-""} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} +# While we do not use OSTCOUNT=1 setup anymore, +# ost1failover_HOST is used +#ostfailover_HOST=${ostfailover_HOST:-$ost_HOST} #failover= must be defined in OST_MKFS_OPTIONS if ostfailover_HOST != ost_HOST remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 +if [ "$FAILURE_MODE" = "HARD" ] && mixed_ost_devs; then + skip "$0: Several ost services on one ost node are used with FAILURE_MODE=$FAILURE_MODE. " + exit 0 +fi + # Tests that fail on uml CPU=`awk '/model/ {print $4}' /proc/cpuinfo` [ "$CPU" = "UML" ] && EXCEPT="$EXCEPT 6" @@ -26,87 +33,89 @@ ALWAYS_EXCEPT="$REPLAY_OST_SINGLE_EXCEPT" # [ "$SLOW" = "no" ] && EXCEPT_SLOW="5" -# It is replay-ost-single, after all -OSTCOUNT=1 - build_test_filter -REFORMAT=--reformat cleanup_and_setup_lustre +check_and_setup_lustre assert_DIR rm -rf $DIR/[df][0-9]* +TDIR=$DIR/d0.${TESTSUITE} +mkdir -p $TDIR +$LFS setstripe $TDIR -i 0 -c 1 +$LFS getstripe $TDIR + test_0a() { zconf_umount `hostname` $MOUNT -f # needs to run during initial client->OST connection #define OBD_FAIL_OST_ALL_REPLY_NET 0x211 - do_facet ost "lctl set_param fail_loc=0x80000211" + do_facet ost1 "lctl set_param fail_loc=0x80000211" zconf_mount `hostname` $MOUNT && df $MOUNT || error "0a mount fail" } run_test 0a "target handle mismatch (bug 5317) `date +%H:%M:%S`" test_0b() { fail ost1 - cp /etc/profile $DIR/$tfile + cp /etc/profile $TDIR/$tfile sync - diff /etc/profile $DIR/$tfile - rm -f $DIR/$tfile + diff /etc/profile $TDIR/$tfile + rm -f $TDIR/$tfile } run_test 0b "empty replay" test_1() { - date > $DIR/$tfile || error "error creating $DIR/$tfile" + date > $TDIR/$tfile || error "error creating $TDIR/$tfile" fail ost1 - $CHECKSTAT -t file $DIR/$tfile || return 1 - rm -f $DIR/$tfile + $CHECKSTAT -t file $TDIR/$tfile || return 1 + rm -f $TDIR/$tfile } run_test 1 "touch" test_2() { for i in `seq 10`; do - echo "tag-$i" > $DIR/$tfile-$i || error "create $DIR/$tfile-$i" + echo "tag-$i" > $TDIR/$tfile-$i || error "create $TDIR/$tfile-$i" done fail ost1 for i in `seq 10`; do - grep -q "tag-$i" $DIR/$tfile-$i || error "grep $DIR/$tfile-$i" + grep -q "tag-$i" $TDIR/$tfile-$i || error "grep $TDIR/$tfile-$i" done - rm -f $DIR/$tfile-* + rm -f $TDIR/$tfile-* } run_test 2 "|x| 10 open(O_CREAT)s" test_3() { verify=$ROOT/tmp/verify-$$ - dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile & + dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $TDIR/$tfile & ddpid=$! sync & fail ost1 wait $ddpid || return 1 - cmp $verify $DIR/$tfile || return 2 - rm -f $verify $DIR/$tfile + cmp $verify $TDIR/$tfile || return 2 + rm -f $verify $TDIR/$tfile } run_test 3 "Fail OST during write, with verification" test_4() { verify=$ROOT/tmp/verify-$$ - dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $DIR/$tfile + dd if=/dev/urandom bs=4096 count=1280 | tee $verify > $TDIR/$tfile # invalidate cache, so that we're reading over the wire cancel_lru_locks osc - cmp $verify $DIR/$tfile & + cmp $verify $TDIR/$tfile & cmppid=$! fail ost1 wait $cmppid || return 1 - rm -f $verify $DIR/$tfile + rm -f $verify $TDIR/$tfile } run_test 4 "Fail OST during read, with verification" test_5() { [ -z "`which iozone 2> /dev/null`" ] && skip "iozone missing" && return 0 - FREE=`df -P $DIR | tail -n 1 | awk '{ print $4/2 }'` + FREE=`df -P $TDIR | tail -n 1 | awk '{ print $4/2 }'` GB=1048576 # 1048576KB == 1GB if (( FREE > GB )); then FREE=$GB fi IOZONE_OPTS="-i 0 -i 1 -i 2 -+d -r 4 -s $FREE" - iozone $IOZONE_OPTS -f $DIR/$tfile & + iozone $IOZONE_OPTS -f $TDIR/$tfile & PID=$! sleep 8 @@ -114,7 +123,7 @@ test_5() { wait $PID RC=$? log "iozone rc=$RC" - rm -f $DIR/$tfile + rm -f $TDIR/$tfile [ $RC -ne 0 ] && return $RC || true } run_test 5 "Fail OST during iozone" @@ -126,7 +135,7 @@ kbytesfree() { test_6() { remote_mds_nodsh && skip "remote MDS with nodsh" && return 0 - f=$DIR/$tfile + f=$TDIR/$tfile rm -f $f sync && sleep 2 && sync # wait for delete thread before=`kbytesfree` @@ -153,7 +162,7 @@ test_6() { run_test 6 "Fail OST before obd_destroy" test_7() { - f=$DIR/$tfile + f=$TDIR/$tfile rm -f $f sync && sleep 5 && sync # wait for delete thread before=`kbytesfree` @@ -179,4 +188,4 @@ run_test 7 "Fail OST before obd_destroy" equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/replay-single.sh b/lustre/tests/replay-single.sh index c3a2fd0..2a2f26f 100755 --- a/lustre/tests/replay-single.sh +++ b/lustre/tests/replay-single.sh @@ -34,7 +34,7 @@ fi build_test_filter -cleanup_and_setup_lustre +check_and_setup_lustre mkdir -p $DIR @@ -1429,16 +1429,28 @@ run_test 59 "test log_commit_thread vs filter_destroy race" # bug 17323 test_59b() { + do_facet $SINGLEMDS "lctl set_param debug=+rpctrace" mkdir -p $DIR/$tdir createmany -o $DIR/$tdir/$tfile-%d 2000 sync #define OBD_FAIL_OBD_LOG_CANCEL_REP 0x606 do_facet $SINGLEMDS "lctl set_param fail_loc=0x606" unlinkmany $DIR/$tdir/$tfile-%d 2000 - sleep 60 + + # make sure that all llcds left ost and nothing left cached + sync + sleep 10 do_facet $SINGLEMDS "lctl set_param fail_loc=0x0" - do_facet $SINGLEMDS $LCTL dk | grep -q "RESENT cancel req" || return 1 + + # sleep 2 obd_timeouts from ost to make sure that we get resents. + local timeout=$(do_facet ost1 lctl get_param -n timeout) + timeout=$((timeout * 2)) + log "Sleep $timeout" + sleep $timeout + do_facet $SINGLEMDS $LCTL dk | grep -q "RESENT cancel req" + local res=$? rmdir $DIR/$tdir + return $res } run_test 59b "resent handle in llog_origin_handle_cancel" @@ -1896,4 +1908,4 @@ run_test 82b "CMD: mkdir cross-node dir (fail mds with name)" equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/runracer b/lustre/tests/runracer new file mode 100644 index 0000000..fcc26ed --- /dev/null +++ b/lustre/tests/runracer @@ -0,0 +1,113 @@ +#!/bin/bash +#set -vx +set -e + +LUSTRE=${LUSTRE:-$(cd $(dirname $0)/..; echo $PWD)} +. $LUSTRE/tests/test-framework.sh +init_test_env $@ +. ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} + +racer=`which racer.sh` +[ -z "$racer" ] && echo racer is not installed && exit 1 + +CLIENTS=${CLIENTS:-$HOSTNAME} +RDIR=$DIR/racer +mkdir -p $RDIR +DURATION=${DURATION:-120} + +assert_env CLIENTS + +timer_on () { + sleep $1 && kill -s ALRM $$ & + TIMERPID=$! + echo TIMERPID=$TIMERPID +} + +do_racer_cleanup () { + trap 0 + + local WAIT=0 + local INTERVAL=5 + local pids + local rc=0 + + echo "DOING RACER CLEANUP ... " + + # Check if all processes are killed + + local clients=$CLIENTS + + # 1.Let chance to racer to kill all it's processes + # FIXME: not sure how long does it take for racer to kill all processes + # 80 is sometimes are enough for 2 clients; sometimes it takes more than 150 sec + while [ $WAIT -lt 90 ]; do + running=$(do_nodes $clients "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|pdsh|bash)" || true) + [ -z "$running" ] && rc=0 && break + echo "clients $clients are still running the racer processes. Waited $WAIT secs" + echo $running + rc=1 + [ $INTERVAL -lt 40 ] && INTERVAL=$((INTERVAL + INTERVAL)) + sleep $INTERVAL + WAIT=$((WAIT + INTERVAL)) + done + + # 2. Kill the remaining processes + if [ $rc -ne 0 ]; then + for C in ${clients//,/ } ; do + pids=$(do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" | awk '{print $2}' || true) + if [ ! -z "$pids" ]; then + echo "client $C still running racer processes after $WAIT seconds. Killing $pids" + do_node $C "ps uax | grep $RDIR " | egrep -v "(acceptance|grep|PATH)" + do_node $C kill -TERM $pids || true + # let processes to be killed + sleep 2 + # 3. Check if the processes were killed + # exit error if the processes still exist + for pid in $pids; do + do_node $C "ps -P $pid" && RC=1 || true + done + else + echo "All processes on client $C exited after $WAIT seconds. OK." + fi + done + else + echo "No racer processes running after $WAIT seconds. OK." + wait_remote_prog $racer 10 + fi +} + +racer_cleanup () { + if [ "$timeout" == "timeout" ]; then + echo $timeout killing RACERPID=$RACERPID + kill $RACERPID || true + sleep 2 # give chance racer to kill it's processes + do_racer_cleanup + else + echo "Racer completed before DURATION=$DURATION expired. Cleaning up..." + kill $TIMERPID + do_racer_cleanup + fi +} + +racer_timeout () { + timeout="timeout" + racer_cleanup + echo "$0: completed $RC" + exit $RC +} + +# run racer +log "Start racer on clients: $CLIENTS DURATION=$DURATION" +RC=0 + +trap racer_timeout ALRM + +timer_on $((DURATION + 5)) + +do_nodes $CLIENTS "DURATION=$DURATION $racer $RDIR" & +RACERPID=$! +echo RACERPID=$RACERPID +wait $RACERPID || RC=2 +racer_cleanup +echo "$0: completed $RC" +exit $RC diff --git a/lustre/tests/sanity-gss.sh b/lustre/tests/sanity-gss.sh index a6c8166..018c242 100644 --- a/lustre/tests/sanity-gss.sh +++ b/lustre/tests/sanity-gss.sh @@ -59,9 +59,7 @@ cnt_all2ost=0 cnt_all2mdt=0 cnt_all2all=0 DBENCH_PID=0 -PROC_CLI="srpc.info" -# Escape "." to use lctl -PROC_CLI=${PROC_CLI//\./\*} +PROC_CLI="srpc_info" # set manually GSS=true @@ -81,7 +79,7 @@ prepare_krb5_creds # we want double mount MOUNT_2=${MOUNT_2:-"yes"} -cleanup_and_setup_lustre +check_and_setup_lustre rm -rf $DIR/[df][0-9]* @@ -230,6 +228,14 @@ flvr_cnt_mdt2ost() echo $cnt; } +flvr_cnt_mgc2mgs() +{ + local flavor=$1 + + output=`do_facet client lctl get_param -n mgc.*.$PROC_CLI 2>/dev/null` + count_flvr "$output" $flavor +} + do_check_flavor() { local dir=$1 # from to @@ -452,6 +458,7 @@ test_1() { chmod 0777 $DIR || error "chmod $DIR failed" # access w/o cred $RUNAS kdestroy + $RUNAS $LFS flushctx || error "can't flush ctx" $RUNAS touch $file && error "unexpected success" # access w/ cred @@ -624,7 +631,7 @@ test_7() { [ $num_osts -lt 2 ] && echo "skipping $TESTNAME (must have >= 2 OSTs)" && return mkdir $tdir || error - $LFS setstripe $tdir 0 -1 -1 || error + $LFS setstripe -c $num_osts $tdir || error echo "creating..." for ((i=0;i<20;i++)); do @@ -640,14 +647,15 @@ run_test 7 "exercise enlarge_reqbuf()" test_8() { - debugsave - sysctl -w lnet.debug="other" + sleep $TIMEOUT $LCTL dk > /dev/null + debugsave + sysctl -w lnet.debug="+other" # sleep sometime in ctx handle - do_facet mds sysctl -w lustre.fail_val=60 + do_facet mds lctl set_param fail_val=30 #define OBD_FAIL_SEC_CTX_HDL_PAUSE 0x1204 - do_facet mds sysctl -w lustre.fail_loc=0x1204 + do_facet mds lctl set_param fail_loc=0x1204 $RUNAS $LFS flushctx || error "can't flush ctx" @@ -780,6 +788,8 @@ test_90() { $LFS flushctx done check_dbench + #sleep to let ctxs be re-established + sleep 10 stop_dbench } run_test 90 "recoverable from losing contexts under load" @@ -787,7 +797,7 @@ run_test 90 "recoverable from losing contexts under load" test_99() { local nrule_old=0 local nrule_new=0 - local max=32 + local max=64 # # general rules @@ -799,7 +809,6 @@ test_99() { for ((i = $nrule_old; i < $max; i++)); do set_rule $FSNAME elan$i any krb5n || error "set rule $i" done - set_rule $FSNAME elan100 any krb5n && error "set $max rule should fail" for ((i = $nrule_old; i < $max; i++)); do set_rule $FSNAME elan$i any || error "remove rule $i" done @@ -820,7 +829,6 @@ test_99() { for ((i = $nrule_old; i < $max; i++)); do set_rule $FSNAME-MDT0000 elan$i any krb5i || error "set rule $i" done - set_rule $FSNAME-MDT0000 elan100 any krb5i && error "set $max rule should fail" for ((i = $nrule_old; i < $max; i++)); do set_rule $FSNAME-MDT0000 elan$i any || error "remove rule $i" done @@ -831,7 +839,7 @@ test_99() { error "general rule: $nrule_new != $nrule_old" fi } -run_test 99 "maximum sptlrpc rules limitation" +run_test 99 "set large number of sptlrpc rules" error_dbench() { @@ -1056,6 +1064,70 @@ test_102() { } run_test 102 "survive from insanely fast flavor switch" +test_150() { + local save_opts + + # started from default flavors + restore_to_default_flavor + + # at this time no rules has been set on mgs; mgc use null + # flavor connect to mgs. + count=`flvr_cnt_mgc2mgs null` + [ $count -eq 1 ] || error "$count mgc connection use null flavor" + + # umount both clients + zconf_umount $HOSTNAME $MOUNT || return 1 + zconf_umount $HOSTNAME $MOUNT2 || return 2 + + # mount client with default flavor - should succeed + zconf_mount $HOSTNAME $MOUNT || error "mount with default flavor should have succeeded" + zconf_umount $HOSTNAME $MOUNT || return 5 + + # mount client with conflict flavor - should fail + save_opts=$MOUNTOPT + MOUNTOPT="$MOUNTOPT,mgssec=krb5p" + zconf_mount $HOSTNAME $MOUNT && error "mount with conflict flavor should have failed" + MOUNTOPT=$save_opts + + # mount client with same flavor - should succeed + save_opts=$MOUNTOPT + MOUNTOPT="$MOUNTOPT,mgssec=null" + zconf_mount $HOSTNAME $MOUNT || error "mount with same flavor should have succeeded" + zconf_umount $HOSTNAME $MOUNT || return 6 + MOUNTOPT=$save_opts +} +run_test 150 "secure mgs connection: client flavor setting" + +test_151() { + local save_opts + + # set mgs only accept krb5p + set_rule _mgs any any krb5p + + # umount everything, modules still loaded + stopall + + # mount mgs with default flavor, in current framework it means mgs+mdt1. + # the connection of mgc of mdt1 to mgs is expected fail. + DEVNAME=$(mdsdevname 1) + start mds1 $DEVNAME $MDS_MOUNT_OPTS && error "mount with default flavor should have failed" + + # mount with unauthorized flavor should fail + save_opts=$MDS_MOUNT_OPTS + MDS_MOUNT_OPTS="$MDS_MOUNT_OPTS,mgssec=null" + start mds1 $DEVNAME $MDS_MOUNT_OPTS && error "mount with unauthorized flavor should have failed" + MDS_MOUNT_OPTS=$save_opts + + # mount with designated flavor should succeed + save_opts=$MDS_MOUNT_OPTS + MDS_MOUNT_OPTS="$MDS_MOUNT_OPTS,mgssec=krb5p" + start mds1 $DEVNAME $MDS_MOUNT_OPTS || error "mount with designated flavor should have succeeded" + MDS_MOUNT_OPTS=$save_opts + + stop mds1 -f +} +run_test 151 "secure mgs connection: server flavor control" + equals_msg `basename $0`: test complete, cleaning up check_and_cleanup_lustre -[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG || true +[ -f "$TESTSUITELOG" ] && cat $TESTSUITELOG && grep -q FAIL $TESTSUITELOG && exit 1 || true diff --git a/lustre/tests/sanity-nano.sh b/lustre/tests/sanity-nano.sh new file mode 100755 index 0000000..2005b0b --- /dev/null +++ b/lustre/tests/sanity-nano.sh @@ -0,0 +1,29 @@ +#! /bin/sh +# +# Extremely minimal regression test set for clio. +# + +MOUNT=${MOUNT:-"/mnt/lustre"} + +function cmpcheck() { + find /etc/ -type f | while read ;do + f=$REPLY + echo -n . + cmp $f $MOUNT/$f + done +} + +cp -vax /etc $MOUNT || exit 1 +cmpcheck + +export OSTCOUNT=2 +#export LOV="27c 27d 27e 27f 27g 27j 27k 27l 27m 27s 27t 27w 34f 51d 56 56g 56h" +#export JOIN="75a 75b 57c 75d 75e 75f 75g" +#export CHKSUM="77a 77d 77e 77f" +#export DIO="69 77d 77e 77f 78 119a 119b 119c" +#export EXCEPT="69 78 118a 129 $JOIN $CHKSUM $DIO" +#export EXCEPT="77f" +export SLOW="yes" + +sh sanity.sh +#umount $MOUNT || exit 2 diff --git a/lustre/tests/sanity-quota.sh b/lustre/tests/sanity-quota.sh index 6298d3d..634ecf1 100644 --- a/lustre/tests/sanity-quota.sh +++ b/lustre/tests/sanity-quota.sh @@ -17,7 +17,8 @@ SRCDIR=`dirname $0` export PATH=$PWD/$SRCDIR:$SRCDIR:$PWD/$SRCDIR/../utils:$PATH:/sbin ONLY=${ONLY:-"$*"} -ALWAYS_EXCEPT="$SANITY_QUOTA_EXCEPT" +# enable test_23 after bug 16542 fixed. +ALWAYS_EXCEPT="10 23 $SANITY_QUOTA_EXCEPT" # UPDATE THE COMMENT ABOVE WITH BUG NUMBERS WHEN CHANGING ALWAYS_EXCEPT! case `uname -r` in @@ -36,10 +37,8 @@ TSTID2=${TSTID2:-60001} TSTUSR=${TSTUSR:-"quota_usr"} TSTUSR2=${TSTUSR2:-"quota_2usr"} BLK_SZ=1024 -BUNIT_SZ=${BUNIT_SZ:-1000} # default 1000 quota blocks -BTUNE_SZ=${BTUNE_SZ:-500} # default 50% of BUNIT_SZ -IUNIT_SZ=${IUNIT_SZ:-10} # default 10 files -ITUNE_SZ=${ITUNE_SZ:-5} # default 50% of IUNIT_SZ +BUNIT_SZ=${BUNIT_SZ:-1024} # min block quota unit(kB) +IUNIT_SZ=${IUNIT_SZ:-10} # min inode quota unit MAX_DQ_TIME=604800 MAX_IQ_TIME=604800 @@ -48,11 +47,13 @@ LUSTRE=${LUSTRE:-`dirname $0`/..} . $LUSTRE/tests/test-framework.sh init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} +DIRECTIO=${DIRECTIO:-$LUSTRE/tests/directio} +[ $MDSCOUNT -gt 1 ] && skip "CMD case" && exit 0 remote_mds_nodsh && skip "remote MDS with nodsh" && exit 0 remote_ost_nodsh && skip "remote OST with nodsh" && exit 0 -[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="9 10 11 18b 21" QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} @@ -61,13 +62,13 @@ QUOTALOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} DIR=${DIR:-$MOUNT} DIR2=${DIR2:-$MOUNT2} -cleanup_and_setup_lustre +check_and_setup_lustre -LOVNAME=`cat $LPROC/llite/*/lov/common_name | tail -n 1` -OSTCOUNT=`cat $LPROC/lov/$LOVNAME/numobd` +LOVNAME=`lctl get_param -n llite.*.lov.common_name | tail -n 1` +OSTCOUNT=`lctl get_param -n lov.$LOVNAME.numobd` -SHOW_QUOTA_USER="$LFS quota -u $TSTUSR $DIR" -SHOW_QUOTA_GROUP="$LFS quota -g $TSTUSR $DIR" +SHOW_QUOTA_USER="$LFS quota -v -u $TSTUSR $DIR" +SHOW_QUOTA_GROUP="$LFS quota -v -g $TSTUSR $DIR" SHOW_QUOTA_INFO="$LFS quota -t $DIR" # control the time of tests @@ -81,74 +82,59 @@ eval ONLY_99=true # set_blk_tunables(btune_sz) set_blk_tunesz() { + local btune=$(($1 * BLK_SZ)) # set btune size on all obdfilters - do_facet ost1 "set -x; for i in /proc/fs/lustre/obdfilter/*/quota_btune_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_btune_sz=$btune" # set btune size on mds - do_facet $SINGLEMDS "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_btune_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.quota_btune_sz=$btune" } # set_blk_unitsz(bunit_sz) set_blk_unitsz() { - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_bunit_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" - do_facet $SINGLEMDS "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_bunit_sz; do - echo $(($1 * BLK_SZ)) >> \\\$i; - done" + local bunit=$(($1 * BLK_SZ)) + # set bunit size on all obdfilters + do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_bunit_sz=$bunit" + # set bunit size on mds + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.quota_bunit_sz=$bunit" } # set_file_tunesz(itune_sz) set_file_tunesz() { - # set iunit and itune size on all obdfilters - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_itune_sz; do - echo $1 >> \\\$i; - done" - # set iunit and itune size on mds - do_facet $SINGLEMDS "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_itune_sz; do - echo $1 >> \\\$i; - done" + local itune=$1 + # set itune size on all obdfilters + do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_itune_sz=$itune" + # set itune size on mds + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.quota_itune_sz=$itune" } # set_file_unitsz(iunit_sz) set_file_unitsz() { - do_facet ost1 "for i in /proc/fs/lustre/obdfilter/*/quota_iunit_sz; do - echo $1 >> \\\$i; - done" - do_facet $SINGLEMDS "for i in /proc/fs/lustre/mds/${FSNAME}-MDT*/quota_iunit_sz; do - echo $1 >> \\\$i; - done" + local iunit=$1 + # set iunit size on all obdfilters + do_facet ost1 "lctl set_param lquota.${FSNAME}-OST*.quota_iunit_sz=$iunit" + # set iunit size on mds + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.quota_iunit_sz=$iunit" } -# These are for test on local machine,if run sanity-quota.sh on -# real cluster, ltest should have setup the test environment: -# -# - create test user/group on all servers with same id. -# - set unit size/tune on all servers size to reasonable value. -pre_test() { - if [ -z "$NOSETUP" ]; then - # set block tunables - set_blk_tunesz $BTUNE_SZ - set_blk_unitsz $BUNIT_SZ - # set file tunables - set_file_tunesz $ITUNE_SZ - set_file_unitsz $IUNIT_SZ - fi -} -pre_test - -post_test() { - if [ -z "$NOSETUP" ]; then - # restore block tunables to default size - set_blk_unitsz $((1024 * 100)) - set_blk_tunesz $((1024 * 50)) - # restore file tunables to default size - set_file_unitsz 5000 - set_file_tunesz 2500 - fi +lustre_fail() { + local fail_node=$1 + local fail_loc=$2 + + case $fail_node in + "mds" ) + do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ;; + "ost" ) + for num in `seq $OSTCOUNT`; do + do_facet ost$num "lctl set_param fail_loc=$fail_loc" + done ;; + "mds_ost" ) + do_facet $SINGLEMDS "lctl set_param fail_loc=$fail_loc" ; + for num in `seq $OSTCOUNT`; do + do_facet ost$num "lctl set_param fail_loc=$fail_loc" + done ;; + * ) echo "usage: lustre_fail fail_node fail_loc" ; + return 1 ;; + esac } RUNAS="runas -u $TSTID" @@ -158,126 +144,232 @@ FAIL_ON_ERROR=true check_runas_id $TSTID2 $RUNAS2 FAIL_ON_ERROR=false +run_test_with_stat() { + (($# != 2)) && error "the number of arguments is wrong" + + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.stats=0" > /dev/null + for j in `seq $OSTCOUNT`; do + do_facet ost$j "lctl set_param lquota.${FSNAME}-OST*.stats=0" > /dev/null + done + run_test "$@" + if [ ${STAT:-"yes"} != "no" -a -z "$LAST_SKIPPED" ]; then + echo "statistics info begin ***************************************" + do_facet $SINGLEMDS "lctl get_param lquota.mdd_obd-${FSNAME}-MDT*.stats" + for j in `seq $OSTCOUNT`; do + do_facet ost$j "lctl get_param lquota.${FSNAME}-OST*.stats" + done + echo "statistics info end ***************************************" + fi +} + # set quota test_0() { $LFS quotaoff -ug $DIR $LFS quotacheck -ug $DIR - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + + lctl set_param debug="+quota" + do_facet $SINGLEMDS "lctl set_param debug=+quota" + for num in `seq $OSTCOUNT`; do + do_facet ost$num "lctl set_param debug=+quota" + done } -run_test 0 "Set quota =============================" +run_test_with_stat 0 "Set quota =============================" -# block hard limit (normal use and out of quota) -test_1() { +# test for specific quota limitation, qunit, qtune $1=block_quota_limit +test_1_sub() { + LIMIT=$1 mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir + TESTFILE="$DIR/$tdir/$tfile-0" - LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 5)) # 5 bunits each sever - TESTFILE=$DIR/$tdir/$tfile-0 - - echo " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + wait_delete_completed + + # test for user + log " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + sleep 3 $SHOW_QUOTA_USER - + $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - echo " Write ..." + log " Write ..." $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) || error "(usr) write failure, but expect success" - echo " Done" - echo " Write out of block quota ..." + log " Done" + log " Write out of block quota ..." # this time maybe cache write, ignore it's failure $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) || true # flush cache, ensure noquota flag is setted on client - sync; sleep 1; sync; + cancel_lru_locks osc $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT && error "(usr) write success, but expect EDQUOT" rm -f $TESTFILE - - echo " Group quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR + sync; sleep 1; sync; + OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'` + OST0_QUOTA_USED=`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'` + echo $OST0_QUOTA_USED + [ $OST0_QUOTA_USED -ne 0 ] && \ + ($SHOW_QUOTA_USER; error "quota deleted isn't released") + $SHOW_QUOTA_USER + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + + # test for group + log "--------------------------------------" + log " Group quota (limit: $LIMIT kbytes)" + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + sleep 3 $SHOW_QUOTA_GROUP - TESTFILE=$DIR/$tdir/$tfile-1 + TESTFILE="$DIR/$tdir/$tfile-1" $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - echo " Write ..." + log " Write ..." $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) || error "(grp) write failure, but expect success" - echo " Done" - echo " Write out of block quota ..." + log " Done" + log " Write out of block quota ..." # this time maybe cache write, ignore it's failure $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT/2)) seek=$(($LIMIT/2)) || true - sync; sleep 1; sync; + cancel_lru_locks osc $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$LIMIT && error "(grp) write success, but expect EDQUOT" # cleanup rm -f $TESTFILE - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + sync; sleep 1; sync; + OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'` + OST0_QUOTA_USED=`$LFS quota -o $OST0_UUID -g $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'` + echo $OST0_QUOTA_USED + [ $OST0_QUOTA_USED -ne 0 ] && \ + ($SHOW_QUOTA_USER; error "quota deleted isn't released") + $SHOW_QUOTA_GROUP + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 1 "Block hard limit (normal use and out of quota) ===" -# file hard limit (normal use and out of quota) -test_2() { +# block hard limit (normal use and out of quota) +test_1() { + for i in `seq 1 $cycle`; do + # define blk_qunit is between 1M and 4M + blk_qunit=$(( $RANDOM % 3072 + 1024 )) + blk_qtune=$(( $RANDOM % $blk_qunit )) + # other osts and mds will occupy at 1M blk quota + b_limit=$(( ($RANDOM - 16384) / 8 + $OSTCOUNT * $blk_qunit * 4 )) + set_blk_tunesz $blk_qtune + set_blk_unitsz $blk_qunit + echo "cycle: $i(total $cycle) bunit:$blk_qunit, btune:$blk_qtune, blimit:$b_limit" + test_1_sub $b_limit + echo "==================================================" + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + done +} +run_test_with_stat 1 "Block hard limit (normal use and out of quota) ===" + +# test for specific quota limitation, qunit, qtune $1=block_quota_limit +test_2_sub() { + LIMIT=$1 mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir + TESTFILE="$DIR/$tdir/$tfile-0" - LIMIT=$(($IUNIT_SZ * 10)) # 10 iunits on mds - TESTFILE=$DIR/$tdir/$tfile-0 + wait_delete_completed - echo " User quota (limit: $LIMIT files)" - $LFS setquota -u $TSTUSR 0 0 0 $LIMIT $DIR + # test for user + log " User quota (limit: $LIMIT files)" + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I $LIMIT $DIR + sleep 3 $SHOW_QUOTA_USER - echo " Create $LIMIT files ..." + log " Create $LIMIT files ..." $RUNAS createmany -m ${TESTFILE} $LIMIT || \ - error "(usr) create failure, but expect success" - echo " Done" - echo " Create out of file quota ..." + error "(usr) create failure, but expect success" + log " Done" + log " Create out of file quota ..." $RUNAS touch ${TESTFILE}_xxx && \ - error "(usr) touch success, but expect EDQUOT" + error "(usr) touch success, but expect EDQUOT" unlinkmany ${TESTFILE} $LIMIT - rm ${TESTFILE}_xxx + rm -f ${TESTFILE}_xxx + sync; sleep 1; sync; + + MDS_UUID=`do_facet $SINGLEMDS $LCTL dl | grep -m1 " mdt " | awk '{print $((NF-1))}'` + MDS_QUOTA_USED=`$LFS quota -o $MDS_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'` + echo $MDS_QUOTA_USED + [ $MDS_QUOTA_USED -ne 0 ] && \ + ($SHOW_QUOTA_USER; error "quota deleted isn't released") + $SHOW_QUOTA_USER + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR - echo " Group quota (limit: $LIMIT files)" - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit - $LFS setquota -g $TSTUSR 0 0 0 $LIMIT $DIR + # test for group + log "--------------------------------------" + log " Group quota (limit: $LIMIT FILE)" + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I $LIMIT $DIR + sleep 3 $SHOW_QUOTA_GROUP TESTFILE=$DIR/$tdir/$tfile-1 - echo " Create $LIMIT files ..." + log " Create $LIMIT files ..." $RUNAS createmany -m ${TESTFILE} $LIMIT || \ - error "(grp) create failure, but expect success" + error "(usr) create failure, but expect success" + log " Done" + log " Create out of file quota ..." + $RUNAS touch ${TESTFILE}_xxx && \ + error "(usr) touch success, but expect EDQUOT" - echo " Done" - echo " Create out of file quota ..." - $RUNAS touch ${TESTFILE}_xxx && \ - error "(grp) touch success, but expect EDQUOT" + unlinkmany ${TESTFILE} $LIMIT + rm -f ${TESTFILE}_xxx + sync; sleep 1; sync; - $RUNAS touch ${TESTFILE}_xxx > /dev/null 2>&1 && error "(grp) touch success, but expect EDQUOT" + MDS_UUID=`do_facet $SINGLEMDS $LCTL dl | grep -m1 " mdt " | awk '{print $((NF-1))}'` + MDS_QUOTA_USED=`$LFS quota -o $MDS_UUID -g $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'` + echo $MDS_QUOTA_USED + [ $MDS_QUOTA_USED -ne 0 ] && \ + ($SHOW_QUOTA_USER; error "quota deleted isn't released") + $SHOW_QUOTA_GROUP + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR +} - # cleanup - unlinkmany ${TESTFILE} $LIMIT - rm ${TESTFILE}_xxx +# file hard limit (normal use and out of quota) +test_2() { + for i in `seq 1 $cycle`; do + if [ $i -eq 1 ]; then + ino_qunit=52 + ino_qtune=41 + i_limit=11 + else + # define ino_qunit is between 10 and 100 + ino_qunit=$(( $RANDOM % 90 + 10 )) + ino_qtune=$(( $RANDOM % $ino_qunit )) + # RANDOM's maxium is 32767 + i_limit=$(( $RANDOM % 990 + 10 )) + fi - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + set_file_tunesz $ino_qtune + set_file_unitsz $ino_qunit + echo "cycle: $i(total $cycle) iunit:$ino_qunit, itune:$ino_qtune, ilimit:$i_limit" + test_2_sub $i_limit + echo "==================================================" + set_file_unitsz 5120 + set_file_tunesz 2560 + done } -run_test 2 "File hard limit (normal use and out of quota) ===" +run_test_with_stat 2 "File hard limit (normal use and out of quota) ===" test_block_soft() { TESTFILE=$1 TIMER=$(($2 * 3 / 2)) OFFSET=0 + wait_delete_completed + echo " Write to exceed soft limit" RUNDD="$RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ" $RUNDD count=$((BUNIT_SZ+1)) || \ - error "write failure, but expect success" + error "write failure, but expect success" OFFSET=$((OFFSET + BUNIT_SZ + 1)) - sync; sleep 1; sync; + cancel_lru_locks osc $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP @@ -285,38 +377,38 @@ test_block_soft() { echo " Write before timer goes off" $RUNDD count=$BUNIT_SZ seek=$OFFSET || \ - error "write failure, but expect success" + error "write failure, but expect success" OFFSET=$((OFFSET + BUNIT_SZ)) - sync; sleep 1; sync; + cancel_lru_locks osc echo " Done" - - echo " Sleep $TIMER seconds ..." - sleep $TIMER - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - $SHOW_QUOTA_INFO + echo " Sleep $TIMER seconds ..." + sleep $TIMER + + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + $SHOW_QUOTA_INFO echo " Write after timer goes off" # maybe cache write, ignore. - sync; sleep 1; sync; $RUNDD count=$BUNIT_SZ seek=$OFFSET || true OFFSET=$((OFFSET + BUNIT_SZ)) - sync; sleep 1; sync; + cancel_lru_locks osc $RUNDD count=$BUNIT_SZ seek=$OFFSET && \ - error "write success, but expect EDQUOT" + error "write success, but expect EDQUOT" - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - $SHOW_QUOTA_INFO + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + $SHOW_QUOTA_INFO echo " Unlink file to stop timer" rm -f $TESTFILE + sync; sleep 1; sync echo " Done" - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - $SHOW_QUOTA_INFO + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + $SHOW_QUOTA_INFO echo " Write ..." $RUNDD count=$BUNIT_SZ || error "write failure, but expect success" @@ -324,6 +416,7 @@ test_block_soft() { # cleanup rm -f $TESTFILE + sync; sleep 3; sync; } # block soft limit (start timer, timer goes off, stop timer) @@ -331,7 +424,8 @@ test_3() { mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir - LIMIT=$(( $BUNIT_SZ * 2 )) # 1 bunit on mds and 1 bunit on the ost + # 1 bunit on mds and 1 bunit on every ost + LIMIT=$(( $BUNIT_SZ * ($OSTCOUNT + 1) )) GRACE=10 echo " User quota (soft limit: $LIMIT kbytes grace: $GRACE seconds)" @@ -340,11 +434,11 @@ test_3() { $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setquota -t -u $GRACE $MAX_IQ_TIME $DIR - $LFS setquota -u $TSTUSR $LIMIT 0 0 0 $DIR + $LFS setquota -t -u --block-grace $GRACE --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -u $TSTUSR -b $LIMIT -B 0 -i 0 -I 0 $DIR test_block_soft $TESTFILE $GRACE - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR echo " Group quota (soft limit: $LIMIT kbytes grace: $GRACE seconds)" TESTFILE=$DIR/$tdir/$tfile-1 @@ -352,19 +446,21 @@ test_3() { $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setquota -t -g $GRACE $MAX_IQ_TIME $DIR - $LFS setquota -g $TSTUSR $LIMIT 0 0 0 $DIR + $LFS setquota -t -g --block-grace $GRACE --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -g $TSTUSR -b $LIMIT -B 0 -i 0 -I 0 $DIR test_block_soft $TESTFILE $GRACE - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 3 "Block soft limit (start timer, timer goes off, stop timer) ===" +run_test_with_stat 3 "Block soft limit (start timer, timer goes off, stop timer) ===" test_file_soft() { TESTFILE=$1 LIMIT=$2 TIMER=$(($3 * 3 / 2)) + wait_delete_completed + echo " Create files to exceed soft limit" $RUNAS createmany -m ${TESTFILE}_ $((LIMIT + 1)) || \ error "create failure, but expect success" @@ -379,23 +475,24 @@ test_file_soft() { echo " Sleep $TIMER seconds ..." sleep $TIMER - + $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP $SHOW_QUOTA_INFO - + echo " Create file after timer goes off" - $RUNAS createmany -m ${TESTFILE}_after_ $((IUNIT_SZ - 2)) || \ - error "create ${TESTFILE}_after failure, but expect success" + # the least of inode qunit is 2, so there are at most 3(qunit:2+qtune:1) + # inode quota left here + $RUNAS touch ${TESTFILE}_after ${TESTFILE}_after1 ${TESTFILE}_after2 || true sync; sleep 1; sync - $RUNAS touch ${TESTFILE}_after && \ + $RUNAS touch ${TESTFILE}_after3 && \ error "create after timer expired, but expect EDQUOT" sync; sleep 1; sync $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP $SHOW_QUOTA_INFO - + echo " Unlink files to stop timer" find `dirname $TESTFILE` -name "`basename ${TESTFILE}`*" | xargs rm -f echo " Done" @@ -408,6 +505,7 @@ test_file_soft() { # cleanup rm -f ${TESTFILE}_xxx + sync; sleep 3; sync; } # file soft limit (start timer, timer goes off, stop timer) @@ -420,66 +518,70 @@ test_4a() { # was test_4 GRACE=5 echo " User quota (soft limit: $LIMIT files grace: $GRACE seconds)" - $LFS setquota -t -u $MAX_DQ_TIME $GRACE $DIR - $LFS setquota -u $TSTUSR 0 0 $LIMIT 0 $DIR + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $GRACE $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i $LIMIT -I 0 $DIR $SHOW_QUOTA_USER test_file_soft $TESTFILE $LIMIT $GRACE - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR echo " Group quota (soft limit: $LIMIT files grace: $GRACE seconds)" - $LFS setquota -t -g $MAX_DQ_TIME $GRACE $DIR - $LFS setquota -g $TSTUSR 0 0 $LIMIT 0 $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $GRACE $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i $LIMIT -I 0 $DIR $SHOW_QUOTA_GROUP TESTFILE=$DIR/$tdir/$tfile-1 test_file_soft $TESTFILE $LIMIT $GRACE - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR # cleanup - $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $DIR - $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $DIR + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR } -run_test 4a "File soft limit (start timer, timer goes off, stop timer) ===" +run_test_with_stat 4a "File soft limit (start timer, timer goes off, stop timer) ===" test_4b() { # was test_4a - GR_STR1="1w3d" - GR_STR2="1000s" - GR_STR3="5s" - GR_STR4="1w2d3h4m5s" - GR_STR5="5c" - GR_STR6="1111111111111111" - - # test of valid grace strings handling - echo " Valid grace strings test" - $LFS setquota -t -u $GR_STR1 $GR_STR2 $DIR - $LFS quota -u -t $DIR | grep "Block grace time: $GR_STR1" - $LFS setquota -t -g $GR_STR3 $GR_STR4 $DIR - $LFS quota -g -t $DIR | grep "Inode grace time: $GR_STR4" - - # test of invalid grace strings handling - echo " Invalid grace strings test" - ! $LFS setquota -t -u $GR_STR4 $GR_STR5 $DIR - ! $LFS setquota -t -g $GR_STR4 $GR_STR6 $DIR - - # cleanup - $LFS setquota -t -u $MAX_DQ_TIME $MAX_IQ_TIME $DIR - $LFS setquota -t -g $MAX_DQ_TIME $MAX_IQ_TIME $DIR + GR_STR1="1w3d" + GR_STR2="1000s" + GR_STR3="5s" + GR_STR4="1w2d3h4m5s" + GR_STR5="5c" + GR_STR6="1111111111111111" + + wait_delete_completed + + # test of valid grace strings handling + echo " Valid grace strings test" + $LFS setquota -t -u --block-grace $GR_STR1 --inode-grace $GR_STR2 $DIR + $LFS quota -u -t $DIR | grep "Block grace time: $GR_STR1" + $LFS setquota -t -g --block-grace $GR_STR3 --inode-grace $GR_STR4 $DIR + $LFS quota -g -t $DIR | grep "Inode grace time: $GR_STR4" + + # test of invalid grace strings handling + echo " Invalid grace strings test" + ! $LFS setquota -t -u --block-grace $GR_STR4 --inode-grace $GR_STR5 $DIR + ! $LFS setquota -t -g --block-grace $GR_STR4 --inode-grace $GR_STR6 $DIR + + # cleanup + $LFS setquota -t -u --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR + $LFS setquota -t -g --block-grace $MAX_DQ_TIME --inode-grace $MAX_IQ_TIME $DIR } -run_test 4b "Grace time strings handling ===" +run_test_with_stat 4b "Grace time strings handling ===" # chown & chgrp (chown & chgrp successfully even out of block/file quota) test_5() { mkdir -p $DIR/$tdir BLIMIT=$(( $BUNIT_SZ * $((OSTCOUNT + 1)) * 10)) # 10 bunits on each server ILIMIT=$(( $IUNIT_SZ * 10 )) # 10 iunits on mds - + + wait_delete_completed + echo " Set quota limit (0 $BLIMIT 0 $ILIMIT) for $TSTUSR.$TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLIMIT 0 $ILIMIT $DIR - $LFS setquota -g $TSTUSR 0 $BLIMIT 0 $ILIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLIMIT -i 0 -I $ILIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLIMIT -i 0 -I $ILIMIT $DIR $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP - + echo " Create more than $ILIMIT files and more than $BLIMIT kbytes ..." createmany -m $DIR/$tdir/$tfile-0_ $((ILIMIT + 1)) || \ error "touch failure, expect success" @@ -493,11 +595,12 @@ test_5() { # cleanup unlinkmany $DIR/$tdir/$tfile-0_ $((ILIMIT + 1)) + sync; sleep 3; sync; - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 5 "Chown & chgrp successfully even out of block/file quota ===" +run_test_with_stat 5 "Chown & chgrp successfully even out of block/file quota ===" # block quota acquire & release test_6() { @@ -506,16 +609,18 @@ test_6() { return 0; fi + wait_delete_completed + mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir LIMIT=$((BUNIT_SZ * (OSTCOUNT + 1) * 5)) # 5 bunits per server FILEA="$DIR/$tdir/$tfile-0_a" FILEB="$DIR/$tdir/$tfile-0_b" - + echo " Set block limit $LIMIT kbytes to $TSTUSR.$TSTUSR" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR - $LFS setquota -g $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_USER $SHOW_QUOTA_GROUP @@ -526,42 +631,44 @@ test_6() { chown $TSTUSR.$TSTUSR $FILEB echo " Exceed quota limit ..." - RUNDD="$RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ" - $RUNDD count=$((LIMIT - BUNIT_SZ * OSTCOUNT)) || \ - error "write fileb failure, but expect success" + RUNDD="$RUNAS dd if=/dev/zero of=$FILEB bs=$BLK_SZ" + $RUNDD count=$((LIMIT - BUNIT_SZ * OSTCOUNT)) || \ + error "write fileb failure, but expect success" - sync; sleep 1; sync; - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - $RUNDD seek=$LIMIT count=$((BUNIT_SZ * OSTCOUNT)) && \ - error "write fileb success, but expect EDQUOT" - sync; sleep 1; sync; + cancel_lru_locks osc + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + $RUNDD seek=$LIMIT count=$((BUNIT_SZ * OSTCOUNT)) && \ + error "write fileb success, but expect EDQUOT" + cancel_lru_locks osc echo " Write to OST0 return EDQUOT" # this write maybe cache write, ignore it's failure - RUNDD="$RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ" - $RUNDD count=$(($BUNIT_SZ * 2)) || true - sync; sleep 1; sync; - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - $RUNDD count=$((BUNIT_SZ * 2)) seek=$((BUNIT_SZ *2)) && \ - error "write filea success, but expect EDQUOT" + RUNDD="$RUNAS dd if=/dev/zero of=$FILEA bs=$BLK_SZ" + $RUNDD count=$(($BUNIT_SZ * 2)) || true + cancel_lru_locks osc + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + $RUNDD count=$((BUNIT_SZ * 2)) seek=$((BUNIT_SZ *2)) && \ + error "write filea success, but expect EDQUOT" echo " Remove fileb to let OST1 release quota" rm -f $FILEB - sync; sleep 10; sync; # need to allow journal commit for small fs + sync; sleep 10; sync; # need to allow journal commit for small fs echo " Write to OST0" $RUNDD count=$((LIMIT - BUNIT_SZ * OSTCOUNT)) || \ - error "write filea failure, expect success" + error "write filea failure, expect success" echo " Done" # cleanup rm -f $FILEA - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR - $LFS setquota -g $TSTUSR 0 0 0 0 $DIR + sync; sleep 3; sync; + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR return 0 } -run_test 6 "Block quota acquire & release =========" +run_test_with_stat 6 "Block quota acquire & release =========" # quota recovery (block quota only by now) test_7() @@ -569,23 +676,25 @@ test_7() mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir - LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever + wait_delete_completed + + LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) )) TESTFILE="$DIR/$tdir/$tfile-0" - - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR - + + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + $LFS setstripe $TESTFILE -c 1 chown $TSTUSR.$TSTUSR $TESTFILE echo " Write to OST0..." $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ || \ error "write failure, but expect success" - - #define OBD_FAIL_OBD_DQACQ 0x604 - echo 0x604 > /proc/sys/lustre/fail_loc + + #define OBD_FAIL_OBD_DQACQ 0x604 + lustre_fail mds 0x604 echo " Remove files on OST0" rm -f $TESTFILE - echo 0 > /proc/sys/lustre/fail_loc + lustre_fail mds 0 echo " Trigger recovery..." OSC0_UUID="`$LCTL dl | awk '$3 ~ /osc/ { print $1 }'`" @@ -598,20 +707,20 @@ test_7() # check limits PATTERN="`echo $DIR | sed 's/\//\\\\\//g'`" - TOTAL_LIMIT="`$LFS quota -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" [ $TOTAL_LIMIT -eq $LIMIT ] || error "total limits not recovery!" echo " total limits = $TOTAL_LIMIT" - - OST0_UUID=`do_facet ost1 "$LCTL dl | grep -m1 obdfilter" | awk '{print $((NF-1))}'` - [ -z "$OST0_UUID" ] && OST0_UUID=`do_facet ost1 "$LCTL dl | grep -m1 obdfilter" | awk '{print $((NF-1))}'` - OST0_LIMIT="`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $3 }'`" + + OST0_UUID=`do_facet ost1 "$LCTL dl | grep -m1 obdfilter" | awk '{print $((NF-1))}'` + [ -z "$OST0_UUID" ] && OST0_UUID=`do_facet ost1 "$LCTL dl | grep -m1 obdfilter" | awk '{print $((NF-1))}'` + OST0_LIMIT="`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $2 }'`" [ $OST0_LIMIT -eq $BUNIT_SZ ] || error "high limits not released!" echo " limits on $OST0_UUID = $OST0_LIMIT" # cleanup - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 7 "Quota recovery (only block limit) ======" +run_test_with_stat 7 "Quota recovery (only block limit) ======" # run dbench with quota enabled test_8() { @@ -622,9 +731,9 @@ test_8() { wait_delete_completed echo " Set enough high limit for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR echo " Set enough high limit for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR chmod 0777 $DIR/$tdir local duration="" @@ -633,162 +742,140 @@ test_8() { sync; sleep 3; sync; - return 0 + return 0 } -run_test 8 "Run dbench with quota enabled ===========" +run_test_with_stat 8 "Run dbench with quota enabled ===========" # run for fixing bug10707, it needs a big room. test for 64bit KB=1024 GB=$((KB * 1024 * 1024)) -FSIZE=$((OSTCOUNT * 9 / 2)) # Use this as dd bs to decrease time # inode->i_blkbits = min(PTLRPC_MAX_BRW_BITS+1, LL_MAX_BLKSIZE_BITS); blksize=$((1 << 21)) # 2Mb +size_file=$((GB * 9 / 2)) +# this check is just for test9 and test10 +OST0_MIN=4900000 #4.67G +check_whether_skip () { + OST0_SIZE=`$LFS df $DIR | awk '/\[OST:0\]/ {print $4}'` + log "OST0_SIZE: $OST0_SIZE required: $OST0_MIN" + if [ $OST0_SIZE -lt $OST0_MIN ]; then + echo "WARN: OST0 has less than $OST0_MIN free, skip this test." + return 0 + else + return 1 + fi +} test_9() { - chmod 0777 $DIR/$tdir - lustrefs_size=`(echo 0; df -t lustre -P | awk '{print $4}') | tail -n 1` - size_file=$((FSIZE * GB)) - echo "lustrefs_size:$lustrefs_size size_file:$((size_file / KB))" - if [ $((lustrefs_size * KB)) -lt $size_file ]; then - skip "less than $size_file bytes free" - return 0; - fi - - set_blk_unitsz $((1024 * 100)) - set_blk_tunesz $((1024 * 50)) + check_whether_skip && return 0 - # set the D_QUOTA flag - debugsave - sysctl -w lnet.debug="+quota" - - TESTFILE="$DIR/$tdir/$tfile-0" + wait_delete_completed - BLK_LIMIT=$((100 * KB * KB)) # 100G - FILE_LIMIT=1000000 + set_blk_tunesz 512 + set_blk_unitsz 1024 - echo " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR - echo " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR + mkdir -p $DIR/$tdir + chmod 0777 $DIR/$tdir + TESTFILE="$DIR/$tdir/$tfile-0" - echo " Set stripe" - [ $OSTCOUNT -ge 2 ] && $LFS setstripe $TESTFILE -c $OSTCOUNT - touch $TESTFILE - chown $TSTUSR.$TSTUSR $TESTFILE + BLK_LIMIT=$((100 * KB * KB)) # 100G + FILE_LIMIT=1000000 + echo " Set block limit $BLK_LIMIT kbytes to $TSTUSR.$TSTUSR" - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR - echo " Write the big file of $FSIZE G ..." - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((size_file / blksize)) || \ - error "(usr) write $FSIZE G file failure, but expect success" + echo " Set stripe" + $LFS setstripe $TESTFILE -c 1 + touch $TESTFILE + chown $TSTUSR.$TSTUSR $TESTFILE - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP - echo " delete the big file of $FSIZE G..." - $RUNAS rm -f $TESTFILE + log " Write the big file of 4.5G ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((size_file / blksize)) || \ + error "(usr) write 4.5G file failure, but expect success" - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP - echo " write the big file of 2 G..." - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((2 * GB / blksize)) || \ - error "(usr) write 2 G file failure, but expect seccess" + log " delete the big file of 4.5G..." + $RUNAS rm -f $TESTFILE + sync; sleep 3; sync; - echo " delete the big file of 2 G..." - $RUNAS rm -f $TESTFILE - RC=$? + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP - set_blk_tunesz $BTUNE_SZ - set_blk_unitsz $BUNIT_SZ + RC=$? - debugrestore - wait_delete_completed + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) - return $RC + return $RC } -run_test 9 "run for fixing bug10707(64bit) ===========" +run_test_with_stat 9 "run for fixing bug10707(64bit) ===========" # run for fixing bug10707, it need a big room. test for 32bit +# 2.0 version does not support 32 bit qd_count, so such test is obsolete. test_10() { mkdir -p $DIR/$tdir chmod 0777 $DIR/$tdir - lustrefs_size=`(echo 0; df -t lustre -P | awk '{print $4}') | tail -n 1` - size_file=$((FSIZE * GB)) - echo "lustrefs_size:$lustrefs_size size_file:$((size_file / KB))" - if [ $((lustrefs_size * KB)) -lt $size_file ]; then - skip "less than $size_file bytes free" - return 0; - fi + check_whether_skip && return 0 - sync; sleep 10; sync; + wait_delete_completed - set_blk_unitsz $((1024 * 100)) - set_blk_tunesz $((1024 * 50)) + set_blk_tunesz 512 + set_blk_unitsz 1024 - # set the D_QUOTA flag - debugsave - sysctl -w lnet.debug="+quota" - # make qd_count 32 bit - sysctl -w lustre.fail_loc=0xA00 + lustre_fail mds_ost 0xA00 TESTFILE="$DIR/$tdir/$tfile-0" BLK_LIMIT=$((100 * KB * KB)) # 100G FILE_LIMIT=1000000 - echo " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" - $LFS setquota -u $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR - echo " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" - $LFS setquota -g $TSTUSR 0 $BLK_LIMIT 0 $FILE_LIMIT $DIR - + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $DIR + echo " Set stripe" - [ $OSTCOUNT -ge 2 ] && $LFS setstripe $TESTFILE -c $OSTCOUNT + $LFS setstripe $TESTFILE -c 1 touch $TESTFILE chown $TSTUSR.$TSTUSR $TESTFILE - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP - - echo " Write the big file of $FSIZE G ..." - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((size_file / blksize)) || \ - error "(usr) write $FSIZE G file failure, but expect success" - - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP - echo " delete the big file of $FSIZE G..." - $RUNAS rm -f $TESTFILE + log " Write the big file of 4.5 G ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((size_file / blksize)) || \ + error "(usr) write 4.5 G file failure, but expect success" - $SHOW_QUOTA_USER - $SHOW_QUOTA_GROUP + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP - echo " write the big file of 2 G..." - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$blksize count=$((2 * GB / blkzise)) || \ - error "(usr) write 2 G file failure, but expect success" + log " delete the big file of 4.5 G..." + $RUNAS rm -f $TESTFILE + sync; sleep 3; sync; - echo " delete the big file of 2 G..." - $RUNAS rm -f $TESTFILE + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP RC=$? - # clear the flage - debugrestore - # make qd_count 64 bit - sysctl -w lustre.fail_loc=0 + lustre_fail mds_ost 0 - set_blk_tunesz $BTUNE_SZ - set_blk_unitsz $BUNIT_SZ - - wait_delete_completed + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) return $RC } -run_test 10 "run for fixing bug10707(32bit) ===========" +#run_test_with_stat 10 "run for fixing bug10707(32bit) ===========" test_11() { wait_delete_completed @@ -796,14 +883,14 @@ test_11() { #prepare the test block_limit=`(echo 0; df -t lustre -P | awk '{print $(NF - 4)}') | tail -n 1` echo $block_limit - orig_dbr=`cat /proc/sys/vm/dirty_background_ratio` - orig_dec=`cat /proc/sys/vm/dirty_expire_centisecs` - orig_dr=`cat /proc/sys/vm/dirty_ratio` - orig_dwc=`cat /proc/sys/vm/dirty_writeback_centisecs` - echo 1 > /proc/sys/vm/dirty_background_ratio - echo 30 > /proc/sys/vm/dirty_expire_centisecs - echo 1 > /proc/sys/vm/dirty_ratio - echo 50 > /proc/sys/vm/dirty_writeback_centisecs + orig_dbr=`sysctl -n vm.dirty_background_ratio` + orig_dec=`sysctl -n vm.dirty_expire_centisecs` + orig_dr=`sysctl -n vm.dirty_ratio` + orig_dwc=`sysctl -n vm.dirty_writeback_centisecs` + sysctl -w vm.dirty_background_ratio=1 + sysctl -w vm.dirty_expire_centisecs=30 + sysctl -w vm.dirty_ratio=1 + sysctl -w vm.dirty_writeback_centisecs=50 TESTDIR="$DIR/$tdir" local RV=0 @@ -819,7 +906,7 @@ test_11() { echo -n " create a file for uid " for j in `seq 1 30`; do echo -n "$j " - # 30MB per dd for a total of 900MB (if space even permits) + # 30MB per dd for a total of 900MB (if space even permits) runas -u $j dd if=/dev/zero of=$TESTDIR/$tfile bs=$blksize count=15 > /dev/null 2>&1 & done echo "" @@ -838,7 +925,7 @@ test_11() { RV=2 break fi - LAST_USED=$USED + LAST_USED=$USED done echo " removing the test files..." rm -f $TESTDIR/$tfile @@ -848,16 +935,16 @@ test_11() { echo "Test took $SECS sec" #clean - echo $orig_dbr > /proc/sys/vm/dirty_background_ratio - echo $orig_dec > /proc/sys/vm/dirty_expire_centisecs - echo $orig_dr > /proc/sys/vm/dirty_ratio - echo $orig_dwc > /proc/sys/vm/dirty_writeback_centisecs + sysctl -w vm.dirty_background_ratio=$orig_dbr + sysctl -w vm.dirty_expire_centisecs=$orig_dec + sysctl -w vm.dirty_ratio=$orig_dr + sysctl -w vm.dirty_writeback_centisecs=$orig_dwc if [ $RV -ne 0 ]; then - error "Nothing was written for $SECS sec ... aborting" + error "Nothing was written for $SECS sec ... aborting" fi return $RV } -run_test 11 "run for fixing bug10912 ===========" +run_test_with_stat 11 "run for fixing bug10912 ===========" # test a deadlock between quota and journal b=11693 @@ -868,106 +955,115 @@ test_12() { [ "$(grep $DIR2 /proc/mounts)" ] || mount_client $DIR2 || \ { skip "Need lustre mounted on $MOUNT2 " && retutn 0; } + if [ $OSTCOUNT -lt 2 ]; then + skip "$OSTCOUNT < 2, too few osts" + return 0; + fi + LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 10)) # 10 bunits each sever TESTFILE="$DIR/$tdir/$tfile-0" TESTFILE2="$DIR2/$tdir/$tfile-1" - + + wait_delete_completed + echo " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR - $LFS setstripe $TESTFILE -i 0 -c 1 + $LFS setstripe $TESTFILE -i 0 -c 1 chown $TSTUSR.$TSTUSR $TESTFILE - $LFS setstripe $TESTFILE2 -i 0 -c 1 - chown $TSTUSR2.$TSTUSR2 $TESTFILE2 + $LFS setstripe $TESTFILE2 -i 1 -c 1 + chown $TSTUSR2.$TSTUSR2 $TESTFILE2 #define OBD_FAIL_OST_HOLD_WRITE_RPC 0x21f - sysctl -w lustre.fail_loc=0x0000021f + lustre_fail ost 0x0000021f echo " step1: write out of block quota ..." - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT*2)) & - DDPID=$! - sleep 5 - $RUNAS2 dd if=/dev/zero of=$TESTFILE2 bs=$BLK_SZ count=102400 & + $RUNAS2 dd if=/dev/zero of=$TESTFILE2 bs=$BLK_SZ count=102400 & DDPID1=$! + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(($LIMIT*2)) & + DDPID=$! echo " step2: testing ......" count=0 while [ true ]; do - if [ -z `ps -ef | awk '$2 == '${DDPID1}' { print $8 }'` ]; then break; fi + if ! ps -p ${DDPID1} > /dev/null 2>&1; then break; fi count=$[count+1] if [ $count -gt 64 ]; then - sysctl -w lustre.fail_loc=0 + lustre_fail ost 0 error "dd should be finished!" fi sleep 1 - done + done echo "(dd_pid=$DDPID1, time=$count)successful" #Recover fail_loc and dd will finish soon - sysctl -w lustre.fail_loc=0 + lustre_fail ost 0 echo " step3: testing ......" count=0 while [ true ]; do - if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi + if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi count=$[count+1] - if [ $count -gt 100 ]; then + if [ $count -gt 150 ]; then error "dd should be finished!" fi sleep 1 - done + done echo "(dd_pid=$DDPID, time=$count)successful" rm -f $TESTFILE $TESTFILE2 - - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + sync; sleep 3; sync; + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 12 "test a deadlock between quota and journal ===" +run_test_with_stat 12 "test a deadlock between quota and journal ===" # test multiple clients write block quota b=11693 test_13() { + mkdir -p $DIR/$tdir + wait_delete_completed + # one OST * 10 + (mds + other OSTs) LIMIT=$((BUNIT_SZ * 10 + (BUNIT_SZ * OSTCOUNT))) TESTFILE="$DIR/$tdir/$tfile" - mkdir -p $DIR/$tdir echo " User quota (limit: $LIMIT kbytes)" - $LFS setquota -u $TSTUSR 0 $LIMIT 0 0 $DIR + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR $SHOW_QUOTA_USER - + $LFS setstripe $TESTFILE -i 0 -c 1 chown $TSTUSR.$TSTUSR $TESTFILE $LFS setstripe $TESTFILE.2 -i 0 -c 1 - chown $TSTUSR.$TSTUSR $TESTFILE.2 + chown $TSTUSR.$TSTUSR $TESTFILE.2 echo " step1: write out of block quota ..." # one bunit will give mds - $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$[($LIMIT - $BUNIT_SZ) / 2] & + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$[($LIMIT - $BUNIT_SZ) / 2] & DDPID=$! - $RUNAS dd if=/dev/zero of=$TESTFILE.2 bs=$BLK_SZ count=$[($LIMIT - $BUNIT_SZ) / 2] & + $RUNAS dd if=/dev/zero of=$TESTFILE.2 bs=$BLK_SZ count=$[($LIMIT - $BUNIT_SZ) / 2] & DDPID1=$! echo " step2: testing ......" count=0 while [ true ]; do - if [ -z `ps -ef | awk '$2 == '${DDPID}' { print $8 }'` ]; then break; fi + if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi count=$[count+1] if [ $count -gt 64 ]; then error "dd should be finished!" fi sleep 1 - done + done echo "(dd_pid=$DDPID, time=$count)successful" count=0 while [ true ]; do - if [ -z `ps -ef | awk '$2 == '${DDPID1}' { print $8 }'` ]; then break; fi + if ! ps -p ${DDPID1} > /dev/null 2>&1 ; then break; fi count=$[count+1] if [ $count -gt 64 ]; then error "dd should be finished!" fi sleep 1 - done + done echo "(dd_pid=$DDPID1, time=$count)successful" sync; sleep 5; sync; @@ -980,13 +1076,14 @@ test_13() { error "files too small $fz + $fz2 < $((BUNIT_SZ * BLK_SZ * 10))" rm -f $TESTFILE $TESTFILE.2 - - $LFS setquota -u $TSTUSR 0 0 0 0 $DIR # clear user limit + sync; sleep 3; sync; + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR } -run_test 13 "test multiple clients write block quota ===" +run_test_with_stat 13 "test multiple clients write block quota ===" check_if_quota_zero(){ - line=`$LFS quota -$1 $2 $DIR | wc -l` + line=`$LFS quota -v -$1 $2 $DIR | wc -l` for i in `seq 3 $line`; do if [ $i -eq 3 ]; then field="3 4 6 7" @@ -994,67 +1091,673 @@ check_if_quota_zero(){ field="3 5" fi for j in $field; do - tmp=`$LFS quota -$1 $2 $DIR | sed -n ${i}p | - awk '{print $'"$j"'}'` - [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -$1 $2 $DIR && \ + tmp=`$LFS quota -v -$1 $2 $DIR | sed -n ${i}p | + awk '{print $'"$j"'}'` + [ -n "$tmp" ] && [ $tmp -ne 0 ] && $LFS quota -v -$1 $2 $DIR && \ error "quota on $2 isn't clean" done done echo "pass check_if_quota_zero" } -pre_test_14 () { - # reboot the lustre - cd $T_PWD; sh llmountcleanup.sh || error "llmountcleanup failed" - sh llmount.sh - pre_test - run_test 0 "reboot lustre" -} - -pre_test_14 - test_14a() { # was test_14 b=12223 -- setting quota on root TESTFILE="$DIR/$tdir/$tfile" + + # reboot the lustre + sync; sleep 5; sync + cleanup_and_setup_lustre + test_0 + mkdir -p $DIR/$tdir # out of root's file and block quota - $LFS setquota -u root 10 10 10 10 $DIR + $LFS setquota -u root -b 10 -B 10 -i 10 -I 10 $DIR createmany -m ${TESTFILE} 20 || \ error "unexpected: user(root) create files failly!" dd if=/dev/zero of=$TESTFILE bs=4k count=4096 || \ error "unexpected: user(root) write files failly!" chmod 666 $TESTFILE $RUNAS dd if=/dev/zero of=${TESTFILE} seek=4096 bs=4k count=4096 && \ - error "unexpected: user(quota_usr) write a file successfully!" + error "unexpected: user(quota_usr) write a file successfully!" # trigger the llog chmod 777 $DIR - for i in `seq 1 10`; do $RUNAS touch ${TESTFILE}a_$i; done - for i in `seq 1 10`; do $RUNAS rm -f ${TESTFILE}a_$i; done + for i in `seq 1 10`; do $RUNAS touch ${TESTFILE}a_$i; done + for i in `seq 1 10`; do $RUNAS rm -f ${TESTFILE}a_$i; done # do the check - dmesg | tail | grep "\-122" |grep llog_obd_origin_add && error "err -122 not found in dmesg" - $LFS setquota -u root 0 0 0 0 $DIR + dmesg | tail | grep "\-122" |grep llog_obd_origin_add && error "err -122 not found in dmesg" + $LFS setquota -u root -b 0 -B 0 -i 0 -I 0 $DIR #check_if_quota_zero u root - # clean + # clean unlinkmany ${TESTFILE} 15 rm -f $TESTFILE + sync; sleep 3; sync; +} +run_test_with_stat 14a "test setting quota on root ===" + +# save quota version (both administrative and operational quotas) +quota_save_version() { + do_facet mgs "lctl conf_param ${FSNAME}-MDT*.mdd.quota_type=$1" + do_facet mgs "lctl conf_param ${FSNAME}-OST*.ost.quota_type=$1" + sleep 5 +} + +test_15(){ + LIMIT=$((24 * 1024 * 1024 * 1024 * 1024)) # 24 TB + PATTERN="`echo $DIR | sed 's/\//\\\\\//g'`" + + wait_delete_completed + + # test for user + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + TOTAL_LIMIT="`$LFS quota -v -u $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + [ $TOTAL_LIMIT -eq $LIMIT ] || error " (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!" + echo " (user)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!" + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + + # test for group + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + TOTAL_LIMIT="`$LFS quota -v -g $TSTUSR $DIR | awk '/^.*'$PATTERN'.*[[:digit:]+][[:space:]+]/ { print $4 }'`" + [ $TOTAL_LIMIT -eq $LIMIT ] || error " (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, failed!" + echo " (group)total limits = $TOTAL_LIMIT; limit = $LIMIT, successful!" + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR + $LFS quotaoff -ug $DIR + do_facet $SINGLEMDS "lctl set_param lquota.mdd_obd-${FSNAME}-MDT*.quota_type=ug" | grep "error writing" && \ + error "fail to set version for $SINGLEMDS" + for j in `seq $OSTCOUNT`; do + do_facet ost$j "lctl set_param lquota.${FSNAME}-OST*.quota_type=ug" | grep "error writing" && \ + error "fail to set version for ost$j" + done + + echo "invalidating quota files" + $LFS quotainv -ug $DIR + $LFS quotainv -ugf $DIR + $LFS quotacheck -ug $DIR +} +run_test_with_stat 15 "set block quota more than 4T ===" + +# $1=u/g $2=with qunit adjust or not +test_16_tub() { + LIMIT=$(( $BUNIT_SZ * $(($OSTCOUNT + 1)) * 4)) + TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + wait_delete_completed + + echo " User quota (limit: $LIMIT kbytes)" + if [ $1 == "u" ]; then + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + $SHOW_QUOTA_USER + else + $LFS setquota -g $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + $SHOW_QUOTA_GROUP + fi + + $LFS setstripe $TESTFILE -c 1 + chown $TSTUSR.$TSTUSR $TESTFILE + + echo " Write ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$((BUNIT_SZ * 4)) || \ + error "(usr) write failure, but expect success" + echo " Done" + echo " Write out of block quota ..." + # this time maybe cache write, ignore it's failure + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$((BUNIT_SZ * 4)) || true + # flush cache, ensure noquota flag is setted on client + cancel_lru_locks osc + if [ $2 -eq 1 ]; then + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$((BUNIT_SZ * 4)) || \ + error "(write failure, but expect success" + else + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$BUNIT_SZ seek=$((BUNIT_SZ * 4)) && \ + error "(write success, but expect EDQUOT" + fi + + rm -f $TESTFILE + sync; sleep 3; sync; + $LFS setquota -$1 $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR +} + +# test without adjusting qunit +# 2.0 version does not support WITHOUT_CHANGE_QS, so such test is obsolete +test_16 () { + set_blk_tunesz $((BUNIT_SZ * 2)) + set_blk_unitsz $((BUNIT_SZ * 4)) + for i in u g; do + for j in 0 1; do + # define OBD_FAIL_QUOTA_WITHOUT_CHANGE_QS 0xA01 + echo " grp/usr: $i, adjust qunit: $j" + echo "-------------------------------" + [ $j -eq 1 ] && lustre_fail mds_ost 0 + [ $j -eq 0 ] && lustre_fail mds_ost 0xA01 + test_16_tub $i $j + done + done + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) } -run_test 14a "test setting quota on root ===" +#run_test_with_stat 16 "test without adjusting qunit" + +# run for fixing bug14526, failed returned quota reqs shouldn't ruin lustre. +test_17() { + set_blk_tunesz 512 + set_blk_unitsz 1024 + + wait_delete_completed + + #define OBD_FAIL_QUOTA_RET_QDATA | OBD_FAIL_ONCE + lustre_fail ost 0x80000A02 + + TESTFILE="$DIR/$tdir/$tfile-a" + TESTFILE2="$DIR/$tdir/$tfile-b" + mkdir -p $DIR/$tdir + + BLK_LIMIT=$((100 * 1024)) # 100M + + log " Set enough high limit(block:$BLK_LIMIT) for user: $TSTUSR" + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR + log " Set enough high limit(block:$BLK_LIMIT) for group: $TSTUSR" + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I 0 $DIR + + touch $TESTFILE + chown $TSTUSR.$TSTUSR $TESTFILE + touch $TESTFILE2 + chown $TSTUSR.$TSTUSR $TESTFILE2 + + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + + log " Write the test file1 ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$(( 10 * 1024 )) \ + || echo "write 10M file failure" + + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + + log " write the test file2 ..." + $RUNAS dd if=/dev/zero of=$TESTFILE2 bs=$BLK_SZ count=$(( 10 * 1024 )) \ + || error "write 10M file failure" + + $SHOW_QUOTA_USER + $SHOW_QUOTA_GROUP + + rm -f $TESTFILE $TESTFILE2 + RC=$? + sync; sleep 3; sync; + + # make qd_count 64 bit + lustre_fail ost 0 + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + return $RC +} +run_test_with_stat 17 "run for fixing bug14526 ===========" + +# test when mds takes a long time to handle a quota req so that +# the ost has dropped it, the ost still could work well b=14840 +test_18() { + LIMIT=$((100 * 1024 * 1024)) # 100G + TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + wait_delete_completed + + set_blk_tunesz 512 + set_blk_unitsz 1024 + + log " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + + $LFS setstripe $TESTFILE -i 0 -c 1 + chown $TSTUSR.$TSTUSR $TESTFILE + + #define OBD_FAIL_MDS_BLOCK_QUOTA_REQ 0x142 + lustre_fail mds 0x142 + + log " step1: write 100M block ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$((1024 * 100)) & + DDPID=$! + + sleep 5 + lustre_fail mds 0 + + echo " step2: testing ......" + count=0 + timeout=$(lctl get_param -n timeout) + while [ true ]; do + if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi + count=$[count+1] + if [ $count -gt $((4 * $timeout)) ]; then + error "count=$count dd should be finished!" + fi + sleep 1 + done + log "(dd_pid=$DDPID, time=$count, timeout=$timeout)" + + testfile_size=$(stat -c %s $TESTFILE) + [ $testfile_size -ne $((BLK_SZ * 1024 * 100)) ] && \ + error "verifying file failed!" + rm -f $TESTFILE + sync; sleep 3; sync; + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) +} +run_test_with_stat 18 "run for fixing bug14840 ===========" + +# test when mds drops a quota req, the ost still could work well b=14840 +test_18a() { + LIMIT=$((100 * 1024 * 1024)) # 100G + TESTFILE="$DIR/$tdir/$tfile-a" + mkdir -p $DIR/$tdir + + wait_delete_completed + + set_blk_tunesz 512 + set_blk_unitsz 1024 + + log " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + + $LFS setstripe $TESTFILE -i 0 -c 1 + chown $TSTUSR.$TSTUSR $TESTFILE + + #define OBD_FAIL_MDS_DROP_QUOTA_REQ | OBD_FAIL_ONCE 0x80000143 + lustre_fail mds 0x80000143 + + log " step1: write 100M block ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ count=$((1024 * 100)) & + DDPID=$! + + echo " step2: testing ......" + count=0 + timeout=$(lctl get_param -n timeout) + while [ true ]; do + if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi + count=$[count+1] + if [ $count -gt $((6 * $timeout)) ]; then + lustre_fail mds 0 + error "count=$count dd should be finished!" + fi + sleep 1 + done + log "(dd_pid=$DDPID, time=$count, timeout=$timeout)" + + lustre_fail mds 0 + + rm -f $TESTFILE + sync; sleep 3; sync; + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) +} +run_test_with_stat 18a "run for fixing bug14840 ===========" + +# test when mds do failover, the ost still could work well without trigger +# watchdog b=14840 +test_18bc_sub() { + type=$1 + + LIMIT=$((110 * 1024 )) # 110M + TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + wait_delete_completed + + set_blk_tunesz 512 + set_blk_unitsz 1024 + + log " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + + $LFS setstripe $TESTFILE -i 0 -c 1 + chown $TSTUSR.$TSTUSR $TESTFILE + + timeout=$(sysctl -n lustre.timeout) + + if [ $type = "directio" ]; then + log " write 100M block(directio) ..." + $RUNAS $DIRECTIO write $TESTFILE 0 100 $((BLK_SZ * 1024)) & + else + log " write 100M block(normal) ..." + $RUNAS dd if=/dev/zero of=$TESTFILE bs=$((BLK_SZ * 1024)) count=100 & + fi + + DDPID=$! + do_facet $SINGLEMDS "$LCTL conf_param ${FSNAME}-MDT*.mdd.quota_type=ug" + + log "failing mds for $((2 * timeout)) seconds" + fail $SINGLEMDS $((2 * timeout)) + + # check if quotaon successful + $LFS quota -u $TSTUSR $MOUNT 2>&1 | grep -q "quotas are not enabled" + if [ $? -eq 0 ]; then + error "quotaon failed!" + rm -rf $TESTFILE + return + fi + + count=0 + while [ true ]; do + if ! ps -p ${DDPID} > /dev/null 2>&1; then break; fi + if [ $((++count % (2 * timeout) )) -eq 0 ]; then + log "it took $count second" + fi + sleep 1 + done + log "(dd_pid=$DDPID, time=$count, timeout=$timeout)" + sync; sleep 1; sync + + testfile_size=$(stat -c %s $TESTFILE) + [ $testfile_size -ne $((BLK_SZ * 1024 * 100)) ] && \ + error "verifying file failed!" + $SHOW_QUOTA_USER + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + rm -rf $TESTFILE + sync; sleep 1; sync +} + +# test when mds does failover, the ost still could work well +# this test shouldn't trigger watchdog b=14840 +test_18b() { + test_18bc_sub normal + test_18bc_sub directio + # check if watchdog is triggered + MSG="test 18b: run for fixing bug14840" + do_facet ost1 "dmesg > $TMP/lustre-log-${TESTNAME}.log" + do_facet client cat > $TMP/lustre-log-${TESTNAME}.awk <<-EOF + /$MSG/ { + start = 1; + } + /Watchdog triggered/ { + if (start) { + print \$0; + } + } + EOF + watchdog=`do_facet ost1 awk -f $TMP/lustre-log-${TESTNAME}.awk $TMP/lustre-log-${TESTNAME}.log` + if [ -n "$watchdog" ]; then error "$watchdog"; fi +} +run_test_with_stat 18b "run for fixing bug14840(mds failover, no watchdog) ===========" + +# test when mds does failover, the ost still could work well +# this test will prevent OST_DISCONNET from happening b=14840 +test_18c() { + # define OBD_FAIL_OST_DISCONNECT_NET 0x202(disable ost_disconnect for osts) + lustre_fail ost 0x202 + test_18bc_sub normal + test_18bc_sub directio + lustre_fail ost 0 +} +run_test_with_stat 18c "run for fixing bug14840(mds failover, OST_DISCONNECT is disabled) ===========" + +run_to_block_limit() { + local LIMIT=$((($OSTCOUNT + 1) * $BUNIT_SZ)) + local TESTFILE=$1 + wait_delete_completed + + # set 1 Mb quota unit size + set_blk_tunesz 512 + set_blk_unitsz 1024 + + # bind file to a single OST + $LFS setstripe -c 1 $TESTFILE + chown $TSTUSR.$TSTUSR $TESTFILE + + echo " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + echo " Updating quota limits" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $MOUNT + $SHOW_QUOTA_USER + + RUNDD="$RUNAS dd if=/dev/zero of=$TESTFILE bs=$BLK_SZ" + $RUNDD count=$BUNIT_SZ || error "(usr) write failure, but expect success" + # for now page cache of TESTFILE may still be dirty, + # let's push it to the corresponding OST, this will also + # cache NOQUOTA on the client from OST's reply + cancel_lru_locks osc + $RUNDD seek=$BUNIT_SZ && error "(usr) write success, should be EDQUOT" +} + +test_19() { + # 1 Mb bunit per each MDS/OSS + local TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + run_to_block_limit $TESTFILE + $SHOW_QUOTA_USER + + # cleanup + rm -f $TESTFILE + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + +} +run_test_with_stat 19 "test if administrative limits updates do not zero operational limits (14790) ===" + +test_20() +{ + LSTR=(1t 2g 3m 4k) # limits strings + LVAL=($[1*1024*1024*1024] $[2*1024*1024] $[3*1024*1024] $[4*1024]) # limits values + + $LFS setquota -u $TSTUSR --block-softlimit ${LSTR[0]} \ + $MOUNT || error "could not set quota limits" + + $LFS setquota -u $TSTUSR --block-hardlimit ${LSTR[1]} \ + --inode-softlimit ${LSTR[2]} \ + --inode-hardlimit ${LSTR[3]} \ + $MOUNT || error "could not set quota limits" + + ($LFS quota -v -u $TSTUSR $MOUNT | \ + grep -E '^ *'$MOUNT' *[0-9]+\** *'${LVAL[0]}' *'${LVAL[1]}' *[0-9]+\** *'${LVAL[2]}' *'${LVAL[3]}) \ + || error "lfs quota output is unexpected" + + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 \ + $MOUNT || error "could not reset quota limits" + +} +run_test_with_stat 20 "test if setquota specifiers work properly (15754)" + +test_21_sub() { + local testfile=$1 + local blk_number=$2 + local seconds=$3 + + time=$(($(date +%s) + seconds)) + while [ $(date +%s) -lt $time ]; do + $RUNAS dd if=/dev/zero of=$testfile bs=$BLK_SZ count=$blk_number > /dev/null 2>&1 + rm -f $testfile + done +} + +# run for fixing bug16053, setquota shouldn't fail when writing and +# deleting are happening +test_21() { + set_blk_tunesz 512 + set_blk_unitsz 1024 + + wait_delete_completed + + TESTFILE="$DIR/$tdir/$tfile" + + BLK_LIMIT=$((10 * 1024 * 1024)) # 10G + FILE_LIMIT=1000000 + + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for user: $TSTUSR" + $LFS setquota -u $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $MOUNT + log " Set enough high limit(block:$BLK_LIMIT; file: $FILE_LIMIT) for group: $TSTUSR" + $LFS setquota -g $TSTUSR -b 0 -B $BLK_LIMIT -i 0 -I $FILE_LIMIT $MOUNT + + # repeat writing on a 1M file + test_21_sub ${TESTFILE}_1 1024 30 & + DDPID1=$! + # repeat writing on a 128M file + test_21_sub ${TESTFILE}_2 $((1024 * 128)) 30 & + DDPID2=$! + + time=$(($(date +%s) + 30)) + i=1 + while [ $(date +%s) -lt $time ]; do + log " Set quota for $i times" + $LFS setquota -u $TSTUSR -b 0 -B $((BLK_LIMIT + 1024 * i)) -i 0 -I $((FILE_LIMIT + i)) $MOUNT + $LFS setquota -g $TSTUSR -b 0 -B $((BLK_LIMIT + 1024 * i)) -i 0 -I $((FILE_LIMIT + i)) $MOUNT + i=$((i+1)) + sleep 1 + done + + count=0 + while [ true ]; do + if ! ps -p ${DDPID1} > /dev/null 2>&1; then break; fi + count=$[count+1] + if [ $count -gt 60 ]; then + error "dd should be finished!" + fi + sleep 1 + done + echo "(dd_pid=$DDPID1, time=$count)successful" + + count=0 + while [ true ]; do + if ! ps -p ${DDPID2} > /dev/null 2>&1; then break; fi + count=$[count+1] + if [ $count -gt 60 ]; then + error "dd should be finished!" + fi + sleep 1 + done + echo "(dd_pid=$DDPID2, time=$count)successful" + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + $LFS setquota -g $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + return $RC +} +run_test_with_stat 21 "run for fixing bug16053 ===========" + +test_22() { + local SAVEREFORMAT + + SAVEREFORMAT=$REFORMAT + $LFS quotaoff -ug $DIR || error "could not turn quotas off" + + quota_save_version "ug" + + REFORMAT="reformat" + stopall + mount + setupall + REFORMAT=$SAVEREFORMAT + + echo "checking parameters" + + do_facet $SINGLEMDS "lctl get_param mdd.${FSNAME}-MDT*.quota_type" | grep "ug" || error "admin failure" + do_facet ost1 "lctl get_param obdfilter.*.quota_type" | grep "ug" || error "op failure" + + run_test 0 "reboot lustre" +} +run_test_with_stat 22 "test if quota_type saved as permanent parameter ====" + +test_23_sub() { + mkdir -p $DIR/$tdir + chmod 0777 $DIR/$tdir + TESTFILE="$DIR/$tdir/$tfile-0" + rm -f $TESTFILE + local bs_unit=$((1024*1024)) + LIMIT=$1 + + wait_delete_completed + + # test for user + log " User quota (limit: $LIMIT kbytes)" + $LFS setquota -u $TSTUSR -b 0 -B $LIMIT -i 0 -I 0 $DIR + sleep 3 + $SHOW_QUOTA_USER + + $LFS setstripe $TESTFILE -c 1 + chown $TSTUSR.$TSTUSR $TESTFILE + + log " Step1: trigger quota with 0_DIRECT" + log " Write half of file" + $RUNAS $DIRECTIO write $TESTFILE 0 $(($LIMIT/1024/2)) $bs_unit || error "(1) write failure, but expect success: $LIMIT" + log " Write out of block quota ..." + $RUNAS $DIRECTIO write $TESTFILE $(($LIMIT/1024/2)) $(($LIMIT/1024/2)) $bs_unit && error "(2) write success, but expect EDQUOT: $LIMIT" + log " Step1: done" + + log " Step2: rewrite should succeed" + $RUNAS $DIRECTIO write $TESTFILE $(($LIMIT/1024/2)) 1 $bs_unit || error "(3) write failure, but expect success: $LIMIT" + log " Step2: done" + + rm -f $TESTFILE + wait_delete_completed + OST0_UUID=`do_facet ost1 $LCTL dl | grep -m1 obdfilter | awk '{print $((NF-1))}'` + OST0_QUOTA_USED=`$LFS quota -o $OST0_UUID -u $TSTUSR $DIR | awk '/^.*[[:digit:]+][[:space:]+]/ { print $1 }'` + echo $OST0_QUOTA_USED + [ $OST0_QUOTA_USED -ne 0 ] && \ + ($SHOW_QUOTA_USER; error "quota deleted isn't released") + $SHOW_QUOTA_USER + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $DIR +} + +test_23() { + log "run for $((OSTCOUNT * 4))MB test file" + test_23_sub $((OSTCOUNT * 4 * 1024)) + + OST0_MIN=120000 + check_whether_skip && return 0 + log "run for $((OSTCOUNT * 40))MB test file" + test_23_sub $((OSTCOUNT * 40 * 1024)) +} +run_test_with_stat 23 "run for fixing bug16125 ===========" + +test_24() { + local TESTFILE="$DIR/$tdir/$tfile" + mkdir -p $DIR/$tdir + + run_to_block_limit $TESTFILE + $SHOW_QUOTA_USER | grep '*' || error "no matching *" + + # cleanup + rm -f $TESTFILE + $LFS setquota -u $TSTUSR -b 0 -B 0 -i 0 -I 0 $MOUNT + + set_blk_unitsz $((128 * 1024)) + set_blk_tunesz $((128 * 1024 / 2)) + +} +run_test_with_stat 24 "test if lfs draws an asterix when limit is reached (16646) ===========" # turn off quota test_99() { $LFS quotaoff $DIR + lctl set_param debug="-quota" + return 0 } -run_test 99 "Quota off ===============================" +run_test_with_stat 99 "Quota off ===============================" log "cleanup: ======================================================" cd $ORIG_PWD -post_test check_and_cleanup_lustre echo '=========================== finished ===============================' [ -f "$QUOTALOG" ] && cat $QUOTALOG && grep -q FAIL $QUOTALOG && exit 1 || true diff --git a/lustre/tests/sanity-sec.sh b/lustre/tests/sanity-sec.sh index 0bfb2f9..b65f722 100644 --- a/lustre/tests/sanity-sec.sh +++ b/lustre/tests/sanity-sec.sh @@ -64,10 +64,14 @@ fi MDT="`do_facet $SINGLEMDS "lctl get_param -N mdt.\*MDT\*/stats 2>/dev/null | cut -d"." -f2" || true`" if [ ! -z "$MDT" ]; then - do_facet $SINGLEMDS "mkdir -p $CONFDIR" + do_facet $SINGLEMDS "mkdir -p $CONFDIR" IDENTITY_FLUSH=mdt.$MDT.identity_flush MDSCAPA=mdt.$MDT.capa CAPA_TIMEOUT=mdt.$MDT.capa_timeout + MDSSECLEVEL=mdt.$MDT.sec_level + LOCALMDT=$MDT +else + LOCALMDT="" fi # for CLIENT_TYPE @@ -121,25 +125,41 @@ sec_setup # run as different user test_0() { - umask 0022 + umask 0022 - chmod 0755 $DIR || error "chmod (1)" - rm -rf $DIR/$tdir || error "rm (1)" + chmod 0755 $DIR || error "chmod (1)" + rm -rf $DIR/* || error "rm (1)" mkdir -p $DIR/$tdir || error "mkdir (1)" - chown $USER0 $DIR/$tdir || error "chown (1)" + + if [ "$CLIENT_TYPE" = "remote" ]; then + [ -z "$MDT" ] && skip "do not support do_facet operations." && return + do_facet $SINGLEMDS "echo '* 0 normtown' > $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + chown $USER0 $DIR/$tdir && error "chown (1)" + do_facet $SINGLEMDS "echo '* 0 rmtown' > $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + else + chown $USER0 $DIR/$tdir || error "chown (2)" + fi + $RUNAS -u $ID0 ls $DIR || error "ls (1)" rm -f $DIR/f0 || error "rm (2)" $RUNAS -u $ID0 touch $DIR/f0 && error "touch (1)" $RUNAS -u $ID0 touch $DIR/$tdir/f1 || error "touch (2)" $RUNAS -u $ID1 touch $DIR/$tdir/f2 && error "touch (3)" touch $DIR/$tdir/f3 || error "touch (4)" - chown root $DIR/$tdir || error "chown (2)" + chown root $DIR/$tdir || error "chown (3)" chgrp $USER0 $DIR/$tdir || error "chgrp (1)" chmod 0775 $DIR/$tdir || error "chmod (2)" $RUNAS -u $ID0 touch $DIR/$tdir/f4 || error "touch (5)" $RUNAS -u $ID1 touch $DIR/$tdir/f5 && error "touch (6)" touch $DIR/$tdir/f6 || error "touch (7)" - rm -rf $DIR/$tdir || error "rm (3)" + rm -rf $DIR/* || error "rm (3)" + + if [ "$CLIENT_TYPE" = "remote" ]; then + do_facet $SINGLEMDS "rm -f $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + fi } run_test 0 "uid permission =============================" @@ -147,11 +167,11 @@ run_test 0 "uid permission =============================" test_1() { [ $GSS_SUP = 0 ] && skip "without GSS support." && return [ -z "$MDT" ] && skip "do not support do_facet operations." && return - [ "$CLIENT_TYPE" = "remote" ] && \ - skip "test_1 for local client only" && return - do_facet $SINGLEMDS "rm -f $PERM_CONF" - do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + if [ "$CLIENT_TYPE" = "remote" ]; then + do_facet $SINGLEMDS "echo '* 0 rmtown' > $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + fi rm -rf $DIR/$tdir mkdir -p $DIR/$tdir @@ -159,7 +179,7 @@ test_1() { chown $USER0 $DIR/$tdir || error "chown (1)" $RUNAS -u $ID1 -v $ID0 touch $DIR/$tdir/f0 && error "touch (2)" echo "enable uid $ID1 setuid" - do_facet $SINGLEMDS "echo '* $ID1 setuid' > $PERM_CONF" + do_facet $SINGLEMDS "echo '* $ID1 setuid' >> $PERM_CONF" do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" $RUNAS -u $ID1 -v $ID0 touch $DIR/$tdir/f1 || error "touch (3)" @@ -196,6 +216,10 @@ test_2 () { [ -z "$(which setfacl 2>/dev/null)" ] && \ skip "could not find setfacl" && return [ "$UID" != 0 ] && skip "must run as root" && return + [ -z "$MDT" ] && skip "do not support do_facet operations." && return + + do_facet $SINGLEMDS "echo '* 0 rmtacl,rmtown' > $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" sec_login root root sec_login bin bin @@ -206,17 +230,8 @@ test_2 () { umask 0022 cd $DIR - if [ ! -z "$MDT" ]; then - do_facet $SINGLEMDS "echo '* 0 rmtacl' > $PERM_CONF" - do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" - fi - - if lfs rgetfacl $DIR; then - echo "performing cp ..." - run_rmtacl_subtest cp || error "cp" - else - echo "server doesn't permit current user 'lfs r{s,g}etfacl', skip cp test." - fi + echo "performing cp ..." + run_rmtacl_subtest cp || error "cp" echo "performing getfacl-noacl..." run_rmtacl_subtest getfacl-noacl || error "getfacl-noacl" echo "performing misc..." @@ -233,13 +248,11 @@ test_2 () { run_rmtacl_subtest inheritance || error "inheritance" rm -f make-tree - if [ ! -z "$MDT" ]; then - do_facet $SINGLEMDS "rm -f $PERM_CONF" - do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" - fi - cd $SAVE_PWD umask $SAVE_UMASK + + do_facet $SINGLEMDS "rm -f $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" } run_test 2 "rmtacl =============================" @@ -255,22 +268,31 @@ run_test 3 "rootsquash =============================" # as for remote client, the groups of the specified uid on MDT # will be obtained by upcall /sbin/l_getidentity and used. test_4() { + if [ "$CLIENT_TYPE" = "remote" ]; then + [ -z "$MDT" ] && skip "do not support do_facet operations." && return + do_facet $SINGLEMDS "echo '* 0 rmtown' > $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + fi + rm -rf $DIR/$tdir mkdir -p $DIR/$tdir chmod 0771 $DIR/$tdir chgrp $ID0 $DIR/$tdir $RUNAS -u $ID0 ls $DIR/$tdir || error "setgroups (1)" - if [ "$CLIENT_TYPE" != "remote" ]; then + if [ "$CLIENT_TYPE" = "local" ]; then if [ ! -z "$MDT" ]; then do_facet $SINGLEMDS "echo '* $ID1 setgrp' > $PERM_CONF" do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" - $RUNAS -u $ID1 -G1,2,$ID0 ls $DIR/$tdir || error "setgroups (2)" - do_facet $SINGLEMDS "rm -f $PERM_CONF" - do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + $RUNAS -u $ID1 -G1,2,$ID0 ls $DIR/$tdir || error "setgroups (2)" fi fi $RUNAS -u $ID1 -G1,2 ls $DIR/$tdir && error "setgroups (3)" rm -rf $DIR/$tdir + + if [ ! -z "$MDT" ]; then + do_facet $SINGLEMDS "rm -f $PERM_CONF" + do_facet $SINGLEMDS "lctl set_param -n $IDENTITY_FLUSH=-1" + fi } run_test 4 "set supplementary group ===============" @@ -282,6 +304,39 @@ mds_capability_timeout() { return 0 } +mds_sec_level_switch() { + [ $# -lt 1 ] && echo "Miss mds sec level switch value" && return 1 + + case $1 in + 0) echo "Disable capa for all clients";; + 1) echo "Enable capa for remote client";; + 3) echo "Enable capa for all clients";; + *) echo "Invalid mds sec level switch value" && return 2;; + esac + + do_facet $SINGLEMDS "lctl set_param -n $MDSSECLEVEL=$1" + return 0 +} + +oss_sec_level_switch() { + [ $# -lt 1 ] && echo "Miss oss sec level switch value" && return 1 + + case $1 in + 0) echo "Disable capa for all clients";; + 1) echo "Enable capa for remote client";; + 3) echo "Enable capa for all clients";; + *) echo "Invalid oss sec level switch value" && return 2;; + esac + + for i in `seq $OSTCOUNT`; do + local j=`expr $i - 1` + local OST="`do_facet ost$i "lctl get_param -N obdfilter.\*OST\*$j/stats 2>/dev/null | cut -d"." -f2" || true`" + [ -z "$OST" ] && return 3 + do_facet ost$i "lctl set_param -n obdfilter.$OST.sec_level=$1" + done + return 0 +} + mds_capability_switch() { [ $# -lt 1 ] && echo "Miss mds capability switch value" && return 1 @@ -306,12 +361,25 @@ oss_capability_switch() { for i in `seq $OSTCOUNT`; do local j=`expr $i - 1` - local OST="`do_facet ost$i "lctl get_param -N obdfilter.\*OST\*$j/stats | cut -d"." -f2" || true`" + local OST="`do_facet ost$i "lctl get_param -N obdfilter.\*OST\*$j/stats 2>/dev/null | cut -d"." -f2" || true`" + [ -z "$OST" ] && return 3 do_facet ost$i "lctl set_param -n obdfilter.$OST.capa=$1" done return 0 } +turn_mds_capa_on() { + mds_capability_switch 3 || return 1 + mds_sec_level_switch 3 || return 2 + return 0 +} + +turn_oss_capa_on() { + oss_capability_switch 1 || return 1 + oss_sec_level_switch 3 || return 2 + return 0 +} + turn_capability_on() { local capa_timeout=${1:-"1800"} @@ -320,13 +388,22 @@ turn_capability_on() { # is turned on on all MDS/OSS servers before # client mount. - umount $MOUNT || return 1 + turn_mds_capa_on || return 1 + turn_oss_capa_on || return 2 + mds_capability_timeout $capa_timeout || return 3 + remount_client $MOUNT || return 4 + return 0 +} - mds_capability_switch 3 || return 2 - oss_capability_switch 1 || return 3 - mds_capability_timeout $capa_timeout || return 4 +turn_mds_capa_off() { + mds_sec_level_switch 0 || return 1 + mds_capability_switch 0 || return 2 + return 0 +} - mount_client $MOUNT || return 5 +turn_oss_capa_off() { + oss_sec_level_switch 0 || return 1 + oss_capability_switch 0 || return 2 return 0 } @@ -335,8 +412,8 @@ turn_capability_off() { # it in a live system. But, please turn off # capability of all OSS servers before MDS servers. - oss_capability_switch 0 || return 1 - mds_capability_switch 0 || return 2 + turn_oss_capa_off || return 1 + turn_mds_capa_off || return 2 return 0 } @@ -347,24 +424,29 @@ turn_capability_off() { test_5() { local file=$DIR/f5 + [ $GSS_SUP = 0 ] && skip "without GSS support." && return [ -z "$MDT" ] && skip "do not support do_facet operations." && return + [ ! -z "$LOCALMDT" ] && skip "client should be separated from server." && return + rm -f $file + turn_capability_off if [ $? != 0 ]; then error "turn_capability_off" return 1 fi - rm -f $file - # Disable proc variable - mds_capability_switch 0 + turn_oss_capa_on if [ $? != 0 ]; then - error "mds_capability_switch 0" + error "turn_oss_capa_on" return 2 fi - oss_capability_switch 1 - if [ $? != 0 ]; then - error "oss_capability_switch 1" - return 3 + + if [ "$CLIENT_TYPE" = "remote" ]; then + remount_client $MOUNT && return 3 + turn_oss_capa_off + return 0 + else + remount_client $MOUNT || return 4 fi # proc variable disabled -- access to the objects in the filesystem @@ -374,14 +456,15 @@ test_5() { $WTL $file 30 if [ $? == 0 ]; then error "Write worked well even though secrets not supplied." - return 4 + return 5 fi turn_capability_on if [ $? != 0 ]; then error "turn_capability_on" - return 4 + return 6 fi + sleep 5 # proc variable enabled, secrets supplied -- write should work now @@ -390,13 +473,13 @@ test_5() { $WTL $file 30 if [ $? != 0 ]; then error "Write failed even though secrets supplied." - return 5 + return 7 fi turn_capability_off if [ $? != 0 ]; then error "turn_capability_off" - return 7 + return 8 fi rm -f $file } @@ -409,12 +492,16 @@ run_test 5 "capa secrets =========================" test_6() { local file=$DIR/f6 + [ $GSS_SUP = 0 ] && skip "without GSS support." && return [ -z "$MDT" ] && skip "do not support do_facet operations." && return + [ ! -z "$LOCALMDT" ] && skip "client should be separated from server." && return + turn_capability_off if [ $? != 0 ]; then error "turn_capability_off" return 1 fi + rm -f $file turn_capability_on 30 @@ -422,6 +509,7 @@ test_6() { error "turn_capability_on 30" return 2 fi + # Token expiry $WTL $file 60 if [ $? != 0 ]; then @@ -435,14 +523,15 @@ test_6() { error "mds_capability_timeout 30" return 4 fi + $WTL $file 60 & local PID=$! sleep 5 # To disable automatic renew, only need turn capa off on MDS. - mds_capability_switch 0 + turn_mds_capa_off if [ $? != 0 ]; then - error "mds_capability_switch 0" + error "turn_mds_capa_off" return 5 fi diff --git a/lustre/tests/sanity.sh b/lustre/tests/sanity.sh index 4c46248..4ad3f14 100644 --- a/lustre/tests/sanity.sh +++ b/lustre/tests/sanity.sh @@ -464,6 +464,14 @@ test_17f() { } run_test 17f "symlinks: long and very long symlink name ========================" +test_17g() { + mkdir -p $DIR/$tdir + LONGSYMLINK="$(dd if=/dev/zero bs=4095 count=1 | tr '\0' 'x')" + ln -s $LONGSYMLINK $DIR/$tdir/$tfile + ls -l $DIR/$tdir +} +run_test 17g "symlinks: really long symlink name ===============================" + test_18() { touch $DIR/f ls $DIR || error @@ -958,7 +966,7 @@ exhaust_all_precreations() { test_27n() { [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return reset_enospc rm -f $DIR/d27/f27n @@ -973,7 +981,7 @@ run_test 27n "create file with some full OSTs ==================" test_27o() { [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return reset_enospc rm -f $DIR/d27/f27o @@ -990,18 +998,18 @@ run_test 27o "create file with all full OSTs (should error) ====" test_27p() { [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return reset_enospc rm -f $DIR/d27/f27p - $MCREATE $DIR/d27/f27p || error - $TRUNCATE $DIR/d27/f27p 80000000 || error - $CHECKSTAT -s 80000000 $DIR/d27/f27p || error + $MCREATE $DIR/d27/f27p || error "mcreate failed" + $TRUNCATE $DIR/d27/f27p 80000000 || error "truncate failed" + $CHECKSTAT -s 80000000 $DIR/d27/f27p || error "checkstat failed" exhaust_precreations 0 0x80000215 - echo foo >> $DIR/d27/f27p || error - $CHECKSTAT -s 80000004 $DIR/d27/f27p || error + echo foo >> $DIR/d27/f27p || error "append failed" + $CHECKSTAT -s 80000004 $DIR/d27/f27p || error "checkstat failed" reset_enospc } @@ -1010,7 +1018,7 @@ run_test 27p "append to a truncated file with some full OSTs ===" test_27q() { [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return reset_enospc rm -f $DIR/d27/f27q @@ -1031,7 +1039,7 @@ run_test 27q "append to truncated file with all OSTs full (should error) ===" test_27r() { [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return reset_enospc rm -f $DIR/d27/f27r @@ -1083,7 +1091,7 @@ run_test 27u "skip object creation on OSC w/o objects ==========" test_27v() { # bug 4900 [ "$OSTCOUNT" -lt "2" ] && skip "too few OSTs" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return - remote_ost_nodsh && skip "remote OST with nodsh" && return + remote_ost_nodsh && skip "remote OST with nodsh" && return exhaust_all_precreations @@ -2213,7 +2221,7 @@ test_51b() { run_test 51b "mkdir .../t-0 --- .../t-$NUMTEST ====================" test_51bb() { - [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return + [ $MDSCOUNT -lt 2 ] && skip "needs >= 2 MDTs" && return local ndirs=${TEST51BB_NDIRS:-10} local nfiles=${TEST51BB_NFILES:-100} @@ -2235,7 +2243,7 @@ test_51bb() { declare -a dirs for ((i=0; i < $ndirs; i++)); do dirs[i]=$dir/$RANDOM - echo Creating directory ${dirs[i]} + echo Creating directory ${dirs[i]} mkdir -p ${dirs[i]} ls $dir echo Creating $nfiles in dir ${dirs[i]} ... @@ -2982,6 +2990,21 @@ test_65i() { # bug6367 } run_test 65i "set non-default striping on root directory (bug 6367)=" +test_65ia() { # bug12836 + $LFS getstripe $MOUNT || error "getstripe $MOUNT failed" +} +run_test 65ia "getstripe on -1 default directory striping" + +test_65ib() { # bug12836 + $LFS getstripe -v $MOUNT || error "getstripe -v $MOUNT failed" +} +run_test 65ib "getstripe -v on -1 default directory striping" + +test_65ic() { # bug12836 + $LFS find -mtime -1 $MOUNT || error "find $MOUNT failed" +} +run_test 65ic "new find on -1 default directory striping" + test_65j() { # bug6367 sync; sleep 1 # if we aren't already remounting for each test, do so for this test @@ -3155,6 +3178,7 @@ test_72() { # bug 5695 - Test that on 2.6 remove_suid works properly cancel_lru_locks mdc test -u $DIR/f72 -o -g $DIR/f72 && error "S/gid is not dropped on MDS" true + rm -f $DIR/f72 } run_test 72 "Test that remove suid works properly (bug5695) ====" @@ -3197,6 +3221,7 @@ test_74a() { # bug 6149, 6184 ls $DIR/f74a lctl set_param fail_loc=0 true + rm -f $DIR/f74a } run_test 74a "ldlm_enqueue freed-export error path, ls (shouldn't LBUG)" @@ -3210,6 +3235,7 @@ test_74b() { # bug 13310 touch $DIR/f74b lctl set_param fail_loc=0 true + rm -f $DIR/f74b } run_test 74b "ldlm_enqueue freed-export error path, touch (shouldn't LBUG)" @@ -3230,9 +3256,6 @@ test75_prep() { } test_75a() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep cp -p ${F128k} ${FHEAD} @@ -3250,9 +3273,6 @@ test_75a() { run_test 75a "TEST join file ====================================" test_75b() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep cp -p ${F128k} ${FTAIL} @@ -3266,9 +3286,6 @@ test_75b() { run_test 75b "TEST join file 2 ==================================" test_75c() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep cp -p ${F128k} ${FTAIL} @@ -3282,9 +3299,6 @@ test_75c() { run_test 75c "TEST join file 3 ==================================" test_75d() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep cp -p ${F128k} ${FHEAD} @@ -3299,9 +3313,6 @@ test_75d() { run_test 75d "TEST join file 4 ==================================" test_75e() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep rm -rf ${FHEAD} || "delete join file error" @@ -3309,9 +3320,6 @@ test_75e() { run_test 75e "TEST join file 5 (remove joined file) =============" test_75f() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return test75_prep cp -p ${F128k} ${F75}_join_10_compare @@ -3329,9 +3337,6 @@ test_75f() { run_test 75f "TEST join file 6 (join 10 files) ==================" test_75g() { -# skipped temporarily: we do not have join file currently -# please remove this when ready - huanghua - return [ ! -f ${F75}_join_10 ] && echo "${F75}_join_10 missing" && return $LFS getstripe ${F75}_join_10 @@ -3402,6 +3407,7 @@ test_77a() { # bug 10889 set_checksums 1 dd if=$F77_TMP of=$DIR/$tfile bs=1M count=$F77SZ || error "dd error" set_checksums 0 + rm -f $DIR/$tfile } run_test 77a "normal checksum read/write operation =============" @@ -3414,6 +3420,7 @@ test_77b() { # bug 10889 error "dd error: $?" lctl set_param fail_loc=0 set_checksums 0 + rm -f $DIR/f77b } run_test 77b "checksum error on client write ====================" @@ -3544,7 +3551,12 @@ test_78() { # bug 10901 echo "MemTotal: $MEMTOTAL" # reserve 256MB of memory for the kernel and other running processes, # and then take 1/2 of the remaining memory for the read/write buffers. - MEMTOTAL=$(((MEMTOTAL - 256 ) / 2)) + if [ $MEMTOTAL -gt 512 ] ;then + MEMTOTAL=$(((MEMTOTAL - 256 ) / 2)) + else + # for those poor memory-starved high-end clusters... + MEMTOTAL=$((MEMTOTAL / 2)) + fi echo "Mem to use for directio: $MEMTOTAL" [ $F78SIZE -gt $MEMTOTAL ] && F78SIZE=$MEMTOTAL [ $F78SIZE -gt 512 ] && F78SIZE=512 @@ -3554,11 +3566,12 @@ test_78() { # bug 10901 [ $SMALLESTOST -lt 10240 ] && \ skip "too small OSTSIZE, useless to run large O_DIRECT test" && return 0 - [ $F78SIZE -gt $((SMALLESTOST * $OSTCOUNT / 1024 - 5)) ] && \ - F78SIZE=$((SMALLESTOST * $OSTCOUNT / 1024 - 5)) + [ $F78SIZE -gt $((SMALLESTOST * $OSTCOUNT / 1024 - 80)) ] && \ + F78SIZE=$((SMALLESTOST * $OSTCOUNT / 1024 - 80)) + [ "$SLOW" = "no" ] && NSEQ=1 && [ $F78SIZE -gt 32 ] && F78SIZE=32 echo "File size: $F78SIZE" - $SETSTRIPE $DIR/$tfile -c -1 || error "setstripe failed" + $SETSTRIPE $DIR/$tfile -c $OSTCOUNT || error "setstripe failed" for i in `seq 1 $NSEQ` do FSIZE=$(($F78SIZE / ($NSEQ - $i + 1))) @@ -3604,13 +3617,14 @@ test_80() { # bug 10718 dd if=/dev/zero of=$DIR/$tfile bs=1M count=1 seek=1M sync; sleep 1; sync BEFORE=`date +%s` - cancel_lru_locks OSC + cancel_lru_locks osc AFTER=`date +%s` DIFF=$((AFTER-BEFORE)) if [ $DIFF -gt 1 ] ; then error "elapsed for 1M@1T = $DIFF" fi true + rm -f $DIR/$tfile } run_test 80 "Page eviction is equally fast at high offsets too ====" @@ -3677,17 +3691,22 @@ test_99f() { [ ! -d $DIR/d99cvsroot ] && test_99d cd $DIR/d99reposname $RUNAS cvs commit -m 'nomsg' foo99 + rm -fr $DIR/d99cvsroot } run_test 99f "cvs commit =======================================" test_100() { + [ "$NETTYPE" = tcp ] || \ + { skip "TCP secure port test, not useful for NETTYPE=$NETTYPE" && \ + return ; } + remote_ost_nodsh && skip "remote OST with nodsh" && return remote_mds_nodsh && skip "remote MDS with nodsh" && return remote_servers || \ { skip "useless for local single node setup" && return; } netstat -tna | ( rc=1; while read PROT SND RCV LOCAL REMOTE STAT; do - [ "$PROT" != "$NETTYPE" ] && continue + [ "$PROT" != "tcp" ] && continue RPORT=$(echo $REMOTE | cut -d: -f2) [ "$RPORT" != "$ACCEPTOR_PORT" ] && continue @@ -3699,7 +3718,7 @@ test_100() { error "local: $LPORT > 1024, remote: $RPORT" fi done - [ "$rc" = 0 ] || error "privileged port not found" ) + [ "$rc" = 0 ] || error "privileged port not found" ) } run_test 100 "check local port using privileged port ===========" @@ -3779,6 +3798,7 @@ cleanup_test101() { [ "$SETUP_TEST101" = "yes" ] || return trap 0 rm -rf $DIR/$tdir + rm -f $DIR/$tfile SETUP_TEST101=no } @@ -3940,6 +3960,7 @@ test_102b() { local stripe_count=`grep "count" $tmp_file| awk '{print $2}'` [ "$stripe_size" -eq 65536 ] || error "stripe size $stripe_size != 65536" [ "$stripe_count" -eq 2 ] || error "stripe count $stripe_count != 2" + rm -f $DIR/$tfile } run_test 102b "getfattr/setfattr for trusted.lov EAs ============" @@ -3969,17 +3990,6 @@ test_102c() { } run_test 102c "non-root getfattr/setfattr for lustre.lov EAs ===========" -get_stripe_info() { - stripe_size=0 - stripe_count=0 - stripe_offset=0 - local lines=`sed -n '/obdidx/=' $1` - stripe_size=`awk '{if($1~/size/) print $2}' $1` - stripe_count=`awk '{if($1~/count/) print $2}' $1` - lines=`expr $lines + 1` - stripe_offset=`sed -n ${lines}p $1 |awk '{print $1}'` -} - compare_stripe_info1() { for num in 1 2 3 4 do @@ -3989,22 +3999,16 @@ compare_stripe_info1() { do local size=`expr $STRIPE_SIZE \* $num` local file=file"$num-$offset-$count" - local tmp_file=out - $GETSTRIPE -v $file > $tmp_file - get_stripe_info $tmp_file - if test $stripe_size -ne $size - then + get_stripe_info client $file + if [ $stripe_size -ne $size ]; then error "$file: different stripe size" && return fi - if test $stripe_count -ne $count - then + if [ $stripe_count -ne $count ]; then error "$file: different stripe count" && return fi - if test $stripe_offset -ne 0 - then + if [ $stripe_index -ne 0 ]; then error "$file: different stripe offset" && return fi - rm -f $tmp_file done done done @@ -4019,22 +4023,16 @@ compare_stripe_info2() { do local size=`expr $STRIPE_SIZE \* $num` local file=file"$num-$offset-$count" - local tmp_file=out - $GETSTRIPE -v $file > $tmp_file - get_stripe_info $tmp_file - if test $stripe_size -ne $size - then + get_stripe_info client $file + if [ $stripe_size -ne $size ]; then error "$file: different stripe size" && return fi - if test $stripe_count -ne $count - then + if [ $stripe_count -ne $count ]; then error "$file: different stripe count" && return fi - if test $stripe_offset -ne $offset - then + if [ $stripe_index -ne $offset ]; then error "$file: different stripe offset" && return fi - rm -f $tmp_file done done done @@ -4140,6 +4138,7 @@ test_102h() { # bug 15777 error "$XBIG different after growing $XSML" fi log "$XBIG still valid after growing $XSML" + rm -f $file } run_test 102h "grow xattr from inside inode to external block" @@ -4217,6 +4216,7 @@ test_104() { lfs df || error "lfs df with deactivated OSC failed" lctl --device %$OSC recover lfs df || error "lfs df with reactivated OSC failed" + rm -f $DIR/$tfile } run_test 104 "lfs df [-ih] [path] test =========================" @@ -4229,6 +4229,7 @@ test_105a() { else flocks_test 1 off -f $DIR/$tfile || error "fail flock off" fi + rm -f $DIR/$tfile } run_test 105a "flock when mounted without -o flock test ========" @@ -4240,6 +4241,7 @@ test_105b() { else flocks_test 1 off -c $DIR/$tfile || error "fail flock off" fi + rm -f $DIR/$tfile } run_test 105b "fcntl when mounted without -o flock test ========" @@ -4251,6 +4253,7 @@ test_105c() { else flocks_test 1 off -l $DIR/$tfile || error "fail flock off" fi + rm -f $DIR/$tfile } run_test 105c "lockf when mounted without -o flock test ========" @@ -4312,6 +4315,7 @@ test_110() { touch $DIR/d110/yyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyyy && error ""create with 256 char should fail, but not ls -l $DIR/d110 + rm -fr $DIR/d110 } run_test 110 "filename length checking" @@ -4454,6 +4458,7 @@ test_117() # bug 10891 > $DIR/$tfile || error "truncate failed" lctl set_param fail_loc=0 echo "Truncate succeeded." + rm -f $DIR/$tfile } run_test 117 "verify fsfilt_extend ==========" @@ -4490,6 +4495,7 @@ test_118a() #bug 11710 error "Dirty pages not flushed to disk, dirty=$DIRTY, writeback=$WRITEBACK" return 1; fi + rm -f $DIR/$tfile } run_test 118a "verify O_SYNC works ==========" @@ -5076,8 +5082,10 @@ test_123a() { # was test 123, statahead(bug 11401) SLOWOK=1 fi - remount_client $MOUNT mkdir -p $DIR/$tdir + rm -rf $DIR/$tdir/* + cancel_lru_locks mdc + cancel_lru_locks osc error=0 NUMFREE=`df -i -P $DIR | tail -n 1 | awk '{ print $4 }'` [ $NUMFREE -gt 100000 ] && NUMFREE=100000 || NUMFREE=$((NUMFREE-1000)) @@ -5155,10 +5163,9 @@ run_test 123b "not panic with network error in statahead enqueue (bug 15027)" test_124a() { [ -z "`lctl get_param -n mdc.*.connect_flags | grep lru_resize`" ] && \ skip "no lru resize on server" && return 0 - NR=2000 + local NR=2000 mkdir -p $DIR/$tdir || error "failed to create $DIR/$tdir" - # use touch to produce $NR new locks log "create $NR files at $DIR/$tdir" createmany -o $DIR/$tdir/f $NR || error "failed to create $NR files in $DIR/$tdir" @@ -5166,14 +5173,14 @@ test_124a() { cancel_lru_locks mdc ls -l $DIR/$tdir > /dev/null - NSDIR="" - LRU_SIZE=0 + local NSDIR="" + local LRU_SIZE=0 for VALUE in `lctl get_param ldlm.namespaces.*mdc-*.lru_size`; do - PARAM=`echo ${VALUE[0]} | cut -d "=" -f1` + local PARAM=`echo ${VALUE[0]} | cut -d "=" -f1` LRU_SIZE=$(lctl get_param -n $PARAM) if [ $LRU_SIZE -gt $(default_lru_size) ]; then NSDIR=$(echo $PARAM | cut -d "." -f1-3) - log "using $(basename $NSDIR) namespace" + log "NS=$(basename $NSDIR)" break fi done @@ -5182,40 +5189,53 @@ test_124a() { skip "Not enough cached locks created!" return 0 fi - log "created $LRU_SIZE lock(s)" - - # we want to sleep 30s to not make test too long - SLEEP=30 - SLEEP_ADD=2 - - # we know that lru resize allows one client to hold $LIMIT locks for 10h - MAX_HRS=10 - - # get the pool limit - LIMIT=`lctl get_param -n $NSDIR.pool.limit` - - # calculate lock volume factor taking into account data set size and the - # rule that number of locks will be getting smaller durring sleep interval - # and we need to additionally enforce LVF to take this into account. - # Use $LRU_SIZE_B here to take into account real number of locks created - # in the case of CMD, LRU_SIZE_B != $NR in most of cases - LVF=$(($MAX_HRS * 60 * 60 * $LIMIT / $SLEEP)) - LRU_SIZE_B=$LRU_SIZE - log "make client drop locks $LVF times faster so that ${SLEEP}s is enough to cancel $LRU_SIZE lock(s)" - OLD_LVF=`lctl get_param -n $NSDIR.pool.lock_volume_factor` + log "LRU=$LRU_SIZE" + + local SLEEP=30 + + # We know that lru resize allows one client to hold $LIMIT locks + # for 10h. After that locks begin to be killed by client. + local MAX_HRS=10 + local LIMIT=`lctl get_param -n $NSDIR.pool.limit` + + # Make LVF so higher that sleeping for $SLEEP is enough to _start_ + # killing locks. Some time was spent for creating locks. This means + # that up to the moment of sleep finish we must have killed some of + # them (10-100 locks). This depends on how fast ther were created. + # Many of them were touched in almost the same moment and thus will + # be killed in groups. + local LVF=$(($MAX_HRS * 60 * 60 / $SLEEP)) + + # Use $LRU_SIZE_B here to take into account real number of locks + # created in the case of CMD, LRU_SIZE_B != $NR in most of cases + local LRU_SIZE_B=$LRU_SIZE + log "LVF=$LVF" + local OLD_LVF=`lctl get_param -n $NSDIR.pool.lock_volume_factor` lctl set_param -n $NSDIR.pool.lock_volume_factor $LVF - log "sleep for $((SLEEP+SLEEP_ADD))s" - sleep $((SLEEP+SLEEP_ADD)) + + # Let's make sure that we really have some margin. Client checks + # cached locks every 10 sec. + SLEEP=$((SLEEP+20)) + log "Sleep ${SLEEP} sec" + local SEC=0 + while ((SEC<$SLEEP)); do + echo -n "..." + sleep 5 + SEC=$((SEC+5)) + LRU_SIZE=`lctl get_param -n $NSDIR/lru_size` + echo -n "$LRU_SIZE" + done + echo "" lctl set_param -n $NSDIR.pool.lock_volume_factor $OLD_LVF - LRU_SIZE_A=`lctl get_param -n $NSDIR.lru_size` + local LRU_SIZE_A=`lctl get_param -n $NSDIR/lru_size` [ $LRU_SIZE_B -gt $LRU_SIZE_A ] || { - error "No locks dropped in "$((SLEEP+SLEEP_ADD))"s. LRU size: $LRU_SIZE_A" + error "No locks dropped in ${SLEEP}s. LRU size: $LRU_SIZE_A" unlinkmany $DIR/$tdir/f $NR return } - log "Dropped "$((LRU_SIZE_B-LRU_SIZE_A))" locks in "$((SLEEP+SLEEP_ADD))"s" + log "Dropped "$((LRU_SIZE_B-LRU_SIZE_A))" locks in ${SLEEP}s" log "unlink $NR files at $DIR/$tdir" unlinkmany $DIR/$tdir/f $NR } @@ -5315,6 +5335,8 @@ run_test 125 "don't return EPROTO when a dir has a non-default striping and ACLs test_126() { # bug 12829/13455 [ -z "$(lctl get_param -n llite.*.client_type | grep local)" ] && skip "must run as local client" && return [ "$UID" != 0 ] && echo "skipping $TESTNAME (must run as root)" && return + $GSS && skip "must run as gss disabled" && return + $RUNAS -u 0 -g 1 touch $DIR/$tfile || error "touch failed" gid=`ls -n $DIR/$tfile | awk '{print $4}'` rm -f $DIR/$tfile @@ -5638,6 +5660,49 @@ test_130e() { } run_test 130e "FIEMAP (test continuation FIEMAP calls)" +# Test for writev/readv +test_131a() { + rwv -f $DIR/$tfile -w -n 3 524288 1048576 1572864 || \ + error "writev test failed" + rwv -f $DIR/$tfile -r -v -n 2 1572864 1048576 || \ + error "readv failed" + rm -f $DIR/$tfile +} +run_test 131a "test iov's crossing stripe boundary for writev/readv" + +test_131b() { + rwv -f $DIR/$tfile -w -a -n 3 524288 1048576 1572864 || \ + error "append writev test failed" + rwv -f $DIR/$tfile -w -a -n 2 1572864 1048576 || \ + error "append writev test failed" + rm -f $DIR/$tfile +} +run_test 131b "test append writev" + +test_131c() { + rwv -f $DIR/$tfile -w -d -n 1 1048576 || return 0 + error "NOT PASS" +} +run_test 131c "test read/write on file w/o objects" + +test_131d() { + rwv -f $DIR/$tfile -w -n 1 1572864 + NOB=`rwv -f $DIR/$tfile -r -n 3 524288 524288 1048576 | awk '/error/ {print $6}'` + if [ "$NOB" != 1572864 ]; then + error "Short read filed: read $NOB bytes instead of 1572864" + fi + rm -f $DIR/$tfile +} +run_test 131d "test short read" + +test_131e() { + rwv -f $DIR/$tfile -w -s 1048576 -n 1 1048576 + rwv -f $DIR/$tfile -r -z -s 0 -n 1 524288 || \ + error "read hitting hole failed" + rm -f $DIR/$tfile +} +run_test 131e "test read hitting hole" + test_140() { #bug-17379 mkdir -p $DIR/$tdir || error "Creating dir $DIR/$tdir" cd $DIR/$tdir || error "Changing to $DIR/$tdir" @@ -5668,6 +5733,19 @@ test_140() { #bug-17379 } run_test 140 "Check reasonable stack depth (shouldn't LBUG) ====" +test_141() { + local ls + #define OBD_FAIL_MGC_PAUSE_PROCESS_LOG 0x903 + $LCTL set_param fail_loc=0x903 + # cancel_lru_locks mgc - does not work due to lctl set_param syntax + for ls in /proc/fs/lustre/ldlm/namespaces/MGC*/lru_size; do + echo "clear" > $ls + done + FAIL_ON_ERROR=true cleanup + FAIL_ON_ERROR=true setup +} +run_test 141 "umount should not race with any mgc requeue thread" + test_150() { local TF="$TMP/$tfile" @@ -5729,7 +5807,7 @@ test_151() { $LCTL set_param -n obdfilter.*.writethrough_cache_enable 1 - # pages should be in the case right after write + # pages should be in the case right after write dd if=/dev/urandom of=$DIR/$tfile bs=4k count=$CPAGES || error "dd failed" BEFORE=`roc_hit` cancel_lru_locks osc @@ -5749,7 +5827,7 @@ test_151() { cancel_lru_locks osc cat $DIR/$tfile >/dev/null AFTER=`roc_hit` - if ! let "AFTER - BEFORE == CPAGES"; then + if let "AFTER - BEFORE != 0"; then error "IN CACHE: before: $BEFORE, after: $AFTER" fi @@ -5907,6 +5985,15 @@ test_200i() { } run_test 200i "Remove a pool ============================================" +test_212() { + size=`date +%s` + size=$((size % 8192 + 1)) + dd if=/dev/urandom of=$DIR/f212 bs=1k count=$size + sendfile $DIR/f212 $DIR/f212.xyz || error "sendfile wrong" + rm -f $DIR/f212 $DIR/f212.xyz +} +run_test 212 "Sendfile test ============================================" + TMPDIR=$OLDTMPDIR TMP=$OLDTMP HOME=$OLDHOME diff --git a/lustre/tests/sanityN.sh b/lustre/tests/sanityN.sh index 3341f43..b20ff18 100644 --- a/lustre/tests/sanityN.sh +++ b/lustre/tests/sanityN.sh @@ -46,7 +46,7 @@ SETUP=${SETUP:-:} init_test_env $@ . ${CONFIG:=$LUSTRE/tests/cfg/$NAME.sh} -[ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16" +[ "$SLOW" = "no" ] && EXCEPT_SLOW="12 16 33a" SANITYLOG=${TESTSUITELOG:-$TMP/$(basename $0 .sh).log} FAIL_ON_ERROR=false @@ -598,7 +598,7 @@ test_30() { #bug #11110 run_test 30 "recreate file race =========" -test_31() { +test_31a() { mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir" writes=`LANG=C dd if=/dev/zero of=$DIR/$tdir/$tfile count=1 2>&1 | awk 'BEGIN { FS="+" } /out/ {print $1}'` @@ -608,39 +608,56 @@ test_31() { awk 'BEGIN { FS="+" } /in/ {print $1}'` [ $reads -eq $writes ] || error "read" $reads "blocks, must be" $writes } -run_test 31 "voluntary cancel / blocking ast race==============" +run_test 31a "voluntary cancel / blocking ast race==============" + +test_31b() { + remote_ost || { skip "local OST" && return 0; } + remote_ost_nodsh && skip "remote OST w/o dsh" && return 0 + mkdir -p $DIR1/$tdir || error "Creating dir $DIR1/$tdir" + lfs setstripe $DIR/$tdir/$tfile -i 0 -c 1 + cp /etc/hosts $DIR/$tdir/$tfile + #define OBD_FAIL_LDLM_CANCEL_BL_CB_RACE 0x314 + lctl set_param fail_loc=0x314 + #define OBD_FAIL_LDLM_OST_FAIL_RACE 0x316 + do_facet ost1 lctl set_param fail_loc=0x316 + # Don't crash kernel + cat $DIR2/$tdir/$tfile > /dev/null 2>&1 + lctl set_param fail_loc=0 + do_facet ost1 lctl set_param fail_loc=0 +} +run_test 31b "voluntary OST cancel / blocking ast race==============" # enable/disable lockless truncate feature, depending on the arg 0/1 enable_lockless_truncate() { - lctl set_param -n llite.*.lockless_truncate $1 + lctl set_param -n osc.*.lockless_truncate $1 } test_32a() { # bug 11270 local p="$TMP/sanityN-$TESTNAME.parameters" - save_lustre_params $HOSTNAME llite.*.lockless_truncate > $p + save_lustre_params $HOSTNAME osc.*.lockless_truncate > $p cancel_lru_locks osc - clear_llite_stats + clear_osc_stats enable_lockless_truncate 1 dd if=/dev/zero of=$DIR1/$tfile count=10 bs=1M > /dev/null 2>&1 log "checking cached lockless truncate" $TRUNCATE $DIR1/$tfile 8000000 $CHECKSTAT -s 8000000 $DIR2/$tfile || error "wrong file size" - [ $(calc_llite_stats lockless_truncate) -eq 0 ] || + [ $(calc_osc_stats lockless_truncate) -eq 0 ] || error "lockless truncate doesn't use cached locks" log "checking not cached lockless truncate" $TRUNCATE $DIR2/$tfile 5000000 $CHECKSTAT -s 5000000 $DIR1/$tfile || error "wrong file size" - [ $(calc_llite_stats lockless_truncate) -ne 0 ] || + [ $(calc_osc_stats lockless_truncate) -ne 0 ] || error "not cached trancate isn't lockless" log "disabled lockless truncate" enable_lockless_truncate 0 - clear_llite_stats + clear_osc_stats $TRUNCATE $DIR2/$tfile 3000000 $CHECKSTAT -s 3000000 $DIR1/$tfile || error "wrong file size" - [ $(calc_llite_stats lockless_truncate) -eq 0 ] || + [ $(calc_osc_stats lockless_truncate) -eq 0 ] || error "lockless truncate disabling failed" rm $DIR1/$tfile # restore lockless_truncate default values @@ -654,36 +671,36 @@ test_32b() { # bug 11270 local node local p="$TMP/sanityN-$TESTNAME.parameters" - save_lustre_params $HOSTNAME "llite.*.contention_seconds" > $p + save_lustre_params $HOSTNAME "osc.*.contention_seconds" > $p for node in $(osts_nodes); do save_lustre_params $node "ldlm.namespaces.filter-*.max_nolock_bytes" >> $p save_lustre_params $node "ldlm.namespaces.filter-*.contended_locks" >> $p save_lustre_params $node "ldlm.namespaces.filter-*.contention_seconds" >> $p done - clear_llite_stats + clear_osc_stats # agressive lockless i/o settings for node in $(osts_nodes); do do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 2000000; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 0; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 60' done - lctl set_param -n llite.*.contention_seconds 60 + lctl set_param -n osc.*.contention_seconds 60 for i in $(seq 5); do dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 done - [ $(calc_llite_stats lockless_write_bytes) -ne 0 ] || error "lockless i/o was not triggered" + [ $(calc_osc_stats lockless_write_bytes) -ne 0 ] || error "lockless i/o was not triggered" # disable lockless i/o (it is disabled by default) for node in $(osts_nodes); do do_node $node 'lctl set_param -n ldlm.namespaces.filter-*.max_nolock_bytes 0; lctl set_param -n ldlm.namespaces.filter-*.contended_locks 32; lctl set_param -n ldlm.namespaces.filter-*.contention_seconds 0' done # set contention_seconds to 0 at client too, otherwise Lustre still # remembers lock contention - lctl set_param -n llite.*.contention_seconds 0 - clear_llite_stats - for i in $(seq 5); do + lctl set_param -n osc.*.contention_seconds 0 + clear_osc_stats + for i in $(seq 1); do dd if=/dev/zero of=$DIR1/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 dd if=/dev/zero of=$DIR2/$tfile bs=4k count=1 conv=notrunc > /dev/null 2>&1 done - [ $(calc_llite_stats lockless_write_bytes) -eq 0 ] || + [ $(calc_osc_stats lockless_write_bytes) -eq 0 ] || error "lockless i/o works when disabled" rm -f $DIR1/$tfile restore_lustre_params <$p @@ -691,6 +708,127 @@ test_32b() { # bug 11270 } run_test 32b "lockless i/o" +print_jbd_stat () { + local dev=$(basename $(do_facet $SINGLEMDS lctl get_param -n osd.*MDT*.mntdev)) + do_facet $SINGLEMDS cat /proc/fs/jbd/$dev/info | head -1 +} + +do_and_time () { + local cmd=$1 + + local start_ts=`date +%s` + + $cmd + + current_ts=`date +%s` + ELAPSED=`expr $current_ts - $start_ts` +} + +# commit on sharing tests +test_33a() { + remote_mds_nodsh && skip "remote MDS with nodsh" && return + + [ -n "$CLIENTS" ] || { skip "Need two or more clients" && return 0; } + [ $CLIENTCOUNT -ge 2 ] || \ + { skip "Need two or more clients, have $CLIENTCOUNT" && return 0; } + + zconf_mount_clients $CLIENT1,$CLIENT2 $DIR1 + zconf_mount_clients $CLIENT1,$CLIENT2 $DIR2 + + local nfiles=${TEST33_NFILES:-10000} + local param_file=$TMP/$tfile-params + + save_lustre_params $(facet_active_host $SINGLEMDS) "mdt.*.commit_on_sharing" > $param_file + + local COS + local jbdold + local jbdnew + local jbd + + for COS in 0 1; do + do_facet $SINGLEMDS lctl set_param mdt.*.commit_on_sharing=$COS + avgjbd=0 + avgtime=0 + for i in 1 2 3; do + + do_nodes $CLIENT1,$CLIENT2 "mkdir -p $DIR1/$tdir-\\\$(hostname)-$i" + + jbdold=$(print_jbd_stat) + echo "=== START createmany $jbdold" + do_and_time "do_nodes $CLIENT1,$CLIENT2 createmany -o $DIR1/$tdir-\\\$(hostname)-$i/f- -r $DIR2/$tdir-\\\$(hostname)-$i/f- $nfiles" + jbdnew=$(print_jbd_stat) + jbd=$((`echo $jbdnew | cut -d" " -f1` - `echo $jbdold | cut -d" " -f1`)) + echo "=== END createmany $jbdnew : $jbd transactions nfiles $nfiles time $ELAPSED COS=$COS" + avgjbd=$(( avgjbd + jbd )) + avgtime=$(( avgtime + ELAPSED )) + done + eval cos${COS}_jbd=$((avgjbd / 3)) + eval cos${COS}_time=$((avgtime / 3)) + done + + echo "COS=0 transactions (avg): $cos0_jbd time (avg): $cos0_time" + echo "COS=1 transactions (avg): $cos1_jbd time (avg): $cos1_time" + [ "$cos0_jbd" != 0 ] && echo "COS=1 vs COS=0 jbd: $((((cos1_jbd/cos0_jbd - 1)) * 100 )) %" + [ "$cos0_time" != 0 ] && echo "COS=1 vs COS=0 time: $((((cos1_time/cos0_time - 1)) * 100 )) %" + + restore_lustre_params < $param_file + rm -f $param_file + return 0 +} +run_test 33a "commit on sharing, cross crete/delete, 2 clients, benchmark" + +# End commit on sharing tests + +test_34() { #16129 + for OPER in notimeout timeout ; do + rm $DIR1/$tfile 2>/dev/null + lock_in=0; + for f in `lctl get_param -n ldlm/namespaces/*/lock_timeouts`; do + lock_in=$(($lock_in + $f)) + done + if [ $OPER == "timeout" ] ; then + for j in `seq $OSTCOUNT`; do + #define OBD_FAIL_PTLRPC_HPREQ_TIMEOUT 0x511 + do_facet ost$j lctl set_param fail_loc=0x511 + done + echo lock should expire + else + for j in `seq $OSTCOUNT`; do + #define OBD_FAIL_PTLRPC_HPREQ_NOTIMEOUT 0x512 + do_facet ost$j lctl set_param fail_loc=0x512 + done + echo lock should not expire + fi + echo writing on client1 + dd if=/dev/zero of=$DIR1/$tfile count=100 conv=notrunc > /dev/null 2>&1 + sync & + # wait for the flush + sleep 1 + echo reading on client2 + dd of=/dev/null if=$DIR2/$tfile > /dev/null 2>&1 + # wait for a lock timeout + sleep 4 + lock_out=0 + for f in `lctl get_param -n ldlm/namespaces/*/lock_timeouts`; do + lock_out=$(($lock_out + $f)) + done + if [ $OPER == "timeout" ] ; then + if [ $lock_in == $lock_out ]; then + error "no lock timeout happened" + else + echo "success" + fi + else + if [ $lock_in != $lock_out ]; then + error "lock timeout happened" + else + echo "success" + fi + fi + done +} +run_test 34 "no lock timeout under IO" + log "cleanup: ======================================================" check_and_cleanup_lustre diff --git a/lustre/tests/sendfile.c b/lustre/tests/sendfile.c index 21ae58a..5cfa110 100644 --- a/lustre/tests/sendfile.c +++ b/lustre/tests/sendfile.c @@ -1,3 +1,38 @@ +/* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- + * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. + */ #include #include @@ -20,85 +55,99 @@ int main(int argc, char *argv[]) { - char *sfile, *tfile; - struct stat stbuf; - int size; - int infd, outfd; - int sd[2]; - int rc; - char *buf; - char cmd[1024]; - int page_size = sysconf(_SC_PAGESIZE); - loff_t pos; - - if (argc < 3) { - fprintf(stderr, "%s \n", argv[0]); - exit(-1); - } - - sfile = argv[1]; - tfile = argv[2]; - - if (stat(sfile, &stbuf) < 0) { - if (errno == ENOENT) { - /* assume doing non-object file testing */ - infd = open(sfile, O_LOV_DELAY_CREATE|O_CREAT|O_RDWR, - 0644); - if (infd < 0) - syserr("open source file:"); - - size = random() % (1 * 1024 * 1024) + 1024; - if (ftruncate(infd, (off_t)size) < 0) - syserr("truncate file error:"); - } else { - syserr("stat file: "); - } - } else if (S_ISREG(stbuf.st_mode)) { - size = (int)stbuf.st_size; - infd = open(sfile, O_RDONLY, 0644); - if (infd < 0) - syserr("Open an existing file error:"); - } else { - fprintf(stderr, "%s is not a regular file\n", sfile); - exit(-1); - } - - outfd = open(tfile, O_WRONLY|O_TRUNC|O_CREAT, 0666); - if (outfd < 0) - syserr("open dest file:"); - - rc = socketpair(AF_LOCAL, SOCK_STREAM, 0, sd); - if (rc < 0) - syserr("socketpair"); - - pos = 0; - while (size > 0) { - int rc2; - size_t seg_size; - - seg_size = (size < page_size) ? size : (random() % size + 1); - if (seg_size > 4 * page_size) - seg_size = 4 * page_size; - rc = sendfile(sd[0], infd, &pos, seg_size); - if (rc < 0) - syserr("sendfile:"); - - size -= seg_size; - if (size == 0) - close(sd[0]); - - buf = malloc(seg_size); - rc = read(sd[1], buf, seg_size); - if (rc != seg_size) - syserr("read from socket:"); - - rc2 = write(outfd, buf, rc); - if (rc2 != rc) - syserr("write dest file error:"); - free(buf); - } - close(sd[1]), close(infd), close(outfd); - - sprintf(cmd, "cmp %s %s\n", sfile, tfile); - return system(cmd); + char *sfile, *tfile; + struct stat stbuf; + int size; + unsigned long bufsize = 1024 * 1024; + int infd, outfd; + int sd[2]; + int rc; + char *buf; + char cmd[1024]; + loff_t pos; + + if (argc < 3) { + fprintf(stderr, "%s \n", argv[0]); + exit(-1); + } + + sfile = argv[1]; + tfile = argv[2]; + + if (stat(sfile, &stbuf) < 0) { + if (errno == ENOENT) { + /* assume doing non-object file testing */ + infd = open(sfile, O_LOV_DELAY_CREATE|O_CREAT|O_RDWR, + 0644); + if (infd < 0) + syserr("open source file:"); + + size = random() % (1 * 1024 * 1024) + 1024; + if (ftruncate(infd, (off_t)size) < 0) + syserr("truncate file error:"); + } else { + syserr("stat file: "); + } + } else if (S_ISREG(stbuf.st_mode)) { + size = (int)stbuf.st_size; + infd = open(sfile, O_RDONLY, 0644); + if (infd < 0) + syserr("Open an existing file error:"); + } else { + fprintf(stderr, "%s is not a regular file\n", sfile); + exit(-1); + } + + outfd = open(tfile, O_WRONLY|O_TRUNC|O_CREAT, 0666); + if (outfd < 0) + syserr("open dest file:"); + + rc = socketpair(AF_LOCAL, SOCK_STREAM, 0, sd); + if (rc < 0) + syserr("socketpair"); + + rc = fcntl(sd[0], F_SETFL, O_NONBLOCK); + if (rc < 0) + syserr("fcntl"); + + rc = setsockopt(sd[0], SOL_SOCKET, SO_SNDBUF, + &bufsize, sizeof(bufsize)); + if (rc) + syserr("setsockopt"); + + srandom(time(NULL)); + + pos = 0; + while (size > 0) { + int rc2; + size_t seg_size; + + seg_size = random() % bufsize + 1; + if (seg_size > size) + seg_size = size; + + while (seg_size) { + rc = sendfile(sd[0], infd, &pos, seg_size); + if (rc < 0) + syserr("sendfile:"); + + seg_size -= rc; + size -= rc; + if (size == 0) + close(sd[0]); + + buf = malloc(rc); + if (read(sd[1], buf, rc) < 0) + syserr("read from socket:"); + + rc2 = write(outfd, buf, rc); + if (rc2 != rc) + syserr("write dest file error:"); + free(buf); + } + } + close(sd[1]), close(infd), close(outfd); + + sprintf(cmd, "cmp %s %s\n", sfile, tfile); + return system(cmd); } diff --git a/lustre/tests/test-framework.sh b/lustre/tests/test-framework.sh index 19cc076..617f2bc 100644 --- a/lustre/tests/test-framework.sh +++ b/lustre/tests/test-framework.sh @@ -15,6 +15,7 @@ export GSS=false export GSS_KRB5=false export GSS_PIPEFS=false export IDENTITY_UPCALL=default + #export PDSH="pdsh -S -Rssh -w" # eg, assert_env LUSTRE MDSNODES OSTNODES CLIENTS @@ -31,12 +32,12 @@ assert_env() { assert_DIR () { local failed="" - [ -z "`echo :$DIR: | grep :$MOUNT:`" ] && \ - failed=1 && echo "DIR not in $MOUNT. Aborting." - [ -z "`echo :$DIR1: | grep :$MOUNT1:`" ] && \ - failed=1 && echo "DIR1 not in $MOUNT1. Aborting." - [ -z "`echo :$DIR2: | grep :$MOUNT2:`" ] && \ - failed=1 && echo "DIR2 not in $MOUNT2. Aborting" + [[ $DIR/ = $MOUNT/* ]] || \ + { failed=1 && echo "DIR=$DIR not in $MOUNT. Aborting."; } + [[ $DIR1/ = $MOUNT1/* ]] || \ + { failed=1 && echo "DIR1=$DIR1 not in $MOUNT1. Aborting."; } + [[ $DIR2/ = $MOUNT2/* ]] || \ + { failed=1 && echo "DIR2=$DIR2 not in $MOUNT2. Aborting"; } [ -n "$failed" ] && exit 99 || true } @@ -98,7 +99,10 @@ init_test_env() { export PATH=$PATH:$LUSTRE/tests fi export MDSRATE=${MDSRATE:-"$LUSTRE/tests/mdsrate"} - [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate) + [ ! -f "$MDSRATE" ] && export MDSRATE=$(which mdsrate 2> /dev/null) + if ! echo $PATH | grep -q $LUSTRE/test/racer; then + export PATH=$PATH:$LUSTRE/tests/racer + fi export LCTL=${LCTL:-"$LUSTRE/utils/lctl"} [ ! -f "$LCTL" ] && export LCTL=$(which lctl) export LFS=${LFS:-"$LUSTRE/utils/lfs"} @@ -176,6 +180,11 @@ init_test_env() { } +case `uname -r` in +2.4.*) EXT=".o"; USE_QUOTA=no; [ ! "$CLIENTONLY" ] && FSTYPE=ext3;; + *) EXT=".ko"; USE_QUOTA=yes;; +esac + load_module() { EXT=".ko" module=$1 @@ -210,9 +219,10 @@ load_modules() { load_module ../libcfs/libcfs/libcfs [ "$PTLDEBUG" ] && lctl set_param debug=$PTLDEBUG [ "$SUBSYSTEM" ] && lctl set_param subsystem_debug=${SUBSYSTEM# } + local MODPROBECONF= [ -f /etc/modprobe.conf ] && MODPROBECONF=/etc/modprobe.conf - [ -f /etc/modprobe.d/Lustre ] && MODPROBECONF=/etc/modprobe.d/Lustre - [ -z "$LNETOPTS" -a -n "$MODPROBECONF" ] && \ + [ ! "$MODPROBECONF" -a -d /etc/modprobe.d ] && MODPROBECONF=/etc/modprobe.d/Lustre + [ -z "$LNETOPTS" -a "$MODPROBECONF" ] && \ LNETOPTS=$(awk '/^options lnet/ { print $0}' $MODPROBECONF | sed 's/^options lnet //g') echo $LNETOPTS | grep -q "accept=all" || LNETOPTS="$LNETOPTS accept=all"; echo "lnet options: '$LNETOPTS'" @@ -224,12 +234,9 @@ load_modules() { load_module obdclass/obdclass load_module ptlrpc/ptlrpc load_module ptlrpc/gss/ptlrpc_gss - # Now, some modules depend on lquota without USE_QUOTA check, - # will fix later. Disable check "$USE_QUOTA" = "yes" temporary. - #[ "$USE_QUOTA" = "yes" ] && load_module quota/lquota - load_module quota/lquota - load_module fid/fid + [ "$USE_QUOTA" = "yes" -a "$LQUOTA" != "no" ] && load_module quota/lquota load_module fld/fld + load_module fid/fid load_module lmv/lmv load_module mdc/mdc load_module osc/osc @@ -251,8 +258,8 @@ load_modules() { load_module llite/lustre load_module llite/llite_lloop - rm -f $TMP/ogdb-$HOSTNAME - OGDB=$TMP + OGDB=${OGDB:-$TMP} + rm -f $OGDB/ogdb-$HOSTNAME [ -d /r ] && OGDB="/r/tmp" $LCTL modules > $OGDB/ogdb-$HOSTNAME @@ -307,7 +314,7 @@ check_mem_leak () { echo "$LEAK_LUSTRE" 1>&2 echo "$LEAK_PORTALS" 1>&2 mv $TMP/debug $TMP/debug-leak.`date +%s` || true - log "Memory leaks detected" + echo "Memory leaks detected" [ -n "$IGNORE_LEAK" ] && { echo "ignoring leaks" && return 0; } || true return 1 fi @@ -590,6 +597,13 @@ reboot_facet() { fi } +boot_node() { + local node=$1 + if [ "$FAILURE_MODE" = HARD ]; then + $POWER_UP $node + fi +} + # verify that lustre actually cleaned up properly cleanup_check() { [ -f $CATASTROPHE ] && [ `cat $CATASTROPHE` -ne 0 ] && \ @@ -690,8 +704,8 @@ wait_remote_prog () { [ "$PDSH" = "no_dsh" ] && return 0 while [ $WAIT -lt $2 ]; do - running=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep) - [ -z "${running}" ] && return 0 + running=$(ps uax | grep "$PDSH.*$prog.*$MOUNT" | grep -v grep) || true + [ -z "${running}" ] && return 0 || true echo "waited $WAIT for: " echo "$running" [ $INTERVAL -lt 60 ] && INTERVAL=$((INTERVAL + INTERVAL)) @@ -737,8 +751,10 @@ client_reconnect() { facet_failover() { facet=$1 + sleep_time=$2 echo "Failing $facet on node `facet_active_host $facet`" shutdown_facet $facet + [ -n "$sleep_time" ] && sleep $sleep_time reboot_facet $facet client_df & DFPID=$! @@ -776,6 +792,16 @@ replay_barrier_nodf() { $LCTL mark "local REPLAY BARRIER on ${!svc}" } +replay_barrier_nosync() { + local facet=$1 echo running=${running} + local svc=${facet}_svc + echo Replay barrier on ${!svc} + do_facet $facet $LCTL --device %${!svc} readonly + do_facet $facet $LCTL --device %${!svc} notransno + do_facet $facet $LCTL mark "$facet REPLAY BARRIER on ${!svc}" + $LCTL mark "local REPLAY BARRIER on ${!svc}" +} + mds_evict_client() { UUID=`lctl get_param -n mdc.${mds1_svc}-mdc-*.uuid` do_facet mds1 "lctl set_param -n mdt.${mds1_svc}.evict_client $UUID" @@ -1056,6 +1082,11 @@ mdsmkfsopts() } formatall() { + if [ "$IAMDIR" == "yes" ]; then + MDS_MKFS_OPTS="$MDS_MKFS_OPTS --iam-dir" + MDSn_MKFS_OPTS="$MDSn_MKFS_OPTS --iam-dir" + fi + [ "$FSTYPE" ] && FSTYPE_OPT="--backfstype $FSTYPE" if [ ! -z $SEC ]; then @@ -1104,7 +1135,7 @@ switch_identity() { local num=$1 local switch=$2 local j=`expr $num - 1` - local MDT="`do_facet mds$num lctl get_param -N mdt.*MDT*$j | cut -d"." -f2 2>/dev/null || true`" + local MDT="`(do_facet mds$num lctl get_param -N mdt.*MDT*$j 2>/dev/null | cut -d"." -f2 2>/dev/null) || true`" if [ -z "$MDT" ]; then return 2 @@ -1199,6 +1230,10 @@ setupall() { done fi + # wait a while to allow sptlrpc configuration be propogated to targets, + # only needed when mounting new target devices. + $GSS && sleep 10 + [ "$DAEMONFILE" ] && $LCTL debug_daemon start $DAEMONFILE $DAEMONSIZE mount_client $MOUNT [ -n "$CLIENTS" ] && zconf_mount_clients $CLIENTS $MOUNT @@ -1260,15 +1295,29 @@ init_facets_vars () { done } +check_config () { + local mntpt=$1 + + echo Checking config lustre mounted on $mntpt + local mgshost=$(mount | grep " $mntpt " | awk -F@ '{print $1}') + mgshost=$(echo $mgshost | awk -F: '{print $1}') + if [ "$mgshost" != "$mgs_HOST" ]; then + FAIL_ON_ERROR=true \ + error "Bad config file: lustre is mounted with mgs $mgshost, but mgs_HOST=$mgs_HOST + Please use correct config or set mds_HOST correctly!" + fi +} + check_and_setup_lustre() { - MOUNTED="`mounted_lustre_filesystems`" - if [ -z "$MOUNTED" ]; then + local MOUNTED=$(mounted_lustre_filesystems) + if [ -z "$MOUNTED" ] || ! $(echo $MOUNTED | grep -w -q $MOUNT); then [ "$REFORMAT" ] && formatall setupall - MOUNTED="`mounted_lustre_filesystems`" + MOUNTED=$(mounted_lustre_filesystems | head -1) [ -z "$MOUNTED" ] && error "NAME=$NAME not mounted" export I_MOUNTED=yes else + check_config $MOUNT init_facets_vars fi if [ "$ONLY" == "setup" ]; then @@ -1602,6 +1651,8 @@ basetest() { IFS=abcdefghijklmnopqrstuvwxyz _basetest $1 } +# print a newline if the last test was skipped +export LAST_SKIPPED= run_test() { assert_DIR @@ -1609,38 +1660,46 @@ run_test() { if [ ! -z "$ONLY" ]; then testname=ONLY_$1 if [ ${!testname}x != x ]; then + [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED= run_one $1 "$2" return $? fi testname=ONLY_$base if [ ${!testname}x != x ]; then + [ "$LAST_SKIPPED" ] && echo "" && LAST_SKIPPED= run_one $1 "$2" return $? fi + LAST_SKIPPED="y" echo -n "." return 0 fi testname=EXCEPT_$1 if [ ${!testname}x != x ]; then + LAST_SKIPPED="y" TESTNAME=test_$1 skip "skipping excluded test $1" return 0 fi testname=EXCEPT_$base if [ ${!testname}x != x ]; then + LAST_SKIPPED="y" TESTNAME=test_$1 skip "skipping excluded test $1 (base $base)" return 0 fi testname=EXCEPT_SLOW_$1 if [ ${!testname}x != x ]; then + LAST_SKIPPED="y" TESTNAME=test_$1 skip "skipping SLOW test $1" return 0 fi testname=EXCEPT_SLOW_$base if [ ${!testname}x != x ]; then + LAST_SKIPPED="y" TESTNAME=test_$1 skip "skipping SLOW test $1 (base $base)" return 0 fi + LAST_SKIPPED= run_one $1 "$2" return $? @@ -1793,10 +1852,18 @@ osc_to_ost() echo $ost } +remote_node () { + local node=$1 + [ "$node" != "$(hostname)" ] +} + remote_mds () { - local var=${SINGLEMDS}_HOST - [ "${!var}" != "$(hostname)" ] + local node + for node in $(mdts_nodes); do + remote_node $node && return 0 + done + return 1 } remote_mds_nodsh() @@ -1806,7 +1873,11 @@ remote_mds_nodsh() remote_ost () { - [ "$ost_HOST" != "$(hostname)" ] + local node + for node in $(osts_nodes) ; do + remote_node $node && return 0 + done + return 1 } remote_ost_nodsh() @@ -1904,6 +1975,12 @@ mixed_ost_devs () { [ ! "$OSTCOUNT" = "$osscount" ] } +mixed_mdt_devs () { + local nodes=$(mdts_nodes) + local mdtcount=$(get_node_count "$nodes") + [ ! "$MDSCOUNT" = "$mdtcount" ] +} + generate_machine_file() { local nodes=${1//,/ } local machinefile=$2 @@ -2012,6 +2089,18 @@ calc_llite_stats() { echo $res } +# reset osc stat counters +clear_osc_stats(){ + lctl set_param -n osc.*.osc_stats 0 +} + +# sum osc stat items +calc_osc_stats() { + local res=$(lctl get_param -n osc.*.osc_stats | + awk 'BEGIN {s = 0} END {print s} /^'"$1"'/ {s += $2}') + echo $res +} + calc_sum () { awk 'BEGIN {s = 0}; {s += $1}; END {print s}' } @@ -2046,3 +2135,20 @@ check_catastrophe () { fi } +# $1 node +# $2 file +get_stripe_info() { + local tmp_file + + stripe_size=0 + stripe_count=0 + stripe_index=0 + tmp_file=$(mktemp) + + do_facet $1 lfs getstripe -v $2 > $tmp_file + + stripe_size=`awk '$1 ~ /size/ {print $2}' $tmp_file` + stripe_count=`awk '$1 ~ /count/ {print $2}' $tmp_file` + stripe_index=`awk '/obdidx/ {start = 1; getline; print $1; exit}' $tmp_file` + rm -f $tmp_file +} diff --git a/lustre/utils/gss/gss_util.c b/lustre/utils/gss/gss_util.c index b08f2f5..fa4838b 100644 --- a/lustre/utils/gss/gss_util.c +++ b/lustre/utils/gss/gss_util.c @@ -90,11 +90,14 @@ #include "lsupport.h" /* Global gssd_credentials handle */ +gss_cred_id_t gssd_cred_mgs; gss_cred_id_t gssd_cred_mds; gss_cred_id_t gssd_cred_oss; +int gssd_cred_mgs_valid = 0; int gssd_cred_mds_valid = 0; int gssd_cred_oss_valid = 0; +char *mgs_local_realm = NULL; char *mds_local_realm = NULL; char *oss_local_realm = NULL; @@ -284,8 +287,14 @@ int gssd_acquire_cred(char *server_name, gss_cred_id_t *cred, return 0; } -int gssd_prepare_creds(int must_srv_mds, int must_srv_oss) +int gssd_prepare_creds(int must_srv_mgs, int must_srv_mds, int must_srv_oss) { + if (gssd_acquire_cred(GSSD_SERVICE_MGS, &gssd_cred_mgs, + &mgs_local_realm, &gssd_cred_mgs_valid)) { + if (must_srv_mgs) + return -1; + } + if (gssd_acquire_cred(GSSD_SERVICE_MDS, &gssd_cred_mds, &mds_local_realm, &gssd_cred_mds_valid)) { if (must_srv_mds) @@ -298,11 +307,16 @@ int gssd_prepare_creds(int must_srv_mds, int must_srv_oss) return -1; } - if (!gssd_cred_mds_valid && !gssd_cred_oss_valid) { - printerr(0, "can't obtain both mds & oss creds, exit\n"); + if (!gssd_cred_mgs_valid && + !gssd_cred_mds_valid && + !gssd_cred_oss_valid) { + printerr(0, "can't obtain any service creds, exit\n"); return -1; } + if (gssd_cred_mgs_valid) + printerr(0, "Ready to serve Lustre MGS in realm %s\n", + mgs_local_realm ? mgs_local_realm : "N/A"); if (gssd_cred_mds_valid) printerr(0, "Ready to serve Lustre MDS in realm %s\n", mds_local_realm ? mds_local_realm : "N/A"); @@ -316,6 +330,12 @@ int gssd_prepare_creds(int must_srv_mds, int must_srv_oss) gss_cred_id_t gssd_select_svc_cred(int lustre_svc) { switch (lustre_svc) { + case LUSTRE_GSS_SVC_MGS: + if (!gssd_cred_mgs_valid) { + printerr(0, "ERROR: service cred for mgs not ready\n"); + return NULL; + } + return gssd_cred_mgs; case LUSTRE_GSS_SVC_MDS: if (!gssd_cred_mds_valid) { printerr(0, "ERROR: service cred for mds not ready\n"); diff --git a/lustre/utils/gss/gssd.h b/lustre/utils/gss/gssd.h index 5f0006e..5d1e8cb 100644 --- a/lustre/utils/gss/gssd.h +++ b/lustre/utils/gss/gssd.h @@ -48,6 +48,7 @@ #define GSSD_DEFAULT_CRED_PREFIX "krb5cc_" #define GSSD_DEFAULT_MACHINE_CRED_SUFFIX "machine" #define GSSD_DEFAULT_KEYTAB_FILE "/etc/krb5.keytab" +#define GSSD_SERVICE_MGS "lustre_mgs" #define GSSD_SERVICE_MDS "lustre_mds" #define GSSD_SERVICE_OSS "lustre_oss" #define GSSD_SERVICE_MDS_NAMELEN 10 diff --git a/lustre/utils/gss/lgss_utils.c b/lustre/utils/gss/lgss_utils.c index db3152e..e665d85 100644 --- a/lustre/utils/gss/lgss_utils.c +++ b/lustre/utils/gss/lgss_utils.c @@ -102,9 +102,9 @@ #include "lgss_krb5_utils.h" const char *lgss_svc_str[LGSS_SVC_MAX] = { + [LGSS_SVC_MGS] = LGSS_SVC_MGS_STR, [LGSS_SVC_MDS] = LGSS_SVC_MDS_STR, [LGSS_SVC_OSS] = LGSS_SVC_OST_STR, - [LGSS_SVC_MGS] = LGSS_SVC_MGS_STR, }; /**************************************** diff --git a/lustre/utils/gss/lgss_utils.h b/lustre/utils/gss/lgss_utils.h index 5553591..bd2fa93 100644 --- a/lustre/utils/gss/lgss_utils.h +++ b/lustre/utils/gss/lgss_utils.h @@ -47,15 +47,15 @@ #include +#define LGSS_SVC_MGS_STR "lustre_mgs" #define LGSS_SVC_MDS_STR "lustre_mds" #define LGSS_SVC_OST_STR "lustre_oss" -#define LGSS_SVC_MGS_STR "lustre_mgs" #define LGSS_USR_ROOT_STR "lustre_root" typedef enum { - LGSS_SVC_MDS = 0, - LGSS_SVC_OSS = 1, - LGSS_SVC_MGS = 2, + LGSS_SVC_MGS = 0, + LGSS_SVC_MDS = 1, + LGSS_SVC_OSS = 2, LGSS_SVC_MAX } lgss_svc_t; diff --git a/lustre/utils/gss/lsupport.c b/lustre/utils/gss/lsupport.c index ca964ff..cbd7a56 100644 --- a/lustre/utils/gss/lsupport.c +++ b/lustre/utils/gss/lsupport.c @@ -71,8 +71,9 @@ #endif #include "lsupport.h" -const char * lustre_svc_name[] = +const char * lustre_svc_name[] = { + [LUSTRE_GSS_SVC_MGS] = "MGS", [LUSTRE_GSS_SVC_MDS] = "MDS", [LUSTRE_GSS_SVC_OSS] = "OSS", }; diff --git a/lustre/utils/gss/lsupport.h b/lustre/utils/gss/lsupport.h index 2172e9c..fece9c4 100644 --- a/lustre/utils/gss/lsupport.h +++ b/lustre/utils/gss/lsupport.h @@ -20,8 +20,9 @@ void gssd_exit_unique(int type); * copied from lustre source */ -#define LUSTRE_GSS_SVC_MDS 0 -#define LUSTRE_GSS_SVC_OSS 1 +#define LUSTRE_GSS_SVC_MGS 0 +#define LUSTRE_GSS_SVC_MDS 1 +#define LUSTRE_GSS_SVC_OSS 2 extern const char * lustre_svc_name[]; diff --git a/lustre/utils/gss/svcgssd.c b/lustre/utils/gss/svcgssd.c index 3ab7ad2..cebd852 100644 --- a/lustre/utils/gss/svcgssd.c +++ b/lustre/utils/gss/svcgssd.c @@ -177,7 +177,7 @@ sig_hup(int signal) static void usage(char *progname) { - fprintf(stderr, "usage: %s [-n] [-f] [-v] [-r] [-m] [-o]\n", + fprintf(stderr, "usage: %s [-n] [-f] [-v] [-r] [-m] [-o] [-g]\n", progname); exit(1); } @@ -189,11 +189,11 @@ main(int argc, char *argv[]) int fg = 0; int verbosity = 0; int opt; - int must_srv_mds = 0, must_srv_oss = 0; + int must_srv_mds = 0, must_srv_oss = 0, must_srv_mgs = 0; extern char *optarg; char *progname; - while ((opt = getopt(argc, argv, "fivrnp:")) != -1) { + while ((opt = getopt(argc, argv, "fvrnmog:")) != -1) { switch (opt) { case 'f': fg = 1; @@ -212,6 +212,10 @@ main(int argc, char *argv[]) get_creds = 1; must_srv_oss = 1; break; + case 'g': + get_creds = 1; + must_srv_mgs = 1; + break; default: usage(argv[0]); break; @@ -235,10 +239,11 @@ main(int argc, char *argv[]) exit(1); } - if (get_creds && gssd_prepare_creds(must_srv_mds, must_srv_oss)) { + if (get_creds && + gssd_prepare_creds(must_srv_mgs, must_srv_mds, must_srv_oss)) { printerr(0, "unable to obtain root (machine) credentials\n"); printerr(0, "do you have a keytab entry for " - "nfs/@ in " + "/@ in " "/etc/krb5.keytab?\n"); exit(1); } diff --git a/lustre/utils/gss/svcgssd.h b/lustre/utils/gss/svcgssd.h index 5283c95..a2eece6 100644 --- a/lustre/utils/gss/svcgssd.h +++ b/lustre/utils/gss/svcgssd.h @@ -37,7 +37,7 @@ int handle_nullreq(FILE *f); void svcgssd_run(void); -int gssd_prepare_creds(int must_srv_mds, int must_srv_oss); +int gssd_prepare_creds(int must_srv_mgs, int must_srv_mds, int must_srv_oss); gss_cred_id_t gssd_select_svc_cred(int lustre_svc); extern char *mds_local_realm; @@ -46,6 +46,7 @@ extern char *oss_local_realm; #define GSSD_SERVICE_NAME "lustre" /* XXX */ +#define GSSD_SERVICE_MGS "lustre_mgs" #define GSSD_SERVICE_MDS "lustre_mds" #define GSSD_SERVICE_OSS "lustre_oss" #define LUSTRE_ROOT_NAME "lustre_root" diff --git a/lustre/utils/gss/svcgssd_proc.c b/lustre/utils/gss/svcgssd_proc.c index 2ba8e37..5074a0e 100644 --- a/lustre/utils/gss/svcgssd_proc.c +++ b/lustre/utils/gss/svcgssd_proc.c @@ -344,8 +344,9 @@ get_ids(gss_name_t client_name, gss_OID mech, struct svc_cred *cred, if (host) *host++ = '\0'; - if (strcmp(sname, GSSD_SERVICE_OSS) == 0) { - printerr(0, "forbid "GSSD_SERVICE_OSS" as user name\n"); + if (strcmp(sname, GSSD_SERVICE_OSS) == 0 || + strcmp(sname, GSSD_SERVICE_MGS) == 0) { + printerr(0, "forbid %s as user name\n", sname); goto out_free; } diff --git a/lustre/utils/l_getidentity.c b/lustre/utils/l_getidentity.c index f45a8ae..ae4c437 100644 --- a/lustre/utils/l_getidentity.c +++ b/lustre/utils/l_getidentity.c @@ -194,6 +194,7 @@ static perm_type_t perm_types[] = { { "setgid", CFS_SETGID_PERM }, { "setgrp", CFS_SETGRP_PERM }, { "rmtacl", CFS_RMTACL_PERM }, + { "rmtown", CFS_RMTOWN_PERM }, { 0 } }; @@ -202,6 +203,7 @@ static perm_type_t noperm_types[] = { { "nosetgid", CFS_SETGID_PERM }, { "nosetgrp", CFS_SETGRP_PERM }, { "normtacl", CFS_RMTACL_PERM }, + { "normtown", CFS_RMTOWN_PERM }, { 0 } }; diff --git a/lustre/utils/lfs.c b/lustre/utils/lfs.c index f0d07ff..6a3bf18 100644 --- a/lustre/utils/lfs.c +++ b/lustre/utils/lfs.c @@ -59,6 +59,10 @@ #include #include #include +#ifdef HAVE_SYS_QUOTA_H +# include +#endif + /* For dirname() */ #include @@ -83,13 +87,14 @@ static int lfs_osts(int argc, char **argv); static int lfs_df(int argc, char **argv); static int lfs_check(int argc, char **argv); static int lfs_catinfo(int argc, char **argv); -#ifdef HAVE_QUOTA_SUPPORT +#ifdef HAVE_SYS_QUOTA_H static int lfs_quotachown(int argc, char **argv); static int lfs_quotacheck(int argc, char **argv); static int lfs_quotaon(int argc, char **argv); static int lfs_quotaoff(int argc, char **argv); static int lfs_setquota(int argc, char **argv); static int lfs_quota(int argc, char **argv); +static int lfs_quotainv(int argc, char **argv); #endif static int lfs_flushctx(int argc, char **argv); static int lfs_join(int argc, char **argv); @@ -156,7 +161,7 @@ command_t cmdlist[] = { "report filesystem disk space usage or inodes usage" "of each MDS/OSD.\n" "Usage: df [-i] [-h] [path]"}, -#ifdef HAVE_QUOTA_SUPPORT +#ifdef HAVE_SYS_QUOTA_H {"quotachown",lfs_quotachown, 0, "Change files' owner or group on the specified filesystem.\n" "usage: quotachown [-i] \n" @@ -170,10 +175,24 @@ command_t cmdlist[] = { {"quotaoff", lfs_quotaoff, 0, "Turn filesystem quotas off.\n" "usage: quotaoff [ -ug ] "}, {"setquota", lfs_setquota, 0, "Set filesystem quotas.\n" - "usage: setquota [ -u | -g ] \n" - " setquota -t [ -u | -g ] "}, + "usage: setquota [ -u | -g ] -b -B -i -I \n" + " setquota -t [ -u | -g ] \n" + " setquota [ -u | --user | -g | --group ] \n" + " [--block-softlimit ]\n" + " [--block-hardlimit ]\n" + " [--inode-softlimit ]\n" + " [--inode-hardlimit ] \n" + " setquota [-t] [ -u | --user | -g | --group ]\n" + " [--block-grace ]\n" + " [--inode-grace ] \n" + " -b can be used instead of --block-softlimit/--block-grace\n" + " -B can be used instead of --block-hardlimit\n" + " -i can be used instead of --inode-softlimit/--inode-grace\n" + " -I can be used instead of --inode-hardlimit"}, {"quota", lfs_quota, 0, "Display disk usage and limits.\n" - "usage: quota [ -o obd_uuid ] [{-u|-g }|-t] "}, + "usage: quota [-v] [-o obd_uuid|-i mdt_idx|-I ost_idx] [{-u|-g }|-t] "}, + {"quotainv", lfs_quotainv, 0, "Invalidate quota data.\n" + "usage: quotainv [-u|-g] "}, #endif {"flushctx", lfs_flushctx, 0, "Flush security context for current user.\n" "usage: flushctx [-k] [mountpoint...]"}, @@ -267,7 +286,7 @@ static int lfs_setstripe(int argc, char **argv) { optind = 0; while ((c = getopt_long(argc, argv, "c:di:o:s:p:", - long_opts, NULL)) >= 0) { + long_opts, NULL)) >= 0) { switch (c) { case 0: /* Long options. */ @@ -313,13 +332,13 @@ static int lfs_setstripe(int argc, char **argv) if (optind == argc) { fprintf(stderr, "error: %s: missing filename|dirname\n", - argv[0]); + argv[0]); return CMD_HELP; } /* get the stripe size */ if (stripe_size_arg != NULL) { - result = parse_size(stripe_size_arg, &st_size, &size_units); + result = parse_size(stripe_size_arg, &st_size, &size_units, 0); if (result) { fprintf(stderr, "error: %s: bad size '%s'\n", argv[0], stripe_size_arg); @@ -392,9 +411,12 @@ static int set_time(time_t *time, time_t *set, char *str) return res; } +#define USER 0 +#define GROUP 1 + static int name2id(unsigned int *id, char *name, int type) { - if (type == USRQUOTA) { + if (type == USER) { struct passwd *entry; if (!(entry = getpwnam(name))) { @@ -421,7 +443,7 @@ static int name2id(unsigned int *id, char *name, int type) static int id2name(char **name, unsigned int id, int type) { - if (type == USRQUOTA) { + if (type == USER) { struct passwd *entry; if (!(entry = getpwuid(id))) { @@ -491,8 +513,8 @@ static int lfs_find(int argc, char **argv) time(&t); optind = 0; - /* when getopt_long_only() hits '!' it returns 1 and puts "!" in optarg */ - while ((c = getopt_long_only(argc, argv, "-A:C:D:g:G:M:n:PpO:qrs:t:u:U:v", + /* when getopt_long_only() hits '!' it returns 1, puts "!" in optarg */ + while ((c = getopt_long_only(argc,argv,"-A:C:D:g:G:M:n:PpO:qrs:t:u:U:v", long_opts, NULL)) >= 0) { xtime = NULL; xsign = NULL; @@ -582,7 +604,7 @@ static int lfs_find(int argc, char **argv) new_fashion = 1; param.gid = strtol(optarg, &endptr, 10); if (optarg == endptr) { - ret = name2id(¶m.gid, optarg, GRPQUOTA); + ret = name2id(¶m.gid, optarg, GROUP); if (ret != 0) { fprintf(stderr, "Group/GID: %s cannot " "be found.\n", optarg); @@ -606,7 +628,7 @@ static int lfs_find(int argc, char **argv) new_fashion = 1; param.uid = strtol(optarg, &endptr, 10); if (optarg == endptr) { - ret = name2id(¶m.uid, optarg, USRQUOTA); + ret = name2id(¶m.uid, optarg, USER); if (ret != 0) { fprintf(stderr, "User/UID: %s cannot " "be found.\n", optarg); @@ -723,7 +745,8 @@ static int lfs_find(int argc, char **argv) if (param.size_sign) optarg++; - ret = parse_size(optarg, ¶m.size,¶m.size_units); + ret = parse_size(optarg, ¶m.size, + ¶m.size_units, 0); if (ret) { fprintf(stderr,"error: bad size '%s'\n", optarg); @@ -1066,12 +1089,12 @@ static int mntdf(char *mntdir, int ishow, int cooked) if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || rc == -ENODATA || rc == 0) { - showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked, - "MDT", index, rc); + showdf(mntdir, &stat_buf, obd_uuid2str(&uuid_buf), + ishow, cooked, "MDT", index, rc); } else { fprintf(stderr, "error: llapi_obd_statfs(%s): %s (%d)\n", - uuid_buf.uuid, strerror(-rc), rc); + obd_uuid2str(&uuid_buf), strerror(-rc), rc); return rc; } if (rc == 0) { @@ -1093,8 +1116,8 @@ static int mntdf(char *mntdir, int ishow, int cooked) if (rc == -ENOTCONN || rc == -ETIMEDOUT || rc == -EIO || rc == -ENODATA || rc == 0) { - showdf(mntdir, &stat_buf, uuid_buf.uuid, ishow, cooked, - "OST", index, rc); + showdf(mntdir, &stat_buf, obd_uuid2str(&uuid_buf), + ishow, cooked, "OST", index, rc); } else { fprintf(stderr, "error: llapi_obd_statfs failed: %s (%d)\n", @@ -1314,7 +1337,7 @@ out: return rc; } -#ifdef HAVE_QUOTA_SUPPORT +#ifdef HAVE_SYS_QUOTA_H static int lfs_quotachown(int argc, char **argv) { @@ -1341,15 +1364,13 @@ static int lfs_quotachown(int argc, char **argv) return rc; } - static int lfs_quotacheck(int argc, char **argv) { int c, check_type = 0; char *mnt; struct if_quotacheck qchk; struct if_quotactl qctl; - char *obd_type = qchk.obd_type; - char *obd_uuid = qchk.obd_uuid.uuid; + char *obd_type = (char *)qchk.obd_type; int rc; memset(&qchk, 0, sizeof(qchk)); @@ -1382,7 +1403,6 @@ static int lfs_quotacheck(int argc, char **argv) memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAOFF; - qctl.qc_id = QFMT_LDISKFS; qctl.qc_type = check_type; rc = llapi_quotactl(mnt, &qctl); if (rc) { @@ -1399,20 +1419,20 @@ static int lfs_quotacheck(int argc, char **argv) rc = llapi_poll_quotacheck(mnt, &qchk); if (rc) { if (*obd_type) - fprintf(stderr, "%s %s ", obd_type, obd_uuid); + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qchk.obd_uuid)); fprintf(stderr, "quota check failed: %s\n", strerror(errno)); return rc; } memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAON; - qctl.qc_id = QFMT_LDISKFS; qctl.qc_type = check_type; rc = llapi_quotactl(mnt, &qctl); if (rc) { if (*obd_type) - fprintf(stderr, "%s %s ", - qctl.obd_type, qctl.obd_uuid.uuid); + fprintf(stderr, "%s %s ", (char *)qctl.obd_type, + obd_uuid2str(&qctl.obd_uuid)); fprintf(stderr, "%s turn on quota failed: %s\n", argv[0], strerror(errno)); return rc; @@ -1426,13 +1446,11 @@ static int lfs_quotaon(int argc, char **argv) int c; char *mnt; struct if_quotactl qctl; - char *obd_type = qctl.obd_type; - char *obd_uuid = qctl.obd_uuid.uuid; + char *obd_type = (char *)qctl.obd_type; int rc; memset(&qctl, 0, sizeof(qctl)); qctl.qc_cmd = LUSTRE_Q_QUOTAON; - qctl.qc_id = QFMT_LDISKFS; optind = 0; while ((c = getopt(argc, argv, "ugf")) != -1) { @@ -1466,7 +1484,8 @@ static int lfs_quotaon(int argc, char **argv) rc = llapi_quotactl(mnt, &qctl); if (rc) { if (*obd_type) - fprintf(stderr, "%s %s ", obd_type, obd_uuid); + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qctl.obd_uuid)); fprintf(stderr, "%s failed: %s\n", argv[0], strerror(errno)); return rc; } @@ -1479,8 +1498,7 @@ static int lfs_quotaoff(int argc, char **argv) int c; char *mnt; struct if_quotactl qctl; - char *obd_type = qctl.obd_type; - char *obd_uuid = qctl.obd_uuid.uuid; + char *obd_type = (char *)qctl.obd_type; int rc; memset(&qctl, 0, sizeof(qctl)); @@ -1513,9 +1531,15 @@ static int lfs_quotaoff(int argc, char **argv) mnt = argv[optind]; rc = llapi_quotactl(mnt, &qctl); + if (rc == -1 && errno == ESRCH) { + fprintf(stderr, "\n%s quotas are not enabled.\n", + qctl.qc_type == 0x00 ? "user" : "group"); + return 0; + } if (rc) { if (*obd_type) - fprintf(stderr, "%s %s ", obd_type, obd_uuid); + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qctl.obd_uuid)); fprintf(stderr, "quotaoff failed: %s\n", strerror(errno)); return rc; } @@ -1523,6 +1547,54 @@ static int lfs_quotaoff(int argc, char **argv) return 0; } +static int lfs_quotainv(int argc, char **argv) +{ + int c; + char *mnt; + struct if_quotactl qctl; + int rc; + + memset(&qctl, 0, sizeof(qctl)); + qctl.qc_cmd = LUSTRE_Q_INVALIDATE; + + optind = 0; + while ((c = getopt(argc, argv, "ugf")) != -1) { + switch (c) { + case 'u': + qctl.qc_type |= 0x01; + break; + case 'g': + qctl.qc_type |= 0x02; + break; + case 'f': + qctl.qc_cmd = LUSTRE_Q_FINVALIDATE; + break; + default: + fprintf(stderr, "error: %s: option '-%c' " + "unrecognized\n", argv[0], c); + return CMD_HELP; + } + } + + if (qctl.qc_type) + qctl.qc_type--; + else /* by default, invalidate quota for both user & group */ + qctl.qc_type = 0x02; + + if (argc == optind) + return CMD_HELP; + + mnt = argv[optind]; + + rc = llapi_quotactl(mnt, &qctl); + if (rc) { + fprintf(stderr, "quotainv failed: %s\n", strerror(errno)); + return rc; + } + + return 0; +} + #define ARG2INT(nr, str, msg) \ do { \ char *endp; \ @@ -1592,87 +1664,224 @@ error: return ULONG_MAX; } -int lfs_setquota(int argc, char **argv) +#define ARG2ULL(nr, str, defscale) \ +do { \ + unsigned long long limit, units = 0; \ + int rc; \ + \ + rc = parse_size(str, &limit, &units, 1); \ + if (rc < 0) { \ + fprintf(stderr, "error: bad limit value %s\n", str); \ + return CMD_HELP; \ + } \ + nr = ((units == 0) ? (defscale) : 1) * limit; \ +} while (0) + +static inline int has_times_option(int argc, char **argv) { - int c; - char *mnt; + int i; + + for (i = 1; i < argc; i++) + if (!strcmp(argv[i], "-t")) + return 1; + + return 0; +} + +int lfs_setquota_times(int argc, char **argv) +{ + int c, rc; struct if_quotactl qctl; - char *obd_type = qctl.obd_type; - char *obd_uuid = qctl.obd_uuid.uuid; - int rc; + char *mnt, *obd_type = (char *)qctl.obd_type; + struct obd_dqblk *dqb = &qctl.qc_dqblk; + struct obd_dqinfo *dqi = &qctl.qc_dqinfo; + struct option long_opts[] = { + {"user", no_argument, 0, 'u'}, + {"group", no_argument, 0, 'g'}, + {"block-grace", required_argument, 0, 'b'}, + {"inode-grace", required_argument, 0, 'i'}, + {"times", no_argument, 0, 't'}, + {0, 0, 0, 0} + }; memset(&qctl, 0, sizeof(qctl)); - qctl.qc_cmd = LUSTRE_Q_SETQUOTA; + qctl.qc_cmd = LUSTRE_Q_SETINFO; + qctl.qc_type = UGQUOTA; optind = 0; - while ((c = getopt(argc, argv, "ugt")) != -1) { + while ((c = getopt_long(argc, argv, "ugb:i:t", long_opts, NULL)) != -1) { switch (c) { case 'u': - qctl.qc_type |= 0x01; - break; case 'g': - qctl.qc_type |= 0x02; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: -u and -g can't be used " + "more than once\n"); + return CMD_HELP; + } + qctl.qc_type = (c == 'u') ? USRQUOTA : GRPQUOTA; break; - case 't': - qctl.qc_cmd = LUSTRE_Q_SETINFO; + case 'b': + if ((dqi->dqi_bgrace = str2sec(optarg)) == ULONG_MAX) { + fprintf(stderr, "error: bad block-grace: %s\n", + optarg); + return CMD_HELP; + } + dqb->dqb_valid |= QIF_BTIME; break; - default: - fprintf(stderr, "error: %s: option '-%c' " - "unrecognized\n", argv[0], c); + case 'i': + if ((dqi->dqi_igrace = str2sec(optarg)) == ULONG_MAX) { + fprintf(stderr, "error: bad inode-grace: %s\n", + optarg); + return CMD_HELP; + } + dqb->dqb_valid |= QIF_ITIME; + break; + case 't': /* Yes, of course! */ + break; + default: /* getopt prints error message for us when opterr != 0 */ return CMD_HELP; } } - if (qctl.qc_type) - qctl.qc_type--; - if (qctl.qc_type == UGQUOTA) { - fprintf(stderr, "error: user and group quotas can't be set " - "both\n"); + fprintf(stderr, "error: neither -u nor -g specified\n"); return CMD_HELP; } - if (qctl.qc_cmd == LUSTRE_Q_SETQUOTA) { - struct obd_dqblk *dqb = &qctl.qc_dqblk; + if (optind != argc - 1) { + fprintf(stderr, "error: unexpected parameters encountered\n"); + return CMD_HELP; + } - if (optind + 6 != argc) - return CMD_HELP; + mnt = argv[optind]; + rc = llapi_quotactl(mnt, &qctl); + if (rc) { + if (*obd_type) + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qctl.obd_uuid)); + fprintf(stderr, "setquota failed: %s\n", strerror(errno)); + return rc; + } - rc = name2id(&qctl.qc_id, argv[optind++], qctl.qc_type); - if (rc) { - fprintf(stderr, "error: find id for name %s failed: %s\n", - argv[optind - 1], strerror(errno)); - return CMD_HELP; - } + return 0; +} - ARG2INT(dqb->dqb_bsoftlimit, argv[optind++], "block-softlimit"); - ARG2INT(dqb->dqb_bhardlimit, argv[optind++], "block-hardlimit"); - ARG2INT(dqb->dqb_isoftlimit, argv[optind++], "inode-softlimit"); - ARG2INT(dqb->dqb_ihardlimit, argv[optind++], "inode-hardlimit"); +#define BSLIMIT (1 << 0) +#define BHLIMIT (1 << 1) +#define ISLIMIT (1 << 2) +#define IHLIMIT (1 << 3) - dqb->dqb_valid = QIF_LIMITS; - } else { - struct obd_dqinfo *dqi = &qctl.qc_dqinfo; +int lfs_setquota(int argc, char **argv) +{ + int c, rc; + struct if_quotactl qctl; + char *mnt, *obd_type = (char *)qctl.obd_type; + struct obd_dqblk *dqb = &qctl.qc_dqblk; + struct option long_opts[] = { + {"user", required_argument, 0, 'u'}, + {"group", required_argument, 0, 'g'}, + {"block-softlimit", required_argument, 0, 'b'}, + {"block-hardlimit", required_argument, 0, 'B'}, + {"inode-softlimit", required_argument, 0, 'i'}, + {"inode-hardlimit", required_argument, 0, 'I'}, + {0, 0, 0, 0} + }; + unsigned limit_mask = 0; - if (optind + 3 != argc) - return CMD_HELP; + if (has_times_option(argc, argv)) + return lfs_setquota_times(argc, argv); + + memset(&qctl, 0, sizeof(qctl)); + qctl.qc_cmd = LUSTRE_Q_SETQUOTA; + qctl.qc_type = UGQUOTA; /* UGQUOTA makes no sense for setquota, + * so it can be used as a marker that qc_type + * isn't reinitialized from command line */ - if ((dqi->dqi_bgrace = str2sec(argv[optind++])) == ULONG_MAX) { - fprintf(stderr, "error: bad %s: %s\n", "block-grace", argv[optind - 1]); + optind = 0; + while ((c = getopt_long(argc, argv, "u:g:b:B:i:I:", long_opts, NULL)) != -1) { + switch (c) { + case 'u': + case 'g': + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: -u and -g can't be used" + " more than once\n"); + return CMD_HELP; + } + qctl.qc_type = (c == 'u') ? USRQUOTA : GRPQUOTA; + rc = name2id(&qctl.qc_id, optarg, + (qctl.qc_type == USRQUOTA) ? USER : GROUP); + if (rc) { + fprintf(stderr, "error: unknown id %s\n", + optarg); + return CMD_HELP; + } + break; + case 'b': + ARG2ULL(dqb->dqb_bsoftlimit, optarg, 1024); + dqb->dqb_bsoftlimit >>= 10; + limit_mask |= BSLIMIT; + break; + case 'B': + ARG2ULL(dqb->dqb_bhardlimit, optarg, 1024); + dqb->dqb_bhardlimit >>= 10; + limit_mask |= BHLIMIT; + break; + case 'i': + ARG2ULL(dqb->dqb_isoftlimit, optarg, 1); + limit_mask |= ISLIMIT; + break; + case 'I': + ARG2ULL(dqb->dqb_ihardlimit, optarg, 1); + limit_mask |= IHLIMIT; + break; + default: /* getopt prints error message for us when opterr != 0 */ return CMD_HELP; } - if ((dqi->dqi_igrace = str2sec(argv[optind++])) == ULONG_MAX) { - fprintf(stderr, "error: bad %s: %s\n", "inode-grace", argv[optind - 1]); + } + + if (qctl.qc_type == UGQUOTA) { + fprintf(stderr, "error: neither -u nor -g are specified\n"); + return CMD_HELP; + } + + if (optind != argc - 1) { + fprintf(stderr, "error: unexpected parameters encountered\n"); + return CMD_HELP; + } + + mnt = argv[optind]; + + if ((!(limit_mask & BHLIMIT) ^ !(limit_mask & BSLIMIT)) || + (!(limit_mask & IHLIMIT) ^ !(limit_mask & ISLIMIT))) { + /* sigh, we can't just set blimits/ilimits */ + struct if_quotactl tmp_qctl = {.qc_cmd = LUSTRE_Q_GETQUOTA, + .qc_type = qctl.qc_type, + .qc_id = qctl.qc_id}; + + rc = llapi_quotactl(mnt, &tmp_qctl); + if (rc < 0) { + fprintf(stderr, "error: getquota failed\n"); return CMD_HELP; } + + if (!(limit_mask & BHLIMIT)) + dqb->dqb_bhardlimit = tmp_qctl.qc_dqblk.dqb_bhardlimit; + if (!(limit_mask & BSLIMIT)) + dqb->dqb_bsoftlimit = tmp_qctl.qc_dqblk.dqb_bsoftlimit; + if (!(limit_mask & IHLIMIT)) + dqb->dqb_ihardlimit = tmp_qctl.qc_dqblk.dqb_ihardlimit; + if (!(limit_mask & ISLIMIT)) + dqb->dqb_isoftlimit = tmp_qctl.qc_dqblk.dqb_isoftlimit; } - mnt = argv[optind]; + dqb->dqb_valid |= (limit_mask & (BHLIMIT | BSLIMIT)) ? QIF_BLIMITS : 0; + dqb->dqb_valid |= (limit_mask & (IHLIMIT | ISLIMIT)) ? QIF_ILIMITS : 0; rc = llapi_quotactl(mnt, &qctl); if (rc) { if (*obd_type) - fprintf(stderr, "%s %s ", obd_type, obd_uuid); + fprintf(stderr, "%s %s ", obd_type, + obd_uuid2str(&qctl.obd_uuid)); fprintf(stderr, "setquota failed: %s\n", strerror(errno)); return rc; } @@ -1741,7 +1950,7 @@ static void print_quota_title(char *name, struct if_quotactl *qctl) "files", "quota", "limit", "grace"); } -static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) +static void print_quota(char *mnt, struct if_quotactl *qctl) { time_t now; @@ -1752,10 +1961,10 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) struct obd_dqblk *dqb = &qctl->qc_dqblk; if (dqb->dqb_bhardlimit && - toqb(dqb->dqb_curspace) > dqb->dqb_bhardlimit) { + toqb(dqb->dqb_curspace) >= dqb->dqb_bhardlimit) { bover = 1; } else if (dqb->dqb_bsoftlimit && - toqb(dqb->dqb_curspace) > dqb->dqb_bsoftlimit) { + toqb(dqb->dqb_curspace) >= dqb->dqb_bsoftlimit) { if (dqb->dqb_btime > now) { bover = 2; } else { @@ -1764,10 +1973,10 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) } if (dqb->dqb_ihardlimit && - dqb->dqb_curinodes > dqb->dqb_ihardlimit) { + dqb->dqb_curinodes >= dqb->dqb_ihardlimit) { iover = 1; } else if (dqb->dqb_isoftlimit && - dqb->dqb_curinodes > dqb->dqb_isoftlimit) { + dqb->dqb_curinodes >= dqb->dqb_isoftlimit) { if (dqb->dqb_btime > now) { iover = 2; } else { @@ -1789,10 +1998,16 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) if (bover) diff2str(dqb->dqb_btime, timebuf, now); - - sprintf(numbuf[0], LPU64, toqb(dqb->dqb_curspace)); - sprintf(numbuf[1], LPU64, dqb->dqb_bsoftlimit); - sprintf(numbuf[2], LPU64, dqb->dqb_bhardlimit); + sprintf(numbuf[0], (dqb->dqb_valid & QIF_SPACE) ? + LPU64 : "["LPU64"]", toqb(dqb->dqb_curspace)); + if (qctl->qc_valid == QC_GENERAL) + sprintf(numbuf[1], (dqb->dqb_valid & QIF_BLIMITS) + ? LPU64 : "["LPU64"]", + dqb->dqb_bsoftlimit); + else + sprintf(numbuf[1], "%s", ""); + sprintf(numbuf[2], (dqb->dqb_valid & QIF_BLIMITS) + ? LPU64 : "["LPU64"]", dqb->dqb_bhardlimit); printf(" %7s%c %6s %7s %7s", numbuf[0], bover ? '*' : ' ', numbuf[1], numbuf[2], bover > 1 ? timebuf : ""); @@ -1800,10 +2015,17 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) if (iover) diff2str(dqb->dqb_itime, timebuf, now); - sprintf(numbuf[0], LPU64, dqb->dqb_curinodes); - sprintf(numbuf[1], LPU64, dqb->dqb_isoftlimit); - sprintf(numbuf[2], LPU64, dqb->dqb_ihardlimit); - if (!ost_only) + sprintf(numbuf[0], (dqb->dqb_valid & QIF_INODES) ? + LPU64 : "["LPU64"]", dqb->dqb_curinodes); + if (qctl->qc_valid == QC_GENERAL) + sprintf(numbuf[1], (dqb->dqb_valid & QIF_ILIMITS) + ? LPU64 : "["LPU64"]", + dqb->dqb_isoftlimit); + else + sprintf(numbuf[1], "%s", ""); + sprintf(numbuf[2], (dqb->dqb_valid & QIF_ILIMITS) ? + LPU64 : "["LPU64"]", dqb->dqb_ihardlimit); + if (qctl->qc_valid != QC_OSTIDX) printf(" %7s%c %6s %7s %7s", numbuf[0], iover ? '*' : ' ', numbuf[1], numbuf[2], iover > 1 ? timebuf : ""); @@ -1821,103 +2043,89 @@ static void print_quota(char *mnt, struct if_quotactl *qctl, int ost_only) } } -static void print_mds_quota(char *mnt, struct if_quotactl *qctl) +static int print_obd_quota(char *mnt, struct if_quotactl *qctl, int is_mdt) { - int rc; + int rc = 0, rc1 = 0, count = 0; + __u32 valid = qctl->qc_valid; - /* XXX: this is a flag to mark that only mds quota is wanted */ - qctl->qc_dqblk.dqb_valid = 1; - rc = llapi_quotactl(mnt, qctl); + rc = llapi_get_obd_count(mnt, &count, is_mdt); if (rc) { - fprintf(stderr, "quotactl failed: %s\n", strerror(errno)); - return; - } - qctl->qc_dqblk.dqb_valid = 0; - - print_quota(qctl->obd_uuid.uuid, qctl, 0); -} - -static void print_lov_quota(char *mnt, struct if_quotactl *qctl) -{ - DIR *dir; - struct obd_uuid *uuids = NULL, *uuidp; - int obdcount = 1024; - int i, rc; - - dir = opendir(mnt); - if (!dir) { - fprintf(stderr, "open %s failed: %s\n", mnt, strerror(errno)); - return; - } - - uuids = (struct obd_uuid *)malloc(INIT_ALLOC_NUM_OSTS * - sizeof(struct obd_uuid)); - if (uuids == NULL) - goto out; - -retry_get_uuids: - rc = llapi_lov_get_uuids(dirfd(dir), uuids, &obdcount); - if (rc != 0) { - struct obd_uuid *uuids_temp; - - if (rc == -EOVERFLOW) { - uuids_temp = realloc(uuids, obdcount * - sizeof(struct obd_uuid)); - if (uuids_temp != NULL) - goto retry_get_uuids; - else - rc = -ENOMEM; - } - - fprintf(stderr, "get ost uuid failed: %s\n", strerror(rc)); - goto out; + fprintf(stderr, "can not get %s count: %s\n", + is_mdt ? "mdt": "ost", strerror(errno)); + return rc; } - for (i = 0, uuidp = uuids; i < obdcount; i++, uuidp++) { - memcpy(&qctl->obd_uuid, uuidp, sizeof(*uuidp)); - - /* XXX clear this flag to get quota from osts */ - qctl->qc_dqblk.dqb_valid = 0; + for (qctl->qc_idx = 0; qctl->qc_idx < count; qctl->qc_idx++) { + qctl->qc_valid = is_mdt ? QC_MDTIDX : QC_OSTIDX; rc = llapi_quotactl(mnt, qctl); if (rc) { - fprintf(stderr, "%s quotactl failed: %s\n", - uuidp->uuid, strerror(errno)); + /* It is remote client case. */ + if (errno == EOPNOTSUPP) { + rc = 0; + goto out; + } + + if (!rc1) + rc1 = rc; + fprintf(stderr, "quotactl %s%d failed.\n", + is_mdt ? "mdt": "ost", qctl->qc_idx); continue; } - print_quota(uuidp->uuid, qctl, 1); + print_quota(obd_uuid2str(&qctl->obd_uuid), qctl); } out: - closedir(dir); - return; + qctl->qc_valid = valid; + return rc ? : rc1; } static int lfs_quota(int argc, char **argv) { int c; - char *name = NULL, *mnt; + char *mnt, *name = NULL; struct if_quotactl qctl = { .qc_cmd = LUSTRE_Q_GETQUOTA, - .qc_type = 0x01 }; - char *obd_type = qctl.obd_type; - char *obd_uuid = qctl.obd_uuid.uuid; - int rc; + .qc_type = UGQUOTA }; + char *obd_type = (char *)qctl.obd_type; + char *obd_uuid = (char *)qctl.obd_uuid.uuid; + int rc, rc1 = 0, rc2 = 0, rc3 = 0, verbose = 0, pass = 0; + __u32 valid = QC_GENERAL, idx = 0; optind = 0; - while ((c = getopt(argc, argv, "ugto:")) != -1) { + while ((c = getopt(argc, argv, "ugto:i:I:v")) != -1) { switch (c) { case 'u': - qctl.qc_type = 0x01; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: use either -u or -g\n"); + return CMD_HELP; + } + qctl.qc_type = USRQUOTA; break; case 'g': - qctl.qc_type = 0x02; + if (qctl.qc_type != UGQUOTA) { + fprintf(stderr, "error: use either -u or -g\n"); + return CMD_HELP; + } + qctl.qc_type = GRPQUOTA; break; case 't': qctl.qc_cmd = LUSTRE_Q_GETINFO; break; case 'o': + valid = qctl.qc_valid = QC_UUID; strncpy(obd_uuid, optarg, sizeof(qctl.obd_uuid)); break; + case 'i': + valid = qctl.qc_valid = QC_MDTIDX; + idx = qctl.qc_idx = atoi(optarg); + break; + case 'I': + valid = qctl.qc_valid = QC_OSTIDX; + idx = qctl.qc_idx = atoi(optarg); + break; + case 'v': + verbose = 1; + break; default: fprintf(stderr, "error: %s: option '-%c' " "unrecognized\n", argv[0], c); @@ -1925,57 +2133,80 @@ static int lfs_quota(int argc, char **argv) } } - if (qctl.qc_type) - qctl.qc_type--; - - - if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) { + /* current uid/gid info for "lfs quota /path/to/lustre/mount" */ + if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA && qctl.qc_type == UGQUOTA && + optind == argc - 1) { +ug_output: + memset(&qctl, 0, sizeof(qctl)); /* spoiled by print_*_quota */ + qctl.qc_cmd = LUSTRE_Q_GETQUOTA; + qctl.qc_valid = valid; + qctl.qc_idx = idx; + if (pass++ == 0) { + qctl.qc_type = USRQUOTA; + qctl.qc_id = geteuid(); + } else { + qctl.qc_type = GRPQUOTA; + qctl.qc_id = getegid(); + } + rc = id2name(&name, qctl.qc_id, + (qctl.qc_type == USRQUOTA) ? USER : GROUP); + if (rc) + name = ""; + } else if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) { if (optind + 2 != argc) { fprintf(stderr, "error: missing quota argument(s)\n"); return CMD_HELP; } name = argv[optind++]; - rc = name2id(&qctl.qc_id, name, qctl.qc_type); + rc = name2id(&qctl.qc_id, name, + (qctl.qc_type == USRQUOTA) ? USER : GROUP); if (rc) { fprintf(stderr,"error: can't find id for name %s: %s\n", name, strerror(errno)); return CMD_HELP; } - print_quota_title(name, &qctl); } else if (optind + 1 != argc) { fprintf(stderr, "error: missing quota info argument(s)\n"); return CMD_HELP; } + if (qctl.qc_cmd == LUSTRE_Q_GETQUOTA) + print_quota_title(name, &qctl); + mnt = argv[optind]; - rc = llapi_quotactl(mnt, &qctl); - if (rc) { - if (*obd_type) - fprintf(stderr, "%s %s ", obd_type, obd_uuid); - fprintf(stderr, "quota failed: %s\n", strerror(errno)); - return rc; + rc1 = llapi_quotactl(mnt, &qctl); + if (rc1 == -1 && errno == ESRCH) { + fprintf(stderr, "\n%s quotas are not enabled.\n", + qctl.qc_type == USRQUOTA ? "user" : "group"); + goto out; } + if (rc1 && *obd_type) + fprintf(stderr, "%s %s ", obd_type, obd_uuid); - if (!name) - rc = id2name(&name, getuid(), qctl.qc_type); + if (qctl.qc_valid != QC_GENERAL) + mnt = obd_uuid2str(&qctl.obd_uuid); - if (*obd_uuid) { - mnt = ""; - name = obd_uuid; + print_quota(mnt, &qctl); + + if (qctl.qc_valid == QC_GENERAL && qctl.qc_cmd != LUSTRE_Q_GETINFO && verbose) { + rc2 = print_obd_quota(mnt, &qctl, 1); + rc3 = print_obd_quota(mnt, &qctl, 0); } - print_quota(mnt, &qctl, 0); + if (rc1 || rc2 || rc3) + printf("Some errors happened when getting quota info. " + "Some devices may be not working or deactivated. " + "The data in \"[]\" is inaccurate.\n"); - if (!*obd_uuid && qctl.qc_cmd != LUSTRE_Q_GETINFO) { - print_mds_quota(mnt, &qctl); - print_lov_quota(mnt, &qctl); - } +out: + if (pass == 1) + goto ug_output; return 0; } -#endif /* HAVE_QUOTA_SUPPORT */ +#endif /* HAVE_SYS_QUOTA_H! */ static int flushctx_ioctl(char *mp) { diff --git a/lustre/utils/liblustreapi.c b/lustre/utils/liblustreapi.c index 9aad868..3856947 100644 --- a/lustre/utils/liblustreapi.c +++ b/lustre/utils/liblustreapi.c @@ -160,18 +160,25 @@ void llapi_printf(int level, char *fmt, ...) va_end(args); } +/** + * size_units is unchanged if no specifier used + */ int parse_size(char *optarg, unsigned long long *size, - unsigned long long *size_units) + unsigned long long *size_units, int bytes_spec) { char *end; - *size = strtoul(optarg, &end, 0); + *size = strtoull(optarg, &end, 0); if (*end != '\0') { if ((*end == 'b') && *(end+1) == '\0' && - (*size & (~0ULL << (64 - 9))) == 0) { + (*size & (~0ULL << (64 - 9))) == 0 && + !bytes_spec) { *size <<= 9; *size_units = 1 << 9; + } else if ((*end == 'b') && *(end+1) == '\0' && + bytes_spec) { + *size_units = 1; } else if ((*end == 'k' || *end == 'K') && *(end+1) == '\0' && (*size & (~0ULL << (64 - 10))) == 0) { @@ -651,6 +658,24 @@ int llapi_lov_get_uuids(int fd, struct obd_uuid *uuidp, int *ost_count) return rc; } +int llapi_get_obd_count(char *mnt, int *count, int is_mdt) +{ + DIR *root; + int rc; + + root = opendir(mnt); + if (!root) { + llapi_err(LLAPI_MSG_ERROR, "open %s failed", mnt); + return -1; + } + + *count = is_mdt; + rc = ioctl(dirfd(root), LL_IOC_GETOBDCOUNT, count); + + closedir(root); + return rc; +} + /* Here, param->obduuid points to a single obduuid, the index of which is * returned in param->obdindex */ static int setup_obd_uuid(DIR *dir, char *dname, struct find_param *param) @@ -1201,7 +1226,7 @@ err: * @mds indicates if this is MDS timestamps and there are attributes on OSTs. * * The result is -1 if it does not match, 0 if not yet clear, 1 if matches. - * The table bolow gives the answers for the specified parameters (value and + * The table below gives the answers for the specified parameters (value and * sign), 1st column is the answer for the MDS value, the 2nd is for the OST: * -------------------------------------- * 1 | file > limit; sign > 0 | -1 / -1 | diff --git a/lustre/utils/llog_reader.c b/lustre/utils/llog_reader.c index d2f1f1a..ae62a39 100644 --- a/lustre/utils/llog_reader.c +++ b/lustre/utils/llog_reader.c @@ -363,6 +363,11 @@ void print_lustre_cfg(struct lustre_cfg *lcfg, int *skip) print_1_cfg(lcfg); break; } + case(LCFG_SPTLRPC_CONF):{ + printf("sptlrpc_conf "); + print_1_cfg(lcfg); + break; + } case(LCFG_MARKER):{ struct cfg_marker *marker = lustre_cfg_buf(lcfg, 1); char createtime[26], canceltime[26] = ""; diff --git a/lustre/utils/lmc b/lustre/utils/lmc index d106c64..861e318 100755 --- a/lustre/utils/lmc +++ b/lustre/utils/lmc @@ -222,9 +222,9 @@ lmc_options = [ ('quota', """ quotaon: enable quota, only u|g|ug is supported now. iunit: the unit for slave to acquire/release inode quota from/to master. - Int type (>0), default value in Lustre is 5000 inodes. + Int type (>0), default value in Lustre is 5120 inodes. bunit: the unit for slave to acquire/release block quota from/to master. - Mbytes (>0), default value in Lustre is 100(Mbytes). + Mbytes (>0), default value in Lustre is 128(Mbytes). itune: used to tune the threthold. When inode quota usage reach the threthold, slave should acquire/release inode quota from/to master. Int type (100 > btune > 0), default value in Lustre is 50 (percentge). diff --git a/lustre/utils/mkfs_lustre.c b/lustre/utils/mkfs_lustre.c index 0bd83b7..8f54f8b 100644 --- a/lustre/utils/mkfs_lustre.c +++ b/lustre/utils/mkfs_lustre.c @@ -98,6 +98,7 @@ char *progname; int verbose = 1; static int print_only = 0; static int failover = 0; +static int upgrade_to_18 = 0; void usage(FILE *out) { @@ -130,6 +131,7 @@ void usage(FILE *out) "\t\t--mkfsoptions= : format options\n" "\t\t--reformat: overwrite an existing disk\n" "\t\t--stripe-count-hint=#N : used for optimizing MDT inode size\n" + "\t\t--iam-dir: make use of IAM directory format on backfs, incompatible with ext3.\n" #else "\t\t--erase-params : erase all old parameter settings\n" "\t\t--nomgs: turn off MGS service on this MDT\n" @@ -716,7 +718,7 @@ void print_ldd(char *str, struct lustre_disk_data *ldd) printf("Lustre FS: %s\n", ldd->ldd_fsname); printf("Mount type: %s\n", MT_STR(ldd)); printf("Flags: %#x\n", ldd->ldd_flags); - printf(" (%s%s%s%s%s%s%s%s)\n", + printf(" (%s%s%s%s%s%s%s%s%s)\n", IS_MDT(ldd) ? "MDT ":"", IS_OST(ldd) ? "OST ":"", IS_MGS(ldd) ? "MGS ":"", @@ -724,6 +726,7 @@ void print_ldd(char *str, struct lustre_disk_data *ldd) ldd->ldd_flags & LDD_F_VIRGIN ? "first_time ":"", ldd->ldd_flags & LDD_F_UPDATE ? "update ":"", ldd->ldd_flags & LDD_F_WRITECONF ? "writeconf ":"", + ldd->ldd_flags & LDD_F_IAM_DIR ? "IAM_dir_format ":"", ldd->ldd_flags & LDD_F_UPGRADE14 ? "upgrade1.4 ":""); printf("Persistent mount opts: %s\n", ldd->ldd_mount_opts); printf("Parameters:%s\n", ldd->ldd_params); @@ -732,6 +735,67 @@ void print_ldd(char *str, struct lustre_disk_data *ldd) printf("\n"); } +static int touch_file(char *filename) +{ + int fd; + + if (filename == NULL) { + return 1; + } + + fd = open(filename, O_CREAT | O_TRUNC, 0600); + if (fd < 0) { + return 1; + } else { + close(fd); + return 0; + } +} + +/* keep it less than LL_FID_NAMELEN */ +#define DUMMY_FILE_NAME_LEN 25 +#define EXT3_DIRENT_SIZE DUMMY_FILE_NAME_LEN + +/* Need to add these many entries to this directory to make HTREE dir. */ +#define MIN_ENTRIES_REQ_FOR_HTREE ((L_BLOCK_SIZE / EXT3_DIRENT_SIZE)) + +static int add_dummy_files(char *dir) +{ + char fpname[PATH_MAX]; + int i; + int rc; + + for (i = 0; i < MIN_ENTRIES_REQ_FOR_HTREE; i++) { + snprintf(fpname, PATH_MAX, "%s/%0*d", dir, + DUMMY_FILE_NAME_LEN, i); + + rc = touch_file(fpname); + if (rc && rc != -EEXIST) { + fprintf(stderr, + "%s: Can't create dummy file %s: %s\n", + progname, fpname , strerror(errno)); + return rc; + } + } + return 0; +} + +static int __l_mkdir(char * filepnm, int mode , struct mkfs_opts *mop) +{ + int ret; + + ret = mkdir(filepnm, mode); + if (ret && ret != -EEXIST) + return ret; + + /* IAM mode supports ext3 directories of HTREE type only. So add dummy + * entries to new directory to create htree type of container for + * this directory. */ + if (mop->mo_ldd.ldd_flags & LDD_F_IAM_DIR) + return add_dummy_files(filepnm); + return 0; +} + /* Write the server config files */ int write_local_files(struct mkfs_opts *mop) { @@ -766,7 +830,7 @@ int write_local_files(struct mkfs_opts *mop) /* Set up initial directories */ sprintf(filepnm, "%s/%s", mntpt, MOUNT_CONFIGS_DIR); - ret = mkdir(filepnm, 0777); + ret = __l_mkdir(filepnm, 0777, mop); if ((ret != 0) && (errno != EEXIST)) { fprintf(stderr, "%s: Can't make configs dir %s (%s)\n", progname, filepnm, strerror(errno)); @@ -775,16 +839,6 @@ int write_local_files(struct mkfs_opts *mop) ret = 0; } - sprintf(filepnm, "%s/%s", mntpt, "ROOT"); - ret = mkdir(filepnm, 0777); - if ((ret != 0) && (errno != EEXIST)) { - fprintf(stderr, "%s: Can't make ROOT dir %s (%s)\n", - progname, filepnm, strerror(errno)); - goto out_umnt; - } else if (errno == EEXIST) { - ret = 0; - } - /* Save the persistent mount data into a file. Lustre must pre-read this file to get the real mount options. */ vprint("Writing %s\n", MOUNT_DATA_FILE); @@ -797,7 +851,6 @@ int write_local_files(struct mkfs_opts *mop) } fwrite(&mop->mo_ldd, sizeof(mop->mo_ldd), 1, filep); fclose(filep); - /* COMPAT_146 */ #ifdef TUNEFS /* Check for upgrade */ @@ -859,7 +912,6 @@ int write_local_files(struct mkfs_opts *mop) #endif /* end COMPAT_146 */ - out_umnt: umount(mntpt); out_rmdir: @@ -1102,6 +1154,7 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, char **mountopts) { static struct option long_opt[] = { + {"iam-dir", 0, 0, 'a'}, {"backfstype", 1, 0, 'b'}, {"stripe-count-hint", 1, 0, 'c'}, {"comment", 1, 0, 'u'}, @@ -1129,6 +1182,7 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, {"reformat", 0, 0, 'r'}, {"verbose", 0, 0, 'v'}, {"writeconf", 0, 0, 'w'}, + {"upgrade_to_18", 0, 0, 'U'}, {0, 0, 0, 0} }; char *optstring = "b:c:C:d:ef:Ghi:k:L:m:MnNo:Op:Pqru:vw"; @@ -1138,6 +1192,11 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, while ((opt = getopt_long(argc, argv, optstring, long_opt, &longidx)) != EOF) { switch (opt) { + case 'a': { + if (IS_MDT(&mop->mo_ldd)) + mop->mo_ldd.ldd_flags |= LDD_F_IAM_DIR; + break; + } case 'b': { int i = 0; while (i < LDD_MT_LAST) { @@ -1289,6 +1348,9 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, case 'w': mop->mo_ldd.ldd_flags |= LDD_F_WRITECONF; break; + case 'U': + upgrade_to_18 = 1; + break; default: if (opt != '?') { fatal(); @@ -1308,227 +1370,6 @@ int parse_opts(int argc, char *const argv[], struct mkfs_opts *mop, return 0; } -#include - -#define LDISKFS_IOC_GETVERSION _IOR('f', 3, long) - -#ifndef TUNEFS /* mkfs.lustre */ -static int mkfs_iam_insert(int key_need_convert, char *keybuf, - int rec_need_convert, char *recbuf, char *filename) -{ - int fd; - int ret; - struct iam_uapi_info ua; - - fd = iam_open(filename, &ua); - if (fd < 0) { - fprintf(stderr, "failed to iam_open %s\n", filename); - return 1; - } - - ret = iam_insert(fd, &ua, - key_need_convert, keybuf, - rec_need_convert, recbuf); - iam_close(fd); - if (ret) { - fprintf(stderr, "failed to iam_insert %s\n", filename); - return 1; - } else { - return 0; - } -} - -static int touch_file(char *filename) -{ - int fd; - - if (filename == NULL) { - return 1; - } - - fd = open(filename, O_CREAT | O_TRUNC, 0600); - if (fd < 0) { - return 1; - } else { - close(fd); - return 0; - } -} - -static int get_generation(char *filename, unsigned long *result) -{ - int fd; - int ret; - - if (filename == NULL) { - return 1; - } - - fd = open(filename, O_RDONLY); - if (fd < 0) { - fprintf(stderr, "%s: failed to open %s\n", - __FUNCTION__, filename); - return 1; - } - - ret = ioctl(fd, LDISKFS_IOC_GETVERSION, result); - close(fd); - - return ((ret < 0) ? ret : 0); -} - -static int mkfs_mdt(struct mkfs_opts *mop) -{ - char mntpt[] = "/tmp/mntXXXXXX"; - char fstype[] = "ldiskfs"; - char filepnm[128]; - char recbuf[64]; - char *source; - int ret; - unsigned long generation; - struct stat st; - - source = mop->mo_device; - if (mop->mo_flags & MO_IS_LOOP) { - source = mop->mo_loopdev; - } - - if ((source == NULL) || (*source == 0)) { - return 1; - } - - if (!mkdtemp(mntpt)) { - fprintf(stderr, "%s: failed to mkdtemp %s\n", - __FUNCTION__, mntpt); - return errno; - } - - ret = mount(source, mntpt, fstype, 0, NULL); - if (ret) { - goto out_rmdir; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "seq_ctl"); - ret = touch_file(filepnm); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "seq_srv"); - ret = touch_file(filepnm); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "last_received"); - ret = touch_file(filepnm); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "lov_objid"); - ret = touch_file(filepnm); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "root"); - ret = iam_creat(filepnm, FMT_LVAR, L_BLOCK_SIZE, 4, 17, 4); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "fld"); - ret = iam_creat(filepnm, FMT_LFIX, L_BLOCK_SIZE, 8, 8, 4); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "orphans"); - ret = iam_creat(filepnm, FMT_LFIX, L_BLOCK_SIZE, 20, 8, 4); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "oi.16"); - ret = iam_creat(filepnm, FMT_LFIX, L_BLOCK_SIZE, 16, 8, 4); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "oi.5"); - ret = iam_creat(filepnm, FMT_LFIX, L_BLOCK_SIZE, 5, 8, 4); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, CAPA_KEYS); - ret = touch_file(filepnm); - if (ret) { - goto out_umount; - } - - umount(mntpt); - ret = mount(source, mntpt, fstype, 0, NULL); - if (ret) { - goto out_rmdir; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "root"); - ret = iam_polymorph(filepnm, 040755); - if (ret) { - perror("IAM_IOC_POLYMORPH"); - goto out_umount; - } - - umount(mntpt); - ret = mount(source, mntpt, fstype, 0, NULL); - if (ret) { - goto out_rmdir; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "fld"); - ret = mkfs_iam_insert(1, "0000000000000002", 1, "0000000000000000", filepnm); - if (ret) { - goto out_umount; - } - - ret = mkfs_iam_insert(1, "0000000000000001", 1, "0000000000000000", filepnm); - if (ret) { - goto out_umount; - } - - snprintf(filepnm, sizeof(filepnm) - 1, "%s/%s", mntpt, "root"); - ret = stat(filepnm, &st); - if (ret) { - goto out_umount; - } - - ret = get_generation(filepnm, &generation); - if (ret) { - goto out_umount; - } - - snprintf(recbuf, sizeof(recbuf) - 1, "110000000000000001%8.8x%8.8x", - (unsigned int)st.st_ino, (unsigned int)generation); - ret = mkfs_iam_insert(0, ".", 1, recbuf, filepnm); - if (ret) { - goto out_umount; - } - - ret = mkfs_iam_insert(0, "..", 1, recbuf, filepnm); - if (ret) { - goto out_umount; - } - -out_umount: - umount(mntpt); -out_rmdir: - rmdir(mntpt); - return ret; -} -#endif - int main(int argc, char *const argv[]) { struct mkfs_opts mop; @@ -1758,16 +1599,6 @@ int main(int argc, char *const argv[]) goto out; } -#ifndef TUNEFS /* mkfs.lustre */ - if (IS_MDT(ldd)) { - ret = mkfs_mdt(&mop); - if (ret != 0) { - fprintf(stderr, "failed to mkfs_mdt\n"); - goto out; - } - } -#endif - out: loop_cleanup(&mop); diff --git a/lustre/utils/mount_lustre.c b/lustre/utils/mount_lustre.c index 0e08246..8bfddf2 100644 --- a/lustre/utils/mount_lustre.c +++ b/lustre/utils/mount_lustre.c @@ -88,6 +88,7 @@ void usage(FILE *out) "\t-v|--verbose: print verbose config settings\n" "\t: one or more comma separated of:\n" "\t\t(no)flock,(no)user_xattr,(no)acl\n" + "\t\tabort_recov: abort server recovery handling\n" "\t\tnosvc: only start MGC/MGS obds\n" "\t\tnomgs: only start target obds, using existing MGS\n" "\t\texclude=[:] : colon-separated list of " diff --git a/lustre/utils/obd.c b/lustre/utils/obd.c index c8e9059..e7abfce 100644 --- a/lustre/utils/obd.c +++ b/lustre/utils/obd.c @@ -207,9 +207,6 @@ out: if (rc) { if (errno == ENOSYS) fprintf(stderr, "Make sure cfg_device is set first.\n"); - if (errno == EINVAL) - fprintf(stderr, "cfg_device should be of the form " - "'lustre-MDT0000'\n"); } return rc; } diff --git a/lustre/utils/obdiolib.c b/lustre/utils/obdiolib.c index 64f0ce4..fa272db 100644 --- a/lustre/utils/obdiolib.c +++ b/lustre/utils/obdiolib.c @@ -152,6 +152,8 @@ obdio_pwrite (struct obdio_conn *conn, __u64 oid, conn->oc_data.ioc_obdo1.o_valid = OBD_MD_FLID | OBD_MD_FLTYPE | OBD_MD_FLMODE; + conn->oc_data.ioc_pbuf1 = (void*)1; + conn->oc_data.ioc_plen1 = 1; conn->oc_data.ioc_pbuf2 = buffer; conn->oc_data.ioc_plen2 = count; conn->oc_data.ioc_count = count; diff --git a/lustre/utils/req-layout.c b/lustre/utils/req-layout.c index dc366bd..e5fd0f8 100644 --- a/lustre/utils/req-layout.c +++ b/lustre/utils/req-layout.c @@ -50,7 +50,7 @@ #define __REQ_LAYOUT_USER__ (1) #define lustre_swab_generic_32s NULL -#define lustre_swab_lu_range NULL +#define lustre_swab_lu_seq_range NULL #define lustre_swab_md_fld NULL #define lustre_swab_mdt_body NULL #define lustre_swab_mdt_epoch NULL @@ -69,6 +69,7 @@ #define lustre_swab_llog_hdr NULL #define lustre_swab_llogd_body NULL #define lustre_swab_obd_quotactl NULL +#define lustre_swab_quota_adjust_qunit NULL #define lustre_swab_mgs_target_info NULL #define lustre_swab_niobuf_remote NULL #define lustre_swab_obd_ioobj NULL diff --git a/lustre/utils/wirecheck.c b/lustre/utils/wirecheck.c index 99c80a7..55d026b 100644 --- a/lustre/utils/wirecheck.c +++ b/lustre/utils/wirecheck.c @@ -206,8 +206,8 @@ static void check_obd_connect_data(void) CHECK_CDEFINE(OBD_CONNECT_JOIN); CHECK_CDEFINE(OBD_CONNECT_ATTRFID); CHECK_CDEFINE(OBD_CONNECT_NODEVOH); - CHECK_CDEFINE(OBD_CONNECT_LCL_CLIENT); CHECK_CDEFINE(OBD_CONNECT_RMT_CLIENT); + CHECK_CDEFINE(OBD_CONNECT_RMT_CLIENT_FORCE); CHECK_CDEFINE(OBD_CONNECT_BRW_SIZE); CHECK_CDEFINE(OBD_CONNECT_QUOTA64); CHECK_CDEFINE(OBD_CONNECT_MDS_CAPA); @@ -914,6 +914,22 @@ check_llog_setattr_rec(void) } static void +check_llog_setattr64_rec(void) +{ + BLANK_LINE(); + CHECK_STRUCT(llog_setattr64_rec); + CHECK_MEMBER(llog_setattr64_rec, lsr_hdr); + CHECK_MEMBER(llog_setattr64_rec, lsr_oid); + CHECK_MEMBER(llog_setattr64_rec, lsr_ogen); + CHECK_MEMBER(llog_setattr64_rec, padding); + CHECK_MEMBER(llog_setattr64_rec, lsr_uid); + CHECK_MEMBER(llog_setattr64_rec, lsr_uid_h); + CHECK_MEMBER(llog_setattr64_rec, lsr_gid); + CHECK_MEMBER(llog_setattr64_rec, lsr_gid_h); + CHECK_MEMBER(llog_setattr64_rec, lsr_tail); +} + +static void check_llog_size_change_rec(void) { BLANK_LINE(); @@ -1035,17 +1051,8 @@ check_qunit_data(void) CHECK_MEMBER(qunit_data, qd_id); CHECK_MEMBER(qunit_data, qd_flags); CHECK_MEMBER(qunit_data, qd_count); -} - -static void -check_qunit_data_old(void) -{ - BLANK_LINE(); - CHECK_STRUCT(qunit_data_old); - CHECK_MEMBER(qunit_data_old, qd_id); - CHECK_MEMBER(qunit_data_old, qd_type); - CHECK_MEMBER(qunit_data_old, qd_count); - CHECK_MEMBER(qunit_data_old, qd_isblk); + CHECK_MEMBER(qunit_data, qd_qunit); + CHECK_MEMBER(qunit_data, padding); } static void @@ -1106,6 +1113,18 @@ check_posix_acl_xattr_header(void) } static void +check_quota_adjust_qunit(void) +{ + BLANK_LINE(); + CHECK_STRUCT(quota_adjust_qunit); + CHECK_MEMBER(quota_adjust_qunit, qaq_flags); + CHECK_MEMBER(quota_adjust_qunit, qaq_id); + CHECK_MEMBER(quota_adjust_qunit, qaq_bunit_sz); + CHECK_MEMBER(quota_adjust_qunit, qaq_iunit_sz); + CHECK_MEMBER(quota_adjust_qunit, padding1); +} + +static void check_ll_user_fiemap(void) { BLANK_LINE(); @@ -1253,6 +1272,7 @@ main(int argc, char **argv) CHECK_VALUE(OST_SYNC); CHECK_VALUE(OST_QUOTACHECK); CHECK_VALUE(OST_QUOTACTL); + CHECK_VALUE(OST_QUOTA_ADJUST_QUNIT); CHECK_VALUE(OST_LAST_OPC); CHECK_DEFINE(OBD_OBJECT_EOF); @@ -1384,6 +1404,7 @@ main(int argc, char **argv) check_llog_orphan_rec(); check_llog_unlink_rec(); check_llog_setattr_rec(); + check_llog_setattr64_rec(); check_llog_size_change_rec(); check_llog_gen(); check_llog_gen_rec(); @@ -1394,7 +1415,7 @@ main(int argc, char **argv) check_llog_array_rec(); check_mds_extent_desc(); check_qunit_data(); - check_qunit_data_old(); + check_quota_adjust_qunit(); check_mgs_target_info(); check_lustre_disk_data(); check_ll_user_fiemap(); diff --git a/lustre/utils/wiretest.c b/lustre/utils/wiretest.c index f5a777b..f881c82 100644 --- a/lustre/utils/wiretest.c +++ b/lustre/utils/wiretest.c @@ -63,8 +63,8 @@ void lustre_assert_wire_constants(void) { /* Wire protocol assertions generated by 'wirecheck' * (make -C lustre/utils newwiretest) - * running on Linux xlab.hostel 2.6.23.15-80.fc7 #1 SMP Sun Feb 10 17:29:10 EST 2008 i686 i68 - * with gcc version 3.4.6 20060404 (Red Hat 3.4.6-7) */ + * running on Linux vb1 2.6.18-build.1 #1 SMP Thu Mar 27 14:34:21 MDT 2008 i686 i686 i386 GNU + * with gcc version 4.1.2 20070626 (Red Hat 4.1.2-14) */ /* Constants... */ @@ -126,7 +126,9 @@ void lustre_assert_wire_constants(void) (long long)OST_QUOTACHECK); LASSERTF(OST_QUOTACTL == 19, " found %lld\n", (long long)OST_QUOTACTL); - LASSERTF(OST_LAST_OPC == 20, " found %lld\n", + LASSERTF(OST_QUOTA_ADJUST_QUNIT == 20, " found %lld\n", + (long long)OST_QUOTA_ADJUST_QUNIT); + LASSERTF(OST_LAST_OPC == 21, " found %lld\n", (long long)OST_LAST_OPC); LASSERTF(OBD_OBJECT_EOF == 0xffffffffffffffffULL," found %lld\n", (long long)OBD_OBJECT_EOF); @@ -234,9 +236,9 @@ void lustre_assert_wire_constants(void) (long long)LCK_NL); LASSERTF(LCK_GROUP == 64, " found %lld\n", (long long)LCK_GROUP); - LASSERTF(LCK_MAXMODE == 65, " found %lld\n", + LASSERTF(LCK_MAXMODE == 129, " found %lld\n", (long long)LCK_MAXMODE); - LASSERTF(LCK_MODE_NUM == 7, " found %lld\n", + LASSERTF(LCK_MODE_NUM == 8, " found %lld\n", (long long)LCK_MODE_NUM); CLASSERT(LDLM_PLAIN == 10); CLASSERT(LDLM_EXTENT == 11); @@ -250,9 +252,9 @@ void lustre_assert_wire_constants(void) (long long)OBD_QC_CALLBACK); LASSERTF(OBD_LAST_OPC == 403, " found %lld\n", (long long)OBD_LAST_OPC); - LASSERTF(QUOTA_DQACQ == 601, " found %lld\n", + LASSERTF(QUOTA_DQACQ == 901, " found %lld\n", (long long)QUOTA_DQACQ); - LASSERTF(QUOTA_DQREL == 602, " found %lld\n", + LASSERTF(QUOTA_DQREL == 902, " found %lld\n", (long long)QUOTA_DQREL); LASSERTF(MGS_CONNECT == 250, " found %lld\n", (long long)MGS_CONNECT); @@ -458,8 +460,8 @@ void lustre_assert_wire_constants(void) CLASSERT(OBD_CONNECT_JOIN == 0x00002000ULL); CLASSERT(OBD_CONNECT_ATTRFID == 0x00004000ULL); CLASSERT(OBD_CONNECT_NODEVOH == 0x00008000ULL); - CLASSERT(OBD_CONNECT_LCL_CLIENT == 0x00010000ULL); - CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00020000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT == 0x00010000ULL); + CLASSERT(OBD_CONNECT_RMT_CLIENT_FORCE == 0x00020000ULL); CLASSERT(OBD_CONNECT_BRW_SIZE == 0x00040000ULL); CLASSERT(OBD_CONNECT_QUOTA64 == 0x00080000ULL); CLASSERT(OBD_CONNECT_MDS_CAPA == 0x00100000ULL); @@ -697,6 +699,67 @@ void lustre_assert_wire_constants(void) LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n", (long long)LOV_PATTERN_RAID1); + /* Checks for struct lov_mds_md_v3 */ + LASSERTF((int)sizeof(struct lov_mds_md_v3) == 48, " found %lld\n", + (long long)(int)sizeof(struct lov_mds_md_v3)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_magic) == 0, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_magic)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_magic)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pattern) == 4, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pattern)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pattern)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_id) == 8, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_id)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_id)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_object_gr) == 16, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_object_gr)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_object_gr)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_size) == 24, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_size)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_size)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_stripe_count) == 28, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_stripe_count)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_stripe_count)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_pool_name) == 32, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_pool_name)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name) == 16, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_pool_name)); + LASSERTF((int)offsetof(struct lov_mds_md_v3, lmm_objects) == 48, " found %lld\n", + (long long)(int)offsetof(struct lov_mds_md_v3, lmm_objects)); + LASSERTF((int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects) == 0, " found %lld\n", + (long long)(int)sizeof(((struct lov_mds_md_v3 *)0)->lmm_objects)); + + /* Checks for struct lov_ost_data_v1 */ + LASSERTF((int)sizeof(struct lov_ost_data_v1) == 24, " found %lld\n", + (long long)(int)sizeof(struct lov_ost_data_v1)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_id) == 0, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_object_id)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_id)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_object_gr) == 8, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_object_gr)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr) == 8, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_object_gr)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_gen) == 16, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_gen)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_gen)); + LASSERTF((int)offsetof(struct lov_ost_data_v1, l_ost_idx) == 20, " found %lld\n", + (long long)(int)offsetof(struct lov_ost_data_v1, l_ost_idx)); + LASSERTF((int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx) == 4, " found %lld\n", + (long long)(int)sizeof(((struct lov_ost_data_v1 *)0)->l_ost_idx)); + CLASSERT(LOV_MAGIC_V3 == 0x0BD30BD0); + LASSERTF(LOV_PATTERN_RAID0 == 1, " found %lld\n", + (long long)LOV_PATTERN_RAID0); + LASSERTF(LOV_PATTERN_RAID1 == 2, " found %lld\n", + (long long)LOV_PATTERN_RAID1); + /* Checks for struct lov_mds_md_join */ LASSERTF((int)sizeof(struct lov_mds_md_join) == 56, " found %lld\n", (long long)(int)sizeof(struct lov_mds_md_join)); @@ -1581,6 +1644,38 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct ost_lvb *)0)->lvb_blocks) == 8, " found %lld\n", (long long)(int)sizeof(((struct ost_lvb *)0)->lvb_blocks)); + /* Checks for struct cfg_marker */ + LASSERTF((int)sizeof(struct cfg_marker) == 160, " found %lld\n", + (long long)(int)sizeof(struct cfg_marker)); + LASSERTF((int)offsetof(struct cfg_marker, cm_step) == 0, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_step)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_step) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_step)); + LASSERTF((int)offsetof(struct cfg_marker, cm_flags) == 4, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_flags)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_flags)); + LASSERTF((int)offsetof(struct cfg_marker, cm_vers) == 8, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_vers)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_vers) == 4, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_vers)); + LASSERTF((int)offsetof(struct cfg_marker, cm_createtime) == 16, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_createtime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_createtime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_createtime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_canceltime) == 24, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_canceltime)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_canceltime) == 8, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_canceltime)); + LASSERTF((int)offsetof(struct cfg_marker, cm_tgtname) == 32, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_tgtname)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_tgtname) == 64, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_tgtname)); + LASSERTF((int)offsetof(struct cfg_marker, cm_comment) == 96, " found %lld\n", + (long long)(int)offsetof(struct cfg_marker, cm_comment)); + LASSERTF((int)sizeof(((struct cfg_marker *)0)->cm_comment) == 64, " found %lld\n", + (long long)(int)sizeof(((struct cfg_marker *)0)->cm_comment)); + /* Checks for struct llog_logid */ LASSERTF((int)sizeof(struct llog_logid) == 20, " found %lld\n", (long long)(int)sizeof(struct llog_logid)); @@ -1799,6 +1894,46 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail) == 8, " found %lld\n", (long long)(int)sizeof(((struct llog_setattr_rec *)0)->lsr_tail)); + /* Checks for struct llog_setattr64_rec */ + LASSERTF((int)sizeof(struct llog_setattr64_rec) == 56, " found %lld\n", + (long long)(int)sizeof(struct llog_setattr64_rec)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_hdr) == 0, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_hdr)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr) == 16, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_hdr)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_oid) == 16, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_oid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_oid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_ogen) == 24, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_ogen)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogen) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_ogen)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, padding) == 28, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, padding)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->padding) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->padding)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid) == 32, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_uid_h) == 36, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_uid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_uid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid) == 40, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_gid_h) == 44, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_gid_h)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h) == 4, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_gid_h)); + LASSERTF((int)offsetof(struct llog_setattr64_rec, lsr_tail) == 48, " found %lld\n", + (long long)(int)offsetof(struct llog_setattr64_rec, lsr_tail)); + LASSERTF((int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail) == 8, " found %lld\n", + (long long)(int)sizeof(((struct llog_setattr64_rec *)0)->lsr_tail)); + /* Checks for struct llog_size_change_rec */ LASSERTF((int)sizeof(struct llog_size_change_rec) == 48, " found %lld\n", (long long)(int)sizeof(struct llog_size_change_rec)); @@ -2009,7 +2144,7 @@ void lustre_assert_wire_constants(void) (long long)(int)sizeof(((struct mds_extent_desc *)0)->med_lmm)); /* Checks for struct qunit_data */ - LASSERTF((int)sizeof(struct qunit_data) == 16, " found %lld\n", + LASSERTF((int)sizeof(struct qunit_data) == 32, " found %lld\n", (long long)(int)sizeof(struct qunit_data)); LASSERTF((int)offsetof(struct qunit_data, qd_id) == 0, " found %lld\n", (long long)(int)offsetof(struct qunit_data, qd_id)); @@ -2023,26 +2158,38 @@ void lustre_assert_wire_constants(void) (long long)(int)offsetof(struct qunit_data, qd_count)); LASSERTF((int)sizeof(((struct qunit_data *)0)->qd_count) == 8, " found %lld\n", (long long)(int)sizeof(((struct qunit_data *)0)->qd_count)); - - /* Checks for struct qunit_data_old */ - LASSERTF((int)sizeof(struct qunit_data_old) == 16, " found %lld\n", - (long long)(int)sizeof(struct qunit_data_old)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_id) == 0, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_id)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_id) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_id)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_type) == 4, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_type)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_type) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_type)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_count) == 8, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_count)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_count) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_count)); - LASSERTF((int)offsetof(struct qunit_data_old, qd_isblk) == 12, " found %lld\n", - (long long)(int)offsetof(struct qunit_data_old, qd_isblk)); - LASSERTF((int)sizeof(((struct qunit_data_old *)0)->qd_isblk) == 4, " found %lld\n", - (long long)(int)sizeof(((struct qunit_data_old *)0)->qd_isblk)); + LASSERTF((int)offsetof(struct qunit_data, qd_qunit) == 16, " found %lld\n", + (long long)(int)offsetof(struct qunit_data, qd_qunit)); + LASSERTF((int)sizeof(((struct qunit_data *)0)->qd_qunit) == 8, " found %lld\n", + (long long)(int)sizeof(((struct qunit_data *)0)->qd_qunit)); + LASSERTF((int)offsetof(struct qunit_data, padding) == 24, " found %lld\n", + (long long)(int)offsetof(struct qunit_data, padding)); + LASSERTF((int)sizeof(((struct qunit_data *)0)->padding) == 8, " found %lld\n", + (long long)(int)sizeof(((struct qunit_data *)0)->padding)); + + /* Checks for struct quota_adjust_qunit */ + LASSERTF((int)sizeof(struct quota_adjust_qunit) == 32, " found %lld\n", + (long long)(int)sizeof(struct quota_adjust_qunit)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_flags) == 0, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_flags)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_flags) == 4, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_flags)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_id) == 4, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_id)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_id) == 4, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_id)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_bunit_sz) == 8, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_bunit_sz)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_bunit_sz) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_bunit_sz)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, qaq_iunit_sz) == 16, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, qaq_iunit_sz)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->qaq_iunit_sz) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->qaq_iunit_sz)); + LASSERTF((int)offsetof(struct quota_adjust_qunit, padding1) == 24, " found %lld\n", + (long long)(int)offsetof(struct quota_adjust_qunit, padding1)); + LASSERTF((int)sizeof(((struct quota_adjust_qunit *)0)->padding1) == 8, " found %lld\n", + (long long)(int)sizeof(((struct quota_adjust_qunit *)0)->padding1)); /* Checks for struct mgs_target_info */ LASSERTF((int)sizeof(struct mgs_target_info) == 4544, " found %lld\n", @@ -2249,5 +2396,18 @@ void lustre_assert_wire_constants(void) LASSERTF((int)sizeof(((xattr_acl_header *)0)->a_entries) == 0, " found %lld\n", (long long)(int)sizeof(((xattr_acl_header *)0)->a_entries)); #endif + + /* check fid range */ + LASSERTF((int)sizeof(struct lu_seq_range) == 24, " found %lld\n", + (long long)(int)sizeof(struct lu_seq_range)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_start) == 0, " found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_start)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_end) == 8, " found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_end)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_mdt) == 16, " found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_mdt)); + LASSERTF((int)offsetof(struct lu_seq_range, lsr_padding) == 20, " found %lld\n", + (long long)(int)offsetof(struct lu_seq_range, lsr_padding)); + } -- 1.8.3.1