X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Finclude%2Fobd.h;h=dbfd961414abeadb6e7ad76aef95e7e2573708e9;hp=7cc09ec17baa005e332cf484420bd255b41b5ae8;hb=a1a1add56b5557cb23971bcfcdc2482dcb9764ef;hpb=4160b722833ea924d7c51a993bc455376ac88bc1 diff --git a/lustre/include/obd.h b/lustre/include/obd.h index 7cc09ec..dbfd961 100644 --- a/lustre/include/obd.h +++ b/lustre/include/obd.h @@ -1,5 +1,37 @@ /* -*- mode: c; c-basic-offset: 8; indent-tabs-mode: nil; -*- * vim:expandtab:shiftwidth=8:tabstop=8: + * + * GPL HEADER START + * + * DO NOT ALTER OR REMOVE COPYRIGHT NOTICES OR THIS FILE HEADER. + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 only, + * as published by the Free Software Foundation. + * + * This program is distributed in the hope that it will be useful, but + * WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU + * General Public License version 2 for more details (a copy is included + * in the LICENSE file that accompanied this code). + * + * You should have received a copy of the GNU General Public License + * version 2 along with this program; If not, see + * http://www.sun.com/software/products/lustre/docs/GPLv2.pdf + * + * Please contact Sun Microsystems, Inc., 4150 Network Circle, Santa Clara, + * CA 95054 USA or visit www.sun.com if you need additional information or + * have any questions. + * + * GPL HEADER END + */ +/* + * Copyright 2008 Sun Microsystems, Inc. All rights reserved + * Use is subject to license terms. + */ +/* + * This file is part of Lustre, http://www.lustre.org/ + * Lustre is a trademark of Sun Microsystems, Inc. */ #ifndef __OBD_H @@ -28,7 +60,8 @@ #define IOC_MDC_MAX_NR 50 #include -#include +#include +#include #include #include #include @@ -64,8 +97,8 @@ struct lov_oinfo { /* per-stripe data structure */ /* used by the osc to keep track of what objects to build into rpcs */ struct loi_oap_pages loi_read_lop; struct loi_oap_pages loi_write_lop; - /* _cli_ is poorly named, it should be _ready_ */ - struct list_head loi_cli_item; + struct list_head loi_ready_item; + struct list_head loi_hp_ready_item; struct list_head loi_write_item; struct list_head loi_read_item; @@ -75,6 +108,12 @@ struct lov_oinfo { /* per-stripe data structure */ struct osc_async_rc loi_ar; }; +static inline void loi_kms_set(struct lov_oinfo *oinfo, __u64 kms) +{ + oinfo->loi_kms = kms; + oinfo->loi_kms_valid = 1; +} + static inline void loi_init(struct lov_oinfo *loi) { CFS_INIT_LIST_HEAD(&loi->loi_read_lop.lop_pending); @@ -83,7 +122,8 @@ static inline void loi_init(struct lov_oinfo *loi) CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_pending); CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_urgent); CFS_INIT_LIST_HEAD(&loi->loi_write_lop.lop_pending_group); - CFS_INIT_LIST_HEAD(&loi->loi_cli_item); + CFS_INIT_LIST_HEAD(&loi->loi_ready_item); + CFS_INIT_LIST_HEAD(&loi->loi_hp_ready_item); CFS_INIT_LIST_HEAD(&loi->loi_write_item); CFS_INIT_LIST_HEAD(&loi->loi_read_item); } @@ -118,6 +158,7 @@ struct lov_stripe_md { __u32 lw_stripe_size; /* size of the stripe */ __u32 lw_pattern; /* striping pattern (RAID0, RAID1) */ unsigned lw_stripe_count; /* number of objects being striped over */ + char lw_pool_name[LOV_MAXPOOLNAME]; /* pool name */ } lsm_wire; struct lov_array_info *lsm_array; /*Only for joined file array info*/ @@ -131,10 +172,11 @@ struct lov_stripe_md { #define lsm_stripe_size lsm_wire.lw_stripe_size #define lsm_pattern lsm_wire.lw_pattern #define lsm_stripe_count lsm_wire.lw_stripe_count +#define lsm_pool_name lsm_wire.lw_pool_name struct obd_info; -typedef int (*obd_enqueue_update_f)(struct obd_info *oinfo, int rc); +typedef int (*obd_enqueue_update_f)(void *cookie, int rc); /* obd info for a particular level (lov, osc). */ struct obd_info { @@ -198,52 +240,6 @@ struct brw_page { obd_flag flag; }; -enum async_flags { - ASYNC_READY = 0x1, /* ap_make_ready will not be called before this - page is added to an rpc */ - ASYNC_URGENT = 0x2, /* page must be put into an RPC before return */ - ASYNC_COUNT_STABLE = 0x4, /* ap_refresh_count will not be called - to give the caller a chance to update - or cancel the size of the io */ - ASYNC_GROUP_SYNC = 0x8, /* ap_completion will not be called, instead - the page is accounted for in the - obd_io_group given to - obd_queue_group_io */ -}; - -struct obd_async_page_ops { - int (*ap_make_ready)(void *data, int cmd); - int (*ap_refresh_count)(void *data, int cmd); - void (*ap_fill_obdo)(void *data, int cmd, struct obdo *oa); - void (*ap_update_obdo)(void *data, int cmd, struct obdo *oa, - obd_valid valid); - int (*ap_completion)(void *data, int cmd, struct obdo *oa, int rc); - struct obd_capa *(*ap_lookup_capa)(void *data, int cmd); -}; - -/* the `oig' is passed down from a caller of obd rw methods. the callee - * records enough state such that the caller can sleep on the oig and - * be woken when all the callees have finished their work */ -struct obd_io_group { - spinlock_t oig_lock; - atomic_t oig_refcount; - int oig_pending; - int oig_rc; - struct list_head oig_occ_list; - cfs_waitq_t oig_waitq; -}; - -/* the oig callback context lets the callee of obd rw methods register - * for callbacks from the caller. */ -struct oig_callback_context { - struct list_head occ_oig_item; - /* called when the caller has received a signal while sleeping. - * callees of this method are encouraged to abort their state - * in the oig. This may be called multiple times. */ - void (*occ_interrupted)(struct oig_callback_context *occ); - unsigned long interrupted:1; -}; - /* Individual type definitions */ struct ost_server_data; @@ -251,15 +247,26 @@ struct ost_server_data; /* hold common fields for "target" device */ struct obd_device_target { struct super_block *obt_sb; - atomic_t obt_quotachecking; + /** last_rcvd file */ + struct file *obt_rcvd_filp; + /** server data in last_rcvd file */ + struct lr_server_data *obt_lsd; + /** Lock protecting client bitmap */ + spinlock_t obt_client_bitmap_lock; + /** Bitmap of known clients */ + unsigned long *obt_client_bitmap; + /** Server last transaction number */ + __u64 obt_last_transno; + /** Lock protecting last transaction number */ + spinlock_t obt_translock; + /** Number of mounts */ + __u64 obt_mount_count; + struct semaphore obt_quotachecking; struct lustre_quota_ctxt obt_qctxt; + lustre_quota_version_t obt_qfmt; + struct rw_semaphore obt_rwsem; }; -typedef void (*obd_pin_extent_cb)(void *data); -typedef int (*obd_page_removal_cb_t)(void *data, int discard); -typedef int (*obd_lock_cancel_cb)(struct ldlm_lock *,struct ldlm_lock_desc *, - void *, int); - /* llog contexts */ enum llog_ctxt_id { LLOG_CONFIG_ORIG_CTXT = 0, @@ -274,6 +281,9 @@ enum llog_ctxt_id { LLOG_TEST_REPL_CTXT, LLOG_LOVEA_ORIG_CTXT, LLOG_LOVEA_REPL_CTXT, + LLOG_CHANGELOG_ORIG_CTXT, /**< changelog generation on mdd */ + LLOG_CHANGELOG_REPL_CTXT, /**< changelog access on clients */ + LLOG_CHANGELOG_USER_ORIG_CTXT, /**< for multiple changelog consumers */ LLOG_MAX_CTXTS }; @@ -292,6 +302,7 @@ struct filter_ext { struct filter_obd { /* NB this field MUST be first */ struct obd_device_target fo_obt; + struct lu_target fo_lut; const char *fo_fstype; struct vfsmount *fo_vfsmnt; @@ -304,12 +315,7 @@ struct filter_obd { spinlock_t fo_objidlock; /* protect fo_lastobjid */ - spinlock_t fo_translock; /* protect fsd_last_transno */ - struct file *fo_rcvd_filp; struct file *fo_health_check_filp; - struct lr_server_data *fo_fsd; - unsigned long *fo_last_rcvd_slots; - __u64 fo_mount_count; unsigned long fo_destroys_in_progress; struct semaphore fo_create_locks[FILTER_SUBDIR_COUNT]; @@ -320,8 +326,13 @@ struct filter_obd { obd_size fo_tot_dirty; /* protected by obd_osfs_lock */ obd_size fo_tot_granted; /* all values in bytes */ obd_size fo_tot_pending; + int fo_tot_granted_clients; obd_size fo_readcache_max_filesize; + int fo_read_cache:1, /**< enable read-only cache */ + fo_writethrough_cache:1,/**< read cache writes */ + fo_mds_ost_sync:1, /**< MDS-OST orphan recovery*/ + fo_raid_degraded:1;/**< RAID device degraded */ struct obd_import *fo_mdc_imp; struct obd_uuid fo_mdc_uuid; @@ -369,8 +380,23 @@ struct filter_obd { unsigned int fo_fl_oss_capa; struct list_head fo_capa_keys; struct hlist_head *fo_capa_hash; + struct llog_commit_master *fo_lcm; + int fo_sec_level; +}; - void *fo_lcm; +#define fo_translock fo_obt.obt_translock +#define fo_rcvd_filp fo_obt.obt_rcvd_filp +#define fo_fsd fo_obt.obt_lsd +#define fo_last_rcvd_slots fo_obt.obt_client_bitmap +#define fo_mount_count fo_obt.obt_mount_count + +struct timeout_item { + enum timeout_event ti_event; + cfs_time_t ti_timeout; + timeout_cb_t ti_cb; + void *ti_cb_data; + struct list_head ti_obd_list; + struct list_head ti_chain; }; #define OSC_MAX_RIF_DEFAULT 8 @@ -384,9 +410,8 @@ struct filter_obd { struct mdc_rpc_lock; struct obd_import; -struct lustre_cache; struct client_obd { - struct semaphore cl_sem; + struct rw_semaphore cl_sem; struct obd_uuid cl_target_uuid; struct obd_import *cl_import; /* ptlrpc connection state */ int cl_conn_count; @@ -396,9 +421,9 @@ struct client_obd { int cl_max_mds_easize; int cl_max_mds_cookiesize; - /* security configuration */ - struct sptlrpc_rule_set cl_sptlrpc_rset; - enum lustre_sec_part cl_sec_part; + enum lustre_sec_part cl_sp_me; + enum lustre_sec_part cl_sp_to; + struct sptlrpc_flavor cl_flvr_mgc; /* fixed flavor of mgc->mgs */ //struct llog_canceld_ctxt *cl_llcd; /* it's included by obd_llog_ctxt */ void *cl_llcd_offset; @@ -406,9 +431,14 @@ struct client_obd { /* the grant values are protected by loi_list_lock below */ long cl_dirty; /* all _dirty_ in bytes */ long cl_dirty_max; /* allowed w/o rpc */ + long cl_dirty_transit; /* dirty synchronous */ long cl_avail_grant; /* bytes of credit for ost */ long cl_lost_grant; /* lost credits (trunc) */ struct list_head cl_cache_waiters; /* waiting for cache/grant */ + cfs_time_t cl_next_shrink_grant; /* jiffies */ + struct list_head cl_grant_shrink_list; /* Timeout event list */ + struct semaphore cl_grant_sem; /*grant shrink list semaphore*/ + int cl_grant_shrink_interval; /* seconds */ /* keep track of objects that have lois that contain pages which * have been queued for async brw. this lock also protects the @@ -430,6 +460,7 @@ struct client_obd { */ client_obd_lock_t cl_loi_list_lock; struct list_head cl_loi_ready_list; + struct list_head cl_loi_hp_ready_list; struct list_head cl_loi_write_list; struct list_head cl_loi_read_list; int cl_r_in_flight; @@ -468,7 +499,7 @@ struct client_obd { __u32 cl_supp_cksum_types; /* checksum algorithm to be used */ cksum_type_t cl_cksum_type; - + /* also protected by the poorly named _loi_list_lock lock above */ struct osc_async_rc cl_ar; @@ -479,10 +510,6 @@ struct client_obd { struct lu_client_seq *cl_seq; atomic_t cl_resends; /* resend count */ - - /* Cache of triples */ - struct lustre_cache *cl_cache; - obd_lock_cancel_cb cl_ext_lock_cancel_cb; }; #define obd2cli_tgt(obd) ((char *)(obd)->u.cli.cl_target_uuid.uuid) @@ -509,15 +536,10 @@ struct mds_obd { cfs_dentry_t *mds_fid_de; int mds_max_mdsize; int mds_max_cookiesize; - struct file *mds_rcvd_filp; - spinlock_t mds_transno_lock; - __u64 mds_last_transno; - __u64 mds_mount_count; __u64 mds_io_epoch; unsigned long mds_atime_diff; struct semaphore mds_epoch_sem; struct ll_fid mds_rootfid; - struct lr_server_data *mds_server_data; cfs_dentry_t *mds_pending_dir; cfs_dentry_t *mds_logs_dir; cfs_dentry_t *mds_objects_dir; @@ -530,18 +552,17 @@ struct mds_obd { __u32 mds_id; /* mark pages dirty for write. */ - bitmap_t *mds_lov_page_dirty; + bitmap_t *mds_lov_page_dirty; /* array for store pages with obd_id */ - void **mds_lov_page_array; + void **mds_lov_page_array; /* file for store objid */ struct file *mds_lov_objid_filp; __u32 mds_lov_objid_count; + __u32 mds_lov_objid_max_index; __u32 mds_lov_objid_lastpage; __u32 mds_lov_objid_lastidx; struct file *mds_health_check_filp; - unsigned long *mds_client_bitmap; -// struct upcall_cache *mds_group_hash; struct lustre_quota_info mds_quota_info; struct semaphore mds_qonoff_sem; @@ -550,14 +571,25 @@ struct mds_obd { mds_fl_acl:1, mds_evict_ost_nids:1, mds_fl_cfglog:1, - mds_fl_synced:1; + mds_fl_synced:1, + mds_quota:1, + mds_fl_target:1; /* mds have one or + * more targets */ struct upcall_cache *mds_identity_cache; /* for capability keys update */ struct lustre_capa_key *mds_capa_keys; + struct rw_semaphore mds_notify_lock; }; +#define mds_transno_lock mds_obt.obt_translock +#define mds_rcvd_filp mds_obt.obt_rcvd_filp +#define mds_server_data mds_obt.obt_lsd +#define mds_client_bitmap mds_obt.obt_client_bitmap +#define mds_mount_count mds_obt.obt_mount_count +#define mds_last_transno mds_obt.obt_last_transno + /* lov objid */ extern __u32 mds_max_ost_index; @@ -596,6 +628,7 @@ struct echo_client_obd { struct obd_export *ec_exp; /* the local connection to osc/lov */ spinlock_t ec_lock; struct list_head ec_objects; + struct list_head ec_locks; int ec_nstripes; __u64 ec_unique; }; @@ -603,10 +636,11 @@ struct echo_client_obd { struct lov_qos_oss { struct obd_uuid lqo_uuid; /* ptlrpc's c_remote_uuid */ struct list_head lqo_oss_list; /* link to lov_qos */ - __u32 lqo_ost_count; /* number of osts on this oss */ __u64 lqo_bavail; /* total bytes avail on OSS */ __u64 lqo_penalty; /* current penalty */ __u64 lqo_penalty_per_obj; /* penalty decrease every obj*/ + time_t lqo_used; /* last used time, seconds */ + __u32 lqo_ost_count; /* number of osts on this oss */ }; struct ltd_qos { @@ -614,25 +648,55 @@ struct ltd_qos { __u64 ltq_penalty; /* current penalty */ __u64 ltq_penalty_per_obj; /* penalty decrease every obj*/ __u64 ltq_weight; /* net weighting */ + time_t ltq_used; /* last used time, seconds */ unsigned int ltq_usable:1; /* usable for striping */ }; +/* Generic subset of OSTs */ +struct ost_pool { + __u32 *op_array; /* array of index of + lov_obd->lov_tgts */ + unsigned int op_count; /* number of OSTs in the array */ + unsigned int op_size; /* allocated size of lp_array */ + struct rw_semaphore op_rw_sem; /* to protect ost_pool use */ +}; + +/* Round-robin allocator data */ +struct lov_qos_rr { + __u32 lqr_start_idx; /* start index of new inode */ + __u32 lqr_offset_idx; /* aliasing for start_idx */ + int lqr_start_count; /* reseed counter */ + struct ost_pool lqr_pool; /* round-robin optimized list */ + unsigned long lqr_dirty:1; /* recalc round-robin list */ +}; + +struct lov_statfs_data { + struct obd_info lsd_oi; + struct obd_statfs lsd_statfs; +}; +/* Stripe placement optimization */ struct lov_qos { struct list_head lq_oss_list; /* list of OSSs that targets use */ struct rw_semaphore lq_rw_sem; __u32 lq_active_oss_count; - __u32 *lq_rr_array; /* round-robin optimized list */ - unsigned int lq_rr_size; /* rr array size */ unsigned int lq_prio_free; /* priority for free space */ + unsigned int lq_threshold_rr;/* priority for rr */ + struct lov_qos_rr lq_rr; /* round robin qos data */ unsigned long lq_dirty:1, /* recalc qos data */ - lq_dirty_rr:1, /* recalc round-robin list */ lq_same_space:1,/* the ost's all have approx. the same space avail */ - lq_reset:1; /* zero current penalties */ + lq_reset:1, /* zero current penalties */ + lq_statfs_in_progress:1; /* statfs op in progress */ + /* qos statfs data */ + struct lov_statfs_data *lq_statfs_data; + cfs_waitq_t lq_statfs_waitq; /* waitqueue to notify statfs + * requests completion */ }; struct lov_tgt_desc { + struct list_head ltd_kill; struct obd_uuid ltd_uuid; + struct obd_device *ltd_obd; struct obd_export *ltd_exp; struct ltd_qos ltd_qos; /* qos info per target */ __u32 ltd_gen; @@ -642,9 +706,30 @@ struct lov_tgt_desc { ltd_reap:1; /* should this target be deleted */ }; +/* Pool metadata */ +#define pool_tgt_size(_p) _p->pool_obds.op_size +#define pool_tgt_count(_p) _p->pool_obds.op_count +#define pool_tgt_array(_p) _p->pool_obds.op_array +#define pool_tgt_rw_sem(_p) _p->pool_obds.op_rw_sem +#define pool_tgt(_p, _i) _p->pool_lov->lov_tgts[_p->pool_obds.op_array[_i]] + +struct pool_desc { + char pool_name[LOV_MAXPOOLNAME + 1]; /* name of pool */ + struct ost_pool pool_obds; /* pool members */ + atomic_t pool_refcount; /* pool ref. counter */ + struct lov_qos_rr pool_rr; /* round robin qos */ + struct hlist_node pool_hash; /* access by poolname */ + struct list_head pool_list; /* serial access */ + cfs_proc_dir_entry_t *pool_proc_entry; /* file in /proc */ + struct lov_obd *pool_lov; /* lov obd to which this + pool belong */ +}; + struct lov_obd { struct lov_desc desc; - struct lov_tgt_desc **lov_tgts; + struct lov_tgt_desc **lov_tgts; /* sparse array */ + struct ost_pool lov_packed; /* all OSTs in a packed + array */ struct semaphore lov_lock; struct obd_connect_data lov_ocd; struct lov_qos lov_qos; /* qos info per lov */ @@ -653,13 +738,12 @@ struct lov_obd { __u32 lov_active_tgt_count; /* how many active */ __u32 lov_death_row;/* tgts scheduled to be deleted */ __u32 lov_tgt_size; /* size of tgts array */ - __u32 lov_start_idx; /* start index of new inode */ - __u32 lov_offset_idx; /* aliasing for start_idx */ - int lov_start_count;/* reseed counter */ int lov_connects; - obd_page_removal_cb_t lov_page_removal_cb; - obd_pin_extent_cb lov_page_pin_cb; - obd_lock_cancel_cb lov_lock_cancel_cb; + int lov_pool_count; + lustre_hash_t *lov_pools_hash_body; /* used for key access */ + struct list_head lov_pool_list; /* used for sequential access */ + cfs_proc_dir_entry_t *lov_pool_proc_entry; + enum lustre_sec_part lov_sp_me; }; struct lmv_tgt_desc { @@ -670,10 +754,20 @@ struct lmv_tgt_desc { struct semaphore ltd_fid_sem; }; +enum placement_policy { + PLACEMENT_CHAR_POLICY = 0, + PLACEMENT_NID_POLICY = 1, + PLACEMENT_INVAL_POLICY = 2, + PLACEMENT_MAX_POLICY +}; + +typedef enum placement_policy placement_policy_t; + struct lmv_obd { int refcount; struct lu_client_fld lmv_fld; spinlock_t lmv_lock; + placement_policy_t lmv_placement; struct lmv_desc desc; struct obd_uuid cluuid; struct obd_export *exp; @@ -710,8 +804,10 @@ struct niobuf_local { #define LUSTRE_CMM_NAME "cmm" #define LUSTRE_MDD_NAME "mdd" #define LUSTRE_OSD_NAME "osd" +#define LUSTRE_VVP_NAME "vvp" #define LUSTRE_LMV_NAME "lmv" #define LUSTRE_CMM_MDC_NAME "cmm-mdc" +#define LUSTRE_SLP_NAME "slp" /* obd device type names */ /* FIXME all the references to LUSTRE_MDS_NAME should be swapped with LUSTRE_MDT_NAME */ @@ -752,8 +848,12 @@ struct obd_trans_info { int oti_numcookies; /* initial thread handling transaction */ - int oti_thread_id; + struct ptlrpc_thread * oti_thread; __u32 oti_conn_cnt; + /** VBR: versions */ + __u64 oti_pre_version; + + struct obd_uuid *oti_ost_uuid; }; static inline void oti_init(struct obd_trans_info *oti, @@ -767,10 +867,18 @@ static inline void oti_init(struct obd_trans_info *oti, return; oti->oti_xid = req->rq_xid; + /** VBR: take versions from request */ + if (req->rq_reqmsg != NULL && + lustre_msg_get_flags(req->rq_reqmsg) & MSG_REPLAY) { + __u64 *pre_version = lustre_msg_get_versions(req->rq_reqmsg); + oti->oti_pre_version = pre_version ? pre_version[0] : 0; + oti->oti_transno = lustre_msg_get_transno(req->rq_reqmsg); + } + /** called from mds_create_objects */ if (req->rq_repmsg != NULL) oti->oti_transno = lustre_msg_get_transno(req->rq_repmsg); - oti->oti_thread_id = req->rq_svc_thread ? req->rq_svc_thread->t_id : -1; + oti->oti_thread = req->rq_svc_thread; if (req->rq_reqmsg != NULL) oti->oti_conn_cnt = lustre_msg_get_conn_cnt(req->rq_reqmsg); } @@ -807,6 +915,8 @@ static inline void oti_free_cookies(struct obd_trans_info *oti) * Events signalled through obd_notify() upcall-chain. */ enum obd_notify_event { + /* Device connect start */ + OBD_NOTIFY_CONNECT, /* Device activated */ OBD_NOTIFY_ACTIVE, /* Device deactivated */ @@ -819,13 +929,16 @@ enum obd_notify_event { OBD_NOTIFY_SYNC_NONBLOCK, OBD_NOTIFY_SYNC, /* Configuration event */ - OBD_NOTIFY_CONFIG + OBD_NOTIFY_CONFIG, + /* Trigger quota recovery */ + OBD_NOTIFY_QUOTA }; /* bit-mask flags for config events */ enum config_flags { - CONFIG_LOG = 0x1, /* finished processing config log */ - CONFIG_SYNC = 0x2 /* mdt synced 1 ost */ + CONFIG_LOG = 0x1, /* finished processing config log */ + CONFIG_SYNC = 0x2, /* mdt synced 1 ost */ + CONFIG_TARGET = 0x4 /* one target is added */ }; /* @@ -834,7 +947,7 @@ enum config_flags { */ struct obd_notify_upcall { int (*onu_upcall)(struct obd_device *host, struct obd_device *watched, - enum obd_notify_event ev, void *owner); + enum obd_notify_event ev, void *owner, void *data); /* Opaque datum supplied by upper layer listener */ void *onu_owner; }; @@ -846,14 +959,41 @@ struct target_recovery_data { struct completion trd_finishing; }; -#define OBD_LLOG_GROUP 0 - enum filter_groups { + FILTER_GROUP_MDS0 = 0, FILTER_GROUP_LLOG = 1, - FILTER_GROUP_ECHO, - FILTER_GROUP_MDS0 + FILTER_GROUP_ECHO = 2 , + FILTER_GROUP_MDS1_N_BASE = 3 }; +static inline __u64 obdo_mdsno(struct obdo *oa) +{ + if (oa->o_gr) + return oa->o_gr - FILTER_GROUP_MDS1_N_BASE; + return 0; +} + +static inline int mdt_to_obd_objgrp(int mdtid) +{ + if (mdtid) + return FILTER_GROUP_MDS1_N_BASE + mdtid; + return 0; +} + +/** + * In HEAD for CMD, the object is created in group number which is 3>= + * or indexing starts from 3. To test this assertions are added to disallow + * group 0. But to run 2.0 mds server on 1.8.x disk format (i.e. interop_mode) + * object in group 0 needs to be allowed. + * So for interop mode following changes needs to be done: + * 1. No need to assert on group 0 or allow group 0 + * 2. The group number indexing starts from 0 instead of 3 + */ + +#define CHECK_MDS_GROUP(group) (group == FILTER_GROUP_MDS0 || \ + group > FILTER_GROUP_MDS1_N_BASE) +#define LASSERT_MDS_GROUP(group) LASSERT(CHECK_MDS_GROUP(group)) + struct obd_llog_group { struct list_head olg_list; int olg_group; @@ -861,11 +1001,14 @@ struct obd_llog_group { cfs_waitq_t olg_waitq; spinlock_t olg_lock; struct obd_export *olg_exp; + int olg_initializing; + struct semaphore olg_cat_processing; }; /* corresponds to one of the obd's */ #define MAX_OBD_NAME 128 #define OBD_DEVICE_MAGIC 0XAB5CD6EF +#define OBD_DEV_BY_DEVNAME 0xffffd0de struct obd_device { struct obd_type *obd_type; __u32 obd_magic; @@ -880,7 +1023,9 @@ struct obd_device { unsigned long obd_attached:1, /* finished attach */ obd_set_up:1, /* finished setup */ obd_recovering:1, /* there are recoverable clients */ - obd_abort_recovery:1,/* somebody ioctl'ed us to abort */ + obd_abort_recovery:1,/* recovery expired */ + obd_version_recov:1, /* obd uses version checking */ + obd_recovery_expired:1, obd_replayable:1, /* recovery is enabled; inform clients */ obd_no_transno:1, /* no committed-transno notification */ obd_no_recov:1, /* fail instead of retry messages */ @@ -891,18 +1036,21 @@ struct obd_device { obd_fail:1, /* cleanup with failover */ obd_async_recov:1, /* allow asyncronous orphan cleanup */ obd_no_conn:1, /* deny new connections */ - obd_inactive:1; /* device active/inactive + obd_inactive:1, /* device active/inactive * (for /proc/status only!!) */ + obd_process_conf:1; /* device is processing mgs config */ /* uuid-export hash body */ - struct lustre_class_hash_body *obd_uuid_hash_body; + struct lustre_hash *obd_uuid_hash; /* nid-export hash body */ - struct lustre_class_hash_body *obd_nid_hash_body; + struct lustre_hash *obd_nid_hash; /* nid stats body */ - struct lustre_class_hash_body *obd_nid_stats_hash_body; + struct lustre_hash *obd_nid_stats_hash; struct list_head obd_nid_stats; atomic_t obd_refcount; cfs_waitq_t obd_refcount_waitq; struct list_head obd_exports; + struct list_head obd_unlinked_exports; + struct list_head obd_delayed_exports; int obd_num_exports; spinlock_t obd_nid_lock; struct ldlm_namespace *obd_namespace; @@ -927,22 +1075,22 @@ struct obd_device { /* XXX encapsulate all this recovery data into one struct */ svc_handler_t obd_recovery_handler; pid_t obd_processing_task; - + int obd_max_recoverable_clients; int obd_connected_clients; - int obd_recoverable_clients; + int obd_stale_clients; + int obd_delayed_clients; spinlock_t obd_processing_task_lock; /* BH lock (timer) */ __u64 obd_next_recovery_transno; int obd_replayed_requests; int obd_requests_queued_for_recovery; cfs_waitq_t obd_next_transno_waitq; - struct list_head obd_uncommitted_replies; - spinlock_t obd_uncommitted_replies_lock; cfs_timer_t obd_recovery_timer; time_t obd_recovery_start; /* seconds */ - time_t obd_recovery_end; /* seconds */ + time_t obd_recovery_end; /* seconds, for lprocfs_status */ time_t obd_recovery_max_time; /* seconds, bz13079 */ - + int obd_recovery_timeout; + /* new recovery stuff from CMD2 */ struct target_recovery_data obd_recovery_data; int obd_replayed_locks; @@ -979,60 +1127,61 @@ struct obd_device { atomic_t obd_evict_inprogress; cfs_waitq_t obd_evict_inprogress_waitq; - /** - * Ldlm pool part. Save last calculated SLV and Limit. + /** + * Ldlm pool part. Save last calculated SLV and Limit. */ rwlock_t obd_pool_lock; int obd_pool_limit; __u64 obd_pool_slv; -}; -#define OBD_OPT_FORCE 0x0001 -#define OBD_OPT_FAILOVER 0x0002 + /** + * A list of outstanding class_incref()'s against this obd. For + * debugging. + */ + struct lu_ref obd_reference; +}; #define OBD_LLOG_FL_SENDNOW 0x0001 enum obd_cleanup_stage { /* Special case hack for MDS LOVs */ OBD_CLEANUP_EARLY, -/* Precleanup stage 1, we must make sure all exports (other than the - self-export) get destroyed. */ +/* can be directly mapped to .ldto_device_fini() */ OBD_CLEANUP_EXPORTS, -/* Precleanup stage 2, do other type-specific cleanup requiring the - self-export. */ - OBD_CLEANUP_SELF_EXP, -/* FIXME we should eliminate the "precleanup" function and make them stages - of the "cleanup" function. */ - OBD_CLEANUP_OBD, }; /* get/set_info keys */ -#define KEY_READ_ONLY "read-only" -#define KEY_MDS_CONN "mds_conn" -#define KEY_NEXT_ID "next_id" -#define KEY_LOVDESC "lovdesc" -#define KEY_INIT_RECOV "initial_recov" -#define KEY_INIT_RECOV_BACKUP "init_recov_bk" -#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_BLOCKSIZE "blocksize" #define KEY_CAPA_KEY "capa_key" +#define KEY_CHANGELOG_CLEAR "changelog_clear" +#define KEY_FID2PATH "fid2path" +#define KEY_CHECKSUM "checksum" +#define KEY_CLEAR_FS "clear_fs" #define KEY_CONN_DATA "conn_data" -#define KEY_MAX_EASIZE "max_easize" -#define KEY_REVIMP_UPD "revimp_update" -#define KEY_LOV_IDX "lov_idx" +#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_FIEMAP "fiemap" +#define KEY_FLUSH_CTX "flush_ctx" +#define KEY_INIT_RECOV_BACKUP "init_recov_bk" +#define KEY_INIT_RECOV "initial_recov" #define KEY_LAST_ID "last_id" -#define KEY_READONLY "read-only" #define KEY_LOCK_TO_STRIPE "lock_to_stripe" -#define KEY_CHECKSUM "checksum" -#define KEY_UNLINKED "unlinked" -#define KEY_EVICT_BY_NID "evict_by_nid" +#define KEY_LOVDESC "lovdesc" +#define KEY_LOV_IDX "lov_idx" +#define KEY_MAX_EASIZE "max_easize" +#define KEY_MDS_CONN "mds_conn" +#define KEY_MGSSEC "mgssec" +#define KEY_NEXT_ID "next_id" +#define KEY_READ_ONLY "read-only" #define KEY_REGISTER_TARGET "register_target" +#define KEY_REVIMP_UPD "revimp_update" #define KEY_SET_FS "set_fs" -#define KEY_CLEAR_FS "clear_fs" -#define KEY_BLOCKSIZE "blocksize" -#define KEY_BLOCKSIZE_BITS "blocksize_bits" +#define KEY_SPTLRPC_CONF "sptlrpc_conf" +#define KEY_UNLINKED "unlinked" /* XXX unused ?*/ #define KEY_INTERMDS "inter_mds" #define KEY_ASYNC "async" +#define KEY_GRANT_SHRINK "grant_shrink" struct lu_context; @@ -1043,7 +1192,7 @@ static inline int it_to_lock_mode(struct lookup_intent *it) return LCK_CW; else if (it->it_op & (IT_READDIR | IT_GETATTR | IT_OPEN | IT_LOOKUP)) return LCK_CR; - + LASSERTF(0, "Invalid it_op: %d\n", it->it_op); return -EINVAL; } @@ -1064,7 +1213,7 @@ struct md_op_data { __u32 op_suppgids[2]; __u32 op_fsuid; __u32 op_fsgid; - __u32 op_cap; + cfs_cap_t op_cap; void *op_data; /* iattr fields and blocks. */ @@ -1112,7 +1261,7 @@ struct obd_ops { int (*o_iocontrol)(unsigned int cmd, struct obd_export *exp, int len, void *karg, void *uarg); int (*o_get_info)(struct obd_export *, __u32 keylen, void *key, - __u32 *vallen, void *val); + __u32 *vallen, void *val, struct lov_stripe_md *lsm); int (*o_set_info_async)(struct obd_export *, __u32 keylen, void *key, __u32 vallen, void *val, struct ptlrpc_request_set *set); @@ -1133,13 +1282,14 @@ struct obd_ops { * granted by the target, which are guaranteed to be a subset of flags * asked for. If @ocd == NULL, use default parameters. */ int (*o_connect)(const struct lu_env *env, - struct lustre_handle *conn, struct obd_device *src, + struct obd_export **exp, struct obd_device *src, struct obd_uuid *cluuid, struct obd_connect_data *ocd, void *localdata); int (*o_reconnect)(const struct lu_env *env, struct obd_export *exp, struct obd_device *src, struct obd_uuid *cluuid, - struct obd_connect_data *ocd); + struct obd_connect_data *ocd, + void *localdata); int (*o_disconnect)(struct obd_export *exp); /* Initialize/finalize fids infrastructure. */ @@ -1150,7 +1300,7 @@ struct obd_ops { int (*o_fid_alloc)(struct obd_export *exp, struct lu_fid *fid, struct md_op_data *op_data); - /* + /* * Object with @fid is getting deleted, we may want to do something * about this. */ @@ -1172,9 +1322,12 @@ struct obd_ops { int (*o_precreate)(struct obd_export *exp); int (*o_create)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md **ea, struct obd_trans_info *oti); + int (*o_create_async)(struct obd_export *exp, struct obd_info *oinfo, + struct lov_stripe_md **ea, + struct obd_trans_info *oti); int (*o_destroy)(struct obd_export *exp, struct obdo *oa, struct lov_stripe_md *ea, struct obd_trans_info *oti, - struct obd_export *md_exp); + struct obd_export *md_exp, void *capa); int (*o_setattr)(struct obd_export *exp, struct obd_info *oinfo, struct obd_trans_info *oti); int (*o_setattr_async)(struct obd_export *exp, struct obd_info *oinfo, @@ -1186,47 +1339,6 @@ struct obd_ops { int (*o_brw)(int rw, struct obd_export *exp, struct obd_info *oinfo, obd_count oa_bufs, struct brw_page *pgarr, struct obd_trans_info *oti); - int (*o_brw_async)(int rw, struct obd_export *exp, - struct obd_info *oinfo, obd_count oa_bufs, - struct brw_page *pgarr, struct obd_trans_info *oti, - struct ptlrpc_request_set *); - int (*o_prep_async_page)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - cfs_page_t *page, obd_off offset, - struct obd_async_page_ops *ops, void *data, - void **res, int nocache, - struct lustre_handle *lockh); - int (*o_reget_short_lock)(struct obd_export *exp, - struct lov_stripe_md *lsm, - void **res, int rw, - obd_off start, obd_off end, - void **cookie); - int (*o_release_short_lock)(struct obd_export *exp, - struct lov_stripe_md *lsm, obd_off end, - void *cookie, int rw); - int (*o_queue_async_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags); - int (*o_queue_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig, - void *cookie, int cmd, obd_off off, int count, - obd_flag brw_flags, obd_flag async_flags); - int (*o_trigger_group_io)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, - struct obd_io_group *oig); - int (*o_set_async_flags)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie, - obd_flag async_flags); - int (*o_teardown_async_page)(struct obd_export *exp, - struct lov_stripe_md *lsm, - struct lov_oinfo *loi, void *cookie); int (*o_merge_lvb)(struct obd_export *exp, struct lov_stripe_md *lsm, struct ost_lvb *lvb, int kms_only); int (*o_adjust_kms)(struct obd_export *exp, struct lov_stripe_md *lsm, @@ -1248,34 +1360,31 @@ struct obd_ops { obd_id *startid, obd_gr group, void *data); int (*o_preprw)(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, - int niocount, struct niobuf_remote *remote, - struct niobuf_local *local, struct obd_trans_info *oti, + struct niobuf_remote *remote, int *nr_pages, + struct niobuf_local *local, + struct obd_trans_info *oti, struct lustre_capa *capa); int (*o_commitrw)(int cmd, struct obd_export *exp, struct obdo *oa, int objcount, struct obd_ioobj *obj, - int niocount, struct niobuf_local *local, + struct niobuf_remote *remote, int pages, + struct niobuf_local *local, struct obd_trans_info *oti, int rc); int (*o_enqueue)(struct obd_export *, struct obd_info *oinfo, struct ldlm_enqueue_info *einfo, struct ptlrpc_request_set *rqset); - int (*o_match)(struct obd_export *, struct lov_stripe_md *, __u32 type, - ldlm_policy_data_t *, __u32 mode, int *flags, void *data, - struct lustre_handle *lockh); int (*o_change_cbdata)(struct obd_export *, struct lov_stripe_md *, ldlm_iterator_t it, void *data); int (*o_cancel)(struct obd_export *, struct lov_stripe_md *md, __u32 mode, struct lustre_handle *); int (*o_cancel_unused)(struct obd_export *, struct lov_stripe_md *, int flags, void *opaque); - int (*o_join_lru)(struct obd_export *, struct lov_stripe_md *, - int join); int (*o_init_export)(struct obd_export *exp); int (*o_destroy_export)(struct obd_export *exp); int (*o_extent_calc)(struct obd_export *, struct lov_stripe_md *, int cmd, obd_off *); /* llog related obd_methods */ - int (*o_llog_init)(struct obd_device *obd, int group, + int (*o_llog_init)(struct obd_device *obd, struct obd_llog_group *grp, struct obd_device *disk_obd, int count, struct llog_catid *logid, struct obd_uuid *uuid); int (*o_llog_finish)(struct obd_device *obd, int count); @@ -1293,38 +1402,35 @@ struct obd_ops { enum obd_notify_event ev, void *data); int (*o_health_check)(struct obd_device *); + struct obd_uuid *(*o_get_uuid) (struct obd_export *exp); /* quota methods */ - int (*o_quotacheck)(struct obd_export *, struct obd_quotactl *); - int (*o_quotactl)(struct obd_export *, struct obd_quotactl *); + int (*o_quotacheck)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quotactl)(struct obd_device *, struct obd_export *, + struct obd_quotactl *); + int (*o_quota_adjust_qunit)(struct obd_export *exp, + struct quota_adjust_qunit *oqaq, + struct lustre_quota_ctxt *qctxt); - int (*o_ping)(struct obd_export *exp); - int (*o_register_page_removal_cb)(struct obd_export *exp, - obd_page_removal_cb_t cb, - obd_pin_extent_cb pin_cb); - int (*o_unregister_page_removal_cb)(struct obd_export *exp, - obd_page_removal_cb_t cb); - int (*o_register_lock_cancel_cb)(struct obd_export *exp, - obd_lock_cancel_cb cb); - int (*o_unregister_lock_cancel_cb)(struct obd_export *exp, - obd_lock_cancel_cb cb); + int (*o_ping)(struct obd_export *exp); + /* pools methods */ + int (*o_pool_new)(struct obd_device *obd, char *poolname); + int (*o_pool_del)(struct obd_device *obd, char *poolname); + int (*o_pool_add)(struct obd_device *obd, char *poolname, + char *ostname); + int (*o_pool_rem)(struct obd_device *obd, char *poolname, + char *ostname); + void (*o_getref)(struct obd_device *obd); + void (*o_putref)(struct obd_device *obd); /* * NOTE: If adding ops, add another LPROCFS_OBD_OP_INIT() line * to lprocfs_alloc_obd_stats() in obdclass/lprocfs_status.c. * Also, add a wrapper function in include/linux/obd_class.h. */ }; -/* TODO: lmv_stripe_md should contain mds capabilities for all slave fids */ -struct lmv_stripe_md { - __u32 mea_magic; - __u32 mea_count; - __u32 mea_master; - __u32 mea_padding; - struct lu_fid mea_ids[0]; -}; - enum { LUSTRE_OPC_MKDIR = (1 << 0), LUSTRE_OPC_SYMLINK = (1 << 1), @@ -1340,7 +1446,7 @@ enum { #define MAX_HASH_SIZE_32 0x7fffffffUL #define MAX_HASH_SIZE 0x7fffffffffffffffULL -#define MAX_HASH_HIGHEST_BIT 0x1000000000000000 +#define MAX_HASH_HIGHEST_BIT 0x1000000000000000ULL struct lustre_md { struct mdt_body *body; @@ -1356,7 +1462,8 @@ struct lustre_md { struct md_open_data { struct obd_client_handle *mod_och; - struct list_head mod_replay_list; + struct ptlrpc_request *mod_open_req; + struct ptlrpc_request *mod_close_req; }; struct lookup_intent; @@ -1369,13 +1476,14 @@ struct md_ops { int (*m_close)(struct obd_export *, struct md_op_data *, struct md_open_data *, struct ptlrpc_request **); int (*m_create)(struct obd_export *, struct md_op_data *, - const void *, int, int, __u32, __u32, __u32, + const void *, int, int, __u32, __u32, cfs_cap_t, __u64, struct ptlrpc_request **); int (*m_done_writing)(struct obd_export *, struct md_op_data *, struct md_open_data *); int (*m_enqueue)(struct obd_export *, struct ldlm_enqueue_info *, struct lookup_intent *, struct md_op_data *, - struct lustre_handle *, void *, int, int); + struct lustre_handle *, void *, int, + struct ptlrpc_request **, int); int (*m_getattr)(struct obd_export *, const struct lu_fid *, struct obd_capa *, obd_valid, int, struct ptlrpc_request **); @@ -1429,7 +1537,7 @@ struct md_ops { struct ptlrpc_request *); int (*m_clear_open_replay_data)(struct obd_export *, struct obd_client_handle *); - int (*m_set_lock_data)(struct obd_export *, __u64 *, void *); + int (*m_set_lock_data)(struct obd_export *, __u64 *, void *, __u32 *); ldlm_mode_t (*m_lock_match)(struct obd_export *, int, const struct lu_fid *, ldlm_type_t, @@ -1441,6 +1549,8 @@ struct md_ops { void *opaque); int (*m_renew_capa)(struct obd_export *, struct obd_capa *oc, renew_capa_cb_t cb); + int (*m_unpack_capa)(struct obd_export *, struct ptlrpc_request *, + const struct req_msg_field *, struct obd_capa **); int (*m_get_remote_perm)(struct obd_export *, const struct lu_fid *, struct obd_capa *, __u32, @@ -1466,9 +1576,9 @@ struct lsm_operations { int (*lsm_destroy)(struct lov_stripe_md *, struct obdo *oa, struct obd_export *md_exp); void (*lsm_stripe_by_index)(struct lov_stripe_md *, int *, obd_off *, - unsigned long *); + obd_off *); void (*lsm_stripe_by_offset)(struct lov_stripe_md *, int *, obd_off *, - unsigned long *); + obd_off *); obd_off (*lsm_stripe_offset_by_index)(struct lov_stripe_md *, int); obd_off (*lsm_stripe_offset_by_offset)(struct lov_stripe_md *, obd_off); int (*lsm_stripe_index_by_offset)(struct lov_stripe_md *, obd_off); @@ -1479,15 +1589,18 @@ struct lsm_operations { struct lov_mds_md *lmm); }; -extern struct lsm_operations lsm_plain_ops; -extern struct lsm_operations lsm_join_ops; -static inline struct lsm_operations *lsm_op_find(int magic) +extern const struct lsm_operations lsm_v1_ops; +extern const struct lsm_operations lsm_join_ops; +extern const struct lsm_operations lsm_v3_ops; +static inline const struct lsm_operations *lsm_op_find(int magic) { switch(magic) { - case LOV_MAGIC: - return &lsm_plain_ops; + case LOV_MAGIC_V1: + return &lsm_v1_ops; case LOV_MAGIC_JOIN: return &lsm_join_ops; + case LOV_MAGIC_V3: + return &lsm_v3_ops; default: CERROR("Cannot recognize lsm_magic %d\n", magic); return NULL; @@ -1501,19 +1614,24 @@ int lvfs_check_io_health(struct obd_device *obd, struct file *file); #define OBD_CALC_STRIPE_END 2 static inline void obd_transno_commit_cb(struct obd_device *obd, __u64 transno, - int error) + struct obd_export *exp, int error) { if (error) { CERROR("%s: transno "LPU64" commit error: %d\n", obd->obd_name, transno, error); return; } - CDEBUG(D_HA, "%s: transno "LPU64" committed\n", - obd->obd_name, transno); - if (transno > obd->obd_last_committed) { - obd->obd_last_committed = transno; - ptlrpc_commit_replies (obd); + if (exp && transno > exp->exp_last_committed) { + CDEBUG(D_HA, "%s: transno "LPU64" committed\n", + obd->obd_name, transno); + exp->exp_last_committed = transno; + ptlrpc_commit_replies(exp); + } else { + CDEBUG(D_INFO, "%s: transno "LPU64" committed\n", + obd->obd_name, transno); } + if (transno > obd->obd_last_committed) + obd->obd_last_committed = transno; } static inline void init_obd_quota_ops(quota_interface_t *interface, @@ -1525,11 +1643,12 @@ static inline void init_obd_quota_ops(quota_interface_t *interface, LASSERT(obd_ops); obd_ops->o_quotacheck = QUOTA_OP(interface, check); obd_ops->o_quotactl = QUOTA_OP(interface, ctl); + obd_ops->o_quota_adjust_qunit = QUOTA_OP(interface, adjust_qunit); } static inline __u64 oinfo_mdsno(struct obd_info *oinfo) { - return oinfo->oi_oa->o_gr - FILTER_GROUP_MDS0; + return obdo_mdsno(oinfo->oi_oa); } static inline struct lustre_capa *oinfo_capa(struct obd_info *oinfo)