X-Git-Url: https://git.whamcloud.com/?p=fs%2Flustre-release.git;a=blobdiff_plain;f=lustre%2Fosp%2Fosp_internal.h;h=243d3ca8352de7689eb0614b1f6abee4011fb309;hp=e75c2468d5db5590d7fd20d639d628405daba24c;hb=26b8238659974959780cd49de92595b4b0bdf89f;hpb=04e1d0cb95e1ad1288676c276efd98f6786a58d7 diff --git a/lustre/osp/osp_internal.h b/lustre/osp/osp_internal.h index e75c246..243d3ca 100644 --- a/lustre/osp/osp_internal.h +++ b/lustre/osp/osp_internal.h @@ -27,7 +27,7 @@ * Copyright (c) 2007, 2010, Oracle and/or its affiliates. All rights reserved. * Use is subject to license terms. * - * Copyright (c) 2011, 2012, Intel, Inc. + * Copyright (c) 2012, 2014, Intel Corporation. */ /* * This file is part of Lustre, http://www.lustre.org/ @@ -44,41 +44,75 @@ #include #include #include +#include #include +#include +#include +#include /* * Infrastructure to support tracking of last committed llog record */ struct osp_id_tracker { - cfs_spinlock_t otr_lock; + spinlock_t otr_lock; __u32 otr_next_id; __u32 otr_committed_id; /* callback is register once per diskfs -- that's the whole point */ struct dt_txn_callback otr_tx_cb; /* single node can run many clusters */ - cfs_list_t otr_wakeup_list; - cfs_list_t otr_list; + struct list_head otr_wakeup_list; + struct list_head otr_list; /* underlying shared device */ struct dt_device *otr_dev; /* how many users of this tracker */ - cfs_atomic_t otr_refcount; + atomic_t otr_refcount; +}; + +struct osp_precreate { + /* + * Precreation pool + */ + spinlock_t osp_pre_lock; + + /* last fid to assign in creation */ + struct lu_fid osp_pre_used_fid; + /* last created id OST reported, next-created - available id's */ + struct lu_fid osp_pre_last_created_fid; + /* how many ids are reserved in declare, we shouldn't block in create */ + __u64 osp_pre_reserved; + /* consumers (who needs new ids) wait here */ + wait_queue_head_t osp_pre_user_waitq; + /* current precreation status: working, failed, stopping? */ + int osp_pre_status; + /* how many to precreate next time */ + int osp_pre_grow_count; + int osp_pre_min_grow_count; + int osp_pre_max_grow_count; + /* whether to grow precreation window next time or not */ + int osp_pre_grow_slow; + /* cleaning up orphans or recreating missing objects */ + int osp_pre_recovering; }; struct osp_device { struct dt_device opd_dt_dev; /* corresponded OST index */ int opd_index; + + /* corrsponded MDT index, which will be used when connecting to OST + * for validating the connection (see ofd_parse_connect_data) */ + int opd_group; /* device used to store persistent state (llogs, last ids) */ struct obd_export *opd_storage_exp; struct dt_device *opd_storage; - struct dt_object *opd_last_used_file; + struct dt_object *opd_last_used_oid_file; + struct dt_object *opd_last_used_seq_file; /* stored persistently in LE format, updated directly to/from disk * and required le64_to_cpu() conversion before use. * Protected by opd_pre_lock */ - volatile obd_id opd_last_used_id; - - obd_id opd_gap_start; + struct lu_fid opd_last_used_fid; + struct lu_fid opd_gap_start_fid; int opd_gap_count; /* connection to OST */ struct obd_device *opd_obd; @@ -86,48 +120,31 @@ struct osp_device { struct obd_uuid opd_cluuid; struct obd_connect_data *opd_connect_data; int opd_connects; - cfs_proc_dir_entry_t *opd_proc_entry; + struct proc_dir_entry *opd_proc_entry; struct lprocfs_stats *opd_stats; /* connection status. */ - int opd_new_connection; - int opd_got_disconnected; - int opd_imp_connected; - int opd_imp_active; - int opd_imp_seen_connected:1; + unsigned int opd_new_connection:1, + opd_got_disconnected:1, + opd_imp_connected:1, + opd_imp_active:1, + opd_imp_seen_connected:1, + opd_connect_mdt:1; /* whether local recovery is completed: * reported via ->ldo_recovery_complete() */ int opd_recovery_completed; - /* - * Precreation pool - */ - cfs_spinlock_t opd_pre_lock; - /* last id assigned in creation */ - __u64 opd_pre_used_id; - /* last created id OST reported, next-created - available id's */ - __u64 opd_pre_last_created; - /* how many ids are reserved in declare, we shouldn't block in create */ - __u64 opd_pre_reserved; + /* precreate structure for OSP */ + struct osp_precreate *opd_pre; /* dedicate precreate thread */ struct ptlrpc_thread opd_pre_thread; /* thread waits for signals about pool going empty */ - cfs_waitq_t opd_pre_waitq; - /* consumers (who needs new ids) wait here */ - cfs_waitq_t opd_pre_user_waitq; - /* current precreation status: working, failed, stopping? */ - int opd_pre_status; - /* how many to precreate next time */ - int opd_pre_grow_count; - int opd_pre_min_grow_count; - int opd_pre_max_grow_count; - /* whether to grow precreation window next time or not */ - int opd_pre_grow_slow; + wait_queue_head_t opd_pre_waitq; /* * OST synchronization */ - cfs_spinlock_t opd_syn_lock; + spinlock_t opd_syn_lock; /* unique generation, to recognize start of new records in the llog */ struct llog_gen opd_syn_generation; /* number of changes to sync, used to wake up sync thread */ @@ -136,9 +153,9 @@ struct osp_device { int opd_syn_prev_done; /* found records */ struct ptlrpc_thread opd_syn_thread; - cfs_waitq_t opd_syn_waitq; + wait_queue_head_t opd_syn_waitq; /* list of remotely committed rpc */ - cfs_list_t opd_syn_committed_there; + struct list_head opd_syn_committed_there; /* number of changes being under sync */ int opd_syn_sync_in_progress; /* number of RPCs in flight - flow control */ @@ -157,40 +174,97 @@ struct osp_device { /* last processed (taken from llog) id */ unsigned long opd_syn_last_processed_id; struct osp_id_tracker *opd_syn_tracker; - cfs_list_t opd_syn_ontrack; + struct list_head opd_syn_ontrack; + /* stop processing new requests until barrier=0 */ + atomic_t opd_syn_barrier; + wait_queue_head_t opd_syn_barrier_waitq; /* * statfs related fields: OSP maintains it on its own */ struct obd_statfs opd_statfs; cfs_time_t opd_statfs_fresh_till; - cfs_timer_t opd_statfs_timer; + struct timer_list opd_statfs_timer; int opd_statfs_update_in_progress; /* how often to update statfs data */ int opd_statfs_maxage; - cfs_proc_dir_entry_t *opd_symlink; + struct proc_dir_entry *opd_symlink; + + /* If the caller wants to do some idempotent async operations on + * remote server, it can append the async remote requests on the + * osp_device::opd_async_requests via declare() functions, these + * requests can be packed together and sent to the remote server + * via single OUT RPC later. */ + struct dt_update_request *opd_async_requests; + /* Protect current operations on opd_async_requests. */ + struct mutex opd_async_requests_mutex; + struct list_head opd_async_updates; + struct rw_semaphore opd_async_updates_rwsem; + atomic_t opd_async_updates_count; +}; + +#define opd_pre_lock opd_pre->osp_pre_lock +#define opd_pre_used_fid opd_pre->osp_pre_used_fid +#define opd_pre_last_created_fid opd_pre->osp_pre_last_created_fid +#define opd_pre_reserved opd_pre->osp_pre_reserved +#define opd_pre_user_waitq opd_pre->osp_pre_user_waitq +#define opd_pre_status opd_pre->osp_pre_status +#define opd_pre_grow_count opd_pre->osp_pre_grow_count +#define opd_pre_min_grow_count opd_pre->osp_pre_min_grow_count +#define opd_pre_max_grow_count opd_pre->osp_pre_max_grow_count +#define opd_pre_grow_slow opd_pre->osp_pre_grow_slow +#define opd_pre_recovering opd_pre->osp_pre_recovering + +extern struct kmem_cache *osp_object_kmem; + +/* The first part of oxe_buf is xattr name, and is '\0' terminated. + * The left part is for value, binary mode. */ +struct osp_xattr_entry { + struct list_head oxe_list; + atomic_t oxe_ref; + void *oxe_value; + size_t oxe_buflen; + size_t oxe_namelen; + size_t oxe_vallen; + unsigned int oxe_exist:1, + oxe_ready:1; + char oxe_buf[0]; }; -extern cfs_mem_cache_t *osp_object_kmem; +struct osp_object_attr { + struct lu_attr ooa_attr; + struct list_head ooa_xattr_list; +}; /* this is a top object */ struct osp_object { - struct lu_object_header opo_header; - struct dt_object opo_obj; - int opo_reserved:1, - opo_new:1; + struct lu_object_header opo_header; + struct dt_object opo_obj; + unsigned int opo_reserved:1, + opo_non_exist:1; + + /* read/write lock for md osp object */ + struct rw_semaphore opo_sem; + const struct lu_env *opo_owner; + struct osp_object_attr *opo_ooa; + /* Protect opo_ooa. */ + spinlock_t opo_lock; }; extern struct lu_object_operations osp_lu_obj_ops; extern const struct dt_device_operations osp_dt_ops; +extern struct dt_object_operations osp_md_obj_ops; +extern struct dt_body_operations osp_md_body_ops; struct osp_thread_info { struct lu_buf osi_lb; + struct lu_buf osi_lb2; struct lu_fid osi_fid; struct lu_attr osi_attr; struct ost_id osi_oi; - obd_id osi_id; + struct ost_id osi_oi2; + u64 osi_id; loff_t osi_off; union { struct llog_rec_hdr osi_hdr; @@ -200,14 +274,57 @@ struct osp_thread_info { }; struct llog_cookie osi_cookie; struct llog_catid osi_cid; + struct lu_seq_range osi_seq; + struct ldlm_res_id osi_resid; + struct obdo osi_obdo; +}; + +/* Iterator for OSP */ +struct osp_it { + __u32 ooi_pos_page; + __u32 ooi_pos_lu_page; + __u32 ooi_attr; + int ooi_pos_ent; + int ooi_total_npages; + int ooi_valid_npages; + unsigned int ooi_swab:1; + __u64 ooi_next; + struct dt_object *ooi_obj; + void *ooi_ent; + struct page *ooi_cur_page; + struct lu_idxpage *ooi_cur_idxpage; + struct page **ooi_pages; }; -static inline void osp_objid_buf_prep(struct osp_thread_info *osi, - struct osp_device *d, int index) +/* The transaction only include the updates on the remote node, and + * no local updates at all */ +static inline bool is_only_remote_trans(struct thandle *th) +{ + return th->th_dev != NULL && th->th_dev->dd_ops == &osp_dt_ops; +} + +static inline void osp_objid_buf_prep(struct lu_buf *buf, loff_t *off, + __u32 *id, int index) +{ + /* Note: through id is only 32 bits, it will also write 64 bits + * for oid to keep compatibility with the previous version. */ + buf->lb_buf = (void *)id; + buf->lb_len = sizeof(u64); + *off = sizeof(u64) * index; +} + +static inline void osp_objseq_buf_prep(struct lu_buf *buf, loff_t *off, + __u64 *seq, int index) +{ + buf->lb_buf = (void *)seq; + buf->lb_len = sizeof(u64); + *off = sizeof(u64) * index; +} + +static inline void osp_buf_prep(struct lu_buf *lb, void *buf, int buf_len) { - osi->osi_lb.lb_buf = (void *)&d->opd_last_used_id; - osi->osi_lb.lb_len = sizeof(d->opd_last_used_id); - osi->osi_off = sizeof(d->opd_last_used_id) * index; + lb->lb_buf = buf; + lb->lb_len = buf_len; } extern struct lu_context_key osp_thread_key; @@ -291,20 +408,203 @@ static inline struct dt_object *osp_object_child(struct osp_object *o) struct dt_object, do_lu); } +static inline struct seq_server_site *osp_seq_site(struct osp_device *osp) +{ + return osp->opd_dt_dev.dd_lu_dev.ld_site->ld_seq_site; +} + +#define osp_init_rpc_lock(lck) mdc_init_rpc_lock(lck) + +static inline void osp_get_rpc_lock(struct osp_device *osp) +{ + struct mdc_rpc_lock *rpc_lock = osp->opd_obd->u.cli.cl_rpc_lock; + + mdc_get_rpc_lock(rpc_lock, NULL); +} + +static inline void osp_put_rpc_lock(struct osp_device *osp) +{ + struct mdc_rpc_lock *rpc_lock = osp->opd_obd->u.cli.cl_rpc_lock; + + mdc_put_rpc_lock(rpc_lock, NULL); +} + +static inline int osp_fid_diff(const struct lu_fid *fid1, + const struct lu_fid *fid2) +{ + /* In 2.6+ ost_idx is packed into IDIF FID, while in 2.4 and 2.5 IDIF + * is always FID_SEQ_IDIF(0x100000000ULL), which does not include OST + * index in the seq. So we can not compare IDIF FID seq here */ + if (fid_is_idif(fid1) && fid_is_idif(fid2)) { + __u32 ost_idx1 = fid_idif_ost_idx(fid1); + __u32 ost_idx2 = fid_idif_ost_idx(fid2); + + LASSERTF(ost_idx1 == 0 || ost_idx2 == 0 || ost_idx1 == ost_idx2, + "fid1: "DFID", fid2: "DFID"\n", PFID(fid1), + PFID(fid2)); + + return fid_idif_id(fid1->f_seq, fid1->f_oid, 0) - + fid_idif_id(fid2->f_seq, fid2->f_oid, 0); + } + + LASSERTF(fid_seq(fid1) == fid_seq(fid2), "fid1:"DFID + ", fid2:"DFID"\n", PFID(fid1), PFID(fid2)); + + return fid_oid(fid1) - fid_oid(fid2); +} + + +static inline void osp_update_last_fid(struct osp_device *d, struct lu_fid *fid) +{ + int diff = osp_fid_diff(fid, &d->opd_last_used_fid); + /* + * we might have lost precreated objects due to VBR and precreate + * orphans, the gap in objid can be calculated properly only here + */ + if (diff > 0) { + if (diff > 1) { + d->opd_gap_start_fid = d->opd_last_used_fid; + d->opd_gap_start_fid.f_oid++; + d->opd_gap_count = diff - 1; + CDEBUG(D_HA, "Gap in objids: start="DFID", count =%d\n", + PFID(&d->opd_gap_start_fid), d->opd_gap_count); + } + d->opd_last_used_fid = *fid; + } +} + +static int osp_fid_end_seq(const struct lu_env *env, struct lu_fid *fid) +{ + if (fid_is_idif(fid)) { + struct osp_thread_info *info = osp_env_info(env); + struct ost_id *oi = &info->osi_oi; + + fid_to_ostid(fid, oi); + return ostid_id(oi) == IDIF_MAX_OID; + } else { + return fid_oid(fid) == LUSTRE_DATA_SEQ_MAX_WIDTH; + } +} + +static inline int osp_precreate_end_seq_nolock(const struct lu_env *env, + struct osp_device *osp) +{ + struct lu_fid *fid = &osp->opd_pre_last_created_fid; + + return osp_fid_end_seq(env, fid); +} + +static inline int osp_precreate_end_seq(const struct lu_env *env, + struct osp_device *osp) +{ + int rc; + + spin_lock(&osp->opd_pre_lock); + rc = osp_precreate_end_seq_nolock(env, osp); + spin_unlock(&osp->opd_pre_lock); + return rc; +} + +static inline int osp_is_fid_client(struct osp_device *osp) +{ + struct obd_import *imp = osp->opd_obd->u.cli.cl_import; + + return imp->imp_connect_data.ocd_connect_flags & OBD_CONNECT_FID; +} + +typedef int (*osp_async_request_interpreter_t)(const struct lu_env *env, + struct object_update_reply *rep, + struct ptlrpc_request *req, + struct osp_object *obj, + void *data, int index, int rc); + /* osp_dev.c */ -void osp_update_last_id(struct osp_device *d, obd_id objid); +void osp_update_last_id(struct osp_device *d, u64 objid); +extern struct llog_operations osp_mds_ost_orig_logops; + +/* osp_trans.c */ +int osp_insert_async_request(const struct lu_env *env, enum update_type op, + struct osp_object *obj, int count, __u16 *lens, + const void **bufs, void *data, + osp_async_request_interpreter_t interpreter); +int osp_unplug_async_request(const struct lu_env *env, + struct osp_device *osp, + struct dt_update_request *update); +struct thandle *osp_trans_create(const struct lu_env *env, + struct dt_device *d); +int osp_trans_start(const struct lu_env *env, struct dt_device *dt, + struct thandle *th); +int osp_prep_update_req(const struct lu_env *env, struct obd_import *imp, + const struct object_update_request *ureq, + struct ptlrpc_request **reqp); +int osp_remote_sync(const struct lu_env *env, struct osp_device *osp, + struct dt_update_request *update, + struct ptlrpc_request **reqp, bool rpc_lock); +/* osp_object.c */ +int osp_attr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr); +int osp_xattr_get(const struct lu_env *env, struct dt_object *dt, + struct lu_buf *buf, const char *name); +int osp_declare_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, + int flag, struct thandle *th); +int osp_xattr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_buf *buf, const char *name, int fl, + struct thandle *th); +int osp_declare_xattr_del(const struct lu_env *env, struct dt_object *dt, + const char *name, struct thandle *th); +int osp_xattr_del(const struct lu_env *env, struct dt_object *dt, + const char *name, struct thandle *th); + +int osp_declare_object_destroy(const struct lu_env *env, + struct dt_object *dt, struct thandle *th); +int osp_object_destroy(const struct lu_env *env, struct dt_object *dt, + struct thandle *th); + +int osp_trans_stop(const struct lu_env *env, struct dt_device *dt, + struct thandle *th); + +struct dt_it *osp_it_init(const struct lu_env *env, struct dt_object *dt, + __u32 attr); +void osp_it_fini(const struct lu_env *env, struct dt_it *di); +int osp_it_get(const struct lu_env *env, struct dt_it *di, + const struct dt_key *key); +void osp_it_put(const struct lu_env *env, struct dt_it *di); +__u64 osp_it_store(const struct lu_env *env, const struct dt_it *di); +int osp_it_key_rec(const struct lu_env *env, const struct dt_it *di, + void *key_rec); +int osp_it_next_page(const struct lu_env *env, struct dt_it *di); +/* osp_md_object.c */ +int osp_md_declare_object_create(const struct lu_env *env, + struct dt_object *dt, + struct lu_attr *attr, + struct dt_allocation_hint *hint, + struct dt_object_format *dof, + struct thandle *th); +int osp_md_object_create(const struct lu_env *env, struct dt_object *dt, + struct lu_attr *attr, struct dt_allocation_hint *hint, + struct dt_object_format *dof, struct thandle *th); +int __osp_md_attr_set(const struct lu_env *env, struct dt_object *dt, + const struct lu_attr *attr, struct thandle *th); +extern const struct dt_index_operations osp_md_index_ops; /* osp_precreate.c */ int osp_init_precreate(struct osp_device *d); int osp_precreate_reserve(const struct lu_env *env, struct osp_device *d); __u64 osp_precreate_get_id(struct osp_device *d); +int osp_precreate_get_fid(const struct lu_env *env, struct osp_device *d, + struct lu_fid *fid); void osp_precreate_fini(struct osp_device *d); int osp_object_truncate(const struct lu_env *env, struct dt_object *dt, __u64); void osp_pre_update_status(struct osp_device *d, int rc); void osp_statfs_need_now(struct osp_device *d); +int osp_reset_last_used(const struct lu_env *env, struct osp_device *osp); +int osp_write_last_oid_seq_files(struct lu_env *env, struct osp_device *osp, + struct lu_fid *fid, int sync); +int osp_init_pre_fid(struct osp_device *osp); /* lproc_osp.c */ -void lprocfs_osp_init_vars(struct lprocfs_static_vars *lvars); +void osp_lprocfs_init(struct osp_device *osp); /* osp_sync.c */ int osp_sync_declare_add(const struct lu_env *env, struct osp_object *o, @@ -316,10 +616,8 @@ int osp_sync_init(const struct lu_env *env, struct osp_device *d); int osp_sync_fini(struct osp_device *d); void __osp_sync_check_for_work(struct osp_device *d); -/* osp_ost.c */ -int osp_init_for_ost(const struct lu_env *env, struct osp_device *m, - struct lu_device_type *ldt, struct lustre_cfg *cfg); -int osp_disconnect(struct osp_device *d); -int osp_fini_for_ost(struct osp_device *osp); +/* lwp_dev.c */ +extern struct obd_ops lwp_obd_device_ops; +extern struct lu_device_type lwp_device_type; #endif