Google Groups no longer supports new Usenet posts or subscriptions. Historical content remains viewable.

Dismiss

[PATCH 11/26] ceph: address space operations

0 views

Skip to first unread message

Sage Weil

unread,

Mar 2, 2010, 7:50:02 PM3/2/10

The ceph address space methods are concerned primarily with managing
the dirty page accounting in the inode, which (among other things)
must keep track of which snapshot context each page was dirtied in,
and ensure that dirty data is written out to the OSDs in snapshort
order.

A writepage() on a page that is not currently writeable due to
snapshot writeback ordering constraints is ignored (it was presumably
called from kswapd).

Signed-off-by: Sage Weil <sa...@newdream.net>
---
fs/ceph/addr.c | 1188 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 1188 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/addr.c

diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c
new file mode 100644
index 0000000..23bb0ce
--- /dev/null
+++ b/fs/ceph/addr.c
@@ -0,0 +1,1188 @@
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/writeback.h> /* generic_writepages */
+#include <linux/pagevec.h>
+#include <linux/task_io_accounting_ops.h>
+
+#include "super.h"
+#include "osd_client.h"
+
+/*
+ * Ceph address space ops.
+ *
+ * There are a few funny things going on here.
+ *
+ * The page->private field is used to reference a struct
+ * ceph_snap_context for _every_ dirty page. This indicates which
+ * snapshot the page was logically dirtied in, and thus which snap
+ * context needs to be associated with the osd write during writeback.
+ *
+ * Similarly, struct ceph_inode_info maintains a set of counters to
+ * count dirty pages on the inode. In the absense of snapshots,
+ * i_wrbuffer_ref == i_wrbuffer_ref_head == the dirty page count.
+ *
+ * When a snapshot is taken (that is, when the client receives
+ * notification that a snapshot was taken), each inode with caps and
+ * with dirty pages (dirty pages implies there is a cap) gets a new
+ * ceph_cap_snap in the i_cap_snaps list (which is sorted in ascending
+ * order, new snaps go to the tail). The i_wrbuffer_ref_head count is
+ * moved to capsnap->dirty. (Unless a sync write is currently in
+ * progress. In that case, the capsnap is said to be "pending", new
+ * writes cannot start, and the capsnap isn't "finalized" until the
+ * write completes (or fails) and a final size/mtime for the inode for
+ * that snap can be settled upon.) i_wrbuffer_ref_head is reset to 0.
+ *
+ * On writeback, we must submit writes to the osd IN SNAP ORDER. So,
+ * we look for the first capsnap in i_cap_snaps and write out pages in
+ * that snap context _only_. Then we move on to the next capsnap,
+ * eventually reaching the "live" or "head" context (i.e., pages that
+ * are not yet snapped) and are writing the most recently dirtied
+ * pages.
+ *
+ * Invalidate and so forth must take care to ensure the dirty page
+ * accounting is preserved.
+ */
+
+#define CONGESTION_ON_THRESH(congestion_kb) (congestion_kb >> (PAGE_SHIFT-10))
+#define CONGESTION_OFF_THRESH(congestion_kb) \
+ (CONGESTION_ON_THRESH(congestion_kb) - \
+ (CONGESTION_ON_THRESH(congestion_kb) >> 2))
+
+
+
+/*
+ * Dirty a page. Optimistically adjust accounting, on the assumption
+ * that we won't race with invalidate. If we do, readjust.
+ */
+static int ceph_set_page_dirty(struct page *page)
+{
+ struct address_space *mapping = page->mapping;
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ int undo = 0;
+ struct ceph_snap_context *snapc;
+
+ if (unlikely(!mapping))
+ return !TestSetPageDirty(page);
+
+ if (TestSetPageDirty(page)) {
+ dout("%p set_page_dirty %p idx %lu -- already dirty\n",
+ mapping->host, page, page->index);
+ return 0;
+ }
+
+ inode = mapping->host;
+ ci = ceph_inode(inode);
+
+ /*
+ * Note that we're grabbing a snapc ref here without holding
+ * any locks!
+ */
+ snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+
+ /* dirty the head */
+ spin_lock(&inode->i_lock);
+ if (ci->i_wrbuffer_ref_head == 0)
+ ci->i_head_snapc = ceph_get_snap_context(snapc);
+ ++ci->i_wrbuffer_ref_head;
+ if (ci->i_wrbuffer_ref == 0)
+ igrab(inode);
+ ++ci->i_wrbuffer_ref;
+ dout("%p set_page_dirty %p idx %lu head %d/%d -> %d/%d "
+ "snapc %p seq %lld (%d snaps)\n",
+ mapping->host, page, page->index,
+ ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref_head-1,
+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
+ snapc, snapc->seq, snapc->num_snaps);
+ spin_unlock(&inode->i_lock);
+
+ /* now adjust page */
+ spin_lock_irq(&mapping->tree_lock);
+ if (page->mapping) { /* Race with truncate? */
+ WARN_ON_ONCE(!PageUptodate(page));
+
+ if (mapping_cap_account_dirty(mapping)) {
+ __inc_zone_page_state(page, NR_FILE_DIRTY);
+ __inc_bdi_stat(mapping->backing_dev_info,
+ BDI_RECLAIMABLE);
+ task_io_account_write(PAGE_CACHE_SIZE);
+ }
+ radix_tree_tag_set(&mapping->page_tree,
+ page_index(page), PAGECACHE_TAG_DIRTY);
+
+ /*
+ * Reference snap context in page->private. Also set
+ * PagePrivate so that we get invalidatepage callback.
+ */
+ page->private = (unsigned long)snapc;
+ SetPagePrivate(page);
+ } else {
+ dout("ANON set_page_dirty %p (raced truncate?)\n", page);
+ undo = 1;
+ }
+
+ spin_unlock_irq(&mapping->tree_lock);
+
+ if (undo)
+ /* whoops, we failed to dirty the page */
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+
+ __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
+
+ BUG_ON(!PageDirty(page));
+ return 1;
+}
+
+/*
+ * If we are truncating the full page (i.e. offset == 0), adjust the
+ * dirty page counters appropriately. Only called if there is private
+ * data on the page.
+ */
+static void ceph_invalidatepage(struct page *page, unsigned long offset)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_snap_context *snapc = (void *)page->private;
+
+ BUG_ON(!PageLocked(page));
+ BUG_ON(!page->private);
+ BUG_ON(!PagePrivate(page));
+ BUG_ON(!page->mapping);
+
+ inode = page->mapping->host;
+
+ /*
+ * We can get non-dirty pages here due to races between
+ * set_page_dirty and truncate_complete_page; just spit out a
+ * warning, in case we end up with accounting problems later.
+ */
+ if (!PageDirty(page))
+ pr_err("%p invalidatepage %p page not dirty\n", inode, page);
+
+ if (offset == 0)
+ ClearPageChecked(page);
+
+ ci = ceph_inode(inode);
+ if (offset == 0) {
+ dout("%p invalidatepage %p idx %lu full dirty page %lu\n",
+ inode, page, page->index, offset);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+ page->private = 0;
+ ClearPagePrivate(page);
+ } else {
+ dout("%p invalidatepage %p idx %lu partial dirty page\n",
+ inode, page, page->index);
+ }
+}
+
+/* just a sanity check */
+static int ceph_releasepage(struct page *page, gfp_t g)
+{
+ struct inode *inode = page->mapping ? page->mapping->host : NULL;
+ dout("%p releasepage %p idx %lu\n", inode, page, page->index);
+ WARN_ON(PageDirty(page));
+ WARN_ON(page->private);
+ WARN_ON(PagePrivate(page));
+ return 0;
+}
+
+/*
+ * read a single page, without unlocking it.
+ */
+static int readpage_nounlock(struct file *filp, struct page *page)
+{
+ struct inode *inode = filp->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ int err = 0;
+ u64 len = PAGE_CACHE_SIZE;
+
+ dout("readpage inode %p file %p page %p index %lu\n",
+ inode, filp, page, page->index);
+ err = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ page->index << PAGE_CACHE_SHIFT, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &page, 1);
+ if (err == -ENOENT)
+ err = 0;
+ if (err < 0) {
+ SetPageError(page);
+ goto out;
+ } else if (err < PAGE_CACHE_SIZE) {
+ /* zero fill remainder of page */
+ zero_user_segment(page, err, PAGE_CACHE_SIZE);
+ }
+ SetPageUptodate(page);
+
+out:
+ return err < 0 ? err : 0;
+}
+
+static int ceph_readpage(struct file *filp, struct page *page)
+{
+ int r = readpage_nounlock(filp, page);
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * Build a vector of contiguous pages from the provided page list.
+ */
+static struct page **page_vector_from_list(struct list_head *page_list,
+ unsigned *nr_pages)
+{
+ struct page **pages;
+ struct page *page;
+ int next_index, contig_pages = 0;
+
+ /* build page vector */
+ pages = kmalloc(sizeof(*pages) * *nr_pages, GFP_NOFS);
+ if (!pages)
+ return ERR_PTR(-ENOMEM);
+
+ BUG_ON(list_empty(page_list));
+ next_index = list_entry(page_list->prev, struct page, lru)->index;
+ list_for_each_entry_reverse(page, page_list, lru) {
+ if (page->index == next_index) {
+ dout("readpages page %d %p\n", contig_pages, page);
+ pages[contig_pages] = page;
+ contig_pages++;
+ next_index++;
+ } else {
+ break;
+ }
+ }
+ *nr_pages = contig_pages;
+ return pages;
+}
+
+/*
+ * Read multiple pages. Leave pages we don't read + unlock in page_list;
+ * the caller (VM) cleans them up.
+ */
+static int ceph_readpages(struct file *file, struct address_space *mapping,
+ struct list_head *page_list, unsigned nr_pages)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_osd_client *osdc = &ceph_inode_to_client(inode)->osdc;
+ int rc = 0;
+ struct page **pages;
+ struct pagevec pvec;
+ loff_t offset;
+ u64 len;
+
+ dout("readpages %p file %p nr_pages %d\n",
+ inode, file, nr_pages);
+
+ pages = page_vector_from_list(page_list, &nr_pages);
+ if (IS_ERR(pages))
+ return PTR_ERR(pages);
+
+ /* guess read extent */
+ offset = pages[0]->index << PAGE_CACHE_SHIFT;
+ len = nr_pages << PAGE_CACHE_SHIFT;
+ rc = ceph_osdc_readpages(osdc, ceph_vino(inode), &ci->i_layout,
+ offset, &len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ pages, nr_pages);
+ if (rc == -ENOENT)
+ rc = 0;
+ if (rc < 0)
+ goto out;
+
+ /* set uptodate and add to lru in pagevec-sized chunks */
+ pagevec_init(&pvec, 0);
+ for (; !list_empty(page_list) && len > 0;
+ rc -= PAGE_CACHE_SIZE, len -= PAGE_CACHE_SIZE) {
+ struct page *page =
+ list_entry(page_list->prev, struct page, lru);
+
+ list_del(&page->lru);
+
+ if (rc < (int)PAGE_CACHE_SIZE) {
+ /* zero (remainder of) page */
+ int s = rc < 0 ? 0 : rc;
+ zero_user_segment(page, s, PAGE_CACHE_SIZE);
+ }
+
+ if (add_to_page_cache(page, mapping, page->index, GFP_NOFS)) {
+ page_cache_release(page);
+ dout("readpages %p add_to_page_cache failed %p\n",
+ inode, page);
+ continue;
+ }
+ dout("readpages %p adding %p idx %lu\n", inode, page,
+ page->index);
+ flush_dcache_page(page);
+ SetPageUptodate(page);
+ unlock_page(page);
+ if (pagevec_add(&pvec, page) == 0)
+ pagevec_lru_add_file(&pvec); /* add to lru */
+ }
+ pagevec_lru_add_file(&pvec);
+ rc = 0;
+
+out:
+ kfree(pages);
+ return rc;
+}
+
+/*
+ * Get ref for the oldest snapc for an inode with dirty data... that is, the
+ * only snap context we are allowed to write back.
+ *
+ * Caller holds i_lock.
+ */
+static struct ceph_snap_context *__get_oldest_context(struct inode *inode,
+ u64 *snap_size)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_snap_context *snapc = NULL;
+ struct ceph_cap_snap *capsnap = NULL;
+
+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
+ dout(" cap_snap %p snapc %p has %d dirty pages\n", capsnap,
+ capsnap->context, capsnap->dirty_pages);
+ if (capsnap->dirty_pages) {
+ snapc = ceph_get_snap_context(capsnap->context);
+ if (snap_size)
+ *snap_size = capsnap->size;
+ break;
+ }
+ }
+ if (!snapc && ci->i_snap_realm) {
+ snapc = ceph_get_snap_context(ci->i_snap_realm->cached_context);
+ dout(" head snapc %p has %d dirty pages\n",
+ snapc, ci->i_wrbuffer_ref_head);
+ }
+ return snapc;
+}
+
+static struct ceph_snap_context *get_oldest_context(struct inode *inode,
+ u64 *snap_size)
+{
+ struct ceph_snap_context *snapc = NULL;
+
+ spin_lock(&inode->i_lock);
+ snapc = __get_oldest_context(inode, snap_size);
+ spin_unlock(&inode->i_lock);
+ return snapc;
+}
+
+/*
+ * Write a single page, but leave the page locked.
+ *
+ * If we get a write error, set the page error bit, but still adjust the
+ * dirty page accounting (i.e., page is no longer dirty).
+ */
+static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
+{
+ struct inode *inode;
+ struct ceph_inode_info *ci;
+ struct ceph_client *client;
+ struct ceph_osd_client *osdc;
+ loff_t page_off = page->index << PAGE_CACHE_SHIFT;
+ int len = PAGE_CACHE_SIZE;
+ loff_t i_size;
+ int err = 0;
+ struct ceph_snap_context *snapc;
+ u64 snap_size = 0;
+ long writeback_stat;
+
+ dout("writepage %p idx %lu\n", page, page->index);
+
+ if (!page->mapping || !page->mapping->host) {
+ dout("writepage %p - no mapping\n", page);
+ return -EFAULT;
+ }
+ inode = page->mapping->host;
+ ci = ceph_inode(inode);
+ client = ceph_inode_to_client(inode);
+ osdc = &client->osdc;
+
+ /* verify this is a writeable snap context */
+ snapc = (void *)page->private;
+ if (snapc == NULL) {
+ dout("writepage %p page %p not dirty?\n", inode, page);
+ goto out;
+ }
+ if (snapc != get_oldest_context(inode, &snap_size)) {
+ dout("writepage %p page %p snapc %p not writeable - noop\n",
+ inode, page, (void *)page->private);
+ /* we should only noop if called by kswapd */
+ WARN_ON((current->flags & PF_MEMALLOC) == 0);
+ goto out;
+ }
+
+ /* is this a partial page at end of file? */
+ if (snap_size)
+ i_size = snap_size;
+ else
+ i_size = i_size_read(inode);
+ if (i_size < page_off + len)
+ len = i_size - page_off;
+
+ dout("writepage %p page %p index %lu on %llu~%u\n",
+ inode, page, page->index, page_off, len);
+
+ writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ if (writeback_stat >
+ CONGESTION_ON_THRESH(client->mount_args->congestion_kb))
+ set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+
+ set_page_writeback(page);
+ err = ceph_osdc_writepages(osdc, ceph_vino(inode),
+ &ci->i_layout, snapc,
+ page_off, len,
+ ci->i_truncate_seq, ci->i_truncate_size,
+ &inode->i_mtime,
+ &page, 1, 0, 0, true);
+ if (err < 0) {
+ dout("writepage setting page/mapping error %d %p\n", err, page);
+ SetPageError(page);
+ mapping_set_error(&inode->i_data, err);
+ if (wbc)
+ wbc->pages_skipped++;
+ } else {
+ dout("writepage cleaned page %p\n", page);
+ err = 0; /* vfs expects us to return 0 */
+ }
+ page->private = 0;
+ ClearPagePrivate(page);
+ end_page_writeback(page);
+ ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
+ ceph_put_snap_context(snapc);
+out:
+ return err;
+}
+
+static int ceph_writepage(struct page *page, struct writeback_control *wbc)
+{
+ int err;
+ struct inode *inode = page->mapping->host;
+ BUG_ON(!inode);
+ igrab(inode);
+ err = writepage_nounlock(page, wbc);
+ unlock_page(page);
+ iput(inode);
+ return err;
+}
+
+
+/*
+ * lame release_pages helper. release_pages() isn't exported to
+ * modules.
+ */
+static void ceph_release_pages(struct page **pages, int num)
+{
+ struct pagevec pvec;
+ int i;
+
+ pagevec_init(&pvec, 0);
+ for (i = 0; i < num; i++) {
+ if (pagevec_add(&pvec, pages[i]) == 0)
+ pagevec_release(&pvec);
+ }
+ pagevec_release(&pvec);
+}
+
+
+/*
+ * async writeback completion handler.
+ *
+ * If we get an error, set the mapping error bit, but not the individual
+ * page error bits.
+ */
+static void writepages_finish(struct ceph_osd_request *req,
+ struct ceph_msg *msg)
+{
+ struct inode *inode = req->r_inode;
+ struct ceph_osd_reply_head *replyhead;
+ struct ceph_osd_op *op;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ unsigned wrote;
+ struct page *page;
+ int i;
+ struct ceph_snap_context *snapc = req->r_snapc;
+ struct address_space *mapping = inode->i_mapping;
+ struct writeback_control *wbc = req->r_wbc;
+ __s32 rc = -EIO;
+ u64 bytes = 0;
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ long writeback_stat;
+ unsigned issued = __ceph_caps_issued(ci, NULL);
+
+ /* parse reply */
+ replyhead = msg->front.iov_base;
+ WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
+ op = (void *)(replyhead + 1);
+ rc = le32_to_cpu(replyhead->result);
+ bytes = le64_to_cpu(op->extent.length);
+
+ if (rc >= 0) {
+ /*
+ * Assume we wrote the pages we originally sent. The
+ * osd might reply with fewer pages if our writeback
+ * raced with a truncation and was adjusted at the osd,
+ * so don't believe the reply.
+ */
+ wrote = req->r_num_pages;
+ } else {
+ wrote = 0;
+ mapping_set_error(mapping, rc);
+ }
+ dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
+ inode, rc, bytes, wrote);
+
+ /* clean all pages */
+ for (i = 0; i < req->r_num_pages; i++) {
+ page = req->r_pages[i];
+ BUG_ON(!page);
+ WARN_ON(!PageUptodate(page));
+
+ writeback_stat =
+ atomic_long_dec_return(&client->writeback_count);
+ if (writeback_stat <
+ CONGESTION_OFF_THRESH(client->mount_args->congestion_kb))
+ clear_bdi_congested(&client->backing_dev_info,
+ BLK_RW_ASYNC);
+
+ if (i >= wrote) {
+ dout("inode %p skipping page %p\n", inode, page);
+ wbc->pages_skipped++;
+ }
+ page->private = 0;
+ ClearPagePrivate(page);
+ ceph_put_snap_context(snapc);
+ dout("unlocking %d %p\n", i, page);
+ end_page_writeback(page);
+
+ /*
+ * We lost the cache cap, need to truncate the page before
+ * it is unlocked, otherwise we'd truncate it later in the
+ * page truncation thread, possibly losing some data that
+ * raced its way in
+ */
+ if ((issued & CEPH_CAP_FILE_CACHE) == 0)
+ generic_error_remove_page(inode->i_mapping, page);
+
+ unlock_page(page);
+ }
+ dout("%p wrote+cleaned %d pages\n", inode, wrote);
+ ceph_put_wrbuffer_cap_refs(ci, req->r_num_pages, snapc);
+
+ ceph_release_pages(req->r_pages, req->r_num_pages);
+ if (req->r_pages_from_pool)
+ mempool_free(req->r_pages,
+ ceph_client(inode->i_sb)->wb_pagevec_pool);
+ else
+ kfree(req->r_pages);
+ ceph_osdc_put_request(req);
+}
+
+/*
+ * allocate a page vec, either directly, or if necessary, via a the
+ * mempool. we avoid the mempool if we can because req->r_num_pages
+ * may be less than the maximum write size.
+ */
+static void alloc_page_vec(struct ceph_client *client,
+ struct ceph_osd_request *req)
+{
+ req->r_pages = kmalloc(sizeof(struct page *) * req->r_num_pages,
+ GFP_NOFS);
+ if (!req->r_pages) {
+ req->r_pages = mempool_alloc(client->wb_pagevec_pool, GFP_NOFS);
+ req->r_pages_from_pool = 1;
+ WARN_ON(!req->r_pages);
+ }
+}
+
+/*
+ * initiate async writeback
+ */
+static int ceph_writepages_start(struct address_space *mapping,
+ struct writeback_control *wbc)
+{
+ struct inode *inode = mapping->host;
+ struct backing_dev_info *bdi = mapping->backing_dev_info;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_client *client;
+ pgoff_t index, start, end;
+ int range_whole = 0;
+ int should_loop = 1;
+ pgoff_t max_pages = 0, max_pages_ever = 0;
+ struct ceph_snap_context *snapc = NULL, *last_snapc = NULL;
+ struct pagevec pvec;
+ int done = 0;
+ int rc = 0;
+ unsigned wsize = 1 << inode->i_blkbits;
+ struct ceph_osd_request *req = NULL;
+ int do_sync;
+ u64 snap_size = 0;
+
+ /*
+ * Include a 'sync' in the OSD request if this is a data
+ * integrity write (e.g., O_SYNC write or fsync()), or if our
+ * cap is being revoked.
+ */
+ do_sync = wbc->sync_mode == WB_SYNC_ALL;
+ if (ceph_caps_revoking(ci, CEPH_CAP_FILE_BUFFER))
+ do_sync = 1;
+ dout("writepages_start %p dosync=%d (mode=%s)\n",
+ inode, do_sync,
+ wbc->sync_mode == WB_SYNC_NONE ? "NONE" :
+ (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD"));
+
+ client = ceph_inode_to_client(inode);
+ if (client->mount_state == CEPH_MOUNT_SHUTDOWN) {
+ pr_warning("writepage_start %p on forced umount\n", inode);
+ return -EIO; /* we're in a forced umount, don't write! */
+ }
+ if (client->mount_args->wsize && client->mount_args->wsize < wsize)
+ wsize = client->mount_args->wsize;
+ if (wsize < PAGE_CACHE_SIZE)
+ wsize = PAGE_CACHE_SIZE;
+ max_pages_ever = wsize >> PAGE_CACHE_SHIFT;
+
+ pagevec_init(&pvec, 0);
+
+ /* ?? */
+ if (wbc->nonblocking && bdi_write_congested(bdi)) {
+ dout(" writepages congested\n");
+ wbc->encountered_congestion = 1;
+ goto out_final;
+ }
+
+ /* where to start/end? */
+ if (wbc->range_cyclic) {
+ start = mapping->writeback_index; /* Start from prev offset */
+ end = -1;
+ dout(" cyclic, start at %lu\n", start);
+ } else {
+ start = wbc->range_start >> PAGE_CACHE_SHIFT;
+ end = wbc->range_end >> PAGE_CACHE_SHIFT;
+ if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
+ range_whole = 1;
+ should_loop = 0;
+ dout(" not cyclic, %lu to %lu\n", start, end);
+ }
+ index = start;
+
+retry:
+ /* find oldest snap context with dirty data */
+ ceph_put_snap_context(snapc);
+ snapc = get_oldest_context(inode, &snap_size);
+ if (!snapc) {
+ /* hmm, why does writepages get called when there
+ is no dirty data? */
+ dout(" no snap context with dirty data?\n");
+ goto out;
+ }
+ dout(" oldest snapc is %p seq %lld (%d snaps)\n",
+ snapc, snapc->seq, snapc->num_snaps);
+ if (last_snapc && snapc != last_snapc) {
+ /* if we switched to a newer snapc, restart our scan at the
+ * start of the original file range. */
+ dout(" snapc differs from last pass, restarting at %lu\n",
+ index);
+ index = start;
+ }
+ last_snapc = snapc;
+
+ while (!done && index <= end) {
+ unsigned i;
+ int first;
+ pgoff_t next;
+ int pvec_pages, locked_pages;
+ struct page *page;
+ int want;
+ u64 offset, len;
+ struct ceph_osd_request_head *reqhead;
+ struct ceph_osd_op *op;
+ long writeback_stat;
+
+ next = 0;
+ locked_pages = 0;
+ max_pages = max_pages_ever;
+
+get_more_pages:
+ first = -1;
+ want = min(end - index,
+ min((pgoff_t)PAGEVEC_SIZE,
+ max_pages - (pgoff_t)locked_pages) - 1)
+ + 1;
+ pvec_pages = pagevec_lookup_tag(&pvec, mapping, &index,
+ PAGECACHE_TAG_DIRTY,
+ want);
+ dout("pagevec_lookup_tag got %d\n", pvec_pages);
+ if (!pvec_pages && !locked_pages)
+ break;
+ for (i = 0; i < pvec_pages && locked_pages < max_pages; i++) {
+ page = pvec.pages[i];
+ dout("? %p idx %lu\n", page, page->index);
+ if (locked_pages == 0)
+ lock_page(page); /* first page */
+ else if (!trylock_page(page))
+ break;
+
+ /* only dirty pages, or our accounting breaks */
+ if (unlikely(!PageDirty(page)) ||
+ unlikely(page->mapping != mapping)) {
+ dout("!dirty or !mapping %p\n", page);
+ unlock_page(page);
+ break;
+ }
+ if (!wbc->range_cyclic && page->index > end) {
+ dout("end of range %p\n", page);
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (next && (page->index != next)) {
+ dout("not consecutive %p\n", page);
+ unlock_page(page);
+ break;
+ }
+ if (wbc->sync_mode != WB_SYNC_NONE) {
+ dout("waiting on writeback %p\n", page);
+ wait_on_page_writeback(page);
+ }
+ if ((snap_size && page_offset(page) > snap_size) ||
+ (!snap_size &&
+ page_offset(page) > i_size_read(inode))) {
+ dout("%p page eof %llu\n", page, snap_size ?
+ snap_size : i_size_read(inode));
+ done = 1;
+ unlock_page(page);
+ break;
+ }
+ if (PageWriteback(page)) {
+ dout("%p under writeback\n", page);
+ unlock_page(page);
+ break;
+ }
+
+ /* only if matching snap context */
+ if (snapc != (void *)page->private) {
+ dout("page snapc %p != oldest %p\n",
+ (void *)page->private, snapc);
+ unlock_page(page);
+ if (!locked_pages)
+ continue; /* keep looking for snap */
+ break;
+ }
+
+ if (!clear_page_dirty_for_io(page)) {
+ dout("%p !clear_page_dirty_for_io\n", page);
+ unlock_page(page);
+ break;
+ }
+
+ /* ok */
+ if (locked_pages == 0) {
+ /* prepare async write request */
+ offset = page->index << PAGE_CACHE_SHIFT;
+ len = wsize;
+ req = ceph_osdc_new_request(&client->osdc,
+ &ci->i_layout,
+ ceph_vino(inode),
+ offset, &len,
+ CEPH_OSD_OP_WRITE,
+ CEPH_OSD_FLAG_WRITE |
+ CEPH_OSD_FLAG_ONDISK,
+ snapc, do_sync,
+ ci->i_truncate_seq,
+ ci->i_truncate_size,
+ &inode->i_mtime, true, 1);
+ max_pages = req->r_num_pages;
+
+ alloc_page_vec(client, req);
+ req->r_callback = writepages_finish;
+ req->r_inode = inode;
+ req->r_wbc = wbc;
+ }
+
+ /* note position of first page in pvec */
+ if (first < 0)
+ first = i;
+ dout("%p will write page %p idx %lu\n",
+ inode, page, page->index);
+
+ writeback_stat = atomic_long_inc_return(&client->writeback_count);
+ if (writeback_stat > CONGESTION_ON_THRESH(client->mount_args->congestion_kb)) {
+ set_bdi_congested(&client->backing_dev_info, BLK_RW_ASYNC);
+ }
+
+ set_page_writeback(page);
+ req->r_pages[locked_pages] = page;
+ locked_pages++;
+ next = page->index + 1;
+ }
+
+ /* did we get anything? */
+ if (!locked_pages)
+ goto release_pvec_pages;
+ if (i) {
+ int j;
+ BUG_ON(!locked_pages || first < 0);
+
+ if (pvec_pages && i == pvec_pages &&
+ locked_pages < max_pages) {
+ dout("reached end pvec, trying for more\n");
+ pagevec_reinit(&pvec);
+ goto get_more_pages;
+ }
+
+ /* shift unused pages over in the pvec... we
+ * will need to release them below. */
+ for (j = i; j < pvec_pages; j++) {
+ dout(" pvec leftover page %p\n",
+ pvec.pages[j]);
+ pvec.pages[j-i+first] = pvec.pages[j];
+ }
+ pvec.nr -= i-first;
+ }
+
+ /* submit the write */
+ offset = req->r_pages[0]->index << PAGE_CACHE_SHIFT;
+ len = min((snap_size ? snap_size : i_size_read(inode)) - offset,
+ (u64)locked_pages << PAGE_CACHE_SHIFT);
+ dout("writepages got %d pages at %llu~%llu\n",
+ locked_pages, offset, len);
+
+ /* revise final length, page count */
+ req->r_num_pages = locked_pages;
+ reqhead = req->r_request->front.iov_base;
+ op = (void *)(reqhead + 1);
+ op->extent.length = cpu_to_le64(len);
+ op->payload_len = cpu_to_le32(len);
+ req->r_request->hdr.data_len = cpu_to_le32(len);
+
+ ceph_osdc_start_request(&client->osdc, req, true);
+ req = NULL;
+
+ /* continue? */
+ index = next;
+ wbc->nr_to_write -= locked_pages;
+ if (wbc->nr_to_write <= 0)
+ done = 1;
+
+release_pvec_pages:
+ dout("pagevec_release on %d pages (%p)\n", (int)pvec.nr,
+ pvec.nr ? pvec.pages[0] : NULL);
+ pagevec_release(&pvec);
+
+ if (locked_pages && !done)
+ goto retry;
+ }
+
+ if (should_loop && !done) {
+ /* more to do; loop back to beginning of file */
+ dout("writepages looping back to beginning of file\n");
+ should_loop = 0;
+ index = 0;
+ goto retry;
+ }
+
+ if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
+ mapping->writeback_index = index;
+
+out:
+ if (req)
+ ceph_osdc_put_request(req);
+ if (rc > 0)
+ rc = 0; /* vfs expects us to return 0 */
+ ceph_put_snap_context(snapc);
+ dout("writepages done, rc = %d\n", rc);
+out_final:
+ return rc;
+}
+
+
+
+/*
+ * See if a given @snapc is either writeable, or already written.
+ */
+static int context_is_writeable_or_written(struct inode *inode,
+ struct ceph_snap_context *snapc)
+{
+ struct ceph_snap_context *oldest = get_oldest_context(inode, NULL);
+ return !oldest || snapc->seq <= oldest->seq;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_update_writeable_page(struct file *file,
+ loff_t pos, unsigned len,
+ struct page *page)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ loff_t page_off = pos & PAGE_CACHE_MASK;
+ int pos_in_page = pos & ~PAGE_CACHE_MASK;
+ int end_in_page = pos_in_page + len;
+ loff_t i_size;
+ struct ceph_snap_context *snapc;
+ int r;
+
+retry_locked:
+ /* writepages currently holds page lock, but if we change that later, */
+ wait_on_page_writeback(page);
+
+ /* check snap context */
+ BUG_ON(!ci->i_snap_realm);
+ down_read(&mdsc->snap_rwsem);
+ BUG_ON(!ci->i_snap_realm->cached_context);
+ if (page->private &&
+ (void *)page->private != ci->i_snap_realm->cached_context) {
+ /*
+ * this page is already dirty in another (older) snap
+ * context! is it writeable now?
+ */
+ snapc = get_oldest_context(inode, NULL);
+ up_read(&mdsc->snap_rwsem);
+
+ if (snapc != (void *)page->private) {
+ dout(" page %p snapc %p not current or oldest\n",
+ page, (void *)page->private);
+ /*
+ * queue for writeback, and wait for snapc to
+ * be writeable or written
+ */
+ snapc = ceph_get_snap_context((void *)page->private);
+ unlock_page(page);
+ ceph_queue_writeback(inode);
+ wait_event_interruptible(ci->i_cap_wq,
+ context_is_writeable_or_written(inode, snapc));
+ ceph_put_snap_context(snapc);
+ return -EAGAIN;
+ }
+
+ /* yay, writeable, do it now (without dropping page lock) */
+ dout(" page %p snapc %p not current, but oldest\n",
+ page, snapc);
+ if (!clear_page_dirty_for_io(page))
+ goto retry_locked;
+ r = writepage_nounlock(page, NULL);
+ if (r < 0)
+ goto fail_nosnap;
+ goto retry_locked;
+ }
+
+ if (PageUptodate(page)) {
+ dout(" page %p already uptodate\n", page);
+ return 0;
+ }
+
+ /* full page? */
+ if (pos_in_page == 0 && len == PAGE_CACHE_SIZE)
+ return 0;
+
+ /* past end of file? */
+ i_size = inode->i_size; /* caller holds i_mutex */
+
+ if (i_size + len > inode->i_sb->s_maxbytes) {
+ /* file is too big */
+ r = -EINVAL;
+ goto fail;
+ }
+
+ if (page_off >= i_size ||
+ (pos_in_page == 0 && (pos+len) >= i_size &&
+ end_in_page - pos_in_page != PAGE_CACHE_SIZE)) {
+ dout(" zeroing %p 0 - %d and %d - %d\n",
+ page, pos_in_page, end_in_page, (int)PAGE_CACHE_SIZE);
+ zero_user_segments(page,
+ 0, pos_in_page,
+ end_in_page, PAGE_CACHE_SIZE);
+ return 0;
+ }
+
+ /* we need to read it. */
+ up_read(&mdsc->snap_rwsem);
+ r = readpage_nounlock(file, page);
+ if (r < 0)
+ goto fail_nosnap;
+ goto retry_locked;
+
+fail:
+ up_read(&mdsc->snap_rwsem);
+fail_nosnap:
+ unlock_page(page);
+ return r;
+}
+
+/*
+ * We are only allowed to write into/dirty the page if the page is
+ * clean, or already dirty within the same snap context.
+ */
+static int ceph_write_begin(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned flags,
+ struct page **pagep, void **fsdata)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct page *page;
+ pgoff_t index = pos >> PAGE_CACHE_SHIFT;
+ int r;
+
+ do {
+ /* get a page*/
+ page = grab_cache_page_write_begin(mapping, index, 0);
+ if (!page)
+ return -ENOMEM;
+ *pagep = page;
+
+ dout("write_begin file %p inode %p page %p %d~%d\n", file,
+ inode, page, (int)pos, (int)len);
+
+ r = ceph_update_writeable_page(file, pos, len, page);
+ } while (r == -EAGAIN);
+
+ return r;
+}
+
+/*
+ * we don't do anything in here that simple_write_end doesn't do
+ * except adjust dirty page accounting and drop read lock on
+ * mdsc->snap_rwsem.
+ */
+static int ceph_write_end(struct file *file, struct address_space *mapping,
+ loff_t pos, unsigned len, unsigned copied,
+ struct page *page, void *fsdata)
+{
+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ unsigned from = pos & (PAGE_CACHE_SIZE - 1);
+ int check_cap = 0;
+
+ dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
+ inode, page, (int)pos, (int)copied, (int)len);
+
+ /* zero the stale part of the page if we did a short copy */
+ if (copied < len)
+ zero_user_segment(page, from+copied, len);
+
+ /* did file size increase? */
+ /* (no need for i_size_read(); we caller holds i_mutex */
+ if (pos+copied > inode->i_size)
+ check_cap = ceph_inode_set_size(inode, pos+copied);
+
+ if (!PageUptodate(page))
+ SetPageUptodate(page);
+
+ set_page_dirty(page);
+
+ unlock_page(page);
+ up_read(&mdsc->snap_rwsem);
+ page_cache_release(page);
+
+ if (check_cap)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_AUTHONLY, NULL);
+
+ return copied;
+}
+
+/*
+ * we set .direct_IO to indicate direct io is supported, but since we
+ * intercept O_DIRECT reads and writes early, this function should
+ * never get called.
+ */
+static ssize_t ceph_direct_io(int rw, struct kiocb *iocb,
+ const struct iovec *iov,
+ loff_t pos, unsigned long nr_segs)
+{
+ WARN_ON(1);
+ return -EINVAL;
+}
+
+const struct address_space_operations ceph_aops = {
+ .readpage = ceph_readpage,
+ .readpages = ceph_readpages,
+ .writepage = ceph_writepage,
+ .writepages = ceph_writepages_start,
+ .write_begin = ceph_write_begin,
+ .write_end = ceph_write_end,
+ .set_page_dirty = ceph_set_page_dirty,
+ .invalidatepage = ceph_invalidatepage,
+ .releasepage = ceph_releasepage,
+ .direct_IO = ceph_direct_io,
+};
+
+
+/*
+ * vm ops
+ */
+
+/*
+ * Reuse write_begin here for simplicity.
+ */
+static int ceph_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf)
+{
+ struct inode *inode = vma->vm_file->f_dentry->d_inode;
+ struct page *page = vmf->page;
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ loff_t off = page->index << PAGE_CACHE_SHIFT;
+ loff_t size, len;
+ int ret;
+
+ size = i_size_read(inode);
+ if (off + PAGE_CACHE_SIZE <= size)
+ len = PAGE_CACHE_SIZE;
+ else
+ len = size & ~PAGE_CACHE_MASK;
+
+ dout("page_mkwrite %p %llu~%llu page %p idx %lu\n", inode,
+ off, len, page, page->index);
+
+ lock_page(page);
+
+ ret = VM_FAULT_NOPAGE;
+ if ((off > size) ||
+ (page->mapping != inode->i_mapping))
+ goto out;
+
+ ret = ceph_update_writeable_page(vma->vm_file, off, len, page);
+ if (ret == 0) {
+ /* success. we'll keep the page locked. */
+ set_page_dirty(page);
+ up_read(&mdsc->snap_rwsem);
+ ret = VM_FAULT_LOCKED;
+ } else {
+ if (ret == -ENOMEM)
+ ret = VM_FAULT_OOM;
+ else
+ ret = VM_FAULT_SIGBUS;
+ }
+out:
+ dout("page_mkwrite %p %llu~%llu = %d\n", inode, off, len, ret);
+ if (ret != VM_FAULT_LOCKED)
+ unlock_page(page);
+ return ret;
+}
+
+static struct vm_operations_struct ceph_vmops = {
+ .fault = filemap_fault,
+ .page_mkwrite = ceph_page_mkwrite,
+};
+
+int ceph_mmap(struct file *file, struct vm_area_struct *vma)
+{
+ struct address_space *mapping = file->f_mapping;
+
+ if (!mapping->a_ops->readpage)
+ return -ENOEXEC;
+ file_accessed(file);
+ vma->vm_ops = &ceph_vmops;
+ vma->vm_flags |= VM_CAN_NONLINEAR;
+ return 0;
+}
--
1.7.0

--
To unsubscribe from this list: send the line "unsubscribe linux-kernel" in
the body of a message to majo...@vger.kernel.org
More majordomo info at http://vger.kernel.org/majordomo-info.html
Please read the FAQ at http://www.tux.org/lkml/

Message has been deleted

Sage Weil

unread,

Mar 2, 2010, 7:50:03 PM3/2/10

Basic state information is available via /sys/kernel/debug/ceph,
including instances of the client, current monitor, mds and osd
maps, and outstanding service requests.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/debugfs.c | 483 +++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 483 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/debugfs.c

diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c
new file mode 100644
index 0000000..e159f14
--- /dev/null
+++ b/fs/ceph/debugfs.c
@@ -0,0 +1,483 @@
+#include "ceph_debug.h"
+
+#include <linux/device.h>
+#include <linux/module.h>
+#include <linux/ctype.h>
+#include <linux/debugfs.h>
+#include <linux/seq_file.h>
+
+#include "super.h"
+#include "mds_client.h"
+#include "mon_client.h"
+#include "auth.h"
+
+#ifdef CONFIG_DEBUG_FS
+
+/*
+ * Implement /sys/kernel/debug/ceph fun
+ *
+ * /sys/kernel/debug/ceph/client* - an instance of the ceph client
+ * .../osdmap - current osdmap
+ * .../mdsmap - current mdsmap
+ * .../monmap - current monmap
+ * .../osdc - active osd requests
+ * .../mdsc - active mds requests
+ * .../monc - mon client state
+ * .../dentry_lru - dump contents of dentry lru
+ * .../caps - expose cap (reservation) stats
+ * .../bdi - symlink to ../../bdi/something
+ */
+
+static struct dentry *ceph_debugfs_dir;
+
+static int monmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->monc.monmap == NULL)
+ return 0;
+
+ seq_printf(s, "epoch %d\n", client->monc.monmap->epoch);
+ for (i = 0; i < client->monc.monmap->num_mon; i++) {
+ struct ceph_entity_inst *inst =
+ &client->monc.monmap->mon_inst[i];
+
+ seq_printf(s, "\t%s%lld\t%s\n",
+ ENTITY_NAME(inst->name),
+ pr_addr(&inst->addr.in_addr));
+ }

+ return 0;
+}
+

+static int mdsmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+
+ if (client->mdsc.mdsmap == NULL)
+ return 0;
+ seq_printf(s, "epoch %d\n", client->mdsc.mdsmap->m_epoch);
+ seq_printf(s, "root %d\n", client->mdsc.mdsmap->m_root);
+ seq_printf(s, "session_timeout %d\n",
+ client->mdsc.mdsmap->m_session_timeout);
+ seq_printf(s, "session_autoclose %d\n",
+ client->mdsc.mdsmap->m_session_autoclose);
+ for (i = 0; i < client->mdsc.mdsmap->m_max_mds; i++) {
+ struct ceph_entity_addr *addr =
+ &client->mdsc.mdsmap->m_info[i].addr;
+ int state = client->mdsc.mdsmap->m_info[i].state;
+
+ seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, pr_addr(&addr->in_addr),
+ ceph_mds_state_name(state));
+ }

+ return 0;
+}
+

+static int osdmap_show(struct seq_file *s, void *p)
+{
+ int i;
+ struct ceph_client *client = s->private;
+ struct rb_node *n;
+
+ if (client->osdc.osdmap == NULL)
+ return 0;
+ seq_printf(s, "epoch %d\n", client->osdc.osdmap->epoch);
+ seq_printf(s, "flags%s%s\n",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_NEARFULL) ?
+ " NEARFULL" : "",
+ (client->osdc.osdmap->flags & CEPH_OSDMAP_FULL) ?
+ " FULL" : "");
+ for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
+ struct ceph_pg_pool_info *pool =
+ rb_entry(n, struct ceph_pg_pool_info, node);
+ seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n",
+ pool->id, pool->v.pg_num, pool->pg_num_mask,
+ pool->v.lpg_num, pool->lpg_num_mask);
+ }
+ for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
+ struct ceph_entity_addr *addr =
+ &client->osdc.osdmap->osd_addr[i];
+ int state = client->osdc.osdmap->osd_state[i];
+ char sb[64];
+
+ seq_printf(s, "\tosd%d\t%s\t%3d%%\t(%s)\n",
+ i, pr_addr(&addr->in_addr),
+ ((client->osdc.osdmap->osd_weight[i]*100) >> 16),
+ ceph_osdmap_state_str(sb, sizeof(sb), state));
+ }

+ return 0;
+}
+

+static int monc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_mon_statfs_request *req;
+ struct ceph_mon_client *monc = &client->monc;
+ struct rb_node *rp;
+
+ mutex_lock(&monc->mutex);
+
+ if (monc->have_mdsmap)
+ seq_printf(s, "have mdsmap %u\n", (unsigned)monc->have_mdsmap);
+ if (monc->have_osdmap)
+ seq_printf(s, "have osdmap %u\n", (unsigned)monc->have_osdmap);
+ if (monc->want_next_osdmap)
+ seq_printf(s, "want next osdmap\n");
+
+ for (rp = rb_first(&monc->statfs_request_tree); rp; rp = rb_next(rp)) {
+ req = rb_entry(rp, struct ceph_mon_statfs_request, node);
+ seq_printf(s, "%lld statfs\n", req->tid);
+ }
+
+ mutex_unlock(&monc->mutex);

+ return 0;
+}
+

+static int mdsc_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = s->private;

+ struct ceph_mds_client *mdsc = &client->mdsc;

+ struct ceph_mds_request *req;
+ struct rb_node *rp;
+ int pathlen;
+ u64 pathbase;
+ char *path;
+
+ mutex_lock(&mdsc->mutex);
+ for (rp = rb_first(&mdsc->request_tree); rp; rp = rb_next(rp)) {
+ req = rb_entry(rp, struct ceph_mds_request, r_node);
+
+ if (req->r_request)
+ seq_printf(s, "%lld\tmds%d\t", req->r_tid, req->r_mds);
+ else
+ seq_printf(s, "%lld\t(no request)\t", req->r_tid);
+
+ seq_printf(s, "%s", ceph_mds_op_name(req->r_op));
+
+ if (req->r_got_unsafe)
+ seq_printf(s, "\t(unsafe)");
+ else
+ seq_printf(s, "\t");
+
+ if (req->r_inode) {
+ seq_printf(s, " #%llx", ceph_ino(req->r_inode));
+ } else if (req->r_dentry) {
+ path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
+ &pathbase, 0);
+ spin_lock(&req->r_dentry->d_lock);
+ seq_printf(s, " #%llx/%.*s (%s)",
+ ceph_ino(req->r_dentry->d_parent->d_inode),
+ req->r_dentry->d_name.len,
+ req->r_dentry->d_name.name,
+ path ? path : "");
+ spin_unlock(&req->r_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path1) {
+ seq_printf(s, " #%llx/%s", req->r_ino1.ino,
+ req->r_path1);
+ }
+
+ if (req->r_old_dentry) {
+ path = ceph_mdsc_build_path(req->r_old_dentry, &pathlen,
+ &pathbase, 0);
+ spin_lock(&req->r_old_dentry->d_lock);
+ seq_printf(s, " #%llx/%.*s (%s)",
+ ceph_ino(req->r_old_dentry->d_parent->d_inode),
+ req->r_old_dentry->d_name.len,
+ req->r_old_dentry->d_name.name,
+ path ? path : "");
+ spin_unlock(&req->r_old_dentry->d_lock);
+ kfree(path);
+ } else if (req->r_path2) {
+ if (req->r_ino2.ino)
+ seq_printf(s, " #%llx/%s", req->r_ino2.ino,
+ req->r_path2);
+ else
+ seq_printf(s, " %s", req->r_path2);
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&mdsc->mutex);
+

+ return 0;
+}
+

+static int osdc_show(struct seq_file *s, void *pp)
+{
+ struct ceph_client *client = s->private;
+ struct ceph_osd_client *osdc = &client->osdc;
+ struct rb_node *p;
+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
+ struct ceph_osd_request *req;
+ struct ceph_osd_request_head *head;
+ struct ceph_osd_op *op;
+ int num_ops;
+ int opcode, olen;
+ int i;
+
+ req = rb_entry(p, struct ceph_osd_request, r_node);
+
+ seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid,
+ req->r_osd ? req->r_osd->o_osd : -1,
+ le32_to_cpu(req->r_pgid.pool),
+ le16_to_cpu(req->r_pgid.ps));
+
+ head = req->r_request->front.iov_base;
+ op = (void *)(head + 1);
+
+ num_ops = le16_to_cpu(head->num_ops);
+ olen = le32_to_cpu(head->object_len);
+ seq_printf(s, "%.*s", olen,
+ (const char *)(head->ops + num_ops));
+
+ if (req->r_reassert_version.epoch)
+ seq_printf(s, "\t%u'%llu",
+ (unsigned)le32_to_cpu(req->r_reassert_version.epoch),
+ le64_to_cpu(req->r_reassert_version.version));
+ else
+ seq_printf(s, "\t");
+
+ for (i = 0; i < num_ops; i++) {
+ opcode = le16_to_cpu(op->op);
+ seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
+ op++;
+ }
+
+ seq_printf(s, "\n");
+ }
+ mutex_unlock(&osdc->request_mutex);

+ return 0;
+}
+

+static int caps_show(struct seq_file *s, void *p)
+{
+ struct ceph_client *client = p;
+ int total, avail, used, reserved, min;
+
+ ceph_reservation_status(client, &total, &avail, &used, &reserved, &min);
+ seq_printf(s, "total\t\t%d\n"
+ "avail\t\t%d\n"
+ "used\t\t%d\n"
+ "reserved\t%d\n"
+ "min\t%d\n",
+ total, avail, used, reserved, min);

+ return 0;
+}
+

+static int dentry_lru_show(struct seq_file *s, void *ptr)
+{
+ struct ceph_client *client = s->private;

+ struct ceph_mds_client *mdsc = &client->mdsc;

+ struct ceph_dentry_info *di;
+
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_for_each_entry(di, &mdsc->dentry_lru, lru) {
+ struct dentry *dentry = di->dentry;
+ seq_printf(s, "%p %p\t%.*s\n",
+ di, dentry, dentry->d_name.len, dentry->d_name.name);
+ }
+ spin_unlock(&mdsc->dentry_lru_lock);
+

+ return 0;
+}
+

+#define DEFINE_SHOW_FUNC(name) \
+static int name##_open(struct inode *inode, struct file *file) \
+{ \
+ struct seq_file *sf; \
+ int ret; \
+ \
+ ret = single_open(file, name, NULL); \
+ sf = file->private_data; \
+ sf->private = inode->i_private; \

+ return ret; \
+} \
+ \

+static const struct file_operations name##_fops = { \
+ .open = name##_open, \
+ .read = seq_read, \
+ .llseek = seq_lseek, \
+ .release = single_release, \
+};
+
+DEFINE_SHOW_FUNC(monmap_show)
+DEFINE_SHOW_FUNC(mdsmap_show)
+DEFINE_SHOW_FUNC(osdmap_show)
+DEFINE_SHOW_FUNC(monc_show)
+DEFINE_SHOW_FUNC(mdsc_show)
+DEFINE_SHOW_FUNC(osdc_show)
+DEFINE_SHOW_FUNC(dentry_lru_show)
+DEFINE_SHOW_FUNC(caps_show)
+
+static int congestion_kb_set(void *data, u64 val)
+{
+ struct ceph_client *client = (struct ceph_client *)data;
+
+ if (client)
+ client->mount_args->congestion_kb = (int)val;
+

+ return 0;
+}
+

+static int congestion_kb_get(void *data, u64 *val)
+{
+ struct ceph_client *client = (struct ceph_client *)data;
+
+ if (client)
+ *val = (u64)client->mount_args->congestion_kb;
+

+ return 0;
+}
+
+

+DEFINE_SIMPLE_ATTRIBUTE(congestion_kb_fops, congestion_kb_get,
+ congestion_kb_set, "%llu\n");
+
+int __init ceph_debugfs_init(void)
+{
+ ceph_debugfs_dir = debugfs_create_dir("ceph", NULL);
+ if (!ceph_debugfs_dir)
+ return -ENOMEM;

+ return 0;
+}
+

+void ceph_debugfs_cleanup(void)
+{
+ debugfs_remove(ceph_debugfs_dir);
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{
+ int ret = 0;
+ char name[80];
+
+ snprintf(name, sizeof(name), FSID_FORMAT ".client%lld",
+ PR_FSID(&client->fsid), client->monc.auth->global_id);
+
+ client->debugfs_dir = debugfs_create_dir(name, ceph_debugfs_dir);
+ if (!client->debugfs_dir)
+ goto out;
+
+ client->monc.debugfs_file = debugfs_create_file("monc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monc_show_fops);
+ if (!client->monc.debugfs_file)
+ goto out;
+
+ client->mdsc.debugfs_file = debugfs_create_file("mdsc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &mdsc_show_fops);
+ if (!client->mdsc.debugfs_file)
+ goto out;
+
+ client->osdc.debugfs_file = debugfs_create_file("osdc",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdc_show_fops);
+ if (!client->osdc.debugfs_file)
+ goto out;
+
+ client->debugfs_monmap = debugfs_create_file("monmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &monmap_show_fops);
+ if (!client->debugfs_monmap)
+ goto out;
+
+ client->debugfs_mdsmap = debugfs_create_file("mdsmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &mdsmap_show_fops);
+ if (!client->debugfs_mdsmap)
+ goto out;
+
+ client->debugfs_osdmap = debugfs_create_file("osdmap",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &osdmap_show_fops);
+ if (!client->debugfs_osdmap)
+ goto out;
+
+ client->debugfs_dentry_lru = debugfs_create_file("dentry_lru",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &dentry_lru_show_fops);
+ if (!client->debugfs_dentry_lru)
+ goto out;
+
+ client->debugfs_caps = debugfs_create_file("caps",
+ 0400,
+ client->debugfs_dir,
+ client,
+ &caps_show_fops);
+ if (!client->debugfs_caps)
+ goto out;
+
+ client->debugfs_congestion_kb = debugfs_create_file("writeback_congestion_kb",
+ 0600,
+ client->debugfs_dir,
+ client,
+ &congestion_kb_fops);
+ if (!client->debugfs_congestion_kb)
+ goto out;
+
+ sprintf(name, "../../bdi/%s", dev_name(client->sb->s_bdi->dev));
+ client->debugfs_bdi = debugfs_create_symlink("bdi", client->debugfs_dir,
+ name);
+
+ return 0;
+
+out:
+ ceph_debugfs_client_cleanup(client);

+ return ret;
+}
+

+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+ debugfs_remove(client->debugfs_bdi);
+ debugfs_remove(client->debugfs_caps);
+ debugfs_remove(client->debugfs_dentry_lru);
+ debugfs_remove(client->debugfs_osdmap);
+ debugfs_remove(client->debugfs_mdsmap);
+ debugfs_remove(client->debugfs_monmap);
+ debugfs_remove(client->osdc.debugfs_file);
+ debugfs_remove(client->mdsc.debugfs_file);
+ debugfs_remove(client->monc.debugfs_file);
+ debugfs_remove(client->debugfs_congestion_kb);
+ debugfs_remove(client->debugfs_dir);
+}
+
+#else // CONFIG_DEBUG_FS
+
+int __init ceph_debugfs_init(void)
+{

+ return 0;
+}
+

+void ceph_debugfs_cleanup(void)
+{
+}
+
+int ceph_debugfs_client_init(struct ceph_client *client)
+{

+ return 0;
+}
+

+void ceph_debugfs_client_cleanup(struct ceph_client *client)
+{
+}
+
+#endif // CONFIG_DEBUG_FS

Sage Weil

unread,

Mar 2, 2010, 7:50:03 PM3/2/10

A few Ceph ioctls for getting and setting file layout (striping)
parameters, and learning the identity and network address of the OSD a
given region of a file is stored on.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

Documentation/ioctl/ioctl-number.txt | 1 +
fs/ceph/ioctl.c | 160 ++++++++++++++++++++++++++++++++++
fs/ceph/ioctl.h | 40 +++++++++
3 files changed, 201 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/ioctl.c
create mode 100644 fs/ceph/ioctl.h

diff --git a/Documentation/ioctl/ioctl-number.txt b/Documentation/ioctl/ioctl-number.txt
index 35cf64d..e07e5b5 100644
--- a/Documentation/ioctl/ioctl-number.txt
+++ b/Documentation/ioctl/ioctl-number.txt
@@ -292,6 +292,7 @@ Code Seq#(hex) Include File Comments
0x92 00-0F drivers/usb/mon/mon_bin.c
0x93 60-7F linux/auto_fs.h
0x94 all fs/btrfs/ioctl.h
+0x97 00-7F fs/ceph/ioctl.h Ceph file system
0x99 00-0F 537-Addinboard driver
<mailto:b...@buks.ipn.de>
0xA0 all linux/sdp/sdp.h Industrial Device Project
diff --git a/fs/ceph/ioctl.c b/fs/ceph/ioctl.c
new file mode 100644
index 0000000..8a5bcae
--- /dev/null
+++ b/fs/ceph/ioctl.c
@@ -0,0 +1,160 @@
+#include <linux/in.h>
+
+#include "ioctl.h"
+#include "super.h"
+#include "ceph_debug.h"
+
+
+/*
+ * ioctls
+ */
+
+/*
+ * get and set the file layout
+ */
+static long ceph_ioctl_get_layout(struct file *file, void __user *arg)
+{
+ struct ceph_inode_info *ci = ceph_inode(file->f_dentry->d_inode);
+ struct ceph_ioctl_layout l;
+ int err;
+
+ err = ceph_do_getattr(file->f_dentry->d_inode, CEPH_STAT_CAP_LAYOUT);
+ if (!err) {
+ l.stripe_unit = ceph_file_layout_su(ci->i_layout);
+ l.stripe_count = ceph_file_layout_stripe_count(ci->i_layout);
+ l.object_size = ceph_file_layout_object_size(ci->i_layout);
+ l.data_pool = le32_to_cpu(ci->i_layout.fl_pg_pool);
+ l.preferred_osd =
+ (s32)le32_to_cpu(ci->i_layout.fl_pg_preferred);
+ if (copy_to_user(arg, &l, sizeof(l)))

+ return -EFAULT;
+ }
+

+ return err;
+}
+

+static long ceph_ioctl_set_layout(struct file *file, void __user *arg)

+{
+ struct inode *inode = file->f_dentry->d_inode;

+ struct inode *parent_inode = file->f_dentry->d_parent->d_inode;
+ struct ceph_mds_client *mdsc = &ceph_sb_to_client(inode->i_sb)->mdsc;
+ struct ceph_mds_request *req;
+ struct ceph_ioctl_layout l;
+ int err, i;
+
+ /* copy and validate */
+ if (copy_from_user(&l, arg, sizeof(l)))
+ return -EFAULT;
+
+ if ((l.object_size & ~PAGE_MASK) ||
+ (l.stripe_unit & ~PAGE_MASK) ||
+ !l.stripe_unit ||
+ (l.object_size &&
+ (unsigned)l.object_size % (unsigned)l.stripe_unit))
+ return -EINVAL;
+
+ /* make sure it's a valid data pool */
+ if (l.data_pool > 0) {
+ mutex_lock(&mdsc->mutex);
+ err = -EINVAL;
+ for (i = 0; i < mdsc->mdsmap->m_num_data_pg_pools; i++)
+ if (mdsc->mdsmap->m_data_pg_pools[i] == l.data_pool) {
+ err = 0;
+ break;
+ }
+ mutex_unlock(&mdsc->mutex);
+ if (err)

+ return err;
+ }
+

+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETLAYOUT,
+ USE_AUTH_MDS);
+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);
+ req->r_inode_drop = CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL;
+
+ req->r_args.setlayout.layout.fl_stripe_unit =
+ cpu_to_le32(l.stripe_unit);
+ req->r_args.setlayout.layout.fl_stripe_count =
+ cpu_to_le32(l.stripe_count);
+ req->r_args.setlayout.layout.fl_object_size =
+ cpu_to_le32(l.object_size);
+ req->r_args.setlayout.layout.fl_pg_pool = cpu_to_le32(l.data_pool);
+ req->r_args.setlayout.layout.fl_pg_preferred =
+ cpu_to_le32(l.preferred_osd);
+
+ err = ceph_mdsc_do_request(mdsc, parent_inode, req);
+ ceph_mdsc_put_request(req);

+ return err;
+}
+

+/*
+ * Return object name, size/offset information, and location (OSD
+ * number, network address) for a given file offset.
+ */
+static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
+{
+ struct ceph_ioctl_dataloc dl;

+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct ceph_osd_client *osdc = &ceph_client(inode->i_sb)->osdc;
+ u64 len = 1, olen;
+ u64 tmp;
+ struct ceph_object_layout ol;
+ struct ceph_pg pgid;
+
+ /* copy and validate */
+ if (copy_from_user(&dl, arg, sizeof(dl)))
+ return -EFAULT;
+
+ down_read(&osdc->map_sem);
+ ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len,
+ &dl.object_no, &dl.object_offset, &olen);
+ dl.file_offset -= dl.object_offset;
+ dl.object_size = ceph_file_layout_object_size(ci->i_layout);
+ dl.block_size = ceph_file_layout_su(ci->i_layout);
+
+ /* block_offset = object_offset % block_size */
+ tmp = dl.object_offset;
+ dl.block_offset = do_div(tmp, dl.block_size);
+
+ snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
+ ceph_ino(inode), dl.object_no);
+ ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout,
+ osdc->osdmap);
+
+ pgid = ol.ol_pgid;
+ dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
+ if (dl.osd >= 0) {
+ struct ceph_entity_addr *a =
+ ceph_osd_addr(osdc->osdmap, dl.osd);
+ if (a)
+ memcpy(&dl.osd_addr, &a->in_addr, sizeof(dl.osd_addr));
+ } else {
+ memset(&dl.osd_addr, 0, sizeof(dl.osd_addr));
+ }
+ up_read(&osdc->map_sem);
+
+ /* send result back to user */
+ if (copy_to_user(arg, &dl, sizeof(dl)))
+ return -EFAULT;

+
+ return 0;
+}
+

+long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
+{
+ dout("ioctl file %p cmd %u arg %lu\n", file, cmd, arg);
+ switch (cmd) {
+ case CEPH_IOC_GET_LAYOUT:
+ return ceph_ioctl_get_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_SET_LAYOUT:
+ return ceph_ioctl_set_layout(file, (void __user *)arg);
+
+ case CEPH_IOC_GET_DATALOC:
+ return ceph_ioctl_get_dataloc(file, (void __user *)arg);
+ }
+ return -ENOTTY;
+}
diff --git a/fs/ceph/ioctl.h b/fs/ceph/ioctl.h
new file mode 100644
index 0000000..25e4f1a
--- /dev/null
+++ b/fs/ceph/ioctl.h
@@ -0,0 +1,40 @@
+#ifndef FS_CEPH_IOCTL_H
+#define FS_CEPH_IOCTL_H
+
+#include <linux/ioctl.h>
+#include <linux/types.h>
+
+#define CEPH_IOCTL_MAGIC 0x97
+
+/* just use u64 to align sanely on all archs */
+struct ceph_ioctl_layout {
+ __u64 stripe_unit, stripe_count, object_size;
+ __u64 data_pool;
+ __s64 preferred_osd;
+};
+
+#define CEPH_IOC_GET_LAYOUT _IOR(CEPH_IOCTL_MAGIC, 1, \
+ struct ceph_ioctl_layout)
+#define CEPH_IOC_SET_LAYOUT _IOW(CEPH_IOCTL_MAGIC, 2, \
+ struct ceph_ioctl_layout)
+
+/*
+ * Extract identity, address of the OSD and object storing a given
+ * file offset.
+ */
+struct ceph_ioctl_dataloc {
+ __u64 file_offset; /* in+out: file offset */
+ __u64 object_offset; /* out: offset in object */
+ __u64 object_no; /* out: object # */
+ __u64 object_size; /* out: object size */
+ char object_name[64]; /* out: object name */
+ __u64 block_offset; /* out: offset in block */
+ __u64 block_size; /* out: block length */
+ __s64 osd; /* out: osd # */
+ struct sockaddr_storage osd_addr; /* out: osd address */
+};
+
+#define CEPH_IOC_GET_DATALOC _IOWR(CEPH_IOCTL_MAGIC, 3, \
+ struct ceph_ioctl_dataloc)
+
+#endif

Sage Weil

unread,

Mar 2, 2010, 7:50:01 PM3/2/10

Mount options, syntax.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

Documentation/filesystems/ceph.txt | 139 ++++++++++++++++++++++++++++++++++++
1 files changed, 139 insertions(+), 0 deletions(-)
create mode 100644 Documentation/filesystems/ceph.txt

diff --git a/Documentation/filesystems/ceph.txt b/Documentation/filesystems/ceph.txt
new file mode 100644
index 0000000..6e03917
--- /dev/null
+++ b/Documentation/filesystems/ceph.txt
@@ -0,0 +1,139 @@
+Ceph Distributed File System
+============================
+
+Ceph is a distributed network file system designed to provide good
+performance, reliability, and scalability.
+
+Basic features include:
+
+ * POSIX semantics
+ * Seamless scaling from 1 to many thousands of nodes
+ * High availability and reliability. No single points of failure.
+ * N-way replication of data across storage nodes
+ * Fast recovery from node failures
+ * Automatic rebalancing of data on node addition/removal
+ * Easy deployment: most FS components are userspace daemons
+
+Also,
+ * Flexible snapshots (on any directory)
+ * Recursive accounting (nested files, directories, bytes)
+
+In contrast to cluster filesystems like GFS, OCFS2, and GPFS that rely
+on symmetric access by all clients to shared block devices, Ceph
+separates data and metadata management into independent server
+clusters, similar to Lustre. Unlike Lustre, however, metadata and
+storage nodes run entirely as user space daemons. Storage nodes
+utilize btrfs to store data objects, leveraging its advanced features
+(checksumming, metadata replication, etc.). File data is striped
+across storage nodes in large chunks to distribute workload and
+facilitate high throughputs. When storage nodes fail, data is
+re-replicated in a distributed fashion by the storage nodes themselves
+(with some minimal coordination from a cluster monitor), making the
+system extremely efficient and scalable.
+
+Metadata servers effectively form a large, consistent, distributed
+in-memory cache above the file namespace that is extremely scalable,
+dynamically redistributes metadata in response to workload changes,
+and can tolerate arbitrary (well, non-Byzantine) node failures. The
+metadata server takes a somewhat unconventional approach to metadata
+storage to significantly improve performance for common workloads. In
+particular, inodes with only a single link are embedded in
+directories, allowing entire directories of dentries and inodes to be
+loaded into its cache with a single I/O operation. The contents of
+extremely large directories can be fragmented and managed by
+independent metadata servers, allowing scalable concurrent access.
+
+The system offers automatic data rebalancing/migration when scaling
+from a small cluster of just a few nodes to many hundreds, without
+requiring an administrator carve the data set into static volumes or
+go through the tedious process of migrating data between servers.
+When the file system approaches full, new nodes can be easily added
+and things will "just work."
+
+Ceph includes flexible snapshot mechanism that allows a user to create
+a snapshot on any subdirectory (and its nested contents) in the
+system. Snapshot creation and deletion are as simple as 'mkdir
+.snap/foo' and 'rmdir .snap/foo'.
+
+Ceph also provides some recursive accounting on directories for nested
+files and bytes. That is, a 'getfattr -d foo' on any directory in the
+system will reveal the total number of nested regular files and
+subdirectories, and a summation of all nested file sizes. This makes
+the identification of large disk space consumers relatively quick, as
+no 'du' or similar recursive scan of the file system is required.
+
+
+Mount Syntax
+============
+
+The basic mount syntax is:
+
+ # mount -t ceph monip[:port][,monip2[:port]...]:/[subdir] mnt
+
+You only need to specify a single monitor, as the client will get the
+full list when it connects. (However, if the monitor you specify
+happens to be down, the mount won't succeed.) The port can be left
+off if the monitor is using the default. So if the monitor is at
+1.2.3.4,
+
+ # mount -t ceph 1.2.3.4:/ /mnt/ceph
+
+is sufficient. If /sbin/mount.ceph is installed, a hostname can be
+used instead of an IP address.
+
+
+
+Mount Options
+=============
+
+ ip=A.B.C.D[:N]
+ Specify the IP and/or port the client should bind to locally.
+ There is normally not much reason to do this. If the IP is not
+ specified, the client's IP address is determined by looking at the
+ address it's connection to the monitor originates from.
+
+ wsize=X
+ Specify the maximum write size in bytes. By default there is no
+ maximu. Ceph will normally size writes based on the file stripe
+ size.
+
+ rsize=X
+ Specify the maximum readahead.
+
+ mount_timeout=X
+ Specify the timeout value for mount (in seconds), in the case
+ of a non-responsive Ceph file system. The default is 30
+ seconds.
+
+ rbytes
+ When stat() is called on a directory, set st_size to 'rbytes',
+ the summation of file sizes over all files nested beneath that
+ directory. This is the default.
+
+ norbytes
+ When stat() is called on a directory, set st_size to the
+ number of entries in that directory.
+
+ nocrc
+ Disable CRC32C calculation for data writes. If set, the OSD
+ must rely on TCP's error correction to detect data corruption
+ in the data payload.
+
+ noasyncreaddir
+ Disable client's use its local cache to satisfy readdir
+ requests. (This does not change correctness; the client uses
+ cached metadata only when a lease or capability ensures it is
+ valid.)
+
+
+More Information
+================
+
+For more information on Ceph, see the home page at
+ http://ceph.newdream.net/
+
+The Linux kernel client source tree is available at
+ git://ceph.newdream.net/linux-ceph-client.git
+
+and the source for the full system is at
+ git://ceph.newdream.net/ceph.git

Sage Weil

unread,

Mar 2, 2010, 7:50:02 PM3/2/10

Define the hash functions used for placing dentries in directories
and for mapping objects into placement groups.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/ceph_hash.c | 118 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/ceph_hash.h | 13 ++++++
2 files changed, 131 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/ceph_hash.c
create mode 100644 fs/ceph/ceph_hash.h

diff --git a/fs/ceph/ceph_hash.c b/fs/ceph/ceph_hash.c
new file mode 100644
index 0000000..bd57001
--- /dev/null
+++ b/fs/ceph/ceph_hash.c
@@ -0,0 +1,118 @@
+
+#include "types.h"
+
+/*
+ * Robert Jenkin's hash function.
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * This is in the public domain.
+ */
+#define mix(a, b, c) \
+ do { \
+ a = a - b; a = a - c; a = a ^ (c >> 13); \
+ b = b - c; b = b - a; b = b ^ (a << 8); \
+ c = c - a; c = c - b; c = c ^ (b >> 13); \
+ a = a - b; a = a - c; a = a ^ (c >> 12); \
+ b = b - c; b = b - a; b = b ^ (a << 16); \
+ c = c - a; c = c - b; c = c ^ (b >> 5); \
+ a = a - b; a = a - c; a = a ^ (c >> 3); \
+ b = b - c; b = b - a; b = b ^ (a << 10); \
+ c = c - a; c = c - b; c = c ^ (b >> 15); \
+ } while (0)
+
+unsigned ceph_str_hash_rjenkins(const char *str, unsigned length)
+{
+ const unsigned char *k = (const unsigned char *)str;
+ __u32 a, b, c; /* the internal state */
+ __u32 len; /* how many key bytes still need mixing */
+
+ /* Set up the internal state */
+ len = length;
+ a = 0x9e3779b9; /* the golden ratio; an arbitrary value */
+ b = a;
+ c = 0; /* variable initialization of internal state */
+
+ /* handle most of the key */
+ while (len >= 12) {
+ a = a + (k[0] + ((__u32)k[1] << 8) + ((__u32)k[2] << 16) +
+ ((__u32)k[3] << 24));
+ b = b + (k[4] + ((__u32)k[5] << 8) + ((__u32)k[6] << 16) +
+ ((__u32)k[7] << 24));
+ c = c + (k[8] + ((__u32)k[9] << 8) + ((__u32)k[10] << 16) +
+ ((__u32)k[11] << 24));
+ mix(a, b, c);
+ k = k + 12;
+ len = len - 12;
+ }
+
+ /* handle the last 11 bytes */
+ c = c + length;
+ switch (len) { /* all the case statements fall through */
+ case 11:
+ c = c + ((__u32)k[10] << 24);
+ case 10:
+ c = c + ((__u32)k[9] << 16);
+ case 9:
+ c = c + ((__u32)k[8] << 8);
+ /* the first byte of c is reserved for the length */
+ case 8:
+ b = b + ((__u32)k[7] << 24);
+ case 7:
+ b = b + ((__u32)k[6] << 16);
+ case 6:
+ b = b + ((__u32)k[5] << 8);
+ case 5:
+ b = b + k[4];
+ case 4:
+ a = a + ((__u32)k[3] << 24);
+ case 3:
+ a = a + ((__u32)k[2] << 16);
+ case 2:
+ a = a + ((__u32)k[1] << 8);
+ case 1:
+ a = a + k[0];
+ /* case 0: nothing left to add */
+ }
+ mix(a, b, c);
+
+ return c;
+}
+
+/*
+ * linux dcache hash
+ */
+unsigned ceph_str_hash_linux(const char *str, unsigned length)
+{
+ unsigned long hash = 0;
+ unsigned char c;
+
+ while (length--) {
+ c = *str++;
+ hash = (hash + (c << 4) + (c >> 4)) * 11;
+ }
+ return hash;
+}
+
+
+unsigned ceph_str_hash(int type, const char *s, unsigned len)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return ceph_str_hash_linux(s, len);
+ case CEPH_STR_HASH_RJENKINS:
+ return ceph_str_hash_rjenkins(s, len);
+ default:
+ return -1;
+ }
+}
+
+const char *ceph_str_hash_name(int type)
+{
+ switch (type) {
+ case CEPH_STR_HASH_LINUX:
+ return "linux";
+ case CEPH_STR_HASH_RJENKINS:
+ return "rjenkins";
+ default:
+ return "unknown";
+ }
+}
diff --git a/fs/ceph/ceph_hash.h b/fs/ceph/ceph_hash.h
new file mode 100644
index 0000000..5ac470c
--- /dev/null
+++ b/fs/ceph/ceph_hash.h
@@ -0,0 +1,13 @@
+#ifndef _FS_CEPH_HASH_H
+#define _FS_CEPH_HASH_H
+
+#define CEPH_STR_HASH_LINUX 0x1 /* linux dcache hash */
+#define CEPH_STR_HASH_RJENKINS 0x2 /* robert jenkins' */
+
+extern unsigned ceph_str_hash_linux(const char *s, unsigned len);
+extern unsigned ceph_str_hash_rjenkins(const char *s, unsigned len);
+
+extern unsigned ceph_str_hash(int type, const char *s, unsigned len);
+extern const char *ceph_str_hash_name(int type);
+
+#endif

Sage Weil

unread,

Mar 2, 2010, 7:50:01 PM3/2/10

Kconfig options and Makefile.

The README documents files that are synchronized between the user space
and kernel code.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

MAINTAINERS | 9 +++++++++
fs/Kconfig | 1 +
fs/Makefile | 1 +
fs/ceph/Kconfig | 27 +++++++++++++++++++++++++++
fs/ceph/Makefile | 39 +++++++++++++++++++++++++++++++++++++++
fs/ceph/README | 20 ++++++++++++++++++++
6 files changed, 97 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/Kconfig
create mode 100644 fs/ceph/Makefile
create mode 100644 fs/ceph/README

diff --git a/MAINTAINERS b/MAINTAINERS
index 2533fc4..a94c580 100644
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -1394,6 +1394,15 @@ F: arch/powerpc/include/asm/spu*.h
F: arch/powerpc/oprofile/*cell*
F: arch/powerpc/platforms/cell/

+CEPH DISTRIBUTED FILE SYSTEM CLIENT
+M: Sage Weil <sa...@newdream.net>
+L: ceph-...@lists.sourceforge.net
+W: http://ceph.newdream.net/
+T: git git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client.git
+S: Supported
+F: Documentation/filesystems/ceph.txt
+F: fs/ceph
+
CERTIFIED WIRELESS USB (WUSB) SUBSYSTEM:
M: David Vrabel <david....@csr.com>
L: linu...@vger.kernel.org
diff --git a/fs/Kconfig b/fs/Kconfig
index 64d44ef..0e0d65d 100644
--- a/fs/Kconfig
+++ b/fs/Kconfig
@@ -234,6 +234,7 @@ config NFS_COMMON

source "net/sunrpc/Kconfig"
source "fs/smbfs/Kconfig"
+source "fs/ceph/Kconfig"
source "fs/cifs/Kconfig"
source "fs/ncpfs/Kconfig"
source "fs/coda/Kconfig"
diff --git a/fs/Makefile b/fs/Makefile
index af6d047..5ef73a0 100644
--- a/fs/Makefile
+++ b/fs/Makefile
@@ -124,3 +124,4 @@ obj-$(CONFIG_OCFS2_FS) += ocfs2/
obj-$(CONFIG_BTRFS_FS) += btrfs/
obj-$(CONFIG_GFS2_FS) += gfs2/
obj-$(CONFIG_EXOFS_FS) += exofs/
+obj-$(CONFIG_CEPH_FS) += ceph/
diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig
new file mode 100644
index 0000000..04b8280
--- /dev/null
+++ b/fs/ceph/Kconfig
@@ -0,0 +1,27 @@
+config CEPH_FS
+ tristate "Ceph distributed file system (EXPERIMENTAL)"
+ depends on INET && EXPERIMENTAL
+ select LIBCRC32C
+ select CONFIG_CRYPTO_AES
+ help
+ Choose Y or M here to include support for mounting the
+ experimental Ceph distributed file system. Ceph is an extremely
+ scalable file system designed to provide high performance,
+ reliable access to petabytes of storage.
+
+ More information at http://ceph.newdream.net/.
+
+ If unsure, say N.
+
+config CEPH_FS_PRETTYDEBUG
+ bool "Include file:line in ceph debug output"
+ depends on CEPH_FS
+ default n
+ help
+ If you say Y here, debug output will include a filename and
+ line to aid debugging. This icnreases kernel size and slows
+ execution slightly when debug call sites are enabled (e.g.,
+ via CONFIG_DYNAMIC_DEBUG).
+
+ If unsure, say N.
+
diff --git a/fs/ceph/Makefile b/fs/ceph/Makefile
new file mode 100644
index 0000000..6a660e6
--- /dev/null
+++ b/fs/ceph/Makefile
@@ -0,0 +1,39 @@
+#
+# Makefile for CEPH filesystem.
+#
+
+ifneq ($(KERNELRELEASE),)
+
+obj-$(CONFIG_CEPH_FS) += ceph.o
+
+ceph-objs := super.o inode.o dir.o file.o addr.o ioctl.o \
+ export.o caps.o snap.o xattr.o \
+ messenger.o msgpool.o buffer.o pagelist.o \
+ mds_client.o mdsmap.o \
+ mon_client.o \
+ osd_client.o osdmap.o crush/crush.o crush/mapper.o crush/hash.o \
+ debugfs.o \
+ auth.o auth_none.o \
+ crypto.o armor.o \
+ auth_x.o \
+ ceph_fs.o ceph_strings.o ceph_hash.o ceph_frag.o
+
+else
+#Otherwise we were called directly from the command
+# line; invoke the kernel build system.
+
+KERNELDIR ?= /lib/modules/$(shell uname -r)/build
+PWD := $(shell pwd)
+
+default: all
+
+all:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules
+
+modules_install:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) CONFIG_CEPH_FS=m modules_install
+
+clean:
+ $(MAKE) -C $(KERNELDIR) M=$(PWD) clean
+
+endif
diff --git a/fs/ceph/README b/fs/ceph/README
new file mode 100644
index 0000000..18352fa
--- /dev/null
+++ b/fs/ceph/README
@@ -0,0 +1,20 @@
+#
+# The following files are shared by (and manually synchronized
+# between) the Ceph userland and kernel client.
+#
+# userland kernel
+src/include/ceph_fs.h fs/ceph/ceph_fs.h
+src/include/ceph_fs.cc fs/ceph/ceph_fs.c
+src/include/msgr.h fs/ceph/msgr.h
+src/include/rados.h fs/ceph/rados.h
+src/include/ceph_strings.cc fs/ceph/ceph_strings.c
+src/include/ceph_frag.h fs/ceph/ceph_frag.h
+src/include/ceph_frag.cc fs/ceph/ceph_frag.c
+src/include/ceph_hash.h fs/ceph/ceph_hash.h
+src/include/ceph_hash.cc fs/ceph/ceph_hash.c
+src/crush/crush.c fs/ceph/crush/crush.c
+src/crush/crush.h fs/ceph/crush/crush.h
+src/crush/mapper.c fs/ceph/crush/mapper.c
+src/crush/mapper.h fs/ceph/crush/mapper.h
+src/crush/hash.h fs/ceph/crush/hash.h
+src/crush/hash.c fs/ceph/crush/hash.c

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

Directory operations, including lookup, are defined here. We take
advantage of lookup intents when possible. For the most part, we just
need to build the proper requests for the metadata server(s) and
pass things off to the mds_client.

The results of most operations are normally incorporated into the
client's cache when the reply is parsed by ceph_fill_trace().
However, if the MDS replies without a trace (e.g., when retrying an
update after an MDS failure recovery), some operation-specific cleanup
may be needed.

We can validate cached dentries in two ways. A per-dentry lease may
be issued by the MDS, or a per-directory cap may be issued that acts
as a lease on the entire directory. In the latter case, a 'gen' value
is used to determine which dentries belong to the currently leased
directory contents.

We normally prepopulate the dcache and icache with readdir results.
This makes subsequent lookups and getattrs avoid any server
interaction. It also lets us satisfy readdir operation by peeking at
the dcache IFF we hold the per-directory cap/lease, previously
performed a readdir, and haven't dropped any of the resulting
dentries.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/dir.c | 1220 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 1220 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/dir.c

diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
new file mode 100644
index 0000000..5107384
--- /dev/null
+++ b/fs/ceph/dir.c
@@ -0,0 +1,1220 @@
+#include "ceph_debug.h"
+
+#include <linux/spinlock.h>
+#include <linux/fs_struct.h>
+#include <linux/namei.h>
+#include <linux/sched.h>
+
+#include "super.h"
+
+/*
+ * Directory operations: readdir, lookup, create, link, unlink,
+ * rename, etc.
+ */
+
+/*
+ * Ceph MDS operations are specified in terms of a base ino and
+ * relative path. Thus, the client can specify an operation on a
+ * specific inode (e.g., a getattr due to fstat(2)), or as a path
+ * relative to, say, the root directory.
+ *
+ * Normally, we limit ourselves to strict inode ops (no path component)
+ * or dentry operations (a single path component relative to an ino). The
+ * exception to this is open_root_dentry(), which will open the mount
+ * point by name.
+ */
+
+const struct inode_operations ceph_dir_iops;
+const struct file_operations ceph_dir_fops;
+struct dentry_operations ceph_dentry_ops;
+
+/*
+ * Initialize ceph dentry state.
+ */
+int ceph_init_dentry(struct dentry *dentry)
+{

+ struct ceph_dentry_info *di;
+

+ if (dentry->d_fsdata)
+ return 0;
+
+ if (ceph_snap(dentry->d_parent->d_inode) == CEPH_NOSNAP)
+ dentry->d_op = &ceph_dentry_ops;
+ else if (ceph_snap(dentry->d_parent->d_inode) == CEPH_SNAPDIR)
+ dentry->d_op = &ceph_snapdir_dentry_ops;
+ else
+ dentry->d_op = &ceph_snap_dentry_ops;
+
+ di = kmem_cache_alloc(ceph_dentry_cachep, GFP_NOFS);
+ if (!di)
+ return -ENOMEM; /* oh well */
+
+ spin_lock(&dentry->d_lock);
+ if (dentry->d_fsdata) /* lost a race */
+ goto out_unlock;
+ di->dentry = dentry;
+ di->lease_session = NULL;
+ dentry->d_fsdata = di;
+ dentry->d_time = jiffies;
+ ceph_dentry_lru_add(dentry);
+out_unlock:
+ spin_unlock(&dentry->d_lock);

+ return 0;
+}
+

+
+
+/*
+ * for readdir, we encode the directory frag and offset within that
+ * frag into f_pos.
+ */
+static unsigned fpos_frag(loff_t p)
+{
+ return p >> 32;
+}
+static unsigned fpos_off(loff_t p)
+{
+ return p & 0xffffffff;
+}
+
+/*
+ * When possible, we try to satisfy a readdir by peeking at the
+ * dcache. We make this work by carefully ordering dentries on
+ * d_u.d_child when we initially get results back from the MDS, and
+ * falling back to a "normal" sync readdir if any dentries in the dir
+ * are dropped.
+ *
+ * I_COMPLETE tells indicates we have all dentries in the dir. It is
+ * defined IFF we hold CEPH_CAP_FILE_SHARED (which will be revoked by
+ * the MDS if/when the directory is modified).
+ */
+static int __dcache_readdir(struct file *filp,
+ void *dirent, filldir_t filldir)
+{

+ struct inode *inode = filp->f_dentry->d_inode;

+ struct ceph_file_info *fi = filp->private_data;
+ struct dentry *parent = filp->f_dentry;
+ struct inode *dir = parent->d_inode;
+ struct list_head *p;
+ struct dentry *dentry, *last;
+ struct ceph_dentry_info *di;

+ int err = 0;
+

+ /* claim ref on last dentry we returned */
+ last = fi->dentry;
+ fi->dentry = NULL;
+
+ dout("__dcache_readdir %p at %llu (last %p)\n", dir, filp->f_pos,
+ last);
+
+ spin_lock(&dcache_lock);
+
+ /* start at beginning? */
+ if (filp->f_pos == 2 || (last &&
+ filp->f_pos < ceph_dentry(last)->offset)) {
+ if (list_empty(&parent->d_subdirs))
+ goto out_unlock;
+ p = parent->d_subdirs.prev;
+ dout(" initial p %p/%p\n", p->prev, p->next);
+ } else {
+ p = last->d_u.d_child.prev;
+ }
+
+more:
+ dentry = list_entry(p, struct dentry, d_u.d_child);
+ di = ceph_dentry(dentry);
+ while (1) {
+ dout(" p %p/%p d_subdirs %p/%p\n", p->prev, p->next,
+ parent->d_subdirs.prev, parent->d_subdirs.next);
+ if (p == &parent->d_subdirs) {
+ fi->at_end = 1;
+ goto out_unlock;
+ }
+ if (!d_unhashed(dentry) && dentry->d_inode &&
+ ceph_snap(dentry->d_inode) != CEPH_SNAPDIR &&
+ ceph_ino(dentry->d_inode) != CEPH_INO_CEPH &&
+ filp->f_pos <= di->offset)
+ break;
+ dout(" skipping %p %.*s at %llu (%llu)%s%s\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, di->offset,
+ filp->f_pos, d_unhashed(dentry) ? " unhashed" : "",
+ !dentry->d_inode ? " null" : "");
+ p = p->prev;
+ dentry = list_entry(p, struct dentry, d_u.d_child);
+ di = ceph_dentry(dentry);
+ }
+
+ atomic_inc(&dentry->d_count);
+ spin_unlock(&dcache_lock);
+ spin_unlock(&inode->i_lock);
+
+ dout(" %llu (%llu) dentry %p %.*s %p\n", di->offset, filp->f_pos,
+ dentry, dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ filp->f_pos = di->offset;
+ err = filldir(dirent, dentry->d_name.name,
+ dentry->d_name.len, di->offset,
+ dentry->d_inode->i_ino,
+ dentry->d_inode->i_mode >> 12);
+
+ if (last) {
+ if (err < 0) {
+ /* remember our position */
+ fi->dentry = last;
+ fi->next_offset = di->offset;
+ } else {
+ dput(last);
+ }
+ last = NULL;
+ }
+
+ spin_lock(&inode->i_lock);
+ spin_lock(&dcache_lock);
+
+ if (err < 0)
+ goto out_unlock;
+
+ last = dentry;
+
+ p = p->prev;
+ filp->f_pos++;
+
+ /* make sure a dentry wasn't dropped while we didn't have dcache_lock */
+ if ((ceph_inode(dir)->i_ceph_flags & CEPH_I_COMPLETE))
+ goto more;
+ dout(" lost I_COMPLETE on %p; falling back to mds\n", dir);
+ err = -EAGAIN;
+
+out_unlock:
+ spin_unlock(&dcache_lock);
+
+ if (last) {
+ spin_unlock(&inode->i_lock);
+ dput(last);
+ spin_lock(&inode->i_lock);

+ }
+
+ return err;

+}
+
+/*
+ * make note of the last dentry we read, so we can
+ * continue at the same lexicographical point,
+ * regardless of what dir changes take place on the
+ * server.
+ */
+static int note_last_dentry(struct ceph_file_info *fi, const char *name,
+ int len)
+{
+ kfree(fi->last_name);
+ fi->last_name = kmalloc(len+1, GFP_NOFS);
+ if (!fi->last_name)
+ return -ENOMEM;
+ memcpy(fi->last_name, name, len);
+ fi->last_name[len] = 0;
+ dout("note_last_dentry '%s'\n", fi->last_name);

+ return 0;
+}
+

+static int ceph_readdir(struct file *filp, void *dirent, filldir_t filldir)
+{
+ struct ceph_file_info *fi = filp->private_data;
+ struct inode *inode = filp->f_dentry->d_inode;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct ceph_client *client = ceph_inode_to_client(inode);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ unsigned frag = fpos_frag(filp->f_pos);
+ int off = fpos_off(filp->f_pos);
+ int err;
+ u32 ftype;
+ struct ceph_mds_reply_info_parsed *rinfo;
+ const int max_entries = client->mount_args->max_readdir;
+
+ dout("readdir %p filp %p frag %u off %u\n", inode, filp, frag, off);
+ if (fi->at_end)
+ return 0;
+
+ /* always start with . and .. */
+ if (filp->f_pos == 0) {
+ /* note dir version at start of readdir so we can tell
+ * if any dentries get dropped */
+ fi->dir_release_count = ci->i_release_count;
+
+ dout("readdir off 0 -> '.'\n");
+ if (filldir(dirent, ".", 1, ceph_make_fpos(0, 0),
+ inode->i_ino, inode->i_mode >> 12) < 0)
+ return 0;
+ filp->f_pos = 1;
+ off = 1;
+ }
+ if (filp->f_pos == 1) {
+ dout("readdir off 1 -> '..'\n");
+ if (filldir(dirent, "..", 2, ceph_make_fpos(0, 1),
+ filp->f_dentry->d_parent->d_inode->i_ino,
+ inode->i_mode >> 12) < 0)
+ return 0;
+ filp->f_pos = 2;
+ off = 2;
+ }
+
+ /* can we use the dcache? */
+ spin_lock(&inode->i_lock);
+ if ((filp->f_pos == 2 || fi->dentry) &&
+ !ceph_test_opt(client, NOASYNCREADDIR) &&
+ (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1)) {
+ err = __dcache_readdir(filp, dirent, filldir);
+ if (err != -EAGAIN) {
+ spin_unlock(&inode->i_lock);

+ return err;
+ }
+ }

+ spin_unlock(&inode->i_lock);
+ if (fi->dentry) {
+ err = note_last_dentry(fi, fi->dentry->d_name.name,
+ fi->dentry->d_name.len);

+ if (err)
+ return err;

+ dput(fi->dentry);
+ fi->dentry = NULL;
+ }
+
+ /* proceed with a normal readdir */
+
+more:
+ /* do we have the correct frag content buffered? */
+ if (fi->frag != frag || fi->last_readdir == NULL) {
+ struct ceph_mds_request *req;
+ int op = ceph_snap(inode) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LSSNAP : CEPH_MDS_OP_READDIR;
+
+ /* discard old result, if any */
+ if (fi->last_readdir)
+ ceph_mdsc_put_request(fi->last_readdir);
+
+ /* requery frag tree, as the frag topology may have changed */
+ frag = ceph_choose_frag(ceph_inode(inode), frag, NULL, NULL);
+
+ dout("readdir fetching %llx.%llx frag %x offset '%s'\n",
+ ceph_vinop(inode), frag, fi->last_name);
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);

+ if (IS_ERR(req))
+ return PTR_ERR(req);
+ req->r_inode = igrab(inode);

+ req->r_dentry = dget(filp->f_dentry);
+ /* hints to request -> mds selection code */
+ req->r_direct_mode = USE_AUTH_MDS;
+ req->r_direct_hash = ceph_frag_value(frag);
+ req->r_direct_is_hash = true;
+ req->r_path2 = kstrdup(fi->last_name, GFP_NOFS);
+ req->r_readdir_offset = fi->next_offset;
+ req->r_args.readdir.frag = cpu_to_le32(frag);
+ req->r_args.readdir.max_entries = cpu_to_le32(max_entries);
+ req->r_num_caps = max_entries;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ if (err < 0) {

+ ceph_mdsc_put_request(req);
+ return err;
+ }

+ dout("readdir got and parsed readdir result=%d"
+ " on frag %x, end=%d, complete=%d\n", err, frag,
+ (int)req->r_reply_info.dir_end,
+ (int)req->r_reply_info.dir_complete);
+
+ if (!req->r_did_prepopulate) {
+ dout("readdir !did_prepopulate");
+ fi->dir_release_count--; /* preclude I_COMPLETE */
+ }
+
+ /* note next offset and last dentry name */
+ fi->offset = fi->next_offset;
+ fi->last_readdir = req;
+
+ if (req->r_reply_info.dir_end) {
+ kfree(fi->last_name);
+ fi->last_name = NULL;
+ fi->next_offset = 0;
+ } else {
+ rinfo = &req->r_reply_info;
+ err = note_last_dentry(fi,
+ rinfo->dir_dname[rinfo->dir_nr-1],
+ rinfo->dir_dname_len[rinfo->dir_nr-1]);

+ if (err)
+ return err;

+ fi->next_offset += rinfo->dir_nr;
+ }
+ }
+
+ rinfo = &fi->last_readdir->r_reply_info;
+ dout("readdir frag %x num %d off %d chunkoff %d\n", frag,
+ rinfo->dir_nr, off, fi->offset);
+ while (off - fi->offset >= 0 && off - fi->offset < rinfo->dir_nr) {
+ u64 pos = ceph_make_fpos(frag, off);
+ struct ceph_mds_reply_inode *in =
+ rinfo->dir_in[off - fi->offset].in;
+ dout("readdir off %d (%d/%d) -> %lld '%.*s' %p\n",
+ off, off - fi->offset, rinfo->dir_nr, pos,
+ rinfo->dir_dname_len[off - fi->offset],
+ rinfo->dir_dname[off - fi->offset], in);
+ BUG_ON(!in);
+ ftype = le32_to_cpu(in->mode) >> 12;
+ if (filldir(dirent,
+ rinfo->dir_dname[off - fi->offset],
+ rinfo->dir_dname_len[off - fi->offset],
+ pos,
+ le64_to_cpu(in->ino),
+ ftype) < 0) {
+ dout("filldir stopping us...\n");
+ return 0;
+ }
+ off++;
+ filp->f_pos = pos + 1;
+ }
+
+ if (fi->last_name) {
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+ goto more;
+ }
+
+ /* more frags? */
+ if (!ceph_frag_is_rightmost(frag)) {
+ frag = ceph_frag_next(frag);
+ off = 0;
+ filp->f_pos = ceph_make_fpos(frag, off);
+ dout("readdir next frag is %x\n", frag);
+ goto more;
+ }
+ fi->at_end = 1;
+
+ /*
+ * if dir_release_count still matches the dir, no dentries
+ * were released during the whole readdir, and we should have
+ * the complete dir contents in our cache.
+ */
+ spin_lock(&inode->i_lock);
+ if (ci->i_release_count == fi->dir_release_count) {
+ dout(" marking %p complete\n", inode);
+ ci->i_ceph_flags |= CEPH_I_COMPLETE;
+ ci->i_max_offset = filp->f_pos;
+ }
+ spin_unlock(&inode->i_lock);
+
+ dout("readdir %p filp %p done.\n", inode, filp);

+ return 0;
+}
+

+static void reset_readdir(struct ceph_file_info *fi)
+{
+ if (fi->last_readdir) {
+ ceph_mdsc_put_request(fi->last_readdir);
+ fi->last_readdir = NULL;
+ }
+ kfree(fi->last_name);
+ fi->next_offset = 2; /* compensate for . and .. */
+ if (fi->dentry) {
+ dput(fi->dentry);
+ fi->dentry = NULL;
+ }
+ fi->at_end = 0;
+}
+
+static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int origin)
+{
+ struct ceph_file_info *fi = file->private_data;
+ struct inode *inode = file->f_mapping->host;
+ loff_t old_offset = offset;
+ loff_t retval;
+
+ mutex_lock(&inode->i_mutex);
+ switch (origin) {
+ case SEEK_END:
+ offset += inode->i_size + 2; /* FIXME */
+ break;
+ case SEEK_CUR:
+ offset += file->f_pos;
+ }
+ retval = -EINVAL;
+ if (offset >= 0 && offset <= inode->i_sb->s_maxbytes) {
+ if (offset != file->f_pos) {
+ file->f_pos = offset;
+ file->f_version = 0;
+ fi->at_end = 0;
+ }
+ retval = offset;
+
+ /*
+ * discard buffered readdir content on seekdir(0), or
+ * seek to new frag, or seek prior to current chunk.
+ */
+ if (offset == 0 ||
+ fpos_frag(offset) != fpos_frag(old_offset) ||
+ fpos_off(offset) < fi->offset) {
+ dout("dir_llseek dropping %p content\n", file);
+ reset_readdir(fi);
+ }
+
+ /* bump dir_release_count if we did a forward seek */
+ if (offset > old_offset)
+ fi->dir_release_count--;
+ }
+ mutex_unlock(&inode->i_mutex);
+ return retval;
+}
+
+/*
+ * Process result of a lookup/open request.
+ *
+ * Mainly, make sure we return the final req->r_dentry (if it already
+ * existed) in place of the original VFS-provided dentry when they
+ * differ.
+ *
+ * Gracefully handle the case where the MDS replies with -ENOENT and
+ * no trace (which it may do, at its discretion, e.g., if it doesn't
+ * care to issue a lease on the negative dentry).
+ */
+struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,
+ struct dentry *dentry, int err)
+{
+ struct ceph_client *client = ceph_client(dentry->d_sb);
+ struct inode *parent = dentry->d_parent->d_inode;
+
+ /* .snap dir? */
+ if (err == -ENOENT &&
+ ceph_vino(parent).ino != CEPH_INO_ROOT && /* no .snap in root dir */
+ strcmp(dentry->d_name.name,
+ client->mount_args->snapdir_name) == 0) {
+ struct inode *inode = ceph_get_snapdir(parent);
+ dout("ENOENT on snapdir %p '%.*s', linking to snapdir %p\n",
+ dentry, dentry->d_name.len, dentry->d_name.name, inode);
+ d_add(dentry, inode);

+ err = 0;
+ }

+ if (err == -ENOENT) {

+ /* no trace? */
+ err = 0;
+ if (!req->r_reply_info.head->is_dentry) {
+ dout("ENOENT and no trace, dentry %p inode %p\n",
+ dentry, dentry->d_inode);
+ if (dentry->d_inode) {
+ d_drop(dentry);
+ err = -ENOENT;
+ } else {
+ d_add(dentry, NULL);
+ }
+ }
+ }
+ if (err)
+ dentry = ERR_PTR(err);
+ else if (dentry != req->r_dentry)
+ dentry = dget(req->r_dentry); /* we got spliced */
+ else
+ dentry = NULL;
+ return dentry;
+}
+
+static int is_root_ceph_dentry(struct inode *inode, struct dentry *dentry)
+{
+ return ceph_ino(inode) == CEPH_INO_ROOT &&
+ strncmp(dentry->d_name.name, ".ceph", 5) == 0;
+}
+
+/*
+ * Look up a single dir entry. If there is a lookup intent, inform
+ * the MDS so that it gets our 'caps wanted' value in a single op.
+ */
+static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int op;
+ int err;
+
+ dout("lookup %p dentry %p '%.*s'\n",
+ dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+ if (dentry->d_name.len > NAME_MAX)
+ return ERR_PTR(-ENAMETOOLONG);
+
+ err = ceph_init_dentry(dentry);
+ if (err < 0)
+ return ERR_PTR(err);
+
+ /* open (but not create!) intent? */
+ if (nd &&
+ (nd->flags & LOOKUP_OPEN) &&
+ (nd->flags & LOOKUP_CONTINUE) == 0 && /* only open last component */
+ !(nd->intent.open.flags & O_CREAT)) {
+ int mode = nd->intent.open.create_mode & ~current->fs->umask;
+ return ceph_lookup_open(dir, dentry, nd, mode, 1);
+ }
+
+ /* can we conclude ENOENT locally? */
+ if (dentry->d_inode == NULL) {
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+
+ spin_lock(&dir->i_lock);
+ dout(" dir %p flags are %d\n", dir, ci->i_ceph_flags);
+ if (strncmp(dentry->d_name.name,
+ client->mount_args->snapdir_name,
+ dentry->d_name.len) &&
+ !is_root_ceph_dentry(dir, dentry) &&
+ (ci->i_ceph_flags & CEPH_I_COMPLETE) &&
+ (__ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1))) {
+ di->offset = ci->i_max_offset++;
+ spin_unlock(&dir->i_lock);
+ dout(" dir %p complete, -ENOENT\n", dir);
+ d_add(dentry, NULL);
+ di->lease_shared_gen = ci->i_shared_gen;
+ return NULL;
+ }
+ spin_unlock(&dir->i_lock);
+ }
+
+ op = ceph_snap(dir) == CEPH_SNAPDIR ?
+ CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
+ req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ /* we only need inode linkage */
+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);
+ req->r_locked_dir = dir;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);
+ dentry = ceph_finish_lookup(req, dentry, err);
+ ceph_mdsc_put_request(req); /* will dput(dentry) */
+ dout("lookup result=%p\n", dentry);
+ return dentry;
+}
+
+/*
+ * If we do a create but get no trace back from the MDS, follow up with
+ * a lookup (the VFS expects us to link up the provided dentry).
+ */
+int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry)
+{
+ struct dentry *result = ceph_lookup(dir, dentry, NULL);
+
+ if (result && !IS_ERR(result)) {
+ /*
+ * We created the item, then did a lookup, and found
+ * it was already linked to another inode we already
+ * had in our cache (and thus got spliced). Link our
+ * dentry to that inode, but don't hash it, just in
+ * case the VFS wants to dereference it.
+ */
+ BUG_ON(!result->d_inode);
+ d_instantiate(dentry, result->d_inode);
+ return 0;
+ }
+ return PTR_ERR(result);
+}
+
+static int ceph_mknod(struct inode *dir, struct dentry *dentry,
+ int mode, dev_t rdev)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("mknod in dir %p dentry %p mode 0%o rdev %d\n",
+ dir, dentry, mode, rdev);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_MKNOD, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_args.mknod.mode = cpu_to_le32(mode);
+ req->r_args.mknod.rdev = cpu_to_le32(rdev);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+ if (err)
+ d_drop(dentry);

+ return err;
+}
+

+static int ceph_create(struct inode *dir, struct dentry *dentry, int mode,
+ struct nameidata *nd)
+{
+ dout("create in dir %p dentry %p name '%.*s'\n",
+ dir, dentry, dentry->d_name.len, dentry->d_name.name);
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ if (nd) {
+ BUG_ON((nd->flags & LOOKUP_OPEN) == 0);
+ dentry = ceph_lookup_open(dir, dentry, nd, mode, 0);
+ /* hrm, what should i do here if we get aliased? */
+ if (IS_ERR(dentry))
+ return PTR_ERR(dentry);

+ return 0;
+ }
+

+ /* fall back to mknod */
+ return ceph_mknod(dir, dentry, (mode & ~S_IFMT) | S_IFREG, 0);
+}
+
+static int ceph_symlink(struct inode *dir, struct dentry *dentry,
+ const char *dest)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("symlink in dir %p dentry %p to '%s'\n", dir, dentry, dest);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SYMLINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_path2 = kstrdup(dest, GFP_NOFS);
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+ if (err)
+ d_drop(dentry);

+ return err;
+}
+

+static int ceph_mkdir(struct inode *dir, struct dentry *dentry, int mode)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* mkdir .snap/foo is a MKSNAP */
+ op = CEPH_MDS_OP_MKSNAP;
+ dout("mksnap dir %p snap '%.*s' dn %p\n", dir,
+ dentry->d_name.len, dentry->d_name.name, dentry);
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("mkdir dir %p dn %p mode 0%o\n", dir, dentry, mode);
+ op = CEPH_MDS_OP_MKDIR;
+ } else {
+ goto out;
+ }
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);

+ goto out;
+ }
+

+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_args.mkdir.mode = cpu_to_le32(mode);
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ err = ceph_handle_notrace_create(dir, dentry);
+ ceph_mdsc_put_request(req);
+out:
+ if (err < 0)
+ d_drop(dentry);

+ return err;
+}
+

+static int ceph_link(struct dentry *old_dentry, struct inode *dir,
+ struct dentry *dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(dir) != CEPH_NOSNAP)
+ return -EROFS;
+
+ dout("link in dir %p old_dentry %p dentry %p\n", dir,
+ old_dentry, dentry);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LINK, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ d_drop(dentry);
+ return PTR_ERR(req);
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry); /* or inode? hrm. */
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (err)
+ d_drop(dentry);
+ else if (!req->r_reply_info.head->is_dentry)
+ d_instantiate(dentry, igrab(old_dentry->d_inode));

+ ceph_mdsc_put_request(req);
+ return err;

+}
+
+/*
+ * For a soon-to-be unlinked file, drop the AUTH_RDCACHE caps. If it
+ * looks like the link count will hit 0, drop any other caps (other
+ * than PIN) we don't specifically want (due to the file still being
+ * open).
+ */
+static int drop_caps_for_unlink(struct inode *inode)
+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+ int drop = CEPH_CAP_LINK_SHARED | CEPH_CAP_LINK_EXCL;
+
+ spin_lock(&inode->i_lock);
+ if (inode->i_nlink == 1) {
+ drop |= ~(__ceph_caps_wanted(ci) | CEPH_CAP_PIN);
+ ci->i_ceph_flags |= CEPH_I_NODELAY;
+ }
+ spin_unlock(&inode->i_lock);
+ return drop;
+}
+
+/*
+ * rmdir and unlink are differ only by the metadata op code
+ */
+static int ceph_unlink(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct inode *inode = dentry->d_inode;
+ struct ceph_mds_request *req;
+ int err = -EROFS;
+ int op;
+
+ if (ceph_snap(dir) == CEPH_SNAPDIR) {
+ /* rmdir .snap/foo is RMSNAP */
+ dout("rmsnap dir %p '%.*s' dn %p\n", dir, dentry->d_name.len,
+ dentry->d_name.name, dentry);
+ op = CEPH_MDS_OP_RMSNAP;
+ } else if (ceph_snap(dir) == CEPH_NOSNAP) {
+ dout("unlink/rmdir dir %p dn %p inode %p\n",
+ dir, dentry, inode);
+ op = ((dentry->d_inode->i_mode & S_IFMT) == S_IFDIR) ?
+ CEPH_MDS_OP_RMDIR : CEPH_MDS_OP_UNLINK;
+ } else
+ goto out;
+ req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS);
+ if (IS_ERR(req)) {
+ err = PTR_ERR(req);
+ goto out;
+ }
+ req->r_dentry = dget(dentry);
+ req->r_num_caps = 2;
+ req->r_locked_dir = dir;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_inode_drop = drop_caps_for_unlink(inode);
+ err = ceph_mdsc_do_request(mdsc, dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry)
+ d_delete(dentry);
+ ceph_mdsc_put_request(req);
+out:

+ return err;
+}
+

+static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
+ struct inode *new_dir, struct dentry *new_dentry)
+{
+ struct ceph_client *client = ceph_sb_to_client(old_dir->i_sb);
+ struct ceph_mds_client *mdsc = &client->mdsc;
+ struct ceph_mds_request *req;
+ int err;
+
+ if (ceph_snap(old_dir) != ceph_snap(new_dir))
+ return -EXDEV;
+ if (ceph_snap(old_dir) != CEPH_NOSNAP ||
+ ceph_snap(new_dir) != CEPH_NOSNAP)
+ return -EROFS;
+ dout("rename dir %p dentry %p to dir %p dentry %p\n",
+ old_dir, old_dentry, new_dir, new_dentry);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_RENAME, USE_AUTH_MDS);

+ if (IS_ERR(req))
+ return PTR_ERR(req);

+ req->r_dentry = dget(new_dentry);
+ req->r_num_caps = 2;
+ req->r_old_dentry = dget(old_dentry);
+ req->r_locked_dir = new_dir;
+ req->r_old_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_old_dentry_unless = CEPH_CAP_FILE_EXCL;
+ req->r_dentry_drop = CEPH_CAP_FILE_SHARED;
+ req->r_dentry_unless = CEPH_CAP_FILE_EXCL;
+ /* release LINK_RDCACHE on source inode (mds will lock it) */
+ req->r_old_inode_drop = CEPH_CAP_LINK_SHARED;
+ if (new_dentry->d_inode)
+ req->r_inode_drop = drop_caps_for_unlink(new_dentry->d_inode);
+ err = ceph_mdsc_do_request(mdsc, old_dir, req);
+ if (!err && !req->r_reply_info.head->is_dentry) {
+ /*
+ * Normally d_move() is done by fill_trace (called by
+ * do_request, above). If there is no trace, we need
+ * to do it here.
+ */
+ d_move(old_dentry, new_dentry);
+ }

+ ceph_mdsc_put_request(req);
+ return err;
+}

+
+
+/*
+ * Check if dentry lease is valid. If not, delete the lease. Try to
+ * renew if the least is more than half up.
+ */
+static int dentry_lease_is_valid(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di;
+ struct ceph_mds_session *s;
+ int valid = 0;
+ u32 gen;
+ unsigned long ttl;
+ struct ceph_mds_session *session = NULL;
+ struct inode *dir = NULL;
+ u32 seq = 0;
+
+ spin_lock(&dentry->d_lock);
+ di = ceph_dentry(dentry);
+ if (di && di->lease_session) {
+ s = di->lease_session;
+ spin_lock(&s->s_cap_lock);
+ gen = s->s_cap_gen;
+ ttl = s->s_cap_ttl;
+ spin_unlock(&s->s_cap_lock);
+
+ if (di->lease_gen == gen &&
+ time_before(jiffies, dentry->d_time) &&
+ time_before(jiffies, ttl)) {
+ valid = 1;
+ if (di->lease_renew_after &&
+ time_after(jiffies, di->lease_renew_after)) {
+ /* we should renew */
+ dir = dentry->d_parent->d_inode;
+ session = ceph_get_mds_session(s);
+ seq = di->lease_seq;
+ di->lease_renew_after = 0;
+ di->lease_renew_from = jiffies;
+ }
+ }
+ }
+ spin_unlock(&dentry->d_lock);
+
+ if (session) {
+ ceph_mdsc_lease_send_msg(session, dir, dentry,
+ CEPH_MDS_LEASE_RENEW, seq);
+ ceph_put_mds_session(session);
+ }
+ dout("dentry_lease_is_valid - dentry %p = %d\n", dentry, valid);
+ return valid;
+}
+
+/*
+ * Check if directory-wide content lease/cap is valid.
+ */
+static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
+{
+ struct ceph_inode_info *ci = ceph_inode(dir);
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ int valid = 0;
+
+ spin_lock(&dir->i_lock);
+ if (ci->i_shared_gen == di->lease_shared_gen)
+ valid = __ceph_caps_issued_mask(ci, CEPH_CAP_FILE_SHARED, 1);
+ spin_unlock(&dir->i_lock);
+ dout("dir_lease_is_valid dir %p v%u dentry %p v%u = %d\n",
+ dir, (unsigned)ci->i_shared_gen, dentry,
+ (unsigned)di->lease_shared_gen, valid);
+ return valid;
+}
+
+/*
+ * Check if cached dentry can be trusted.
+ */
+static int ceph_d_revalidate(struct dentry *dentry, struct nameidata *nd)
+{
+ struct inode *dir = dentry->d_parent->d_inode;
+
+ dout("d_revalidate %p '%.*s' inode %p\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+
+ /* always trust cached snapped dentries, snapdir dentry */
+ if (ceph_snap(dir) != CEPH_NOSNAP) {
+ dout("d_revalidate %p '%.*s' inode %p is SNAPPED\n", dentry,
+ dentry->d_name.len, dentry->d_name.name, dentry->d_inode);
+ goto out_touch;
+ }
+ if (dentry->d_inode && ceph_snap(dentry->d_inode) == CEPH_SNAPDIR)
+ goto out_touch;
+
+ if (dentry_lease_is_valid(dentry) ||
+ dir_lease_is_valid(dir, dentry))
+ goto out_touch;
+
+ dout("d_revalidate %p invalid\n", dentry);
+ d_drop(dentry);
+ return 0;
+out_touch:
+ ceph_dentry_lru_touch(dentry);
+ return 1;
+}
+
+/*
+ * When a dentry is released, clear the dir I_COMPLETE if it was part
+ * of the current dir gen.
+ */
+static void ceph_dentry_release(struct dentry *dentry)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dentry);
+ struct inode *parent_inode = dentry->d_parent->d_inode;
+
+ if (parent_inode) {
+ struct ceph_inode_info *ci = ceph_inode(parent_inode);
+
+ spin_lock(&parent_inode->i_lock);
+ if (ci->i_shared_gen == di->lease_shared_gen) {
+ dout(" clearing %p complete (d_release)\n",
+ parent_inode);
+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;
+ ci->i_release_count++;
+ }
+ spin_unlock(&parent_inode->i_lock);
+ }
+ if (di) {
+ ceph_dentry_lru_del(dentry);
+ if (di->lease_session)
+ ceph_put_mds_session(di->lease_session);
+ kmem_cache_free(ceph_dentry_cachep, di);
+ dentry->d_fsdata = NULL;
+ }
+}
+
+static int ceph_snapdir_d_revalidate(struct dentry *dentry,
+ struct nameidata *nd)
+{
+ /*
+ * Eventually, we'll want to revalidate snapped metadata
+ * too... probably...
+ */

+ return 1;
+}
+

+
+
+/*
+ * read() on a dir. This weird interface hack only works if mounted
+ * with '-o dirstat'.
+ */
+static ssize_t ceph_read_dir(struct file *file, char __user *buf, size_t size,
+ loff_t *ppos)
+{
+ struct ceph_file_info *cf = file->private_data;

+ struct inode *inode = file->f_dentry->d_inode;
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int left;
+
+ if (!ceph_test_opt(ceph_client(inode->i_sb), DIRSTAT))
+ return -EISDIR;
+
+ if (!cf->dir_info) {
+ cf->dir_info = kmalloc(1024, GFP_NOFS);
+ if (!cf->dir_info)
+ return -ENOMEM;
+ cf->dir_info_len =
+ sprintf(cf->dir_info,
+ "entries: %20lld\n"
+ " files: %20lld\n"
+ " subdirs: %20lld\n"
+ "rentries: %20lld\n"
+ " rfiles: %20lld\n"
+ " rsubdirs: %20lld\n"
+ "rbytes: %20lld\n"
+ "rctime: %10ld.%09ld\n",
+ ci->i_files + ci->i_subdirs,
+ ci->i_files,
+ ci->i_subdirs,
+ ci->i_rfiles + ci->i_rsubdirs,
+ ci->i_rfiles,
+ ci->i_rsubdirs,
+ ci->i_rbytes,
+ (long)ci->i_rctime.tv_sec,
+ (long)ci->i_rctime.tv_nsec);
+ }
+
+ if (*ppos >= cf->dir_info_len)
+ return 0;
+ size = min_t(unsigned, size, cf->dir_info_len-*ppos);
+ left = copy_to_user(buf, cf->dir_info + *ppos, size);
+ if (left == size)
+ return -EFAULT;
+ *ppos += (size - left);
+ return size - left;
+}
+
+/*
+ * an fsync() on a dir will wait for any uncommitted directory
+ * operations to commit.
+ */
+static int ceph_dir_fsync(struct file *file, struct dentry *dentry,
+ int datasync)
+{
+ struct inode *inode = dentry->d_inode;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct list_head *head = &ci->i_unsafe_dirops;
+ struct ceph_mds_request *req;
+ u64 last_tid;

+ int ret = 0;
+

+ dout("dir_fsync %p\n", inode);
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ req = list_entry(head->prev,
+ struct ceph_mds_request, r_unsafe_dir_item);
+ last_tid = req->r_tid;
+
+ do {
+ ceph_mdsc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+ dout("dir_fsync %p wait on tid %llu (until %llu)\n",
+ inode, req->r_tid, last_tid);
+ if (req->r_timeout) {
+ ret = wait_for_completion_timeout(
+ &req->r_safe_completion, req->r_timeout);
+ if (ret > 0)
+ ret = 0;
+ else if (ret == 0)
+ ret = -EIO; /* timed out */
+ } else {
+ wait_for_completion(&req->r_safe_completion);
+ }
+ spin_lock(&ci->i_unsafe_lock);
+ ceph_mdsc_put_request(req);
+
+ if (ret || list_empty(head))
+ break;
+ req = list_entry(head->next,
+ struct ceph_mds_request, r_unsafe_dir_item);
+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+ return ret;
+}
+
+/*
+ * We maintain a private dentry LRU.
+ *
+ * FIXME: this needs to be changed to a per-mds lru to be useful.
+ */
+void ceph_dentry_lru_add(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_add %p %p '%.*s'\n", di, dn,
+ dn->d_name.len, dn->d_name.name);
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_add_tail(&di->lru, &mdsc->dentry_lru);
+ mdsc->num_dentry++;

+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}

+
+void ceph_dentry_lru_touch(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_touch %p %p '%.*s'\n", di, dn,
+ dn->d_name.len, dn->d_name.name);
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_move_tail(&di->lru, &mdsc->dentry_lru);

+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}

+
+void ceph_dentry_lru_del(struct dentry *dn)
+{
+ struct ceph_dentry_info *di = ceph_dentry(dn);
+ struct ceph_mds_client *mdsc;
+
+ dout("dentry_lru_del %p %p '%.*s'\n", di, dn,
+ dn->d_name.len, dn->d_name.name);
+ if (di) {
+ mdsc = &ceph_client(dn->d_sb)->mdsc;
+ spin_lock(&mdsc->dentry_lru_lock);
+ list_del_init(&di->lru);
+ mdsc->num_dentry--;

+ spin_unlock(&mdsc->dentry_lru_lock);
+ }
+}

+
+const struct file_operations ceph_dir_fops = {
+ .read = ceph_read_dir,
+ .readdir = ceph_readdir,
+ .llseek = ceph_dir_llseek,
+ .open = ceph_open,
+ .release = ceph_release,
+ .unlocked_ioctl = ceph_ioctl,
+ .fsync = ceph_dir_fsync,
+};
+
+const struct inode_operations ceph_dir_iops = {
+ .lookup = ceph_lookup,
+ .permission = ceph_permission,
+ .getattr = ceph_getattr,
+ .setattr = ceph_setattr,
+ .setxattr = ceph_setxattr,
+ .getxattr = ceph_getxattr,
+ .listxattr = ceph_listxattr,
+ .removexattr = ceph_removexattr,
+ .mknod = ceph_mknod,
+ .symlink = ceph_symlink,
+ .mkdir = ceph_mkdir,
+ .link = ceph_link,
+ .unlink = ceph_unlink,
+ .rmdir = ceph_unlink,
+ .rename = ceph_rename,
+ .create = ceph_create,
+};
+
+struct dentry_operations ceph_dentry_ops = {
+ .d_revalidate = ceph_d_revalidate,
+ .d_release = ceph_dentry_release,
+};
+
+struct dentry_operations ceph_snapdir_dentry_ops = {
+ .d_revalidate = ceph_snapdir_d_revalidate,
+};
+
+struct dentry_operations ceph_snap_dentry_ops = {
+};

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

These headers describe the types used to exchange messages between the
Ceph client and various servers. All types are little-endian and
packed. These headers are shared between the kernel and userspace, so
all types are in terms of e.g. __u32.

Additionally, we define a few magic values to identify the current
version of the protocol(s) in use, so that discrepancies to be
detected on mount.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/ceph_fs.c | 74 ++++++
fs/ceph/ceph_fs.h | 651 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/ceph_strings.c | 176 +++++++++++++
fs/ceph/msgr.h | 158 ++++++++++++
fs/ceph/rados.h | 374 +++++++++++++++++++++++++++
5 files changed, 1433 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/ceph_fs.c
create mode 100644 fs/ceph/ceph_fs.h
create mode 100644 fs/ceph/ceph_strings.c
create mode 100644 fs/ceph/msgr.h
create mode 100644 fs/ceph/rados.h

diff --git a/fs/ceph/ceph_fs.c b/fs/ceph/ceph_fs.c
new file mode 100644
index 0000000..79d76bc
--- /dev/null
+++ b/fs/ceph/ceph_fs.c
@@ -0,0 +1,74 @@
+/*
+ * Some non-inline ceph helpers
+ */
+#include "types.h"
+
+/*
+ * return true if @layout appears to be valid
+ */
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout)
+{
+ __u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ __u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ __u32 os = le32_to_cpu(layout->fl_object_size);
+
+ /* stripe unit, object size must be non-zero, 64k increment */
+ if (!su || (su & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ if (!os || (os & (CEPH_MIN_STRIPE_UNIT-1)))
+ return 0;
+ /* object size must be a multiple of stripe unit */
+ if (os < su || os % su)
+ return 0;
+ /* stripe count must be non-zero */
+ if (!sc)
+ return 0;

+ return 1;
+}
+
+

+int ceph_flags_to_mode(int flags)
+{
+#ifdef O_DIRECTORY /* fixme */
+ if ((flags & O_DIRECTORY) == O_DIRECTORY)
+ return CEPH_FILE_MODE_PIN;
+#endif
+#ifdef O_LAZY
+ if (flags & O_LAZY)
+ return CEPH_FILE_MODE_LAZY;
+#endif
+ if ((flags & O_APPEND) == O_APPEND)
+ flags |= O_WRONLY;
+
+ flags &= O_ACCMODE;
+ if ((flags & O_RDWR) == O_RDWR)
+ return CEPH_FILE_MODE_RDWR;
+ if ((flags & O_WRONLY) == O_WRONLY)
+ return CEPH_FILE_MODE_WR;
+ return CEPH_FILE_MODE_RD;
+}
+
+int ceph_caps_for_mode(int mode)
+{
+ switch (mode) {
+ case CEPH_FILE_MODE_PIN:
+ return CEPH_CAP_PIN;
+ case CEPH_FILE_MODE_RD:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE;
+ case CEPH_FILE_MODE_RDWR:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_RD | CEPH_CAP_FILE_CACHE |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;
+ case CEPH_FILE_MODE_WR:
+ return CEPH_CAP_PIN | CEPH_CAP_FILE_SHARED |
+ CEPH_CAP_FILE_EXCL |
+ CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_AUTH_SHARED | CEPH_CAP_AUTH_EXCL |
+ CEPH_CAP_XATTR_SHARED | CEPH_CAP_XATTR_EXCL;

+ }
+ return 0;
+}

diff --git a/fs/ceph/ceph_fs.h b/fs/ceph/ceph_fs.h
new file mode 100644
index 0000000..4c68061
--- /dev/null
+++ b/fs/ceph/ceph_fs.h
@@ -0,0 +1,651 @@
+/*
+ * ceph_fs.h - Ceph constants and data types to share between kernel and
+ * user space.
+ *
+ * Most types in this file are defined as little-endian, and are
+ * primarily intended to describe data structures that pass over the
+ * wire or that are stored on disk.
+ *
+ * LGPL2
+ */
+
+#ifndef _FS_CEPH_CEPH_FS_H
+#define _FS_CEPH_CEPH_FS_H
+
+#include "msgr.h"
+#include "rados.h"
+
+/*
+ * Ceph release version
+ */
+#define CEPH_VERSION_MAJOR 0
+#define CEPH_VERSION_MINOR 19
+#define CEPH_VERSION_PATCH 0
+
+#define _CEPH_STRINGIFY(x) #x
+#define CEPH_STRINGIFY(x) _CEPH_STRINGIFY(x)
+#define CEPH_MAKE_VERSION(x, y, z) CEPH_STRINGIFY(x) "." CEPH_STRINGIFY(y) \
+ "." CEPH_STRINGIFY(z)
+#define CEPH_VERSION CEPH_MAKE_VERSION(CEPH_VERSION_MAJOR, \
+ CEPH_VERSION_MINOR, CEPH_VERSION_PATCH)
+
+/*
+ * subprotocol versions. when specific messages types or high-level
+ * protocols change, bump the affected components. we keep rev
+ * internal cluster protocols separately from the public,
+ * client-facing protocol.
+ */
+#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
+#define CEPH_MDS_PROTOCOL 9 /* cluster internal */
+#define CEPH_MON_PROTOCOL 5 /* cluster internal */
+#define CEPH_OSDC_PROTOCOL 24 /* server/client */
+#define CEPH_MDSC_PROTOCOL 32 /* server/client */
+#define CEPH_MONC_PROTOCOL 15 /* server/client */
+
+
+#define CEPH_INO_ROOT 1
+#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
+
+/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
+#define CEPH_MAX_MON 31
+
+
+/*
+ * feature bits
+ */
+#define CEPH_FEATURE_SUPPORTED 0
+#define CEPH_FEATURE_REQUIRED 0
+
+
+/*
+ * ceph_file_layout - describe data layout for a file/inode
+ */
+struct ceph_file_layout {
+ /* file -> object mapping */
+ __le32 fl_stripe_unit; /* stripe unit, in bytes. must be multiple
+ of page size. */
+ __le32 fl_stripe_count; /* over this many objects */
+ __le32 fl_object_size; /* until objects are this big, then move to
+ new objects */
+ __le32 fl_cas_hash; /* 0 = none; 1 = sha256 */
+
+ /* pg -> disk layout */
+ __le32 fl_object_stripe_unit; /* for per-object parity, if any */
+
+ /* object -> pg layout */
+ __le32 fl_pg_preferred; /* preferred primary for pg (-1 for none) */
+ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
+} __attribute__ ((packed));
+
+#define CEPH_MIN_STRIPE_UNIT 65536
+
+int ceph_file_layout_is_valid(const struct ceph_file_layout *layout);
+
+
+/* crypto algorithms */
+#define CEPH_CRYPTO_NONE 0x0
+#define CEPH_CRYPTO_AES 0x1
+
+/* security/authentication protocols */
+#define CEPH_AUTH_UNKNOWN 0x0
+#define CEPH_AUTH_NONE 0x1
+#define CEPH_AUTH_CEPHX 0x2
+
+
+/*********************************************
+ * message layer
+ */
+
+/*
+ * message types
+ */
+
+/* misc */
+#define CEPH_MSG_SHUTDOWN 1
+#define CEPH_MSG_PING 2
+
+/* client <-> monitor */
+#define CEPH_MSG_MON_MAP 4
+#define CEPH_MSG_MON_GET_MAP 5
+#define CEPH_MSG_STATFS 13
+#define CEPH_MSG_STATFS_REPLY 14
+#define CEPH_MSG_MON_SUBSCRIBE 15
+#define CEPH_MSG_MON_SUBSCRIBE_ACK 16
+#define CEPH_MSG_AUTH 17
+#define CEPH_MSG_AUTH_REPLY 18
+
+/* client <-> mds */
+#define CEPH_MSG_MDS_MAP 21
+
+#define CEPH_MSG_CLIENT_SESSION 22
+#define CEPH_MSG_CLIENT_RECONNECT 23
+
+#define CEPH_MSG_CLIENT_REQUEST 24
+#define CEPH_MSG_CLIENT_REQUEST_FORWARD 25
+#define CEPH_MSG_CLIENT_REPLY 26
+#define CEPH_MSG_CLIENT_CAPS 0x310
+#define CEPH_MSG_CLIENT_LEASE 0x311
+#define CEPH_MSG_CLIENT_SNAP 0x312
+#define CEPH_MSG_CLIENT_CAPRELEASE 0x313
+
+/* osd */
+#define CEPH_MSG_OSD_MAP 41
+#define CEPH_MSG_OSD_OP 42
+#define CEPH_MSG_OSD_OPREPLY 43
+
+struct ceph_mon_request_header {
+ __le64 have_version;
+ __le16 session_mon;
+ __le64 session_mon_tid;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_statfs {
+ __le64 kb, kb_used, kb_avail;
+ __le64 num_objects;
+} __attribute__ ((packed));
+
+struct ceph_mon_statfs_reply {
+ struct ceph_fsid fsid;
+ __le64 version;
+ struct ceph_statfs st;
+} __attribute__ ((packed));
+
+struct ceph_osd_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+ __le32 start;
+} __attribute__ ((packed));
+
+struct ceph_mds_getmap {
+ struct ceph_mon_request_header monhdr;
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+struct ceph_client_mount {
+ struct ceph_mon_request_header monhdr;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_item {
+ __le64 have_version; __le64 have;
+ __u8 onetime;
+} __attribute__ ((packed));
+
+struct ceph_mon_subscribe_ack {
+ __le32 duration; /* seconds */
+ struct ceph_fsid fsid;
+} __attribute__ ((packed));
+
+/*
+ * mds states
+ * > 0 -> in
+ * <= 0 -> out
+ */
+#define CEPH_MDS_STATE_DNE 0 /* down, does not exist. */
+#define CEPH_MDS_STATE_STOPPED -1 /* down, once existed, but no subtrees.
+ empty log. */
+#define CEPH_MDS_STATE_BOOT -4 /* up, boot announcement. */
+#define CEPH_MDS_STATE_STANDBY -5 /* up, idle. waiting for assignment. */
+#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
+#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
+#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
+
+#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
+#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
+ operations (import, rename, etc.) */
+#define CEPH_MDS_STATE_RECONNECT 10 /* up, reconnect to clients */
+#define CEPH_MDS_STATE_REJOIN 11 /* up, rejoining distributed cache */
+#define CEPH_MDS_STATE_CLIENTREPLAY 12 /* up, replaying client operations */
+#define CEPH_MDS_STATE_ACTIVE 13 /* up, active */
+#define CEPH_MDS_STATE_STOPPING 14 /* up, but exporting metadata */
+
+extern const char *ceph_mds_state_name(int s);
+
+
+/*
+ * metadata lock types.
+ * - these are bitmasks.. we can compose them
+ * - they also define the lock ordering by the MDS
+ * - a few of these are internal to the mds
+ */
+#define CEPH_LOCK_DN 1
+#define CEPH_LOCK_ISNAP 2
+#define CEPH_LOCK_IVERSION 4 /* mds internal */
+#define CEPH_LOCK_IFILE 8 /* mds internal */
+#define CEPH_LOCK_IAUTH 32
+#define CEPH_LOCK_ILINK 64
+#define CEPH_LOCK_IDFT 128 /* dir frag tree */
+#define CEPH_LOCK_INEST 256 /* mds internal */
+#define CEPH_LOCK_IXATTR 512
+#define CEPH_LOCK_INO 2048 /* immutable inode bits; not a lock */
+
+/* client_session ops */
+enum {
+ CEPH_SESSION_REQUEST_OPEN,
+ CEPH_SESSION_OPEN,
+ CEPH_SESSION_REQUEST_CLOSE,
+ CEPH_SESSION_CLOSE,
+ CEPH_SESSION_REQUEST_RENEWCAPS,
+ CEPH_SESSION_RENEWCAPS,
+ CEPH_SESSION_STALE,
+ CEPH_SESSION_RECALL_STATE,
+};
+
+extern const char *ceph_session_op_name(int op);
+
+struct ceph_mds_session_head {
+ __le32 op;
+ __le64 seq;
+ struct ceph_timespec stamp;
+ __le32 max_caps, max_leases;
+} __attribute__ ((packed));
+
+/* client_request */
+/*
+ * metadata ops.
+ * & 0x001000 -> write op
+ * & 0x010000 -> follow symlink (e.g. stat(), not lstat()).
+ & & 0x100000 -> use weird ino/path trace
+ */
+#define CEPH_MDS_OP_WRITE 0x001000
+enum {
+ CEPH_MDS_OP_LOOKUP = 0x00100,
+ CEPH_MDS_OP_GETATTR = 0x00101,
+ CEPH_MDS_OP_LOOKUPHASH = 0x00102,
+ CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
+
+ CEPH_MDS_OP_SETXATTR = 0x01105,
+ CEPH_MDS_OP_RMXATTR = 0x01106,
+ CEPH_MDS_OP_SETLAYOUT = 0x01107,
+ CEPH_MDS_OP_SETATTR = 0x01108,
+
+ CEPH_MDS_OP_MKNOD = 0x01201,
+ CEPH_MDS_OP_LINK = 0x01202,
+ CEPH_MDS_OP_UNLINK = 0x01203,
+ CEPH_MDS_OP_RENAME = 0x01204,
+ CEPH_MDS_OP_MKDIR = 0x01220,
+ CEPH_MDS_OP_RMDIR = 0x01221,
+ CEPH_MDS_OP_SYMLINK = 0x01222,
+
+ CEPH_MDS_OP_CREATE = 0x01301,
+ CEPH_MDS_OP_OPEN = 0x00302,
+ CEPH_MDS_OP_READDIR = 0x00305,
+
+ CEPH_MDS_OP_LOOKUPSNAP = 0x00400,
+ CEPH_MDS_OP_MKSNAP = 0x01400,
+ CEPH_MDS_OP_RMSNAP = 0x01401,
+ CEPH_MDS_OP_LSSNAP = 0x00402,
+};
+
+extern const char *ceph_mds_op_name(int op);
+
+
+#define CEPH_SETATTR_MODE 1
+#define CEPH_SETATTR_UID 2
+#define CEPH_SETATTR_GID 4
+#define CEPH_SETATTR_MTIME 8
+#define CEPH_SETATTR_ATIME 16
+#define CEPH_SETATTR_SIZE 32
+#define CEPH_SETATTR_CTIME 64
+
+union ceph_mds_request_args {
+ struct {
+ __le32 mask; /* CEPH_CAP_* */
+ } __attribute__ ((packed)) getattr;
+ struct {
+ __le32 mode;
+ __le32 uid;
+ __le32 gid;
+ struct ceph_timespec mtime;
+ struct ceph_timespec atime;
+ __le64 size, old_size; /* old_size needed by truncate */
+ __le32 mask; /* CEPH_SETATTR_* */
+ } __attribute__ ((packed)) setattr;
+ struct {
+ __le32 frag; /* which dir fragment */
+ __le32 max_entries; /* how many dentries to grab */
+ } __attribute__ ((packed)) readdir;
+ struct {
+ __le32 mode;
+ __le32 rdev;
+ } __attribute__ ((packed)) mknod;
+ struct {
+ __le32 mode;
+ } __attribute__ ((packed)) mkdir;
+ struct {
+ __le32 flags;
+ __le32 mode;
+ __le32 stripe_unit; /* layout for newly created file */
+ __le32 stripe_count; /* ... */
+ __le32 object_size;
+ __le32 file_replication;
+ __le32 preferred;
+ } __attribute__ ((packed)) open;
+ struct {
+ __le32 flags;
+ } __attribute__ ((packed)) setxattr;
+ struct {
+ struct ceph_file_layout layout;
+ } __attribute__ ((packed)) setlayout;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_FLAG_REPLAY 1 /* this is a replayed op */
+#define CEPH_MDS_FLAG_WANT_DENTRY 2 /* want dentry in reply */
+
+struct ceph_mds_request_head {
+ __le64 oldest_client_tid;
+ __le32 mdsmap_epoch; /* on client */
+ __le32 flags; /* CEPH_MDS_FLAG_* */
+ __u8 num_retry, num_fwd; /* count retry, fwd attempts */
+ __le16 num_releases; /* # include cap/lease release records */
+ __le32 op; /* mds op code */
+ __le32 caller_uid, caller_gid;
+ __le64 ino; /* use this ino for openc, mkdir, mknod,
+ etc. (if replaying) */
+ union ceph_mds_request_args args;
+} __attribute__ ((packed));
+
+/* cap/lease release record */
+struct ceph_mds_request_release {
+ __le64 ino, cap_id; /* ino and unique cap id */
+ __le32 caps, wanted; /* new issued, wanted */
+ __le32 seq, issue_seq, mseq;
+ __le32 dname_seq; /* if releasing a dentry lease, a */
+ __le32 dname_len; /* string follows. */
+} __attribute__ ((packed));
+
+/* client reply */
+struct ceph_mds_reply_head {
+ __le32 op;
+ __le32 result;
+ __le32 mdsmap_epoch;
+ __u8 safe; /* true if committed to disk */
+ __u8 is_dentry, is_target; /* true if dentry, target inode records
+ are included with reply */
+} __attribute__ ((packed));
+
+/* one for each node split */
+struct ceph_frag_tree_split {
+ __le32 frag; /* this frag splits... */
+ __le32 by; /* ...by this many bits */
+} __attribute__ ((packed));
+
+struct ceph_frag_tree_head {
+ __le32 nsplits; /* num ceph_frag_tree_split records */
+ struct ceph_frag_tree_split splits[];
+} __attribute__ ((packed));
+
+/* capability issue, for bundling with mds reply */
+struct ceph_mds_reply_cap {
+ __le32 caps, wanted; /* caps issued, wanted */
+ __le64 cap_id;
+ __le32 seq, mseq;
+ __le64 realm; /* snap realm */
+ __u8 flags; /* CEPH_CAP_FLAG_* */
+} __attribute__ ((packed));
+
+#define CEPH_CAP_FLAG_AUTH 1 /* cap is issued by auth mds */
+
+/* inode record, for bundling with mds reply */
+struct ceph_mds_reply_inode {
+ __le64 ino;
+ __le64 snapid;
+ __le32 rdev;
+ __le64 version; /* inode version */
+ __le64 xattr_version; /* version for xattr blob */
+ struct ceph_mds_reply_cap cap; /* caps issued for this inode */
+ struct ceph_file_layout layout;
+ struct ceph_timespec ctime, mtime, atime;
+ __le32 time_warp_seq;
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ __le32 mode, uid, gid;
+ __le32 nlink;
+ __le64 files, subdirs, rbytes, rfiles, rsubdirs; /* dir stats */
+ struct ceph_timespec rctime;
+ struct ceph_frag_tree_head fragtree; /* (must be at end of struct) */
+} __attribute__ ((packed));
+/* followed by frag array, then symlink string, then xattr blob */
+
+/* reply_lease follows dname, and reply_inode */
+struct ceph_mds_reply_lease {
+ __le16 mask; /* lease type(s) */
+ __le32 duration_ms; /* lease duration */
+ __le32 seq;
+} __attribute__ ((packed));
+
+struct ceph_mds_reply_dirfrag {
+ __le32 frag; /* fragment */
+ __le32 auth; /* auth mds, if this is a delegation point */
+ __le32 ndist; /* number of mds' this is replicated on */
+ __le32 dist[];
+} __attribute__ ((packed));
+
+/* file access modes */
+#define CEPH_FILE_MODE_PIN 0
+#define CEPH_FILE_MODE_RD 1
+#define CEPH_FILE_MODE_WR 2
+#define CEPH_FILE_MODE_RDWR 3 /* RD | WR */
+#define CEPH_FILE_MODE_LAZY 4 /* lazy io */
+#define CEPH_FILE_MODE_NUM 8 /* bc these are bit fields.. mostly */
+
+int ceph_flags_to_mode(int flags);
+
+
+/* capability bits */
+#define CEPH_CAP_PIN 1 /* no specific capabilities beyond the pin */
+
+/* generic cap bits */
+#define CEPH_CAP_GSHARED 1 /* client can reads */
+#define CEPH_CAP_GEXCL 2 /* client can read and update */
+#define CEPH_CAP_GCACHE 4 /* (file) client can cache reads */
+#define CEPH_CAP_GRD 8 /* (file) client can read */
+#define CEPH_CAP_GWR 16 /* (file) client can write */
+#define CEPH_CAP_GBUFFER 32 /* (file) client can buffer writes */
+#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
+#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
+
+/* per-lock shift */
+#define CEPH_CAP_SAUTH 2
+#define CEPH_CAP_SLINK 4
+#define CEPH_CAP_SXATTR 6
+#define CEPH_CAP_SFILE 8 /* goes at the end (uses >2 cap bits) */
+
+#define CEPH_CAP_BITS 16
+
+/* composed values */
+#define CEPH_CAP_AUTH_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SAUTH)
+#define CEPH_CAP_AUTH_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SAUTH)
+#define CEPH_CAP_LINK_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SLINK)
+#define CEPH_CAP_LINK_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SLINK)
+#define CEPH_CAP_XATTR_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SXATTR)
+#define CEPH_CAP_XATTR_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SXATTR)
+#define CEPH_CAP_FILE(x) (x << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_SHARED (CEPH_CAP_GSHARED << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_EXCL (CEPH_CAP_GEXCL << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_CACHE (CEPH_CAP_GCACHE << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_RD (CEPH_CAP_GRD << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WR (CEPH_CAP_GWR << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_BUFFER (CEPH_CAP_GBUFFER << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_WREXTEND (CEPH_CAP_GWREXTEND << CEPH_CAP_SFILE)
+#define CEPH_CAP_FILE_LAZYIO (CEPH_CAP_GLAZYIO << CEPH_CAP_SFILE)
+
+/* cap masks (for getattr) */
+#define CEPH_STAT_CAP_INODE CEPH_CAP_PIN
+#define CEPH_STAT_CAP_TYPE CEPH_CAP_PIN /* mode >> 12 */
+#define CEPH_STAT_CAP_SYMLINK CEPH_CAP_PIN
+#define CEPH_STAT_CAP_UID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_GID CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_MODE CEPH_CAP_AUTH_SHARED
+#define CEPH_STAT_CAP_NLINK CEPH_CAP_LINK_SHARED
+#define CEPH_STAT_CAP_LAYOUT CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_MTIME CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_SIZE CEPH_CAP_FILE_SHARED
+#define CEPH_STAT_CAP_ATIME CEPH_CAP_FILE_SHARED /* fixme */
+#define CEPH_STAT_CAP_XATTR CEPH_CAP_XATTR_SHARED
+#define CEPH_STAT_CAP_INODE_ALL (CEPH_CAP_PIN | \
+ CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_FILE_SHARED | \
+ CEPH_CAP_XATTR_SHARED)
+
+#define CEPH_CAP_ANY_SHARED (CEPH_CAP_AUTH_SHARED | \
+ CEPH_CAP_LINK_SHARED | \
+ CEPH_CAP_XATTR_SHARED | \
+ CEPH_CAP_FILE_SHARED)
+#define CEPH_CAP_ANY_RD (CEPH_CAP_ANY_SHARED | CEPH_CAP_FILE_RD | \
+ CEPH_CAP_FILE_CACHE)
+
+#define CEPH_CAP_ANY_EXCL (CEPH_CAP_AUTH_EXCL | \
+ CEPH_CAP_LINK_EXCL | \
+ CEPH_CAP_XATTR_EXCL | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_FILE_WR (CEPH_CAP_FILE_WR | CEPH_CAP_FILE_BUFFER | \
+ CEPH_CAP_FILE_EXCL)
+#define CEPH_CAP_ANY_WR (CEPH_CAP_ANY_EXCL | CEPH_CAP_ANY_FILE_WR)
+#define CEPH_CAP_ANY (CEPH_CAP_ANY_RD | CEPH_CAP_ANY_EXCL | \
+ CEPH_CAP_ANY_FILE_WR | CEPH_CAP_PIN)
+
+#define CEPH_CAP_LOCKS (CEPH_LOCK_IFILE | CEPH_LOCK_IAUTH | CEPH_LOCK_ILINK | \
+ CEPH_LOCK_IXATTR)
+
+int ceph_caps_for_mode(int mode);
+
+enum {
+ CEPH_CAP_OP_GRANT, /* mds->client grant */
+ CEPH_CAP_OP_REVOKE, /* mds->client revoke */
+ CEPH_CAP_OP_TRUNC, /* mds->client trunc notify */
+ CEPH_CAP_OP_EXPORT, /* mds has exported the cap */
+ CEPH_CAP_OP_IMPORT, /* mds has imported the cap */
+ CEPH_CAP_OP_UPDATE, /* client->mds update */
+ CEPH_CAP_OP_DROP, /* client->mds drop cap bits */
+ CEPH_CAP_OP_FLUSH, /* client->mds cap writeback */
+ CEPH_CAP_OP_FLUSH_ACK, /* mds->client flushed */
+ CEPH_CAP_OP_FLUSHSNAP, /* client->mds flush snapped metadata */
+ CEPH_CAP_OP_FLUSHSNAP_ACK, /* mds->client flushed snapped metadata */
+ CEPH_CAP_OP_RELEASE, /* client->mds release (clean) cap */
+ CEPH_CAP_OP_RENEW, /* client->mds renewal request */
+};
+
+extern const char *ceph_cap_op_name(int op);
+
+/*
+ * caps message, used for capability callbacks, acks, requests, etc.
+ */
+struct ceph_mds_caps {
+ __le32 op; /* CEPH_CAP_OP_* */
+ __le64 ino, realm;
+ __le64 cap_id;
+ __le32 seq, issue_seq;
+ __le32 caps, wanted, dirty; /* latest issued/wanted/dirty */
+ __le32 migrate_seq;
+ __le64 snap_follows;
+ __le32 snap_trace_len;
+
+ /* authlock */
+ __le32 uid, gid, mode;
+
+ /* linklock */
+ __le32 nlink;
+
+ /* xattrlock */
+ __le32 xattr_len;
+ __le64 xattr_version;
+
+ /* filelock */
+ __le64 size, max_size, truncate_size;
+ __le32 truncate_seq;
+ struct ceph_timespec mtime, atime, ctime;
+ struct ceph_file_layout layout;
+ __le32 time_warp_seq;
+} __attribute__ ((packed));
+
+/* cap release msg head */
+struct ceph_mds_cap_release {
+ __le32 num; /* number of cap_items that follow */
+} __attribute__ ((packed));
+
+struct ceph_mds_cap_item {
+ __le64 ino;
+ __le64 cap_id;
+ __le32 migrate_seq, seq;
+} __attribute__ ((packed));
+
+#define CEPH_MDS_LEASE_REVOKE 1 /* mds -> client */
+#define CEPH_MDS_LEASE_RELEASE 2 /* client -> mds */
+#define CEPH_MDS_LEASE_RENEW 3 /* client <-> mds */
+#define CEPH_MDS_LEASE_REVOKE_ACK 4 /* client -> mds */
+
+extern const char *ceph_lease_op_name(int o);
+
+/* lease msg header */
+struct ceph_mds_lease {
+ __u8 action; /* CEPH_MDS_LEASE_* */
+ __le16 mask; /* which lease */
+ __le64 ino;
+ __le64 first, last; /* snap range */
+ __le32 seq;
+ __le32 duration_ms; /* duration of renewal */
+} __attribute__ ((packed));
+/* followed by a __le32+string for dname */
+
+/* client reconnect */
+struct ceph_mds_cap_reconnect {
+ __le64 cap_id;
+ __le32 wanted;
+ __le32 issued;
+ __le64 size;
+ struct ceph_timespec mtime, atime;
+ __le64 snaprealm;
+ __le64 pathbase; /* base ino for our path to this ino */
+} __attribute__ ((packed));
+/* followed by encoded string */
+
+struct ceph_mds_snaprealm_reconnect {
+ __le64 ino; /* snap realm base */
+ __le64 seq; /* snap seq for this snap realm */
+ __le64 parent; /* parent realm */
+} __attribute__ ((packed));
+
+/*
+ * snaps
+ */
+enum {
+ CEPH_SNAP_OP_UPDATE, /* CREATE or DESTROY */
+ CEPH_SNAP_OP_CREATE,
+ CEPH_SNAP_OP_DESTROY,
+ CEPH_SNAP_OP_SPLIT,
+};
+
+extern const char *ceph_snap_op_name(int o);
+
+/* snap msg header */
+struct ceph_mds_snap_head {
+ __le32 op; /* CEPH_SNAP_OP_* */
+ __le64 split; /* ino to split off, if any */
+ __le32 num_split_inos; /* # inos belonging to new child realm */
+ __le32 num_split_realms; /* # child realms udner new child realm */
+ __le32 trace_len; /* size of snap trace blob */
+} __attribute__ ((packed));
+/* followed by split ino list, then split realms, then the trace blob */
+
+/*
+ * encode info about a snaprealm, as viewed by a client
+ */
+struct ceph_mds_snap_realm {
+ __le64 ino; /* ino */
+ __le64 created; /* snap: when created */
+ __le64 parent; /* ino: parent realm */
+ __le64 parent_since; /* snap: same parent since */
+ __le64 seq; /* snap: version */
+ __le32 num_snaps;
+ __le32 num_prior_parent_snaps;
+} __attribute__ ((packed));
+/* followed by my snap list, then prior parent snap list */
+
+#endif
diff --git a/fs/ceph/ceph_strings.c b/fs/ceph/ceph_strings.c
new file mode 100644
index 0000000..8e4be6a
--- /dev/null
+++ b/fs/ceph/ceph_strings.c
@@ -0,0 +1,176 @@
+/*
+ * Ceph string constants
+ */
+#include "types.h"
+
+const char *ceph_entity_type_name(int type)
+{
+ switch (type) {
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_ADMIN: return "admin";
+ case CEPH_ENTITY_TYPE_AUTH: return "auth";
+ default: return "unknown";
+ }
+}
+
+const char *ceph_osd_op_name(int op)
+{
+ switch (op) {
+ case CEPH_OSD_OP_READ: return "read";
+ case CEPH_OSD_OP_STAT: return "stat";
+
+ case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
+
+ case CEPH_OSD_OP_WRITE: return "write";
+ case CEPH_OSD_OP_DELETE: return "delete";
+ case CEPH_OSD_OP_TRUNCATE: return "truncate";
+ case CEPH_OSD_OP_ZERO: return "zero";
+ case CEPH_OSD_OP_WRITEFULL: return "writefull";
+
+ case CEPH_OSD_OP_APPEND: return "append";
+ case CEPH_OSD_OP_STARTSYNC: return "startsync";
+ case CEPH_OSD_OP_SETTRUNC: return "settrunc";
+ case CEPH_OSD_OP_TRIMTRUNC: return "trimtrunc";
+
+ case CEPH_OSD_OP_TMAPUP: return "tmapup";
+ case CEPH_OSD_OP_TMAPGET: return "tmapget";
+ case CEPH_OSD_OP_TMAPPUT: return "tmapput";
+
+ case CEPH_OSD_OP_GETXATTR: return "getxattr";
+ case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
+ case CEPH_OSD_OP_SETXATTR: return "setxattr";
+ case CEPH_OSD_OP_SETXATTRS: return "setxattrs";
+ case CEPH_OSD_OP_RESETXATTRS: return "resetxattrs";
+ case CEPH_OSD_OP_RMXATTR: return "rmxattr";
+
+ case CEPH_OSD_OP_PULL: return "pull";
+ case CEPH_OSD_OP_PUSH: return "push";
+ case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
+ case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
+ case CEPH_OSD_OP_SCRUB: return "scrub";
+
+ case CEPH_OSD_OP_WRLOCK: return "wrlock";
+ case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
+ case CEPH_OSD_OP_RDLOCK: return "rdlock";
+ case CEPH_OSD_OP_RDUNLOCK: return "rdunlock";
+ case CEPH_OSD_OP_UPLOCK: return "uplock";
+ case CEPH_OSD_OP_DNLOCK: return "dnlock";
+
+ case CEPH_OSD_OP_CALL: return "call";
+
+ case CEPH_OSD_OP_PGLS: return "pgls";
+ }
+ return "???";
+}
+
+const char *ceph_mds_state_name(int s)
+{
+ switch (s) {
+ /* down and out */
+ case CEPH_MDS_STATE_DNE: return "down:dne";
+ case CEPH_MDS_STATE_STOPPED: return "down:stopped";
+ /* up and out */
+ case CEPH_MDS_STATE_BOOT: return "up:boot";
+ case CEPH_MDS_STATE_STANDBY: return "up:standby";
+ case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
+ case CEPH_MDS_STATE_CREATING: return "up:creating";
+ case CEPH_MDS_STATE_STARTING: return "up:starting";
+ /* up and in */
+ case CEPH_MDS_STATE_REPLAY: return "up:replay";
+ case CEPH_MDS_STATE_RESOLVE: return "up:resolve";
+ case CEPH_MDS_STATE_RECONNECT: return "up:reconnect";
+ case CEPH_MDS_STATE_REJOIN: return "up:rejoin";
+ case CEPH_MDS_STATE_CLIENTREPLAY: return "up:clientreplay";
+ case CEPH_MDS_STATE_ACTIVE: return "up:active";
+ case CEPH_MDS_STATE_STOPPING: return "up:stopping";
+ }
+ return "???";
+}
+
+const char *ceph_session_op_name(int op)
+{
+ switch (op) {
+ case CEPH_SESSION_REQUEST_OPEN: return "request_open";
+ case CEPH_SESSION_OPEN: return "open";
+ case CEPH_SESSION_REQUEST_CLOSE: return "request_close";
+ case CEPH_SESSION_CLOSE: return "close";
+ case CEPH_SESSION_REQUEST_RENEWCAPS: return "request_renewcaps";
+ case CEPH_SESSION_RENEWCAPS: return "renewcaps";
+ case CEPH_SESSION_STALE: return "stale";
+ case CEPH_SESSION_RECALL_STATE: return "recall_state";
+ }
+ return "???";
+}
+
+const char *ceph_mds_op_name(int op)
+{
+ switch (op) {
+ case CEPH_MDS_OP_LOOKUP: return "lookup";
+ case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
+ case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
+ case CEPH_MDS_OP_GETATTR: return "getattr";
+ case CEPH_MDS_OP_SETXATTR: return "setxattr";
+ case CEPH_MDS_OP_SETATTR: return "setattr";
+ case CEPH_MDS_OP_RMXATTR: return "rmxattr";
+ case CEPH_MDS_OP_READDIR: return "readdir";
+ case CEPH_MDS_OP_MKNOD: return "mknod";
+ case CEPH_MDS_OP_LINK: return "link";
+ case CEPH_MDS_OP_UNLINK: return "unlink";
+ case CEPH_MDS_OP_RENAME: return "rename";
+ case CEPH_MDS_OP_MKDIR: return "mkdir";
+ case CEPH_MDS_OP_RMDIR: return "rmdir";
+ case CEPH_MDS_OP_SYMLINK: return "symlink";
+ case CEPH_MDS_OP_CREATE: return "create";
+ case CEPH_MDS_OP_OPEN: return "open";
+ case CEPH_MDS_OP_LOOKUPSNAP: return "lookupsnap";
+ case CEPH_MDS_OP_LSSNAP: return "lssnap";
+ case CEPH_MDS_OP_MKSNAP: return "mksnap";
+ case CEPH_MDS_OP_RMSNAP: return "rmsnap";
+ }
+ return "???";
+}
+
+const char *ceph_cap_op_name(int op)
+{
+ switch (op) {
+ case CEPH_CAP_OP_GRANT: return "grant";
+ case CEPH_CAP_OP_REVOKE: return "revoke";
+ case CEPH_CAP_OP_TRUNC: return "trunc";
+ case CEPH_CAP_OP_EXPORT: return "export";
+ case CEPH_CAP_OP_IMPORT: return "import";
+ case CEPH_CAP_OP_UPDATE: return "update";
+ case CEPH_CAP_OP_DROP: return "drop";
+ case CEPH_CAP_OP_FLUSH: return "flush";
+ case CEPH_CAP_OP_FLUSH_ACK: return "flush_ack";
+ case CEPH_CAP_OP_FLUSHSNAP: return "flushsnap";
+ case CEPH_CAP_OP_FLUSHSNAP_ACK: return "flushsnap_ack";
+ case CEPH_CAP_OP_RELEASE: return "release";
+ case CEPH_CAP_OP_RENEW: return "renew";
+ }
+ return "???";
+}
+
+const char *ceph_lease_op_name(int o)
+{
+ switch (o) {
+ case CEPH_MDS_LEASE_REVOKE: return "revoke";
+ case CEPH_MDS_LEASE_RELEASE: return "release";
+ case CEPH_MDS_LEASE_RENEW: return "renew";
+ case CEPH_MDS_LEASE_REVOKE_ACK: return "revoke_ack";
+ }
+ return "???";
+}
+
+const char *ceph_snap_op_name(int o)
+{
+ switch (o) {
+ case CEPH_SNAP_OP_UPDATE: return "update";
+ case CEPH_SNAP_OP_CREATE: return "create";
+ case CEPH_SNAP_OP_DESTROY: return "destroy";
+ case CEPH_SNAP_OP_SPLIT: return "split";
+ }
+ return "???";
+}
diff --git a/fs/ceph/msgr.h b/fs/ceph/msgr.h
new file mode 100644
index 0000000..8aaab41
--- /dev/null
+++ b/fs/ceph/msgr.h
@@ -0,0 +1,158 @@
+#ifndef __MSGR_H
+#define __MSGR_H
+
+/*
+ * Data types for message passing layer used by Ceph.
+ */
+
+#define CEPH_MON_PORT 6789 /* default monitor port */
+
+/*
+ * client-side processes will try to bind to ports in this
+ * range, simply for the benefit of tools like nmap or wireshark
+ * that would like to identify the protocol.
+ */
+#define CEPH_PORT_FIRST 6789
+#define CEPH_PORT_START 6800 /* non-monitors start here */
+#define CEPH_PORT_LAST 6900
+
+/*
+ * tcp connection banner. include a protocol version. and adjust
+ * whenever the wire protocol changes. try to keep this string length
+ * constant.
+ */
+#define CEPH_BANNER "ceph v027"
+#define CEPH_BANNER_MAX_LEN 30
+
+
+/*
+ * Rollover-safe type and comparator for 32-bit sequence numbers.
+ * Comparator returns -1, 0, or 1.
+ */
+typedef __u32 ceph_seq_t;
+
+static inline __s32 ceph_seq_cmp(__u32 a, __u32 b)
+{
+ return (__s32)a - (__s32)b;
+}
+
+
+/*
+ * entity_name -- logical name for a process participating in the
+ * network, e.g. 'mds0' or 'osd3'.
+ */
+struct ceph_entity_name {
+ __u8 type; /* CEPH_ENTITY_TYPE_* */
+ __le64 num;
+} __attribute__ ((packed));
+
+#define CEPH_ENTITY_TYPE_MON 0x01
+#define CEPH_ENTITY_TYPE_MDS 0x02
+#define CEPH_ENTITY_TYPE_OSD 0x04
+#define CEPH_ENTITY_TYPE_CLIENT 0x08
+#define CEPH_ENTITY_TYPE_ADMIN 0x10
+#define CEPH_ENTITY_TYPE_AUTH 0x20
+
+#define CEPH_ENTITY_TYPE_ANY 0xFF
+
+extern const char *ceph_entity_type_name(int type);
+
+/*
+ * entity_addr -- network address
+ */
+struct ceph_entity_addr {
+ __le32 type;
+ __le32 nonce; /* unique id for process (e.g. pid) */
+ struct sockaddr_storage in_addr;
+} __attribute__ ((packed));
+
+struct ceph_entity_inst {
+ struct ceph_entity_name name;
+ struct ceph_entity_addr addr;
+} __attribute__ ((packed));
+
+
+/* used by message exchange protocol */
+#define CEPH_MSGR_TAG_READY 1 /* server->client: ready for messages */
+#define CEPH_MSGR_TAG_RESETSESSION 2 /* server->client: reset, try again */
+#define CEPH_MSGR_TAG_WAIT 3 /* server->client: wait for racing
+ incoming connection */
+#define CEPH_MSGR_TAG_RETRY_SESSION 4 /* server->client + cseq: try again
+ with higher cseq */
+#define CEPH_MSGR_TAG_RETRY_GLOBAL 5 /* server->client + gseq: try again
+ with higher gseq */
+#define CEPH_MSGR_TAG_CLOSE 6 /* closing pipe */
+#define CEPH_MSGR_TAG_MSG 7 /* message */
+#define CEPH_MSGR_TAG_ACK 8 /* message ack */
+#define CEPH_MSGR_TAG_KEEPALIVE 9 /* just a keepalive byte! */
+#define CEPH_MSGR_TAG_BADPROTOVER 10 /* bad protocol version */
+#define CEPH_MSGR_TAG_BADAUTHORIZER 11 /* bad authorizer */
+#define CEPH_MSGR_TAG_FEATURES 12 /* insufficient features */
+
+
+/*
+ * connection negotiation
+ */
+struct ceph_msg_connect {
+ __le64 features; /* supported feature bits */
+ __le32 host_type; /* CEPH_ENTITY_TYPE_* */
+ __le32 global_seq; /* count connections initiated by this host */
+ __le32 connect_seq; /* count connections initiated in this session */
+ __le32 protocol_version;
+ __le32 authorizer_protocol;
+ __le32 authorizer_len;
+ __u8 flags; /* CEPH_MSG_CONNECT_* */
+} __attribute__ ((packed));
+
+struct ceph_msg_connect_reply {
+ __u8 tag;
+ __le64 features; /* feature bits for this session */
+ __le32 global_seq;
+ __le32 connect_seq;
+ __le32 protocol_version;
+ __le32 authorizer_len;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_CONNECT_LOSSY 1 /* messages i send may be safely dropped */
+
+
+/*
+ * message header
+ */
+struct ceph_msg_header {
+ __le64 seq; /* message seq# for this session */
+ __le64 tid; /* transaction id */
+ __le16 type; /* message type */
+ __le16 priority; /* priority. higher value == higher priority */
+ __le16 version; /* version of message encoding */
+
+ __le32 front_len; /* bytes in main payload */
+ __le32 middle_len;/* bytes in middle payload */
+ __le32 data_len; /* bytes of data payload */
+ __le16 data_off; /* sender: include full offset;
+ receiver: mask against ~PAGE_MASK */
+
+ struct ceph_entity_inst src, orig_src;
+ __le32 reserved;
+ __le32 crc; /* header crc32c */
+} __attribute__ ((packed));
+
+#define CEPH_MSG_PRIO_LOW 64
+#define CEPH_MSG_PRIO_DEFAULT 127
+#define CEPH_MSG_PRIO_HIGH 196
+#define CEPH_MSG_PRIO_HIGHEST 255
+
+/*
+ * follows data payload
+ */
+struct ceph_msg_footer {
+ __le32 front_crc, middle_crc, data_crc;
+ __u8 flags;
+} __attribute__ ((packed));
+
+#define CEPH_MSG_FOOTER_COMPLETE (1<<0) /* msg wasn't aborted */
+#define CEPH_MSG_FOOTER_NOCRC (1<<1) /* no data crc */
+
+
+#endif
diff --git a/fs/ceph/rados.h b/fs/ceph/rados.h
new file mode 100644
index 0000000..26ac8b8
--- /dev/null
+++ b/fs/ceph/rados.h
@@ -0,0 +1,374 @@
+#ifndef __RADOS_H
+#define __RADOS_H
+
+/*
+ * Data types for the Ceph distributed object storage layer RADOS
+ * (Reliable Autonomic Distributed Object Store).
+ */
+
+#include "msgr.h"
+
+/*
+ * osdmap encoding versions
+ */
+#define CEPH_OSDMAP_INC_VERSION 4
+#define CEPH_OSDMAP_VERSION 4
+
+/*
+ * fs id
+ */
+struct ceph_fsid {
+ unsigned char fsid[16];
+};
+
+static inline int ceph_fsid_compare(const struct ceph_fsid *a,
+ const struct ceph_fsid *b)
+{
+ return memcmp(a, b, sizeof(*a));
+}
+
+/*
+ * ino, object, etc.
+ */
+typedef __le64 ceph_snapid_t;
+#define CEPH_SNAPDIR ((__u64)(-1)) /* reserved for hidden .snap dir */
+#define CEPH_NOSNAP ((__u64)(-2)) /* "head", "live" revision */
+#define CEPH_MAXSNAP ((__u64)(-3)) /* largest valid snapid */
+
+struct ceph_timespec {
+ __le32 tv_sec;
+ __le32 tv_nsec;
+} __attribute__ ((packed));
+
+
+/*
+ * object layout - how objects are mapped into PGs
+ */
+#define CEPH_OBJECT_LAYOUT_HASH 1
+#define CEPH_OBJECT_LAYOUT_LINEAR 2
+#define CEPH_OBJECT_LAYOUT_HASHINO 3
+
+/*
+ * pg layout -- how PGs are mapped onto (sets of) OSDs
+ */
+#define CEPH_PG_LAYOUT_CRUSH 0
+#define CEPH_PG_LAYOUT_HASH 1
+#define CEPH_PG_LAYOUT_LINEAR 2
+#define CEPH_PG_LAYOUT_HYBRID 3
+
+
+/*
+ * placement group.
+ * we encode this into one __le64.
+ */
+struct ceph_pg {
+ __le16 preferred; /* preferred primary osd */
+ __le16 ps; /* placement seed */
+ __le32 pool; /* object pool */
+} __attribute__ ((packed));
+
+/*
+ * pg_pool is a set of pgs storing a pool of objects
+ *
+ * pg_num -- base number of pseudorandomly placed pgs
+ *
+ * pgp_num -- effective number when calculating pg placement. this
+ * is used for pg_num increases. new pgs result in data being "split"
+ * into new pgs. for this to proceed smoothly, new pgs are intiially
+ * colocated with their parents; that is, pgp_num doesn't increase
+ * until the new pgs have successfully split. only _then_ are the new
+ * pgs placed independently.
+ *
+ * lpg_num -- localized pg count (per device). replicas are randomly
+ * selected.
+ *
+ * lpgp_num -- as above.
+ */
+#define CEPH_PG_TYPE_REP 1
+#define CEPH_PG_TYPE_RAID4 2
+#define CEPH_PG_POOL_VERSION 2
+struct ceph_pg_pool {
+ __u8 type; /* CEPH_PG_TYPE_* */
+ __u8 size; /* number of osds in each pg */
+ __u8 crush_ruleset; /* crush placement rule */
+ __u8 object_hash; /* hash mapping object name to ps */
+ __le32 pg_num, pgp_num; /* number of pg's */
+ __le32 lpg_num, lpgp_num; /* number of localized pg's */
+ __le32 last_change; /* most recent epoch changed */
+ __le64 snap_seq; /* seq for per-pool snapshot */
+ __le32 snap_epoch; /* epoch of last snap */
+ __le32 num_snaps;
+ __le32 num_removed_snap_intervals;
+ __le64 uid;
+} __attribute__ ((packed));
+
+/*
+ * stable_mod func is used to control number of placement groups.
+ * similar to straight-up modulo, but produces a stable mapping as b
+ * increases over time. b is the number of bins, and bmask is the
+ * containing power of 2 minus 1.
+ *
+ * b <= bmask and bmask=(2**n)-1
+ * e.g., b=12 -> bmask=15, b=123 -> bmask=127
+ */
+static inline int ceph_stable_mod(int x, int b, int bmask)
+{
+ if ((x & bmask) < b)
+ return x & bmask;
+ else
+ return x & (bmask >> 1);
+}
+
+/*
+ * object layout - how a given object should be stored.
+ */
+struct ceph_object_layout {
+ struct ceph_pg ol_pgid; /* raw pg, with _full_ ps precision. */
+ __le32 ol_stripe_unit; /* for per-object parity, if any */
+} __attribute__ ((packed));
+
+/*
+ * compound epoch+version, used by storage layer to serialize mutations
+ */
+struct ceph_eversion {
+ __le32 epoch;
+ __le64 version;
+} __attribute__ ((packed));
+
+/*
+ * osd map bits
+ */
+
+/* status bits */
+#define CEPH_OSD_EXISTS 1
+#define CEPH_OSD_UP 2
+
+/* osd weights. fixed point value: 0x10000 == 1.0 ("in"), 0 == "out" */
+#define CEPH_OSD_IN 0x10000
+#define CEPH_OSD_OUT 0
+
+
+/*
+ * osd map flag bits
+ */
+#define CEPH_OSDMAP_NEARFULL (1<<0) /* sync writes (near ENOSPC) */
+#define CEPH_OSDMAP_FULL (1<<1) /* no data writes (ENOSPC) */
+#define CEPH_OSDMAP_PAUSERD (1<<2) /* pause all reads */
+#define CEPH_OSDMAP_PAUSEWR (1<<3) /* pause all writes */
+#define CEPH_OSDMAP_PAUSEREC (1<<4) /* pause recovery */
+
+/*
+ * osd ops
+ */
+#define CEPH_OSD_OP_MODE 0xf000
+#define CEPH_OSD_OP_MODE_RD 0x1000
+#define CEPH_OSD_OP_MODE_WR 0x2000
+#define CEPH_OSD_OP_MODE_RMW 0x3000
+#define CEPH_OSD_OP_MODE_SUB 0x4000
+
+#define CEPH_OSD_OP_TYPE 0x0f00
+#define CEPH_OSD_OP_TYPE_LOCK 0x0100
+#define CEPH_OSD_OP_TYPE_DATA 0x0200
+#define CEPH_OSD_OP_TYPE_ATTR 0x0300
+#define CEPH_OSD_OP_TYPE_EXEC 0x0400
+#define CEPH_OSD_OP_TYPE_PG 0x0500
+
+enum {
+ /** data **/
+ /* read */
+ CEPH_OSD_OP_READ = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_STAT = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 2,
+
+ /* fancy read */
+ CEPH_OSD_OP_MASKTRUNC = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 4,
+
+ /* write */
+ CEPH_OSD_OP_WRITE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 1,
+ CEPH_OSD_OP_WRITEFULL = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 2,
+ CEPH_OSD_OP_TRUNCATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 3,
+ CEPH_OSD_OP_ZERO = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 4,
+ CEPH_OSD_OP_DELETE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 5,
+
+ /* fancy write */
+ CEPH_OSD_OP_APPEND = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 6,
+ CEPH_OSD_OP_STARTSYNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 7,
+ CEPH_OSD_OP_SETTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 8,
+ CEPH_OSD_OP_TRIMTRUNC = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 9,
+
+ CEPH_OSD_OP_TMAPUP = CEPH_OSD_OP_MODE_RMW | CEPH_OSD_OP_TYPE_DATA | 10,
+ CEPH_OSD_OP_TMAPPUT = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 11,
+ CEPH_OSD_OP_TMAPGET = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_DATA | 12,
+
+ CEPH_OSD_OP_CREATE = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_DATA | 13,
+
+ /** attrs **/
+ /* read */
+ CEPH_OSD_OP_GETXATTR = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_GETXATTRS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_ATTR | 2,
+
+ /* write */
+ CEPH_OSD_OP_SETXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 1,
+ CEPH_OSD_OP_SETXATTRS = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 2,
+ CEPH_OSD_OP_RESETXATTRS = CEPH_OSD_OP_MODE_WR|CEPH_OSD_OP_TYPE_ATTR | 3,
+ CEPH_OSD_OP_RMXATTR = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_ATTR | 4,
+
+ /** subop **/
+ CEPH_OSD_OP_PULL = CEPH_OSD_OP_MODE_SUB | 1,
+ CEPH_OSD_OP_PUSH = CEPH_OSD_OP_MODE_SUB | 2,
+ CEPH_OSD_OP_BALANCEREADS = CEPH_OSD_OP_MODE_SUB | 3,
+ CEPH_OSD_OP_UNBALANCEREADS = CEPH_OSD_OP_MODE_SUB | 4,
+ CEPH_OSD_OP_SCRUB = CEPH_OSD_OP_MODE_SUB | 5,
+
+ /** lock **/
+ CEPH_OSD_OP_WRLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 1,
+ CEPH_OSD_OP_WRUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 2,
+ CEPH_OSD_OP_RDLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 3,
+ CEPH_OSD_OP_RDUNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 4,
+ CEPH_OSD_OP_UPLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 5,
+ CEPH_OSD_OP_DNLOCK = CEPH_OSD_OP_MODE_WR | CEPH_OSD_OP_TYPE_LOCK | 6,
+
+ /** exec **/
+ CEPH_OSD_OP_CALL = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_EXEC | 1,
+
+ /** pg **/
+ CEPH_OSD_OP_PGLS = CEPH_OSD_OP_MODE_RD | CEPH_OSD_OP_TYPE_PG | 1,
+};
+
+static inline int ceph_osd_op_type_lock(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_LOCK;
+}
+static inline int ceph_osd_op_type_data(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_DATA;
+}
+static inline int ceph_osd_op_type_attr(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_ATTR;
+}
+static inline int ceph_osd_op_type_exec(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_EXEC;
+}
+static inline int ceph_osd_op_type_pg(int op)
+{
+ return (op & CEPH_OSD_OP_TYPE) == CEPH_OSD_OP_TYPE_PG;
+}
+
+static inline int ceph_osd_op_mode_subop(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_SUB;
+}
+static inline int ceph_osd_op_mode_read(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_RD;
+}
+static inline int ceph_osd_op_mode_modify(int op)
+{
+ return (op & CEPH_OSD_OP_MODE) == CEPH_OSD_OP_MODE_WR;
+}
+
+#define CEPH_OSD_TMAP_HDR 'h'
+#define CEPH_OSD_TMAP_SET 's'
+#define CEPH_OSD_TMAP_RM 'r'
+
+extern const char *ceph_osd_op_name(int op);
+
+
+/*
+ * osd op flags
+ *
+ * An op may be READ, WRITE, or READ|WRITE.
+ */
+enum {
+ CEPH_OSD_FLAG_ACK = 1, /* want (or is) "ack" ack */
+ CEPH_OSD_FLAG_ONNVRAM = 2, /* want (or is) "onnvram" ack */
+ CEPH_OSD_FLAG_ONDISK = 4, /* want (or is) "ondisk" ack */
+ CEPH_OSD_FLAG_RETRY = 8, /* resend attempt */
+ CEPH_OSD_FLAG_READ = 16, /* op may read */
+ CEPH_OSD_FLAG_WRITE = 32, /* op may write */
+ CEPH_OSD_FLAG_ORDERSNAP = 64, /* EOLDSNAP if snapc is out of order */
+ CEPH_OSD_FLAG_PEERSTAT = 128, /* msg includes osd_peer_stat */
+ CEPH_OSD_FLAG_BALANCE_READS = 256,
+ CEPH_OSD_FLAG_PARALLELEXEC = 512, /* execute op in parallel */
+ CEPH_OSD_FLAG_PGOP = 1024, /* pg op, no object */
+ CEPH_OSD_FLAG_EXEC = 2048, /* op may exec */
+};
+
+enum {
+ CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
+};
+
+#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
+#define EBLACKLISTED ESHUTDOWN /* blacklisted */
+
+/*
+ * an individual object operation. each may be accompanied by some data
+ * payload
+ */
+struct ceph_osd_op {
+ __le16 op; /* CEPH_OSD_OP_* */
+ __le32 flags; /* CEPH_OSD_FLAG_* */
+ union {
+ struct {
+ __le64 offset, length;
+ __le64 truncate_size;
+ __le32 truncate_seq;
+ } __attribute__ ((packed)) extent;
+ struct {
+ __le32 name_len;
+ __le32 value_len;
+ } __attribute__ ((packed)) xattr;
+ struct {
+ __u8 class_len;
+ __u8 method_len;
+ __u8 argc;
+ __le32 indata_len;
+ } __attribute__ ((packed)) cls;
+ struct {
+ __le64 cookie, count;
+ } __attribute__ ((packed)) pgls;
+ };
+ __le32 payload_len;
+} __attribute__ ((packed));
+
+/*
+ * osd request message header. each request may include multiple
+ * ceph_osd_op object operations.
+ */
+struct ceph_osd_request_head {
+ __le32 client_inc; /* client incarnation */
+ struct ceph_object_layout layout; /* pgid */
+ __le32 osdmap_epoch; /* client's osdmap epoch */
+
+ __le32 flags;
+
+ struct ceph_timespec mtime; /* for mutations only */
+ struct ceph_eversion reassert_version; /* if we are replaying op */
+
+ __le32 object_len; /* length of object name */
+
+ __le64 snapid; /* snapid to read */
+ __le64 snap_seq; /* writer's snap context */
+ __le32 num_snaps;
+
+ __le16 num_ops;
+ struct ceph_osd_op ops[]; /* followed by ops[], obj, ticket, snaps */
+} __attribute__ ((packed));
+
+struct ceph_osd_reply_head {
+ __le32 client_inc; /* client incarnation */
+ __le32 flags;
+ struct ceph_object_layout layout;
+ __le32 osdmap_epoch;
+ struct ceph_eversion reassert_version; /* for replaying uncommitted */
+
+ __le32 result; /* result code */
+
+ __le32 object_len; /* length of object name */
+ __le32 num_ops;
+ struct ceph_osd_op ops[0]; /* ops[], object */
+} __attribute__ ((packed));
+
+
+#endif

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

We first define constants, types, and prototypes for the kernel client
proper.

A few subsystems are defined separately later: the MDS, OSD, and
monitor clients, and the messaging layer.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/ceph_debug.h | 37 ++
fs/ceph/ceph_frag.c | 21 ++
fs/ceph/ceph_frag.h | 109 ++++++
fs/ceph/super.h | 900 ++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/types.h | 29 ++
5 files changed, 1096 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/ceph_debug.h
create mode 100644 fs/ceph/ceph_frag.c
create mode 100644 fs/ceph/ceph_frag.h
create mode 100644 fs/ceph/super.h
create mode 100644 fs/ceph/types.h

diff --git a/fs/ceph/ceph_debug.h b/fs/ceph/ceph_debug.h
new file mode 100644
index 0000000..1818c23
--- /dev/null
+++ b/fs/ceph/ceph_debug.h
@@ -0,0 +1,37 @@
+#ifndef _FS_CEPH_DEBUG_H
+#define _FS_CEPH_DEBUG_H
+
+#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
+
+#ifdef CONFIG_CEPH_FS_PRETTYDEBUG
+
+/*
+ * wrap pr_debug to include a filename:lineno prefix on each line.
+ * this incurs some overhead (kernel size and execution time) due to
+ * the extra function call at each call site.
+ */
+
+# if defined(DEBUG) || defined(CONFIG_DYNAMIC_DEBUG)
+extern const char *ceph_file_part(const char *s, int len);
+# define dout(fmt, ...) \
+ pr_debug(" %12.12s:%-4d : " fmt, \
+ ceph_file_part(__FILE__, sizeof(__FILE__)), \
+ __LINE__, ##__VA_ARGS__)
+# else
+/* faux printk call just to see any compiler warnings. */
+# define dout(fmt, ...) do { \
+ if (0) \
+ printk(KERN_DEBUG fmt, ##__VA_ARGS__); \
+ } while (0)
+# endif
+
+#else
+
+/*
+ * or, just wrap pr_debug
+ */
+# define dout(fmt, ...) pr_debug(" " fmt, ##__VA_ARGS__)
+
+#endif
+
+#endif
diff --git a/fs/ceph/ceph_frag.c b/fs/ceph/ceph_frag.c
new file mode 100644
index 0000000..ab6cf35
--- /dev/null
+++ b/fs/ceph/ceph_frag.c
@@ -0,0 +1,21 @@
+/*
+ * Ceph 'frag' type

+ */
+#include "types.h"
+

+int ceph_frag_compare(__u32 a, __u32 b)
+{
+ unsigned va = ceph_frag_value(a);
+ unsigned vb = ceph_frag_value(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ va = ceph_frag_bits(a);
+ vb = ceph_frag_bits(b);
+ if (va < vb)
+ return -1;
+ if (va > vb)
+ return 1;
+ return 0;
+}
diff --git a/fs/ceph/ceph_frag.h b/fs/ceph/ceph_frag.h
new file mode 100644
index 0000000..793f50c
--- /dev/null
+++ b/fs/ceph/ceph_frag.h
@@ -0,0 +1,109 @@
+#ifndef _FS_CEPH_FRAG_H
+#define _FS_CEPH_FRAG_H
+
+/*
+ * "Frags" are a way to describe a subset of a 32-bit number space,
+ * using a mask and a value to match against that mask. Any given frag
+ * (subset of the number space) can be partitioned into 2^n sub-frags.
+ *
+ * Frags are encoded into a 32-bit word:
+ * 8 upper bits = "bits"
+ * 24 lower bits = "value"
+ * (We could go to 5+27 bits, but who cares.)
+ *
+ * We use the _most_ significant bits of the 24 bit value. This makes
+ * values logically sort.
+ *
+ * Unfortunately, because the "bits" field is still in the high bits, we
+ * can't sort encoded frags numerically. However, it does allow you
+ * to feed encoded frags as values into frag_contains_value.
+ */
+static inline __u32 ceph_frag_make(__u32 b, __u32 v)
+{
+ return (b << 24) |
+ (v & (0xffffffu << (24-b)) & 0xffffffu);
+}
+static inline __u32 ceph_frag_bits(__u32 f)
+{
+ return f >> 24;
+}
+static inline __u32 ceph_frag_value(__u32 f)
+{
+ return f & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask(__u32 f)
+{
+ return (0xffffffu << (24-ceph_frag_bits(f))) & 0xffffffu;
+}
+static inline __u32 ceph_frag_mask_shift(__u32 f)
+{
+ return 24 - ceph_frag_bits(f);
+}
+
+static inline int ceph_frag_contains_value(__u32 f, __u32 v)
+{
+ return (v & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+static inline int ceph_frag_contains_frag(__u32 f, __u32 sub)
+{
+ /* is sub as specific as us, and contained by us? */
+ return ceph_frag_bits(sub) >= ceph_frag_bits(f) &&
+ (ceph_frag_value(sub) & ceph_frag_mask(f)) == ceph_frag_value(f);
+}
+
+static inline __u32 ceph_frag_parent(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f) - 1,
+ ceph_frag_value(f) & (ceph_frag_mask(f) << 1));
+}
+static inline int ceph_frag_is_left_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 0;
+}
+static inline int ceph_frag_is_right_child(__u32 f)
+{
+ return ceph_frag_bits(f) > 0 &&
+ (ceph_frag_value(f) & (0x1000000 >> ceph_frag_bits(f))) == 1;
+}
+static inline __u32 ceph_frag_sibling(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) ^ (0x1000000 >> ceph_frag_bits(f)));
+}
+static inline __u32 ceph_frag_left_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1, ceph_frag_value(f));
+}
+static inline __u32 ceph_frag_right_child(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f)+1,
+ ceph_frag_value(f) | (0x1000000 >> (1+ceph_frag_bits(f))));
+}
+static inline __u32 ceph_frag_make_child(__u32 f, int by, int i)
+{
+ int newbits = ceph_frag_bits(f) + by;
+ return ceph_frag_make(newbits,
+ ceph_frag_value(f) | (i << (24 - newbits)));
+}
+static inline int ceph_frag_is_leftmost(__u32 f)
+{
+ return ceph_frag_value(f) == 0;
+}
+static inline int ceph_frag_is_rightmost(__u32 f)
+{
+ return ceph_frag_value(f) == ceph_frag_mask(f);
+}
+static inline __u32 ceph_frag_next(__u32 f)
+{
+ return ceph_frag_make(ceph_frag_bits(f),
+ ceph_frag_value(f) + (0x1000000 >> ceph_frag_bits(f)));
+}
+
+/*
+ * comparator to sort frags logically, as when traversing the
+ * number space in ascending order...
+ */
+int ceph_frag_compare(__u32 a, __u32 b);
+
+#endif
diff --git a/fs/ceph/super.h b/fs/ceph/super.h
new file mode 100644
index 0000000..02c0ddc
--- /dev/null
+++ b/fs/ceph/super.h
@@ -0,0 +1,900 @@
+#ifndef _FS_CEPH_SUPER_H
+#define _FS_CEPH_SUPER_H
+
+#include "ceph_debug.h"
+
+#include <asm/unaligned.h>
+#include <linux/backing-dev.h>
+#include <linux/completion.h>
+#include <linux/exportfs.h>
+#include <linux/fs.h>
+#include <linux/mempool.h>
+#include <linux/pagemap.h>
+#include <linux/wait.h>
+
+#include "types.h"
+#include "messenger.h"
+#include "msgpool.h"
+#include "mon_client.h"
+#include "mds_client.h"
+#include "osd_client.h"
+#include "ceph_fs.h"
+
+/* f_type in struct statfs */
+#define CEPH_SUPER_MAGIC 0x00c36400
+
+/* large granularity for statfs utilization stats to facilitate
+ * large volume sizes on 32-bit machines. */
+#define CEPH_BLOCK_SHIFT 20 /* 1 MB */
+#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
+
+/*
+ * mount options
+ */
+#define CEPH_OPT_FSID (1<<0)
+#define CEPH_OPT_NOSHARE (1<<1) /* don't share client with other sbs */
+#define CEPH_OPT_MYIP (1<<2) /* specified my ip */
+#define CEPH_OPT_DIRSTAT (1<<4) /* funky `cat dirname` for stats */
+#define CEPH_OPT_RBYTES (1<<5) /* dir st_bytes = rbytes */
+#define CEPH_OPT_NOCRC (1<<6) /* no data crc on writes */
+#define CEPH_OPT_NOASYNCREADDIR (1<<7) /* no dcache readdir */
+
+#define CEPH_OPT_DEFAULT (CEPH_OPT_RBYTES)
+
+#define ceph_set_opt(client, opt) \
+ (client)->mount_args->flags |= CEPH_OPT_##opt;
+#define ceph_test_opt(client, opt) \
+ (!!((client)->mount_args->flags & CEPH_OPT_##opt))
+
+
+struct ceph_mount_args {
+ int sb_flags;
+ int num_mon;
+ struct ceph_entity_addr *mon_addr;
+ int flags;
+ int mount_timeout;
+ int osd_idle_ttl;
+ int caps_wanted_delay_min, caps_wanted_delay_max;
+ struct ceph_fsid fsid;
+ struct ceph_entity_addr my_addr;
+ int wsize;
+ int rsize; /* max readahead */
+ int max_readdir; /* max readdir size */
+ int congestion_kb; /* max readdir size */
+ int osd_timeout;
+ int osd_keepalive_timeout;
+ char *snapdir_name; /* default ".snap" */
+ char *name;
+ char *secret;
+ int cap_release_safety;
+};
+
+/*
+ * defaults
+ */
+#define CEPH_MOUNT_TIMEOUT_DEFAULT 60
+#define CEPH_OSD_TIMEOUT_DEFAULT 60 /* seconds */
+#define CEPH_OSD_KEEPALIVE_DEFAULT 5
+#define CEPH_OSD_IDLE_TTL_DEFAULT 60
+#define CEPH_MOUNT_RSIZE_DEFAULT (512*1024) /* readahead */
+
+#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
+#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
+
+#define CEPH_SNAPDIRNAME_DEFAULT ".snap"
+#define CEPH_AUTH_NAME_DEFAULT "guest"
+
+/*
+ * Delay telling the MDS we no longer want caps, in case we reopen
+ * the file. Delay a minimum amount of time, even if we send a cap
+ * message for some other reason. Otherwise, take the oppotunity to
+ * update the mds to avoid sending another message later.
+ */
+#define CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT 5 /* cap release delay */
+#define CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT 60 /* cap release delay */
+
+
+/* mount state */
+enum {
+ CEPH_MOUNT_MOUNTING,
+ CEPH_MOUNT_MOUNTED,
+ CEPH_MOUNT_UNMOUNTING,
+ CEPH_MOUNT_UNMOUNTED,
+ CEPH_MOUNT_SHUTDOWN,
+};
+
+/*
+ * subtract jiffies
+ */
+static inline unsigned long time_sub(unsigned long a, unsigned long b)
+{
+ BUG_ON(time_after(b, a));
+ return (long)a - (long)b;
+}
+
+/*
+ * per-filesystem client state
+ *
+ * possibly shared by multiple mount points, if they are
+ * mounting the same ceph filesystem/cluster.
+ */
+struct ceph_client {
+ struct ceph_fsid fsid;
+ bool have_fsid;
+
+ struct mutex mount_mutex; /* serialize mount attempts */
+ struct ceph_mount_args *mount_args;
+
+ struct super_block *sb;
+
+ unsigned long mount_state;
+ wait_queue_head_t auth_wq;
+
+ int auth_err;
+
+ int min_caps; /* min caps i added */
+
+ struct ceph_messenger *msgr; /* messenger instance */
+ struct ceph_mon_client monc;
+ struct ceph_mds_client mdsc;
+ struct ceph_osd_client osdc;
+
+ /* writeback */
+ mempool_t *wb_pagevec_pool;
+ struct workqueue_struct *wb_wq;
+ struct workqueue_struct *pg_inv_wq;
+ struct workqueue_struct *trunc_wq;
+ atomic_long_t writeback_count;
+
+ struct backing_dev_info backing_dev_info;
+
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_monmap;
+ struct dentry *debugfs_mdsmap, *debugfs_osdmap;
+ struct dentry *debugfs_dir, *debugfs_dentry_lru, *debugfs_caps;
+ struct dentry *debugfs_congestion_kb;
+ struct dentry *debugfs_bdi;
+#endif
+};
+
+static inline struct ceph_client *ceph_client(struct super_block *sb)
+{
+ return sb->s_fs_info;
+}
+
+
+/*
+ * File i/o capability. This tracks shared state with the metadata
+ * server that allows us to cache or writeback attributes or to read
+ * and write data. For any given inode, we should have one or more
+ * capabilities, one issued by each metadata server, and our
+ * cumulative access is the OR of all issued capabilities.
+ *
+ * Each cap is referenced by the inode's i_caps rbtree and by per-mds
+ * session capability lists.
+ */
+struct ceph_cap {
+ struct ceph_inode_info *ci;
+ struct rb_node ci_node; /* per-ci cap tree */
+ struct ceph_mds_session *session;
+ struct list_head session_caps; /* per-session caplist */
+ int mds;
+ u64 cap_id; /* unique cap id (mds provided) */
+ int issued; /* latest, from the mds */
+ int implemented; /* implemented superset of issued (for revocation) */
+ int mds_wanted;
+ u32 seq, issue_seq, mseq;
+ u32 cap_gen; /* active/stale cycle */
+ unsigned long last_used;
+ struct list_head caps_item;
+};
+
+#define CHECK_CAPS_NODELAY 1 /* do not delay any further */
+#define CHECK_CAPS_AUTHONLY 2 /* only check auth cap */
+#define CHECK_CAPS_FLUSH 4 /* flush any dirty caps */
+
+/*
+ * Snapped cap state that is pending flush to mds. When a snapshot occurs,
+ * we first complete any in-process sync writes and writeback any dirty
+ * data before flushing the snapped state (tracked here) back to the MDS.
+ */
+struct ceph_cap_snap {
+ atomic_t nref;
+ struct ceph_inode_info *ci;
+ struct list_head ci_item, flushing_item;
+
+ u64 follows, flush_tid;
+ int issued, dirty;
+ struct ceph_snap_context *context;
+
+ mode_t mode;
+ uid_t uid;
+ gid_t gid;
+
+ void *xattr_blob;
+ int xattr_len;
+ u64 xattr_version;
+
+ u64 size;
+ struct timespec mtime, atime, ctime;
+ u64 time_warp_seq;
+ int writing; /* a sync write is still in progress */
+ int dirty_pages; /* dirty pages awaiting writeback */
+};
+
+static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
+{
+ if (atomic_dec_and_test(&capsnap->nref))
+ kfree(capsnap);
+}
+
+/*
+ * The frag tree describes how a directory is fragmented, potentially across
+ * multiple metadata servers. It is also used to indicate points where
+ * metadata authority is delegated, and whether/where metadata is replicated.
+ *
+ * A _leaf_ frag will be present in the i_fragtree IFF there is
+ * delegation info. That is, if mds >= 0 || ndist > 0.
+ */
+#define CEPH_MAX_DIRFRAG_REP 4
+
+struct ceph_inode_frag {
+ struct rb_node node;
+
+ /* fragtree state */
+ u32 frag;
+ int split_by; /* i.e. 2^(split_by) children */
+
+ /* delegation and replication info */
+ int mds; /* -1 if same authority as parent */
+ int ndist; /* >0 if replicated */
+ int dist[CEPH_MAX_DIRFRAG_REP];
+};
+
+/*
+ * We cache inode xattrs as an encoded blob until they are first used,
+ * at which point we parse them into an rbtree.
+ */
+struct ceph_inode_xattr {
+ struct rb_node node;
+
+ const char *name;
+ int name_len;
+ const char *val;
+ int val_len;
+ int dirty;
+
+ int should_free_name;
+ int should_free_val;
+};
+
+struct ceph_inode_xattrs_info {
+ /*
+ * (still encoded) xattr blob. we avoid the overhead of parsing
+ * this until someone actually calls getxattr, etc.
+ *
+ * blob->vec.iov_len == 4 implies there are no xattrs; blob ==
+ * NULL means we don't know.
+ */
+ struct ceph_buffer *blob, *prealloc_blob;
+
+ struct rb_root index;
+ bool dirty;
+ int count;
+ int names_size;
+ int vals_size;
+ u64 version, index_version;
+};
+
+/*
+ * Ceph inode.
+ */
+#define CEPH_I_COMPLETE 1 /* we have complete directory cached */
+#define CEPH_I_NODELAY 4 /* do not delay cap release */
+#define CEPH_I_FLUSH 8 /* do not delay flush of dirty metadata */
+#define CEPH_I_NOFLUSH 16 /* do not flush dirty caps */
+
+struct ceph_inode_info {
+ struct ceph_vino i_vino; /* ceph ino + snap */
+
+ u64 i_version;
+ u32 i_time_warp_seq;
+
+ unsigned i_ceph_flags;
+ unsigned long i_release_count;
+
+ struct ceph_file_layout i_layout;
+ char *i_symlink;
+
+ /* for dirs */
+ struct timespec i_rctime;
+ u64 i_rbytes, i_rfiles, i_rsubdirs;
+ u64 i_files, i_subdirs;
+ u64 i_max_offset; /* largest readdir offset, set with I_COMPLETE */
+
+ struct rb_root i_fragtree;
+ struct mutex i_fragtree_mutex;
+
+ struct ceph_inode_xattrs_info i_xattrs;
+
+ /* capabilities. protected _both_ by i_lock and cap->session's
+ * s_mutex. */
+ struct rb_root i_caps; /* cap list */
+ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
+ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
+ struct list_head i_dirty_item, i_flushing_item;
+ u64 i_cap_flush_seq;
+ /* we need to track cap writeback on a per-cap-bit basis, to allow
+ * overlapping, pipelined cap flushes to the mds. we can probably
+ * reduce the tid to 8 bits if we're concerned about inode size. */
+ u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS];
+ wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
+ unsigned long i_hold_caps_min; /* jiffies */
+ unsigned long i_hold_caps_max; /* jiffies */
+ struct list_head i_cap_delay_list; /* for delayed cap release to mds */
+ int i_cap_exporting_mds; /* to handle cap migration between */
+ unsigned i_cap_exporting_mseq; /* mds's. */
+ unsigned i_cap_exporting_issued;
+ struct ceph_cap_reservation i_cap_migration_resv;
+ struct list_head i_cap_snaps; /* snapped state pending flush to mds */
+ struct ceph_snap_context *i_head_snapc; /* set if wr_buffer_head > 0 */
+ unsigned i_snap_caps; /* cap bits for snapped files */
+
+ int i_nr_by_mode[CEPH_FILE_MODE_NUM]; /* open file counts */
+
+ u32 i_truncate_seq; /* last truncate to smaller size */
+ u64 i_truncate_size; /* and the size we last truncated down to */
+ int i_truncate_pending; /* still need to call vmtruncate */
+
+ u64 i_max_size; /* max file size authorized by mds */
+ u64 i_reported_size; /* (max_)size reported to or requested of mds */
+ u64 i_wanted_max_size; /* offset we'd like to write too */
+ u64 i_requested_max_size; /* max_size we've requested */
+
+ /* held references to caps */
+ int i_pin_ref;
+ int i_rd_ref, i_rdcache_ref, i_wr_ref;
+ int i_wrbuffer_ref, i_wrbuffer_ref_head;
+ u32 i_shared_gen; /* increment each time we get FILE_SHARED */
+ u32 i_rdcache_gen; /* we increment this each time we get
+ FILE_CACHE. If it's non-zero, we
+ _may_ have cached pages. */
+ u32 i_rdcache_revoking; /* RDCACHE gen to async invalidate, if any */
+
+ struct list_head i_unsafe_writes; /* uncommitted sync writes */
+ struct list_head i_unsafe_dirops; /* uncommitted mds dir ops */
+ spinlock_t i_unsafe_lock;
+
+ struct ceph_snap_realm *i_snap_realm; /* snap realm (if caps) */
+ int i_snap_realm_counter; /* snap realm (if caps) */
+ struct list_head i_snap_realm_item;
+ struct list_head i_snap_flush_item;
+
+ struct work_struct i_wb_work; /* writeback work */
+ struct work_struct i_pg_inv_work; /* page invalidation work */
+
+ struct work_struct i_vmtruncate_work;
+
+ struct inode vfs_inode; /* at end */
+};
+
+static inline struct ceph_inode_info *ceph_inode(struct inode *inode)
+{
+ return container_of(inode, struct ceph_inode_info, vfs_inode);
+}
+
+static inline void ceph_i_clear(struct inode *inode, unsigned mask)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+

+ spin_lock(&inode->i_lock);
+ ci->i_ceph_flags &= ~mask;

+ spin_unlock(&inode->i_lock);
+}
+

+static inline void ceph_i_set(struct inode *inode, unsigned mask)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);
+

+ spin_lock(&inode->i_lock);
+ ci->i_ceph_flags |= mask;

+ spin_unlock(&inode->i_lock);
+}
+

+static inline bool ceph_i_test(struct inode *inode, unsigned mask)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ bool r;
+
+ smp_mb();
+ r = (ci->i_ceph_flags & mask) == mask;

+ return r;
+}
+
+

+/* find a specific frag @f */
+extern struct ceph_inode_frag *__ceph_find_frag(struct ceph_inode_info *ci,
+ u32 f);
+
+/*
+ * choose fragment for value @v. copy frag content to pfrag, if leaf
+ * exists
+ */
+extern u32 ceph_choose_frag(struct ceph_inode_info *ci, u32 v,
+ struct ceph_inode_frag *pfrag,
+ int *found);
+
+/*
+ * Ceph dentry state
+ */
+struct ceph_dentry_info {
+ struct ceph_mds_session *lease_session;
+ u32 lease_gen, lease_shared_gen;
+ u32 lease_seq;
+ unsigned long lease_renew_after, lease_renew_from;
+ struct list_head lru;
+ struct dentry *dentry;
+ u64 time;
+ u64 offset;
+};
+
+static inline struct ceph_dentry_info *ceph_dentry(struct dentry *dentry)
+{
+ return (struct ceph_dentry_info *)dentry->d_fsdata;
+}
+
+static inline loff_t ceph_make_fpos(unsigned frag, unsigned off)
+{
+ return ((loff_t)frag << 32) | (loff_t)off;
+}
+
+/*
+ * ino_t is <64 bits on many architectures, blech.
+ *
+ * don't include snap in ino hash, at least for now.
+ */
+static inline ino_t ceph_vino_to_ino(struct ceph_vino vino)
+{
+ ino_t ino = (ino_t)vino.ino; /* ^ (vino.snap << 20); */
+#if BITS_PER_LONG == 32
+ ino ^= vino.ino >> (sizeof(u64)-sizeof(ino_t)) * 8;
+ if (!ino)
+ ino = 1;
+#endif
+ return ino;
+}
+
+static inline int ceph_set_ino_cb(struct inode *inode, void *data)
+{
+ ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
+ inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);

+ return 0;
+}
+

+static inline struct ceph_vino ceph_vino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino;
+}
+
+/* for printf-style formatting */
+#define ceph_vinop(i) ceph_inode(i)->i_vino.ino, ceph_inode(i)->i_vino.snap
+
+static inline u64 ceph_ino(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.ino;
+}
+static inline u64 ceph_snap(struct inode *inode)
+{
+ return ceph_inode(inode)->i_vino.snap;
+}
+
+static inline int ceph_ino_compare(struct inode *inode, void *data)
+{
+ struct ceph_vino *pvino = (struct ceph_vino *)data;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ return ci->i_vino.ino == pvino->ino &&
+ ci->i_vino.snap == pvino->snap;
+}
+
+static inline struct inode *ceph_find_inode(struct super_block *sb,
+ struct ceph_vino vino)
+{
+ ino_t t = ceph_vino_to_ino(vino);
+ return ilookup5(sb, t, ceph_ino_compare, &vino);
+}
+
+
+/*
+ * caps helpers
+ */
+static inline bool __ceph_is_any_real_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps);
+}
+
+extern int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented);
+extern int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int t);
+extern int __ceph_caps_issued_other(struct ceph_inode_info *ci,
+ struct ceph_cap *cap);
+
+static inline int ceph_caps_issued(struct ceph_inode_info *ci)
+{
+ int issued;
+ spin_lock(&ci->vfs_inode.i_lock);
+ issued = __ceph_caps_issued(ci, NULL);
+ spin_unlock(&ci->vfs_inode.i_lock);
+ return issued;
+}
+
+static inline int ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask,
+ int touch)
+{
+ int r;
+ spin_lock(&ci->vfs_inode.i_lock);
+ r = __ceph_caps_issued_mask(ci, mask, touch);
+ spin_unlock(&ci->vfs_inode.i_lock);
+ return r;
+}
+
+static inline int __ceph_caps_dirty(struct ceph_inode_info *ci)
+{
+ return ci->i_dirty_caps | ci->i_flushing_caps;
+}
+extern void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask);
+
+extern int ceph_caps_revoking(struct ceph_inode_info *ci, int mask);
+extern int __ceph_caps_used(struct ceph_inode_info *ci);
+
+extern int __ceph_caps_file_wanted(struct ceph_inode_info *ci);
+
+/*
+ * wanted, by virtue of open file modes AND cap refs (buffered/cached data)
+ */
+static inline int __ceph_caps_wanted(struct ceph_inode_info *ci)
+{
+ int w = __ceph_caps_file_wanted(ci) | __ceph_caps_used(ci);
+ if (w & CEPH_CAP_FILE_BUFFER)
+ w |= CEPH_CAP_FILE_EXCL; /* we want EXCL if dirty data */
+ return w;
+}
+
+/* what the mds thinks we want */
+extern int __ceph_caps_mds_wanted(struct ceph_inode_info *ci);
+
+extern void ceph_caps_init(void);
+extern void ceph_caps_finalize(void);
+extern void ceph_adjust_min_caps(int delta);
+extern int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need);
+extern int ceph_unreserve_caps(struct ceph_cap_reservation *ctx);
+extern void ceph_reservation_status(struct ceph_client *client,
+ int *total, int *avail, int *used,
+ int *reserved, int *min);
+
+static inline struct ceph_client *ceph_inode_to_client(struct inode *inode)
+{
+ return (struct ceph_client *)inode->i_sb->s_fs_info;
+}
+
+static inline struct ceph_client *ceph_sb_to_client(struct super_block *sb)
+{
+ return (struct ceph_client *)sb->s_fs_info;
+}
+
+
+/*
+ * we keep buffered readdir results attached to file->private_data
+ */
+struct ceph_file_info {
+ int fmode; /* initialized on open */
+
+ /* readdir: position within the dir */
+ u32 frag;
+ struct ceph_mds_request *last_readdir;
+ int at_end;
+
+ /* readdir: position within a frag */
+ unsigned offset; /* offset of last chunk, adjusted for . and .. */
+ u64 next_offset; /* offset of next chunk (last_name's + 1) */
+ char *last_name; /* last entry in previous chunk */
+ struct dentry *dentry; /* next dentry (for dcache readdir) */
+ unsigned long dir_release_count;
+
+ /* used for -o dirstat read() on directory thing */
+ char *dir_info;
+ int dir_info_len;
+};
+
+
+
+/*
+ * snapshots
+ */
+
+/*
+ * A "snap context" is the set of existing snapshots when we
+ * write data. It is used by the OSD to guide its COW behavior.
+ *
+ * The ceph_snap_context is refcounted, and attached to each dirty
+ * page, indicating which context the dirty data belonged when it was
+ * dirtied.
+ */
+struct ceph_snap_context {
+ atomic_t nref;
+ u64 seq;
+ int num_snaps;
+ u64 snaps[];
+};
+
+static inline struct ceph_snap_context *
+ceph_get_snap_context(struct ceph_snap_context *sc)
+{
+ /*
+ printk("get_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+ atomic_read(&sc->nref)+1);
+ */
+ if (sc)
+ atomic_inc(&sc->nref);
+ return sc;
+}
+
+static inline void ceph_put_snap_context(struct ceph_snap_context *sc)
+{
+ if (!sc)
+ return;
+ /*
+ printk("put_snap_context %p %d -> %d\n", sc, atomic_read(&sc->nref),
+ atomic_read(&sc->nref)-1);
+ */
+ if (atomic_dec_and_test(&sc->nref)) {
+ /*printk(" deleting snap_context %p\n", sc);*/
+ kfree(sc);
+ }
+}
+
+/*
+ * A "snap realm" describes a subset of the file hierarchy sharing
+ * the same set of snapshots that apply to it. The realms themselves
+ * are organized into a hierarchy, such that children inherit (some of)
+ * the snapshots of their parents.
+ *
+ * All inodes within the realm that have capabilities are linked into a
+ * per-realm list.
+ */
+struct ceph_snap_realm {
+ u64 ino;
+ atomic_t nref;
+ struct rb_node node;
+
+ u64 created, seq;
+ u64 parent_ino;
+ u64 parent_since; /* snapid when our current parent became so */
+
+ u64 *prior_parent_snaps; /* snaps inherited from any parents we */
+ int num_prior_parent_snaps; /* had prior to parent_since */
+ u64 *snaps; /* snaps specific to this realm */
+ int num_snaps;
+
+ struct ceph_snap_realm *parent;
+ struct list_head children; /* list of child realms */
+ struct list_head child_item;
+
+ struct list_head empty_item; /* if i have ref==0 */
+
+ /* the current set of snaps for this realm */
+ struct ceph_snap_context *cached_context;
+
+ struct list_head inodes_with_caps;
+ spinlock_t inodes_with_caps_lock;
+};
+
+
+
+/*
+ * calculate the number of pages a given length and offset map onto,
+ * if we align the data.
+ */
+static inline int calc_pages_for(u64 off, u64 len)
+{
+ return ((off+len+PAGE_CACHE_SIZE-1) >> PAGE_CACHE_SHIFT) -
+ (off >> PAGE_CACHE_SHIFT);
+}
+
+
+
+/* snap.c */
+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino);
+extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+extern int ceph_update_snap_trace(struct ceph_mds_client *m,
+ void *p, void *e, bool deletion);
+extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern void ceph_queue_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_snap_context *snapc);
+extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap);
+extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
+
+/*
+ * a cap_snap is "pending" if it is still awaiting an in-progress
+ * sync write (that may/may not still update size, mtime, etc.).
+ */
+static inline bool __ceph_have_pending_cap_snap(struct ceph_inode_info *ci)
+{
+ return !list_empty(&ci->i_cap_snaps) &&
+ list_entry(ci->i_cap_snaps.prev, struct ceph_cap_snap,
+ ci_item)->writing;
+}
+
+
+/* super.c */
+extern struct kmem_cache *ceph_inode_cachep;
+extern struct kmem_cache *ceph_cap_cachep;
+extern struct kmem_cache *ceph_dentry_cachep;
+extern struct kmem_cache *ceph_file_cachep;
+
+extern const char *ceph_msg_type_name(int type);
+extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
+
+#define FSID_FORMAT "%02x%02x%02x%02x-%02x%02x-%02x%02x-%02x%02x-" \
+ "%02x%02x%02x%02x%02x%02x"
+#define PR_FSID(f) (f)->fsid[0], (f)->fsid[1], (f)->fsid[2], (f)->fsid[3], \
+ (f)->fsid[4], (f)->fsid[5], (f)->fsid[6], (f)->fsid[7], \
+ (f)->fsid[8], (f)->fsid[9], (f)->fsid[10], (f)->fsid[11], \
+ (f)->fsid[12], (f)->fsid[13], (f)->fsid[14], (f)->fsid[15]
+
+/* inode.c */
+extern const struct inode_operations ceph_file_iops;
+
+extern struct inode *ceph_alloc_inode(struct super_block *sb);
+extern void ceph_destroy_inode(struct inode *inode);
+
+extern struct inode *ceph_get_inode(struct super_block *sb,
+ struct ceph_vino vino);
+extern struct inode *ceph_get_snapdir(struct inode *parent);
+extern int ceph_fill_file_size(struct inode *inode, int issued,
+ u32 truncate_seq, u64 truncate_size, u64 size);
+extern void ceph_fill_file_time(struct inode *inode, int issued,
+ u64 time_warp_seq, struct timespec *ctime,
+ struct timespec *mtime, struct timespec *atime);
+extern int ceph_fill_trace(struct super_block *sb,
+ struct ceph_mds_request *req,
+ struct ceph_mds_session *session);
+extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
+ struct ceph_mds_session *session);
+
+extern int ceph_inode_holds_cap(struct inode *inode, int mask);
+
+extern int ceph_inode_set_size(struct inode *inode, loff_t size);
+extern void __ceph_do_pending_vmtruncate(struct inode *inode);
+extern void ceph_queue_vmtruncate(struct inode *inode);
+
+extern void ceph_queue_invalidate(struct inode *inode);
+extern void ceph_queue_writeback(struct inode *inode);
+
+extern int ceph_do_getattr(struct inode *inode, int mask);
+extern int ceph_permission(struct inode *inode, int mask);
+extern int ceph_setattr(struct dentry *dentry, struct iattr *attr);
+extern int ceph_getattr(struct vfsmount *mnt, struct dentry *dentry,
+ struct kstat *stat);
+
+/* xattr.c */
+extern int ceph_setxattr(struct dentry *, const char *, const void *,
+ size_t, int);
+extern ssize_t ceph_getxattr(struct dentry *, const char *, void *, size_t);
+extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
+extern int ceph_removexattr(struct dentry *, const char *);
+extern void __ceph_build_xattrs_blob(struct ceph_inode_info *ci);
+extern void __ceph_destroy_xattrs(struct ceph_inode_info *ci);
+
+/* caps.c */
+extern const char *ceph_cap_string(int c);
+extern void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg);
+extern int ceph_add_cap(struct inode *inode,
+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,
+ unsigned cap, unsigned seq, u64 realmino, int flags,
+ struct ceph_cap_reservation *caps_reservation);
+extern void __ceph_remove_cap(struct ceph_cap *cap);
+static inline void ceph_remove_cap(struct ceph_cap *cap)
+{
+ struct inode *inode = &cap->ci->vfs_inode;
+ spin_lock(&inode->i_lock);
+ __ceph_remove_cap(cap);
+ spin_unlock(&inode->i_lock);
+}
+extern void ceph_put_cap(struct ceph_cap *cap);
+
+extern void ceph_queue_caps_release(struct inode *inode);
+extern int ceph_write_inode(struct inode *inode, int unused);
+extern int ceph_fsync(struct file *file, struct dentry *dentry, int datasync);
+extern void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session);
+extern int ceph_get_cap_mds(struct inode *inode);
+extern void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps);
+extern void ceph_put_cap_refs(struct ceph_inode_info *ci, int had);
+extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc);
+extern void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession);
+extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session);
+extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
+extern void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc);
+
+extern int ceph_encode_inode_release(void **p, struct inode *inode,
+ int mds, int drop, int unless, int force);
+extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
+ int mds, int drop, int unless);
+
+extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
+ int *got, loff_t endoff);
+
+/* for counting open files by mode */
+static inline void __ceph_get_fmode(struct ceph_inode_info *ci, int mode)
+{
+ ci->i_nr_by_mode[mode]++;
+}
+extern void ceph_put_fmode(struct ceph_inode_info *ci, int mode);
+
+/* addr.c */
+extern const struct address_space_operations ceph_aops;
+extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
+
+/* file.c */
+extern const struct file_operations ceph_file_fops;
+extern const struct address_space_operations ceph_aops;
+extern int ceph_open(struct inode *inode, struct file *file);
+extern struct dentry *ceph_lookup_open(struct inode *dir, struct dentry *dentry,
+ struct nameidata *nd, int mode,
+ int locked_dir);
+extern int ceph_release(struct inode *inode, struct file *filp);
+extern void ceph_release_page_vector(struct page **pages, int num_pages);
+
+/* dir.c */
+extern const struct file_operations ceph_dir_fops;
+extern const struct inode_operations ceph_dir_iops;
+extern struct dentry_operations ceph_dentry_ops, ceph_snap_dentry_ops,
+ ceph_snapdir_dentry_ops;
+
+extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry);
+extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req,

+ struct dentry *dentry, int err);
+

+extern void ceph_dentry_lru_add(struct dentry *dn);
+extern void ceph_dentry_lru_touch(struct dentry *dn);
+extern void ceph_dentry_lru_del(struct dentry *dn);
+
+/*
+ * our d_ops vary depending on whether the inode is live,
+ * snapshotted (read-only), or a virtual ".snap" directory.

+ */
+int ceph_init_dentry(struct dentry *dentry);
+
+

+/* ioctl.c */
+extern long ceph_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
+
+/* export.c */
+extern const struct export_operations ceph_export_ops;
+
+/* debugfs.c */
+extern int ceph_debugfs_init(void);
+extern void ceph_debugfs_cleanup(void);
+extern int ceph_debugfs_client_init(struct ceph_client *client);
+extern void ceph_debugfs_client_cleanup(struct ceph_client *client);
+
+static inline struct inode *get_dentry_parent_inode(struct dentry *dentry)
+{
+ if (dentry && dentry->d_parent)
+ return dentry->d_parent->d_inode;
+
+ return NULL;
+}
+
+#endif /* _FS_CEPH_SUPER_H */
diff --git a/fs/ceph/types.h b/fs/ceph/types.h
new file mode 100644
index 0000000..28b35a0
--- /dev/null
+++ b/fs/ceph/types.h
@@ -0,0 +1,29 @@
+#ifndef _FS_CEPH_TYPES_H
+#define _FS_CEPH_TYPES_H
+
+/* needed before including ceph_fs.h */
+#include <linux/in.h>
+#include <linux/types.h>
+#include <linux/fcntl.h>
+#include <linux/string.h>
+
+#include "ceph_fs.h"
+#include "ceph_frag.h"
+#include "ceph_hash.h"
+
+/*
+ * Identify inodes by both their ino AND snapshot id (a u64).
+ */
+struct ceph_vino {
+ u64 ino;
+ u64 snap;
+};
+
+
+/* context for the caps reservation mechanism */
+struct ceph_cap_reservation {
+ int count;
+};

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

The OSD client is responsible for reading and writing data from/to the
object storage pool. This includes determining where objects are
stored in the cluster, and ensuring that requests are retried or
redirected in the event of a node failure or data migration.

If an OSD does not respond before a timeout expires, keepalive
messages are sent across the lossless, ordered communications channel
to ensure that any break in the TCP is discovered. If the session
does reset, a reconnection is attempted and affected requests are
resent (by the message transport layer).

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/osd_client.c | 1537 ++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/osd_client.h | 166 ++++++
fs/ceph/osdmap.c | 1019 +++++++++++++++++++++++++++++++++
fs/ceph/osdmap.h | 125 ++++
4 files changed, 2847 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/osd_client.c
create mode 100644 fs/ceph/osd_client.h
create mode 100644 fs/ceph/osdmap.c
create mode 100644 fs/ceph/osdmap.h

diff --git a/fs/ceph/osd_client.c b/fs/ceph/osd_client.c
new file mode 100644
index 0000000..daf545f
--- /dev/null
+++ b/fs/ceph/osd_client.c
@@ -0,0 +1,1537 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/highmem.h>
+#include <linux/mm.h>
+#include <linux/pagemap.h>
+#include <linux/slab.h>
+#include <linux/uaccess.h>

+
+#include "super.h"
+#include "osd_client.h"

+#include "messenger.h"
+#include "decode.h"
+#include "auth.h"
+
+#define OSD_OP_FRONT_LEN 4096
+#define OSD_OPREPLY_FRONT_LEN 512
+
+const static struct ceph_connection_operations osd_con_ops;
+static int __kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd);
+
+static void kick_requests(struct ceph_osd_client *osdc, struct ceph_osd *osd);
+
+/*
+ * Implement client access to distributed object storage cluster.
+ *
+ * All data objects are stored within a cluster/cloud of OSDs, or
+ * "object storage devices." (Note that Ceph OSDs have _nothing_ to
+ * do with the T10 OSD extensions to SCSI.) Ceph OSDs are simply
+ * remote daemons serving up and coordinating consistent and safe
+ * access to storage.
+ *
+ * Cluster membership and the mapping of data objects onto storage devices
+ * are described by the osd map.
+ *
+ * We keep track of pending OSD requests (read, write), resubmit
+ * requests to different OSDs when the cluster topology/data layout
+ * change, or retry the affected requests when the communications
+ * channel with an OSD is reset.
+ */
+
+/*
+ * calculate the mapping of a file extent onto an object, and fill out the
+ * request accordingly. shorten extent as necessary if it crosses an
+ * object boundary.
+ *
+ * fill osd op in request message.
+ */
+static void calc_layout(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,

+ struct ceph_osd_request *req)
+{

+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ struct ceph_osd_op *op = (void *)(reqhead + 1);
+ u64 orig_len = *plen;
+ u64 objoff, objlen; /* extent in object */
+ u64 bno;
+
+ reqhead->snapid = cpu_to_le64(vino.snap);
+
+ /* object extent? */
+ ceph_calc_file_object_mapping(layout, off, plen, &bno,
+ &objoff, &objlen);
+ if (*plen < orig_len)
+ dout(" skipping last %llu, final file extent %llu~%llu\n",
+ orig_len - *plen, off, *plen);
+
+ sprintf(req->r_oid, "%llx.%08llx", vino.ino, bno);
+ req->r_oid_len = strlen(req->r_oid);
+
+ op->extent.offset = cpu_to_le64(objoff);
+ op->extent.length = cpu_to_le64(objlen);
+ req->r_num_pages = calc_pages_for(off, *plen);
+
+ dout("calc_layout %s (%d) %llu~%llu (%d pages)\n",
+ req->r_oid, req->r_oid_len, objoff, objlen, req->r_num_pages);
+}
+
+/*
+ * requests
+ */
+void ceph_osdc_release_request(struct kref *kref)
+{
+ struct ceph_osd_request *req = container_of(kref,
+ struct ceph_osd_request,
+ r_kref);

+
+ if (req->r_request)

+ ceph_msg_put(req->r_request);
+ if (req->r_reply)
+ ceph_msg_put(req->r_reply);
+ if (req->r_con_filling_msg) {
+ dout("release_request revoking pages %p from con %p\n",
+ req->r_pages, req->r_con_filling_msg);
+ ceph_con_revoke_message(req->r_con_filling_msg,
+ req->r_reply);
+ ceph_con_put(req->r_con_filling_msg);
+ }
+ if (req->r_own_pages)
+ ceph_release_page_vector(req->r_pages,
+ req->r_num_pages);
+ ceph_put_snap_context(req->r_snapc);
+ if (req->r_mempool)
+ mempool_free(req, req->r_osdc->req_mempool);
+ else
+ kfree(req);
+}
+
+/*
+ * build new request AND message, calculate layout, and adjust file
+ * extent as needed.
+ *
+ * if the file was recently truncated, we include information about its
+ * old and new size so that the object can be updated appropriately. (we
+ * avoid synchronously deleting truncated objects because it's slow.)
+ *
+ * if @do_sync, include a 'startsync' command so that the osd will flush
+ * data quickly.
+ */
+struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *osdc,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 off, u64 *plen,
+ int opcode, int flags,
+ struct ceph_snap_context *snapc,
+ int do_sync,
+ u32 truncate_seq,
+ u64 truncate_size,
+ struct timespec *mtime,
+ bool use_mempool, int num_reply)
+{
+ struct ceph_osd_request *req;
+ struct ceph_msg *msg;

+ struct ceph_osd_request_head *head;
+ struct ceph_osd_op *op;

+ void *p;
+ int num_op = 1 + do_sync;
+ size_t msg_size = sizeof(*head) + num_op*sizeof(*op);
+ int i;
+
+ if (use_mempool) {
+ req = mempool_alloc(osdc->req_mempool, GFP_NOFS);
+ memset(req, 0, sizeof(*req));
+ } else {
+ req = kzalloc(sizeof(*req), GFP_NOFS);
+ }
+ if (req == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ req->r_osdc = osdc;
+ req->r_mempool = use_mempool;
+ kref_init(&req->r_kref);
+ init_completion(&req->r_completion);
+ init_completion(&req->r_safe_completion);
+ INIT_LIST_HEAD(&req->r_unsafe_item);
+ req->r_flags = flags;
+
+ WARN_ON((flags & (CEPH_OSD_FLAG_READ|CEPH_OSD_FLAG_WRITE)) == 0);
+
+ /* create reply message */
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY,
+ OSD_OPREPLY_FRONT_LEN, 0, 0, NULL);
+ if (IS_ERR(msg)) {
+ ceph_osdc_put_request(req);
+ return ERR_PTR(PTR_ERR(msg));
+ }
+ req->r_reply = msg;
+
+ /* create request message; allow space for oid */
+ msg_size += 40;
+ if (snapc)
+ msg_size += sizeof(u64) * snapc->num_snaps;
+ if (use_mempool)
+ msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
+ else
+ msg = ceph_msg_new(CEPH_MSG_OSD_OP, msg_size, 0, 0, NULL);
+ if (IS_ERR(msg)) {
+ ceph_osdc_put_request(req);
+ return ERR_PTR(PTR_ERR(msg));
+ }
+ msg->hdr.type = cpu_to_le16(CEPH_MSG_OSD_OP);
+ memset(msg->front.iov_base, 0, msg->front.iov_len);
+ head = msg->front.iov_base;

+ op = (void *)(head + 1);

+ p = (void *)(op + num_op);
+
+ req->r_request = msg;
+ req->r_snapc = ceph_get_snap_context(snapc);
+
+ head->client_inc = cpu_to_le32(1); /* always, for now. */
+ head->flags = cpu_to_le32(flags);
+ if (flags & CEPH_OSD_FLAG_WRITE)
+ ceph_encode_timespec(&head->mtime, mtime);
+ head->num_ops = cpu_to_le16(num_op);
+ op->op = cpu_to_le16(opcode);
+
+ /* calculate max write size */
+ calc_layout(osdc, vino, layout, off, plen, req);
+ req->r_file_layout = *layout; /* keep a copy */
+
+ if (flags & CEPH_OSD_FLAG_WRITE) {
+ req->r_request->hdr.data_off = cpu_to_le16(off);
+ req->r_request->hdr.data_len = cpu_to_le32(*plen);
+ op->payload_len = cpu_to_le32(*plen);
+ }
+ op->extent.truncate_size = cpu_to_le64(truncate_size);
+ op->extent.truncate_seq = cpu_to_le32(truncate_seq);
+
+ /* fill in oid */
+ head->object_len = cpu_to_le32(req->r_oid_len);
+ memcpy(p, req->r_oid, req->r_oid_len);
+ p += req->r_oid_len;
+
+ if (do_sync) {
+ op++;
+ op->op = cpu_to_le16(CEPH_OSD_OP_STARTSYNC);
+ }
+ if (snapc) {
+ head->snap_seq = cpu_to_le64(snapc->seq);
+ head->num_snaps = cpu_to_le32(snapc->num_snaps);
+ for (i = 0; i < snapc->num_snaps; i++) {
+ put_unaligned_le64(snapc->snaps[i], p);
+ p += sizeof(u64);
+ }
+ }
+
+ BUG_ON(p > msg->front.iov_base + msg->front.iov_len);
+ msg_size = p - msg->front.iov_base;
+ msg->front.iov_len = msg_size;
+ msg->hdr.front_len = cpu_to_le32(msg_size);
+ return req;
+}
+
+/*
+ * We keep osd requests in an rbtree, sorted by ->r_tid.
+ */
+static void __insert_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *new)
+{
+ struct rb_node **p = &osdc->requests.rb_node;
+ struct rb_node *parent = NULL;

+ struct ceph_osd_request *req = NULL;
+

+ while (*p) {
+ parent = *p;
+ req = rb_entry(parent, struct ceph_osd_request, r_node);
+ if (new->r_tid < req->r_tid)
+ p = &(*p)->rb_left;
+ else if (new->r_tid > req->r_tid)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->r_node, parent, p);
+ rb_insert_color(&new->r_node, &osdc->requests);
+}
+
+static struct ceph_osd_request *__lookup_request(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid)
+ n = n->rb_left;
+ else if (tid > req->r_tid)
+ n = n->rb_right;
+ else
+ return req;

+ }
+ return NULL;
+}
+

+static struct ceph_osd_request *
+__lookup_request_ge(struct ceph_osd_client *osdc,
+ u64 tid)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *n = osdc->requests.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_osd_request, r_node);
+ if (tid < req->r_tid) {
+ if (!n->rb_left)
+ return req;
+ n = n->rb_left;
+ } else if (tid > req->r_tid) {
+ n = n->rb_right;
+ } else {
+ return req;
+ }

+ }
+ return NULL;
+}
+

+
+/*
+ * If the osd connection drops, we need to resubmit all requests.
+ */
+static void osd_reset(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+
+ if (!osd)
+ return;
+ dout("osd_reset osd%d\n", osd->o_osd);
+ osdc = osd->o_osdc;
+ down_read(&osdc->map_sem);
+ kick_requests(osdc, osd);
+ up_read(&osdc->map_sem);
+}
+
+/*
+ * Track open sessions with osds.
+ */
+static struct ceph_osd *create_osd(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd *osd;
+
+ osd = kzalloc(sizeof(*osd), GFP_NOFS);
+ if (!osd)
+ return NULL;
+
+ atomic_set(&osd->o_ref, 1);
+ osd->o_osdc = osdc;
+ INIT_LIST_HEAD(&osd->o_requests);
+ INIT_LIST_HEAD(&osd->o_osd_lru);
+ osd->o_incarnation = 1;
+
+ ceph_con_init(osdc->client->msgr, &osd->o_con);
+ osd->o_con.private = osd;
+ osd->o_con.ops = &osd_con_ops;
+ osd->o_con.peer_name.type = CEPH_ENTITY_TYPE_OSD;
+
+ INIT_LIST_HEAD(&osd->o_keepalive_item);
+ return osd;
+}
+
+static struct ceph_osd *get_osd(struct ceph_osd *osd)
+{
+ if (atomic_inc_not_zero(&osd->o_ref)) {
+ dout("get_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref)-1,
+ atomic_read(&osd->o_ref));
+ return osd;
+ } else {
+ dout("get_osd %p FAIL\n", osd);

+ return NULL;
+ }
+}
+

+static void put_osd(struct ceph_osd *osd)
+{
+ dout("put_osd %p %d -> %d\n", osd, atomic_read(&osd->o_ref),
+ atomic_read(&osd->o_ref) - 1);
+ if (atomic_dec_and_test(&osd->o_ref))
+ kfree(osd);
+}
+
+/*
+ * remove an osd from our map
+ */
+static void __remove_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{
+ dout("__remove_osd %p\n", osd);
+ BUG_ON(!list_empty(&osd->o_requests));
+ rb_erase(&osd->o_node, &osdc->osds);
+ list_del_init(&osd->o_osd_lru);
+ ceph_con_close(&osd->o_con);
+ put_osd(osd);
+}
+
+static void __move_osd_to_lru(struct ceph_osd_client *osdc,
+ struct ceph_osd *osd)
+{
+ dout("__move_osd_to_lru %p\n", osd);
+ BUG_ON(!list_empty(&osd->o_osd_lru));
+ list_add_tail(&osd->o_osd_lru, &osdc->osd_lru);
+ osd->lru_ttl = jiffies + osdc->client->mount_args->osd_idle_ttl * HZ;
+}
+
+static void __remove_osd_from_lru(struct ceph_osd *osd)
+{
+ dout("__remove_osd_from_lru %p\n", osd);
+ if (!list_empty(&osd->o_osd_lru))
+ list_del_init(&osd->o_osd_lru);
+}
+
+static void remove_old_osds(struct ceph_osd_client *osdc, int remove_all)
+{
+ struct ceph_osd *osd, *nosd;
+
+ dout("__remove_old_osds %p\n", osdc);
+ mutex_lock(&osdc->request_mutex);
+ list_for_each_entry_safe(osd, nosd, &osdc->osd_lru, o_osd_lru) {
+ if (!remove_all && time_before(jiffies, osd->lru_ttl))
+ break;
+ __remove_osd(osdc, osd);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * reset osd connect
+ */
+static int __reset_osd(struct ceph_osd_client *osdc, struct ceph_osd *osd)
+{

+ int ret = 0;
+

+ dout("__reset_osd %p osd%d\n", osd, osd->o_osd);
+ if (list_empty(&osd->o_requests)) {
+ __remove_osd(osdc, osd);
+ } else {
+ ceph_con_close(&osd->o_con);
+ ceph_con_open(&osd->o_con, &osdc->osdmap->osd_addr[osd->o_osd]);
+ osd->o_incarnation++;
+ }

+ return ret;
+}
+

+static void __insert_osd(struct ceph_osd_client *osdc, struct ceph_osd *new)
+{
+ struct rb_node **p = &osdc->osds.rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_osd *osd = NULL;
+
+ while (*p) {
+ parent = *p;
+ osd = rb_entry(parent, struct ceph_osd, o_node);
+ if (new->o_osd < osd->o_osd)
+ p = &(*p)->rb_left;
+ else if (new->o_osd > osd->o_osd)
+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->o_node, parent, p);
+ rb_insert_color(&new->o_node, &osdc->osds);
+}
+
+static struct ceph_osd *__lookup_osd(struct ceph_osd_client *osdc, int o)
+{
+ struct ceph_osd *osd;
+ struct rb_node *n = osdc->osds.rb_node;
+
+ while (n) {
+ osd = rb_entry(n, struct ceph_osd, o_node);
+ if (o < osd->o_osd)
+ n = n->rb_left;
+ else if (o > osd->o_osd)
+ n = n->rb_right;
+ else
+ return osd;

+ }
+ return NULL;
+}
+

+static void __schedule_osd_timeout(struct ceph_osd_client *osdc)
+{
+ schedule_delayed_work(&osdc->timeout_work,
+ osdc->client->mount_args->osd_keepalive_timeout * HZ);
+}
+
+static void __cancel_osd_timeout(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work_sync(&osdc->timeout_work);
+}
+
+/*
+ * Register request, assign tid. If this is the first request, set up
+ * the timeout event.
+ */
+static void register_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req)
+{
+ mutex_lock(&osdc->request_mutex);
+ req->r_tid = ++osdc->last_tid;
+ req->r_request->hdr.tid = cpu_to_le64(req->r_tid);
+ INIT_LIST_HEAD(&req->r_req_lru_item);
+
+ dout("register_request %p tid %lld\n", req, req->r_tid);
+ __insert_request(osdc, req);
+ ceph_osdc_get_request(req);
+ osdc->num_requests++;
+
+ if (osdc->num_requests == 1) {
+ dout(" first request, scheduling timeout\n");
+ __schedule_osd_timeout(osdc);
+ }
+ mutex_unlock(&osdc->request_mutex);
+}
+
+/*
+ * called under osdc->request_mutex
+ */
+static void __unregister_request(struct ceph_osd_client *osdc,

+ struct ceph_osd_request *req)
+{

+ dout("__unregister_request %p tid %lld\n", req, req->r_tid);
+ rb_erase(&req->r_node, &osdc->requests);
+ osdc->num_requests--;
+
+ if (req->r_osd) {
+ /* make sure the original request isn't in flight. */
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+
+ list_del_init(&req->r_osd_item);
+ if (list_empty(&req->r_osd->o_requests))
+ __move_osd_to_lru(osdc, req->r_osd);
+ req->r_osd = NULL;
+ }
+
+ ceph_osdc_put_request(req);
+
+ list_del_init(&req->r_req_lru_item);
+ if (osdc->num_requests == 0) {
+ dout(" no requests, canceling timeout\n");
+ __cancel_osd_timeout(osdc);
+ }
+}
+
+/*
+ * Cancel a previously queued request message
+ */
+static void __cancel_request(struct ceph_osd_request *req)
+{
+ if (req->r_sent) {
+ ceph_con_revoke(&req->r_osd->o_con, req->r_request);
+ req->r_sent = 0;
+ }
+ list_del_init(&req->r_req_lru_item);
+}
+
+/*
+ * Pick an osd (the first 'up' osd in the pg), allocate the osd struct
+ * (as needed), and set the request r_osd appropriately. If there is
+ * no up osd, set r_osd to NULL.
+ *
+ * Return 0 if unchanged, 1 if changed, or negative on error.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static int __map_osds(struct ceph_osd_client *osdc,

+ struct ceph_osd_request *req)
+{

+ struct ceph_osd_request_head *reqhead = req->r_request->front.iov_base;
+ struct ceph_pg pgid;
+ int o = -1;
+ int err;
+
+ dout("map_osds %p tid %lld\n", req, req->r_tid);
+ err = ceph_calc_object_layout(&reqhead->layout, req->r_oid,
+ &req->r_file_layout, osdc->osdmap);

+ if (err)
+ return err;

+ pgid = reqhead->layout.ol_pgid;
+ req->r_pgid = pgid;
+
+ o = ceph_calc_pg_primary(osdc->osdmap, pgid);
+
+ if ((req->r_osd && req->r_osd->o_osd == o &&
+ req->r_sent >= req->r_osd->o_incarnation) ||
+ (req->r_osd == NULL && o == -1))
+ return 0; /* no change */
+
+ dout("map_osds tid %llu pgid %d.%x osd%d (was osd%d)\n",
+ req->r_tid, le32_to_cpu(pgid.pool), le16_to_cpu(pgid.ps), o,
+ req->r_osd ? req->r_osd->o_osd : -1);
+
+ if (req->r_osd) {
+ __cancel_request(req);
+ list_del_init(&req->r_osd_item);
+ req->r_osd = NULL;
+ }
+
+ req->r_osd = __lookup_osd(osdc, o);
+ if (!req->r_osd && o >= 0) {
+ err = -ENOMEM;
+ req->r_osd = create_osd(osdc);
+ if (!req->r_osd)
+ goto out;
+
+ dout("map_osds osd %p is osd%d\n", req->r_osd, o);
+ req->r_osd->o_osd = o;
+ req->r_osd->o_con.peer_name.num = cpu_to_le64(o);
+ __insert_osd(osdc, req->r_osd);
+
+ ceph_con_open(&req->r_osd->o_con, &osdc->osdmap->osd_addr[o]);
+ }
+
+ if (req->r_osd) {
+ __remove_osd_from_lru(req->r_osd);
+ list_add(&req->r_osd_item, &req->r_osd->o_requests);
+ }
+ err = 1; /* osd changed */
+
+out:
+ return err;
+}
+
+/*
+ * caller should hold map_sem (for read) and request_mutex
+ */
+static int __send_request(struct ceph_osd_client *osdc,

+ struct ceph_osd_request *req)
+{

+ struct ceph_osd_request_head *reqhead;
+ int err;
+
+ err = __map_osds(osdc, req);
+ if (err < 0)
+ return err;
+ if (req->r_osd == NULL) {
+ dout("send_request %p no up osds in pg\n", req);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);

+ return 0;
+ }
+

+ dout("send_request %p tid %llu to osd%d flags %d\n",
+ req, req->r_tid, req->r_osd->o_osd, req->r_flags);
+
+ reqhead = req->r_request->front.iov_base;
+ reqhead->osdmap_epoch = cpu_to_le32(osdc->osdmap->epoch);
+ reqhead->flags |= cpu_to_le32(req->r_flags); /* e.g., RETRY */
+ reqhead->reassert_version = req->r_reassert_version;
+
+ req->r_sent_stamp = jiffies;
+ list_move_tail(&osdc->req_lru, &req->r_req_lru_item);
+
+ ceph_msg_get(req->r_request); /* send consumes a ref */
+ ceph_con_send(&req->r_osd->o_con, req->r_request);
+ req->r_sent = req->r_osd->o_incarnation;

+ return 0;
+}
+

+/*
+ * Timeout callback, called every N seconds when 1 or more osd
+ * requests has been active for more than N seconds. When this
+ * happens, we ping all OSDs with requests who have timed out to
+ * ensure any communications channel reset is detected. Reset the
+ * request timeouts another N seconds in the future as we go.
+ * Reschedule the timeout event another N seconds in future (unless
+ * there are no open requests).
+ */
+static void handle_timeout(struct work_struct *work)
+{

+ struct ceph_osd_client *osdc =

+ container_of(work, struct ceph_osd_client, timeout_work.work);
+ struct ceph_osd_request *req, *last_req = NULL;
+ struct ceph_osd *osd;
+ unsigned long timeout = osdc->client->mount_args->osd_timeout * HZ;
+ unsigned long keepalive =
+ osdc->client->mount_args->osd_keepalive_timeout * HZ;
+ unsigned long last_sent = 0;
+ struct rb_node *p;
+ struct list_head slow_osds;
+
+ dout("timeout\n");
+ down_read(&osdc->map_sem);
+
+ ceph_monc_request_next_osdmap(&osdc->client->monc);

+
+ mutex_lock(&osdc->request_mutex);
+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {

+ req = rb_entry(p, struct ceph_osd_request, r_node);
+

+ if (req->r_resend) {
+ int err;
+
+ dout("osdc resending prev failed %lld\n", req->r_tid);
+ err = __send_request(osdc, req);
+ if (err)
+ dout("osdc failed again on %lld\n", req->r_tid);
+ else
+ req->r_resend = false;
+ continue;
+ }
+ }
+
+ /*
+ * reset osds that appear to be _really_ unresponsive. this
+ * is a failsafe measure.. we really shouldn't be getting to
+ * this point if the system is working properly. the monitors
+ * should mark the osd as failed and we should find out about
+ * it from an updated osd map.
+ */
+ while (!list_empty(&osdc->req_lru)) {
+ req = list_entry(osdc->req_lru.next, struct ceph_osd_request,
+ r_req_lru_item);
+
+ if (time_before(jiffies, req->r_sent_stamp + timeout))
+ break;
+
+ BUG_ON(req == last_req && req->r_sent_stamp == last_sent);
+ last_req = req;
+ last_sent = req->r_sent_stamp;
+
+ osd = req->r_osd;
+ BUG_ON(!osd);
+ pr_warning(" tid %llu timed out on osd%d, will reset osd\n",
+ req->r_tid, osd->o_osd);
+ __kick_requests(osdc, osd);
+ }
+
+ /*
+ * ping osds that are a bit slow. this ensures that if there
+ * is a break in the TCP connection we will notice, and reopen
+ * a connection with that osd (from the fault callback).
+ */
+ INIT_LIST_HEAD(&slow_osds);
+ list_for_each_entry(req, &osdc->req_lru, r_req_lru_item) {
+ if (time_before(jiffies, req->r_sent_stamp + keepalive))
+ break;
+
+ osd = req->r_osd;
+ BUG_ON(!osd);
+ dout(" tid %llu is slow, will send keepalive on osd%d\n",
+ req->r_tid, osd->o_osd);
+ list_move_tail(&osd->o_keepalive_item, &slow_osds);
+ }
+ while (!list_empty(&slow_osds)) {
+ osd = list_entry(slow_osds.next, struct ceph_osd,
+ o_keepalive_item);
+ list_del_init(&osd->o_keepalive_item);
+ ceph_con_keepalive(&osd->o_con);
+ }
+
+ __schedule_osd_timeout(osdc);
+ mutex_unlock(&osdc->request_mutex);

+
+ up_read(&osdc->map_sem);
+}
+

+static void handle_osds_timeout(struct work_struct *work)
+{

+ struct ceph_osd_client *osdc =

+ container_of(work, struct ceph_osd_client,
+ osds_timeout_work.work);
+ unsigned long delay =
+ osdc->client->mount_args->osd_idle_ttl * HZ >> 2;
+
+ dout("osds timeout\n");
+ down_read(&osdc->map_sem);
+ remove_old_osds(osdc, 0);
+ up_read(&osdc->map_sem);
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(delay));
+}
+
+/*
+ * handle osd op reply. either call the callback if it is specified,
+ * or do the completion to wake up the waiting thread.
+ */
+static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg,
+ struct ceph_connection *con)
+{
+ struct ceph_osd_reply_head *rhead = msg->front.iov_base;
+ struct ceph_osd_request *req;
+ u64 tid;
+ int numops, object_len, flags;
+
+ tid = le64_to_cpu(msg->hdr.tid);
+ if (msg->front.iov_len < sizeof(*rhead))
+ goto bad;
+ numops = le32_to_cpu(rhead->num_ops);
+ object_len = le32_to_cpu(rhead->object_len);
+ if (msg->front.iov_len != sizeof(*rhead) + object_len +
+ numops * sizeof(struct ceph_osd_op))
+ goto bad;
+ dout("handle_reply %p tid %llu\n", msg, tid);
+
+ /* lookup */
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (req == NULL) {
+ dout("handle_reply tid %llu dne\n", tid);
+ mutex_unlock(&osdc->request_mutex);
+ return;
+ }
+ ceph_osdc_get_request(req);
+ flags = le32_to_cpu(rhead->flags);
+
+ /*
+ * if this connection filled our message, drop our reference now, to
+ * avoid a (safe but slower) revoke later.
+ */
+ if (req->r_con_filling_msg == con && req->r_reply == msg) {
+ dout(" dropping con_filling_msg ref %p\n", con);
+ req->r_con_filling_msg = NULL;
+ ceph_con_put(con);
+ }
+
+ if (!req->r_got_reply) {
+ unsigned bytes;
+
+ req->r_result = le32_to_cpu(rhead->result);
+ bytes = le32_to_cpu(msg->hdr.data_len);
+ dout("handle_reply result %d bytes %d\n", req->r_result,
+ bytes);
+ if (req->r_result == 0)
+ req->r_result = bytes;
+
+ /* in case this is a write and we need to replay, */
+ req->r_reassert_version = rhead->reassert_version;
+
+ req->r_got_reply = 1;
+ } else if ((flags & CEPH_OSD_FLAG_ONDISK) == 0) {
+ dout("handle_reply tid %llu dup ack\n", tid);
+ mutex_unlock(&osdc->request_mutex);
+ goto done;
+ }
+
+ dout("handle_reply tid %llu flags %d\n", tid, flags);
+
+ /* either this is a read, or we got the safe response */
+ if ((flags & CEPH_OSD_FLAG_ONDISK) ||
+ ((flags & CEPH_OSD_FLAG_WRITE) == 0))
+ __unregister_request(osdc, req);

+
+ mutex_unlock(&osdc->request_mutex);
+

+ if (req->r_callback)
+ req->r_callback(req, msg);
+ else
+ complete(&req->r_completion);
+
+ if (flags & CEPH_OSD_FLAG_ONDISK) {
+ if (req->r_safe_callback)
+ req->r_safe_callback(req, msg);
+ complete(&req->r_safe_completion); /* fsync waiter */
+ }
+
+done:
+ ceph_osdc_put_request(req);
+ return;
+
+bad:
+ pr_err("corrupt osd_op_reply got %d %d expected %d\n",
+ (int)msg->front.iov_len, le32_to_cpu(msg->hdr.front_len),
+ (int)sizeof(*rhead));
+ ceph_msg_dump(msg);
+}
+
+
+static int __kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd)
+{
+ struct ceph_osd_request *req;
+ struct rb_node *p, *n;
+ int needmap = 0;
+ int err;
+
+ dout("kick_requests osd%d\n", kickosd ? kickosd->o_osd : -1);
+ if (kickosd) {
+ __reset_osd(osdc, kickosd);
+ } else {
+ for (p = rb_first(&osdc->osds); p; p = n) {
+ struct ceph_osd *osd =
+ rb_entry(p, struct ceph_osd, o_node);
+
+ n = rb_next(p);
+ if (!ceph_osd_is_up(osdc->osdmap, osd->o_osd) ||
+ memcmp(&osd->o_con.peer_addr,
+ ceph_osd_addr(osdc->osdmap,
+ osd->o_osd),
+ sizeof(struct ceph_entity_addr)) != 0)
+ __reset_osd(osdc, osd);
+ }
+ }
+

+ for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {

+ req = rb_entry(p, struct ceph_osd_request, r_node);
+

+ if (req->r_resend) {
+ dout(" r_resend set on tid %llu\n", req->r_tid);
+ __cancel_request(req);
+ goto kick;
+ }
+ if (req->r_osd && kickosd == req->r_osd) {
+ __cancel_request(req);
+ goto kick;
+ }
+
+ err = __map_osds(osdc, req);
+ if (err == 0)
+ continue; /* no change */

+ if (err < 0) {
+ /*

+ * FIXME: really, we should set the request
+ * error and fail if this isn't a 'nofail'
+ * request, but that's a fair bit more
+ * complicated to do. So retry!
+ */
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ continue;
+ }
+ if (req->r_osd == NULL) {
+ dout("tid %llu maps to no valid osd\n", req->r_tid);
+ needmap++; /* request a newer map */
+ continue;
+ }
+
+kick:
+ dout("kicking %p tid %llu osd%d\n", req, req->r_tid,
+ req->r_osd->o_osd);
+ req->r_flags |= CEPH_OSD_FLAG_RETRY;
+ err = __send_request(osdc, req);
+ if (err) {
+ dout(" setting r_resend on %llu\n", req->r_tid);
+ req->r_resend = true;
+ }
+ }
+
+ return needmap;
+}
+
+/*
+ * Resubmit osd requests whose osd or osd address has changed. Request
+ * a new osd map if osds are down, or we are otherwise unable to determine
+ * how to direct a request.
+ *
+ * Close connections to down osds.
+ *
+ * If @who is specified, resubmit requests for that specific osd.
+ *
+ * Caller should hold map_sem for read and request_mutex.
+ */
+static void kick_requests(struct ceph_osd_client *osdc,
+ struct ceph_osd *kickosd)
+{
+ int needmap;
+
+ mutex_lock(&osdc->request_mutex);
+ needmap = __kick_requests(osdc, kickosd);
+ mutex_unlock(&osdc->request_mutex);
+
+ if (needmap) {
+ dout("%d requests for down osds, need new map\n", needmap);
+ ceph_monc_request_next_osdmap(&osdc->client->monc);
+ }
+
+}
+/*
+ * Process updated osd map.
+ *
+ * The message contains any number of incremental and full maps, normally
+ * indicating some sort of topology change in the cluster. Kick requests
+ * off to different OSDs as needed.
+ */
+void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
+{
+ void *p, *end, *next;
+ u32 nr_maps, maplen;
+ u32 epoch;
+ struct ceph_osdmap *newmap = NULL, *oldmap;
+ int err;

+ struct ceph_fsid fsid;
+

+ dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
+ p = msg->front.iov_base;
+ end = p + msg->front.iov_len;
+
+ /* verify fsid */
+ ceph_decode_need(&p, end, sizeof(fsid), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ if (ceph_check_fsid(osdc->client, &fsid) < 0)
+ return;
+
+ down_write(&osdc->map_sem);
+
+ /* incremental maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d inc maps\n", nr_maps);
+ while (nr_maps > 0) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ next = p + maplen;
+ if (osdc->osdmap && osdc->osdmap->epoch+1 == epoch) {
+ dout("applying incremental map %u len %d\n",
+ epoch, maplen);
+ newmap = osdmap_apply_incremental(&p, next,
+ osdc->osdmap,
+ osdc->client->msgr);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ if (newmap != osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = newmap;
+ }
+ } else {
+ dout("ignoring incremental map %u len %d\n",
+ epoch, maplen);
+ }
+ p = next;
+ nr_maps--;
+ }
+ if (newmap)
+ goto done;
+
+ /* full maps */
+ ceph_decode_32_safe(&p, end, nr_maps, bad);
+ dout(" %d full maps\n", nr_maps);
+ while (nr_maps) {
+ ceph_decode_need(&p, end, 2*sizeof(u32), bad);
+ epoch = ceph_decode_32(&p);
+ maplen = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, maplen, bad);
+ if (nr_maps > 1) {
+ dout("skipping non-latest full map %u len %d\n",
+ epoch, maplen);
+ } else if (osdc->osdmap && osdc->osdmap->epoch >= epoch) {
+ dout("skipping full map %u len %d, "
+ "older than our %u\n", epoch, maplen,
+ osdc->osdmap->epoch);
+ } else {
+ dout("taking full map %u len %d\n", epoch, maplen);
+ newmap = osdmap_decode(&p, p+maplen);
+ if (IS_ERR(newmap)) {
+ err = PTR_ERR(newmap);
+ goto bad;
+ }
+ BUG_ON(!newmap);
+ oldmap = osdc->osdmap;
+ osdc->osdmap = newmap;
+ if (oldmap)
+ ceph_osdmap_destroy(oldmap);
+ }
+ p += maplen;
+ nr_maps--;
+ }
+
+done:
+ downgrade_write(&osdc->map_sem);
+ ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch);
+ if (newmap)
+ kick_requests(osdc, NULL);
+ up_read(&osdc->map_sem);
+ return;
+
+bad:
+ pr_err("osdc handle_map corrupt msg\n");
+ ceph_msg_dump(msg);
+ up_write(&osdc->map_sem);
+ return;
+}
+
+
+/*
+ * A read request prepares specific pages that data is to be read into.
+ * When a message is being read off the wire, we call prepare_pages to
+ * find those pages.
+ * 0 = success, -1 failure.
+ */
+static int __prepare_pages(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ struct ceph_osd_request *req,
+ u64 tid,
+ struct ceph_msg *m)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ int ret = -1;
+ int data_len = le32_to_cpu(hdr->data_len);
+ unsigned data_off = le16_to_cpu(hdr->data_off);
+
+ int want = calc_pages_for(data_off & ~PAGE_MASK, data_len);
+
+ if (!osd)
+ return -1;
+
+ osdc = osd->o_osdc;
+
+ dout("__prepare_pages on msg %p tid %llu, has %d pages, want %d\n", m,
+ tid, req->r_num_pages, want);
+ if (unlikely(req->r_num_pages < want))
+ goto out;
+ m->pages = req->r_pages;
+ m->nr_pages = req->r_num_pages;
+ ret = 0; /* success */
+out:
+ BUG_ON(ret < 0 || m->nr_pages < want);
+
+ return ret;
+}
+
+/*
+ * Register request, send initial attempt.
+ */
+int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail)
+{

+ int rc = 0;
+

+ req->r_request->pages = req->r_pages;
+ req->r_request->nr_pages = req->r_num_pages;
+
+ register_request(osdc, req);
+
+ down_read(&osdc->map_sem);
+ mutex_lock(&osdc->request_mutex);
+ /*
+ * a racing kick_requests() may have sent the message for us
+ * while we dropped request_mutex above, so only send now if
+ * the request still han't been touched yet.
+ */
+ if (req->r_sent == 0) {
+ rc = __send_request(osdc, req);
+ if (rc) {
+ if (nofail) {
+ dout("osdc_start_request failed send, "
+ " marking %lld\n", req->r_tid);
+ req->r_resend = true;
+ rc = 0;
+ } else {
+ __unregister_request(osdc, req);
+ }
+ }
+ }
+ mutex_unlock(&osdc->request_mutex);
+ up_read(&osdc->map_sem);
+ return rc;
+}
+
+/*
+ * wait for a request to complete
+ */
+int ceph_osdc_wait_request(struct ceph_osd_client *osdc,

+ struct ceph_osd_request *req)
+{

+ int rc;
+
+ rc = wait_for_completion_interruptible(&req->r_completion);
+ if (rc < 0) {
+ mutex_lock(&osdc->request_mutex);
+ __cancel_request(req);
+ __unregister_request(osdc, req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("wait_request tid %llu canceled/timed out\n", req->r_tid);
+ return rc;
+ }
+
+ dout("wait_request tid %llu result %d\n", req->r_tid, req->r_result);
+ return req->r_result;
+}
+
+/*
+ * sync - wait for all in-flight requests to flush. avoid starvation.
+ */
+void ceph_osdc_sync(struct ceph_osd_client *osdc)
+{
+ struct ceph_osd_request *req;
+ u64 last_tid, next_tid = 0;
+
+ mutex_lock(&osdc->request_mutex);
+ last_tid = osdc->last_tid;
+ while (1) {
+ req = __lookup_request_ge(osdc, next_tid);
+ if (!req)
+ break;
+ if (req->r_tid > last_tid)
+ break;
+
+ next_tid = req->r_tid + 1;
+ if ((req->r_flags & CEPH_OSD_FLAG_WRITE) == 0)
+ continue;
+
+ ceph_osdc_get_request(req);
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync waiting on tid %llu (last is %llu)\n",
+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);
+ mutex_lock(&osdc->request_mutex);
+ ceph_osdc_put_request(req);
+ }
+ mutex_unlock(&osdc->request_mutex);
+ dout("sync done (thru tid %llu)\n", last_tid);
+}
+
+/*
+ * init, shutdown
+ */
+int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
+{
+ int err;
+
+ dout("init\n");
+ osdc->client = client;
+ osdc->osdmap = NULL;
+ init_rwsem(&osdc->map_sem);
+ init_completion(&osdc->map_waiters);
+ osdc->last_requested_map = 0;
+ mutex_init(&osdc->request_mutex);
+ osdc->last_tid = 0;
+ osdc->osds = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->osd_lru);
+ osdc->requests = RB_ROOT;
+ INIT_LIST_HEAD(&osdc->req_lru);
+ osdc->num_requests = 0;
+ INIT_DELAYED_WORK(&osdc->timeout_work, handle_timeout);
+ INIT_DELAYED_WORK(&osdc->osds_timeout_work, handle_osds_timeout);
+
+ schedule_delayed_work(&osdc->osds_timeout_work,
+ round_jiffies_relative(osdc->client->mount_args->osd_idle_ttl * HZ));
+
+ err = -ENOMEM;
+ osdc->req_mempool = mempool_create_kmalloc_pool(10,
+ sizeof(struct ceph_osd_request));
+ if (!osdc->req_mempool)
+ goto out;
+
+ err = ceph_msgpool_init(&osdc->msgpool_op, OSD_OP_FRONT_LEN, 10, true);
+ if (err < 0)
+ goto out_mempool;
+ err = ceph_msgpool_init(&osdc->msgpool_op_reply,
+ OSD_OPREPLY_FRONT_LEN, 10, true);
+ if (err < 0)
+ goto out_msgpool;
+ return 0;
+
+out_msgpool:
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+out_mempool:
+ mempool_destroy(osdc->req_mempool);

+out:
+ return err;
+}
+

+void ceph_osdc_stop(struct ceph_osd_client *osdc)
+{
+ cancel_delayed_work_sync(&osdc->timeout_work);
+ cancel_delayed_work_sync(&osdc->osds_timeout_work);
+ if (osdc->osdmap) {
+ ceph_osdmap_destroy(osdc->osdmap);
+ osdc->osdmap = NULL;
+ }
+ remove_old_osds(osdc, 1);
+ mempool_destroy(osdc->req_mempool);
+ ceph_msgpool_destroy(&osdc->msgpool_op);
+ ceph_msgpool_destroy(&osdc->msgpool_op_reply);
+}
+
+/*
+ * Read some contiguous pages. If we cross a stripe boundary, shorten
+ * *plen. Return number of bytes read, or error.
+ */
+int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino, struct ceph_file_layout *layout,
+ u64 off, u64 *plen,

+ u32 truncate_seq, u64 truncate_size,

+ struct page **pages, int num_pages)
+{
+ struct ceph_osd_request *req;
+ int rc = 0;
+
+ dout("readpages on ino %llx.%llx on %llu~%llu\n", vino.ino,
+ vino.snap, off, *plen);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, plen,
+ CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
+ NULL, 0, truncate_seq, truncate_size, NULL,
+ false, 1);

+ if (IS_ERR(req))
+ return PTR_ERR(req);
+

+ /* it may be a short read due to an object boundary */
+ req->r_pages = pages;
+ num_pages = calc_pages_for(off, *plen);
+ req->r_num_pages = num_pages;
+
+ dout("readpages final extent is %llu~%llu (%d pages)\n",
+ off, *plen, req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, false);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ dout("readpages result %d\n", rc);
+ return rc;
+}
+
+/*
+ * do a synchronous write on N pages
+ */
+int ceph_osdc_writepages(struct ceph_osd_client *osdc, struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *snapc,
+ u64 off, u64 len,

+ u32 truncate_seq, u64 truncate_size,

+ struct timespec *mtime,
+ struct page **pages, int num_pages,
+ int flags, int do_sync, bool nofail)
+{
+ struct ceph_osd_request *req;

+ int rc = 0;
+

+ BUG_ON(vino.snap != CEPH_NOSNAP);
+ req = ceph_osdc_new_request(osdc, layout, vino, off, &len,
+ CEPH_OSD_OP_WRITE,
+ flags | CEPH_OSD_FLAG_ONDISK |
+ CEPH_OSD_FLAG_WRITE,
+ snapc, do_sync,
+ truncate_seq, truncate_size, mtime,
+ nofail, 1);

+ if (IS_ERR(req))
+ return PTR_ERR(req);
+

+ /* it may be a short write due to an object boundary */
+ req->r_pages = pages;
+ req->r_num_pages = calc_pages_for(off, len);
+ dout("writepages %llu~%llu (%d pages)\n", off, len,
+ req->r_num_pages);
+
+ rc = ceph_osdc_start_request(osdc, req, nofail);
+ if (!rc)
+ rc = ceph_osdc_wait_request(osdc, req);
+
+ ceph_osdc_put_request(req);
+ if (rc == 0)
+ rc = len;
+ dout("writepages result %d\n", rc);
+ return rc;
+}
+
+/*
+ * handle incoming message
+ */
+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc;
+ int type = le16_to_cpu(msg->hdr.type);
+
+ if (!osd)
+ return;
+ osdc = osd->o_osdc;
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(osdc, msg);
+ break;
+ case CEPH_MSG_OSD_OPREPLY:
+ handle_reply(osdc, msg, con);
+ break;
+
+ default:
+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);
+}
+
+/*
+ * lookup and return message for incoming reply
+ */
+static struct ceph_msg *get_reply(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ struct ceph_osd_client *osdc = osd->o_osdc;
+ struct ceph_msg *m;
+ struct ceph_osd_request *req;
+ int front = le32_to_cpu(hdr->front_len);
+ int data_len = le32_to_cpu(hdr->data_len);
+ u64 tid;
+ int err;
+
+ tid = le64_to_cpu(hdr->tid);
+ mutex_lock(&osdc->request_mutex);
+ req = __lookup_request(osdc, tid);
+ if (!req) {
+ *skip = 1;
+ m = NULL;
+ pr_info("get_reply unknown tid %llu from osd%d\n", tid,
+ osd->o_osd);

+ goto out;
+ }
+

+ if (req->r_con_filling_msg) {
+ dout("get_reply revoking msg %p from old con %p\n",
+ req->r_reply, req->r_con_filling_msg);
+ ceph_con_revoke_message(req->r_con_filling_msg, req->r_reply);
+ ceph_con_put(req->r_con_filling_msg);
+ }
+
+ if (front > req->r_reply->front.iov_len) {
+ pr_warning("get_reply front %d > preallocated %d\n",
+ front, (int)req->r_reply->front.iov_len);
+ m = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, front, 0, 0, NULL);
+ if (IS_ERR(m))
+ goto out;
+ ceph_msg_put(req->r_reply);
+ req->r_reply = m;
+ }
+ m = ceph_msg_get(req->r_reply);
+
+ if (data_len > 0) {
+ err = __prepare_pages(con, hdr, req, tid, m);
+ if (err < 0) {
+ *skip = 1;
+ ceph_msg_put(m);
+ m = ERR_PTR(err);
+ }
+ }
+ *skip = 0;
+ req->r_con_filling_msg = ceph_con_get(con);
+ dout("get_reply tid %lld %p\n", tid, m);
+
+out:
+ mutex_unlock(&osdc->request_mutex);
+ return m;
+
+}
+
+static struct ceph_msg *alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip)
+{
+ struct ceph_osd *osd = con->private;
+ int type = le16_to_cpu(hdr->type);
+ int front = le32_to_cpu(hdr->front_len);
+
+ switch (type) {
+ case CEPH_MSG_OSD_MAP:
+ return ceph_msg_new(type, front, 0, 0, NULL);
+ case CEPH_MSG_OSD_OPREPLY:
+ return get_reply(con, hdr, skip);
+ default:
+ pr_info("alloc_msg unexpected msg type %d from osd%d\n", type,
+ osd->o_osd);
+ *skip = 1;

+ return NULL;
+ }
+}
+

+/*
+ * Wrappers to refcount containing ceph_osd struct
+ */
+static struct ceph_connection *get_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ if (get_osd(osd))
+ return con;

+ return NULL;
+}
+

+static void put_osd_con(struct ceph_connection *con)
+{
+ struct ceph_osd *osd = con->private;
+ put_osd(osd);
+}
+
+/*
+ * authentication
+ */
+static int get_authorizer(struct ceph_connection *con,
+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;

+ int ret = 0;
+

+ if (force_new && o->o_authorizer) {
+ ac->ops->destroy_authorizer(ac, o->o_authorizer);
+ o->o_authorizer = NULL;
+ }
+ if (o->o_authorizer == NULL) {
+ ret = ac->ops->create_authorizer(
+ ac, CEPH_ENTITY_TYPE_OSD,
+ &o->o_authorizer,
+ &o->o_authorizer_buf,
+ &o->o_authorizer_buf_len,
+ &o->o_authorizer_reply_buf,
+ &o->o_authorizer_reply_buf_len);
+ if (ret)

+ return ret;
+ }
+

+ *proto = ac->protocol;
+ *buf = o->o_authorizer_buf;
+ *len = o->o_authorizer_buf_len;
+ *reply_buf = o->o_authorizer_reply_buf;
+ *reply_len = o->o_authorizer_reply_buf_len;

+ return 0;
+}
+
+

+static int verify_authorizer_reply(struct ceph_connection *con, int len)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ return ac->ops->verify_authorizer_reply(ac, o->o_authorizer, len);
+}
+
+static int invalidate_authorizer(struct ceph_connection *con)
+{
+ struct ceph_osd *o = con->private;
+ struct ceph_osd_client *osdc = o->o_osdc;
+ struct ceph_auth_client *ac = osdc->client->monc.auth;
+
+ if (ac->ops->invalidate_authorizer)
+ ac->ops->invalidate_authorizer(ac, CEPH_ENTITY_TYPE_OSD);
+
+ return ceph_monc_validate_auth(&osdc->client->monc);
+}
+
+const static struct ceph_connection_operations osd_con_ops = {
+ .get = get_osd_con,
+ .put = put_osd_con,
+ .dispatch = dispatch,
+ .get_authorizer = get_authorizer,
+ .verify_authorizer_reply = verify_authorizer_reply,
+ .invalidate_authorizer = invalidate_authorizer,
+ .alloc_msg = alloc_msg,
+ .fault = osd_reset,
+};
diff --git a/fs/ceph/osd_client.h b/fs/ceph/osd_client.h
new file mode 100644
index 0000000..1b1a3ca
--- /dev/null
+++ b/fs/ceph/osd_client.h
@@ -0,0 +1,166 @@
+#ifndef _FS_CEPH_OSD_CLIENT_H
+#define _FS_CEPH_OSD_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/kref.h>
+#include <linux/mempool.h>
+#include <linux/rbtree.h>
+
+#include "types.h"
+#include "osdmap.h"
+#include "messenger.h"
+
+struct ceph_msg;
+struct ceph_snap_context;
+struct ceph_osd_request;
+struct ceph_osd_client;
+struct ceph_authorizer;
+
+/*
+ * completion callback for async writepages
+ */
+typedef void (*ceph_osdc_callback_t)(struct ceph_osd_request *,
+ struct ceph_msg *);
+
+/* a given osd we're communicating with */
+struct ceph_osd {
+ atomic_t o_ref;
+ struct ceph_osd_client *o_osdc;
+ int o_osd;
+ int o_incarnation;
+ struct rb_node o_node;
+ struct ceph_connection o_con;
+ struct list_head o_requests;
+ struct list_head o_osd_lru;
+ struct ceph_authorizer *o_authorizer;
+ void *o_authorizer_buf, *o_authorizer_reply_buf;
+ size_t o_authorizer_buf_len, o_authorizer_reply_buf_len;
+ unsigned long lru_ttl;
+ int o_marked_for_keepalive;
+ struct list_head o_keepalive_item;
+};
+
+/* an in-flight request */
+struct ceph_osd_request {
+ u64 r_tid; /* unique for this client */
+ struct rb_node r_node;
+ struct list_head r_req_lru_item;
+ struct list_head r_osd_item;
+ struct ceph_osd *r_osd;
+ struct ceph_pg r_pgid;
+
+ struct ceph_connection *r_con_filling_msg;
+
+ struct ceph_msg *r_request, *r_reply;
+ int r_result;
+ int r_flags; /* any additional flags for the osd */
+ u32 r_sent; /* >0 if r_request is sending/sent */
+ int r_got_reply;
+
+ struct ceph_osd_client *r_osdc;
+ struct kref r_kref;
+ bool r_mempool;
+ struct completion r_completion, r_safe_completion;
+ ceph_osdc_callback_t r_callback, r_safe_callback;
+ struct ceph_eversion r_reassert_version;
+ struct list_head r_unsafe_item;
+
+ struct inode *r_inode; /* for use by callbacks */
+ struct writeback_control *r_wbc; /* ditto */
+
+ char r_oid[40]; /* object name */
+ int r_oid_len;
+ unsigned long r_sent_stamp;
+ bool r_resend; /* msg send failed, needs retry */
+
+ struct ceph_file_layout r_file_layout;
+ struct ceph_snap_context *r_snapc; /* snap context for writes */
+ unsigned r_num_pages; /* size of page array (follows) */
+ struct page **r_pages; /* pages for data payload */
+ int r_pages_from_pool;
+ int r_own_pages; /* if true, i own page list */
+};
+
+struct ceph_osd_client {

+ struct ceph_client *client;
+

+ struct ceph_osdmap *osdmap; /* current map */
+ struct rw_semaphore map_sem;
+ struct completion map_waiters;
+ u64 last_requested_map;
+
+ struct mutex request_mutex;
+ struct rb_root osds; /* osds */
+ struct list_head osd_lru; /* idle osds */
+ u64 timeout_tid; /* tid of timeout triggering rq */
+ u64 last_tid; /* tid of last request */
+ struct rb_root requests; /* pending requests */
+ struct list_head req_lru; /* pending requests lru */
+ int num_requests;
+ struct delayed_work timeout_work;
+ struct delayed_work osds_timeout_work;
+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+
+ mempool_t *req_mempool;
+
+ struct ceph_msgpool msgpool_op;
+ struct ceph_msgpool msgpool_op_reply;
+};
+
+extern int ceph_osdc_init(struct ceph_osd_client *osdc,
+ struct ceph_client *client);
+extern void ceph_osdc_stop(struct ceph_osd_client *osdc);
+
+extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
+ struct ceph_msg *msg);
+extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,

+ struct ceph_msg *msg);
+

+extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
+ struct ceph_file_layout *layout,
+ struct ceph_vino vino,
+ u64 offset, u64 *len, int op, int flags,
+ struct ceph_snap_context *snapc,
+ int do_sync, u32 truncate_seq,
+ u64 truncate_size,
+ struct timespec *mtime,
+ bool use_mempool, int num_reply);
+
+static inline void ceph_osdc_get_request(struct ceph_osd_request *req)
+{
+ kref_get(&req->r_kref);
+}
+extern void ceph_osdc_release_request(struct kref *kref);
+static inline void ceph_osdc_put_request(struct ceph_osd_request *req)
+{
+ kref_put(&req->r_kref, ceph_osdc_release_request);
+}
+
+extern int ceph_osdc_start_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req,
+ bool nofail);
+extern int ceph_osdc_wait_request(struct ceph_osd_client *osdc,
+ struct ceph_osd_request *req);
+extern void ceph_osdc_sync(struct ceph_osd_client *osdc);
+
+extern int ceph_osdc_readpages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ u64 off, u64 *plen,

+ u32 truncate_seq, u64 truncate_size,

+ struct page **pages, int nr_pages);
+
+extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
+ struct ceph_vino vino,
+ struct ceph_file_layout *layout,
+ struct ceph_snap_context *sc,
+ u64 off, u64 len,

+ u32 truncate_seq, u64 truncate_size,

+ struct timespec *mtime,
+ struct page **pages, int nr_pages,
+ int flags, int do_sync, bool nofail);
+
+#endif
+
diff --git a/fs/ceph/osdmap.c b/fs/ceph/osdmap.c
new file mode 100644
index 0000000..b83f269
--- /dev/null
+++ b/fs/ceph/osdmap.c
@@ -0,0 +1,1019 @@
+
+#include <asm/div64.h>
+
+#include "super.h"
+#include "osdmap.h"
+#include "crush/hash.h"
+#include "crush/mapper.h"
+#include "decode.h"
+#include "ceph_debug.h"
+
+char *ceph_osdmap_state_str(char *str, int len, int state)
+{
+ int flag = 0;
+
+ if (!len)
+ goto done;
+
+ *str = '\0';
+ if (state) {
+ if (state & CEPH_OSD_EXISTS) {
+ snprintf(str, len, "exists");
+ flag = 1;
+ }
+ if (state & CEPH_OSD_UP) {
+ snprintf(str, len, "%s%s%s", str, (flag ? ", " : ""),
+ "up");
+ flag = 1;
+ }
+ } else {
+ snprintf(str, len, "doesn't exist");
+ }
+done:
+ return str;
+}
+
+/* maps */
+
+static int calc_bits_of(unsigned t)
+{
+ int b = 0;
+ while (t) {
+ t = t >> 1;
+ b++;
+ }
+ return b;
+}
+
+/*
+ * the foo_mask is the smallest value 2^n-1 that is >= foo.
+ */
+static void calc_pg_masks(struct ceph_pg_pool_info *pi)
+{
+ pi->pg_num_mask = (1 << calc_bits_of(le32_to_cpu(pi->v.pg_num)-1)) - 1;
+ pi->pgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.pgp_num)-1)) - 1;
+ pi->lpg_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpg_num)-1)) - 1;
+ pi->lpgp_num_mask =
+ (1 << calc_bits_of(le32_to_cpu(pi->v.lpgp_num)-1)) - 1;
+}
+
+/*
+ * decode crush map
+ */
+static int crush_decode_uniform_bucket(void **p, void *end,
+ struct crush_bucket_uniform *b)
+{
+ dout("crush_decode_uniform_bucket %p to %p\n", *p, end);
+ ceph_decode_need(p, end, (1+b->h.size) * sizeof(u32), bad);
+ b->item_weight = ceph_decode_32(p);
+ return 0;
+bad:

+ return -EINVAL;
+}
+

+static int crush_decode_list_bucket(void **p, void *end,
+ struct crush_bucket_list *b)
+{
+ int j;
+ dout("crush_decode_list_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->sum_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->sum_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->sum_weights[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:

+ return -EINVAL;
+}
+

+static int crush_decode_tree_bucket(void **p, void *end,
+ struct crush_bucket_tree *b)
+{
+ int j;
+ dout("crush_decode_tree_bucket %p to %p\n", *p, end);
+ ceph_decode_32_safe(p, end, b->num_nodes, bad);
+ b->node_weights = kcalloc(b->num_nodes, sizeof(u32), GFP_NOFS);
+ if (b->node_weights == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, b->num_nodes * sizeof(u32), bad);
+ for (j = 0; j < b->num_nodes; j++)
+ b->node_weights[j] = ceph_decode_32(p);
+ return 0;
+bad:

+ return -EINVAL;
+}
+

+static int crush_decode_straw_bucket(void **p, void *end,
+ struct crush_bucket_straw *b)
+{
+ int j;
+ dout("crush_decode_straw_bucket %p to %p\n", *p, end);
+ b->item_weights = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->item_weights == NULL)
+ return -ENOMEM;
+ b->straws = kcalloc(b->h.size, sizeof(u32), GFP_NOFS);
+ if (b->straws == NULL)
+ return -ENOMEM;
+ ceph_decode_need(p, end, 2 * b->h.size * sizeof(u32), bad);
+ for (j = 0; j < b->h.size; j++) {
+ b->item_weights[j] = ceph_decode_32(p);
+ b->straws[j] = ceph_decode_32(p);
+ }
+ return 0;
+bad:

+ return -EINVAL;
+}
+

+static struct crush_map *crush_decode(void *pbyval, void *end)
+{
+ struct crush_map *c;
+ int err = -EINVAL;
+ int i, j;
+ void **p = &pbyval;
+ void *start = pbyval;
+ u32 magic;
+
+ dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ c = kzalloc(sizeof(*c), GFP_NOFS);
+ if (c == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ magic = ceph_decode_32(p);
+ if (magic != CRUSH_MAGIC) {
+ pr_err("crush_decode magic %x != current %x\n",
+ (unsigned)magic, (unsigned)CRUSH_MAGIC);
+ goto bad;
+ }
+ c->max_buckets = ceph_decode_32(p);
+ c->max_rules = ceph_decode_32(p);
+ c->max_devices = ceph_decode_32(p);
+
+ c->device_parents = kcalloc(c->max_devices, sizeof(u32), GFP_NOFS);
+ if (c->device_parents == NULL)
+ goto badmem;
+ c->bucket_parents = kcalloc(c->max_buckets, sizeof(u32), GFP_NOFS);
+ if (c->bucket_parents == NULL)
+ goto badmem;
+
+ c->buckets = kcalloc(c->max_buckets, sizeof(*c->buckets), GFP_NOFS);
+ if (c->buckets == NULL)
+ goto badmem;
+ c->rules = kcalloc(c->max_rules, sizeof(*c->rules), GFP_NOFS);
+ if (c->rules == NULL)
+ goto badmem;
+
+ /* buckets */
+ for (i = 0; i < c->max_buckets; i++) {
+ int size = 0;
+ u32 alg;
+ struct crush_bucket *b;
+
+ ceph_decode_32_safe(p, end, alg, bad);
+ if (alg == 0) {
+ c->buckets[i] = NULL;
+ continue;
+ }
+ dout("crush_decode bucket %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ size = sizeof(struct crush_bucket_uniform);
+ break;
+ case CRUSH_BUCKET_LIST:
+ size = sizeof(struct crush_bucket_list);
+ break;
+ case CRUSH_BUCKET_TREE:
+ size = sizeof(struct crush_bucket_tree);
+ break;
+ case CRUSH_BUCKET_STRAW:
+ size = sizeof(struct crush_bucket_straw);
+ break;
+ default:
+ err = -EINVAL;
+ goto bad;
+ }
+ BUG_ON(size == 0);
+ b = c->buckets[i] = kzalloc(size, GFP_NOFS);
+ if (b == NULL)
+ goto badmem;
+
+ ceph_decode_need(p, end, 4*sizeof(u32), bad);
+ b->id = ceph_decode_32(p);
+ b->type = ceph_decode_16(p);
+ b->alg = ceph_decode_8(p);
+ b->hash = ceph_decode_8(p);
+ b->weight = ceph_decode_32(p);
+ b->size = ceph_decode_32(p);
+
+ dout("crush_decode bucket size %d off %x %p to %p\n",
+ b->size, (int)(*p-start), *p, end);
+
+ b->items = kcalloc(b->size, sizeof(__s32), GFP_NOFS);
+ if (b->items == NULL)
+ goto badmem;
+ b->perm = kcalloc(b->size, sizeof(u32), GFP_NOFS);
+ if (b->perm == NULL)
+ goto badmem;
+ b->perm_n = 0;
+
+ ceph_decode_need(p, end, b->size*sizeof(u32), bad);
+ for (j = 0; j < b->size; j++)
+ b->items[j] = ceph_decode_32(p);
+
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ err = crush_decode_uniform_bucket(p, end,
+ (struct crush_bucket_uniform *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_LIST:
+ err = crush_decode_list_bucket(p, end,
+ (struct crush_bucket_list *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_TREE:
+ err = crush_decode_tree_bucket(p, end,
+ (struct crush_bucket_tree *)b);
+ if (err < 0)
+ goto bad;
+ break;
+ case CRUSH_BUCKET_STRAW:
+ err = crush_decode_straw_bucket(p, end,
+ (struct crush_bucket_straw *)b);
+ if (err < 0)
+ goto bad;

+ break;
+ }
+ }
+

+ /* rules */
+ dout("rule vec is %p\n", c->rules);
+ for (i = 0; i < c->max_rules; i++) {
+ u32 yes;
+ struct crush_rule *r;
+
+ ceph_decode_32_safe(p, end, yes, bad);
+ if (!yes) {
+ dout("crush_decode NO rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+ c->rules[i] = NULL;
+ continue;
+ }
+
+ dout("crush_decode rule %d off %x %p to %p\n",
+ i, (int)(*p-start), *p, end);
+
+ /* len */
+ ceph_decode_32_safe(p, end, yes, bad);
+#if BITS_PER_LONG == 32
+ err = -EINVAL;
+ if (yes > ULONG_MAX / sizeof(struct crush_rule_step))
+ goto bad;
+#endif
+ r = c->rules[i] = kmalloc(sizeof(*r) +
+ yes*sizeof(struct crush_rule_step),
+ GFP_NOFS);
+ if (r == NULL)
+ goto badmem;
+ dout(" rule %d is at %p\n", i, r);
+ r->len = yes;
+ ceph_decode_copy_safe(p, end, &r->mask, 4, bad); /* 4 u8's */
+ ceph_decode_need(p, end, r->len*3*sizeof(u32), bad);
+ for (j = 0; j < r->len; j++) {
+ r->steps[j].op = ceph_decode_32(p);
+ r->steps[j].arg1 = ceph_decode_32(p);
+ r->steps[j].arg2 = ceph_decode_32(p);
+ }
+ }
+
+ /* ignore trailing name maps. */
+
+ dout("crush_decode success\n");
+ return c;
+
+badmem:
+ err = -ENOMEM;
+bad:
+ dout("crush_decode fail %d\n", err);
+ crush_destroy(c);
+ return ERR_PTR(err);
+}
+
+
+/*
+ * osd map
+ */
+void ceph_osdmap_destroy(struct ceph_osdmap *map)
+{
+ dout("osdmap_destroy %p\n", map);
+ if (map->crush)
+ crush_destroy(map->crush);
+ while (!RB_EMPTY_ROOT(&map->pg_temp)) {
+ struct ceph_pg_mapping *pg =
+ rb_entry(rb_first(&map->pg_temp),
+ struct ceph_pg_mapping, node);
+ rb_erase(&pg->node, &map->pg_temp);
+ kfree(pg);
+ }
+ while (!RB_EMPTY_ROOT(&map->pg_pools)) {
+ struct ceph_pg_pool_info *pi =
+ rb_entry(rb_first(&map->pg_pools),
+ struct ceph_pg_pool_info, node);
+ rb_erase(&pi->node, &map->pg_pools);
+ kfree(pi);
+ }
+ kfree(map->osd_state);
+ kfree(map->osd_weight);
+ kfree(map->osd_addr);
+ kfree(map);
+}
+
+/*
+ * adjust max osd value. reallocate arrays.
+ */
+static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
+{
+ u8 *state;
+ struct ceph_entity_addr *addr;
+ u32 *weight;
+
+ state = kcalloc(max, sizeof(*state), GFP_NOFS);
+ addr = kcalloc(max, sizeof(*addr), GFP_NOFS);
+ weight = kcalloc(max, sizeof(*weight), GFP_NOFS);
+ if (state == NULL || addr == NULL || weight == NULL) {
+ kfree(state);
+ kfree(addr);
+ kfree(weight);
+ return -ENOMEM;
+ }
+
+ /* copy old? */
+ if (map->osd_state) {
+ memcpy(state, map->osd_state, map->max_osd*sizeof(*state));
+ memcpy(addr, map->osd_addr, map->max_osd*sizeof(*addr));
+ memcpy(weight, map->osd_weight, map->max_osd*sizeof(*weight));
+ kfree(map->osd_state);
+ kfree(map->osd_addr);
+ kfree(map->osd_weight);
+ }
+
+ map->osd_state = state;
+ map->osd_weight = weight;
+ map->osd_addr = addr;
+ map->max_osd = max;

+ return 0;
+}
+

+/*
+ * rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
+ * to a set of osds)
+ */
+static int pgid_cmp(struct ceph_pg l, struct ceph_pg r)
+{
+ u64 a = *(u64 *)&l;
+ u64 b = *(u64 *)&r;
+
+ if (a < b)
+ return -1;
+ if (a > b)

+ return 1;
+ return 0;
+}

+
+static int __insert_pg_mapping(struct ceph_pg_mapping *new,
+ struct rb_root *root)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_mapping *pg = NULL;
+ int c;
+
+ while (*p) {
+ parent = *p;
+ pg = rb_entry(parent, struct ceph_pg_mapping, node);
+ c = pgid_cmp(new->pgid, pg->pgid);
+ if (c < 0)
+ p = &(*p)->rb_left;
+ else if (c > 0)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);

+ return 0;
+}
+

+static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root,
+ struct ceph_pg pgid)
+{
+ struct rb_node *n = root->rb_node;
+ struct ceph_pg_mapping *pg;
+ int c;
+
+ while (n) {
+ pg = rb_entry(n, struct ceph_pg_mapping, node);
+ c = pgid_cmp(pgid, pg->pgid);
+ if (c < 0)
+ n = n->rb_left;
+ else if (c > 0)
+ n = n->rb_right;
+ else
+ return pg;

+ }
+ return NULL;
+}
+

+/*
+ * rbtree of pg pool info
+ */
+static int __insert_pg_pool(struct rb_root *root, struct ceph_pg_pool_info *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_pg_pool_info *pi = NULL;
+
+ while (*p) {
+ parent = *p;
+ pi = rb_entry(parent, struct ceph_pg_pool_info, node);
+ if (new->id < pi->id)
+ p = &(*p)->rb_left;
+ else if (new->id > pi->id)
+ p = &(*p)->rb_right;
+ else
+ return -EEXIST;
+ }
+
+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);

+ return 0;
+}
+

+static struct ceph_pg_pool_info *__lookup_pg_pool(struct rb_root *root, int id)
+{
+ struct ceph_pg_pool_info *pi;
+ struct rb_node *n = root->rb_node;
+
+ while (n) {
+ pi = rb_entry(n, struct ceph_pg_pool_info, node);
+ if (id < pi->id)
+ n = n->rb_left;
+ else if (id > pi->id)
+ n = n->rb_right;
+ else
+ return pi;

+ }
+ return NULL;
+}
+

+/*
+ * decode a full map.
+ */
+struct ceph_osdmap *osdmap_decode(void **p, void *end)
+{
+ struct ceph_osdmap *map;
+ u16 version;
+ u32 len, max, i;
+ u8 ev;
+ int err = -EINVAL;
+ void *start = *p;
+ struct ceph_pg_pool_info *pi;
+
+ dout("osdmap_decode %p to %p len %d\n", *p, end, (int)(end - *p));
+
+ map = kzalloc(sizeof(*map), GFP_NOFS);
+ if (map == NULL)
+ return ERR_PTR(-ENOMEM);
+ map->pg_temp = RB_ROOT;
+
+ ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_VERSION) {
+ pr_warning("got unknown v %d > %d of osdmap\n", version,
+ CEPH_OSDMAP_VERSION);
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, 2*sizeof(u64)+6*sizeof(u32), bad);
+ ceph_decode_copy(p, &map->fsid, sizeof(map->fsid));
+ map->epoch = ceph_decode_32(p);
+ ceph_decode_copy(p, &map->created, sizeof(map->created));
+ ceph_decode_copy(p, &map->modified, sizeof(map->modified));
+
+ ceph_decode_32_safe(p, end, max, bad);
+ while (max--) {
+ ceph_decode_need(p, end, 4 + 1 + sizeof(pi->v), bad);
+ pi = kmalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi)
+ goto bad;
+ pi->id = ceph_decode_32(p);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
+ goto bad;
+ }
+ ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+ __insert_pg_pool(&map->pg_pools, pi);
+ calc_pg_masks(pi);
+ *p += le32_to_cpu(pi->v.num_snaps) * sizeof(u64);
+ *p += le32_to_cpu(pi->v.num_removed_snap_intervals)
+ * sizeof(u64) * 2;
+ }
+ ceph_decode_32_safe(p, end, map->pool_max, bad);
+
+ ceph_decode_32_safe(p, end, map->flags, bad);
+
+ max = ceph_decode_32(p);
+
+ /* (re)alloc osd arrays */
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ dout("osdmap_decode max_osd = %d\n", map->max_osd);
+
+ /* osds */
+ err = -EINVAL;
+ ceph_decode_need(p, end, 3*sizeof(u32) +
+ map->max_osd*(1 + sizeof(*map->osd_weight) +
+ sizeof(*map->osd_addr)), bad);
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_state, map->max_osd);
+
+ *p += 4; /* skip length field (should match max) */
+ for (i = 0; i < map->max_osd; i++)
+ map->osd_weight[i] = ceph_decode_32(p);
+
+ *p += 4; /* skip length field (should match max) */
+ ceph_decode_copy(p, map->osd_addr, map->max_osd*sizeof(*map->osd_addr));
+ for (i = 0; i < map->max_osd; i++)
+ ceph_decode_addr(&map->osd_addr[i]);
+
+ /* pg_temp */
+ ceph_decode_32_safe(p, end, len, bad);
+ for (i = 0; i < len; i++) {
+ int n, j;
+ struct ceph_pg pgid;
+ struct ceph_pg_mapping *pg;
+
+ ceph_decode_need(p, end, sizeof(u32) + sizeof(u64), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ n = ceph_decode_32(p);
+ ceph_decode_need(p, end, n * sizeof(u32), bad);
+ err = -ENOMEM;
+ pg = kmalloc(sizeof(*pg) + n*sizeof(u32), GFP_NOFS);
+ if (!pg)
+ goto bad;
+ pg->pgid = pgid;
+ pg->len = n;
+ for (j = 0; j < n; j++)
+ pg->osds[j] = ceph_decode_32(p);
+
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err)
+ goto bad;
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid, len);
+ }
+
+ /* crush */
+ ceph_decode_32_safe(p, end, len, bad);
+ dout("osdmap_decode crush len %d from off 0x%x\n", len,
+ (int)(*p - start));
+ ceph_decode_need(p, end, len, bad);
+ map->crush = crush_decode(*p, end);
+ *p += len;
+ if (IS_ERR(map->crush)) {
+ err = PTR_ERR(map->crush);
+ map->crush = NULL;
+ goto bad;
+ }
+
+ /* ignore the rest of the map */
+ *p = end;
+
+ dout("osdmap_decode done %p %p\n", *p, end);
+ return map;
+
+bad:
+ dout("osdmap_decode fail\n");
+ ceph_osdmap_destroy(map);
+ return ERR_PTR(err);
+}
+
+/*
+ * decode and apply an incremental map update.
+ */
+struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr)
+{
+ struct crush_map *newcrush = NULL;
+ struct ceph_fsid fsid;
+ u32 epoch = 0;
+ struct ceph_timespec modified;
+ u32 len, pool;
+ __s32 new_pool_max, new_flags, max;
+ void *start = *p;
+ int err = -EINVAL;
+ u16 version;
+ struct rb_node *rbp;
+
+ ceph_decode_16_safe(p, end, version, bad);
+ if (version > CEPH_OSDMAP_INC_VERSION) {
+ pr_warning("got unknown v %d > %d of inc osdmap\n", version,
+ CEPH_OSDMAP_INC_VERSION);
+ goto bad;
+ }
+
+ ceph_decode_need(p, end, sizeof(fsid)+sizeof(modified)+2*sizeof(u32),
+ bad);
+ ceph_decode_copy(p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(p);
+ BUG_ON(epoch != map->epoch+1);
+ ceph_decode_copy(p, &modified, sizeof(modified));
+ new_pool_max = ceph_decode_32(p);
+ new_flags = ceph_decode_32(p);
+
+ /* full map? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental full map len %d, %p to %p\n",
+ len, *p, end);
+ return osdmap_decode(p, min(*p+len, end));
+ }
+
+ /* new crush? */
+ ceph_decode_32_safe(p, end, len, bad);
+ if (len > 0) {
+ dout("apply_incremental new crush map len %d, %p to %p\n",
+ len, *p, end);
+ newcrush = crush_decode(*p, min(*p+len, end));
+ if (IS_ERR(newcrush))
+ return ERR_PTR(PTR_ERR(newcrush));
+ }
+
+ /* new flags? */
+ if (new_flags >= 0)
+ map->flags = new_flags;
+ if (new_pool_max >= 0)
+ map->pool_max = new_pool_max;
+
+ ceph_decode_need(p, end, 5*sizeof(u32), bad);
+
+ /* new max? */
+ max = ceph_decode_32(p);
+ if (max >= 0) {
+ err = osdmap_set_max_osd(map, max);
+ if (err < 0)
+ goto bad;
+ }
+
+ map->epoch++;
+ map->modified = map->modified;
+ if (newcrush) {
+ if (map->crush)
+ crush_destroy(map->crush);
+ map->crush = newcrush;
+ newcrush = NULL;
+ }
+
+ /* new_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ __u8 ev;
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ ceph_decode_need(p, end, 1 + sizeof(pi->v), bad);
+ ev = ceph_decode_8(p); /* encoding version */
+ if (ev > CEPH_PG_POOL_VERSION) {
+ pr_warning("got unknown v %d > %d of ceph_pg_pool\n",
+ ev, CEPH_PG_POOL_VERSION);
+ goto bad;
+ }
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (!pi) {
+ pi = kmalloc(sizeof(*pi), GFP_NOFS);
+ if (!pi) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pi->id = pool;
+ __insert_pg_pool(&map->pg_pools, pi);
+ }
+ ceph_decode_copy(p, &pi->v, sizeof(pi->v));
+ calc_pg_masks(pi);
+ }
+
+ /* old_pool */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ struct ceph_pg_pool_info *pi;
+
+ ceph_decode_32_safe(p, end, pool, bad);
+ pi = __lookup_pg_pool(&map->pg_pools, pool);
+ if (pi) {
+ rb_erase(&pi->node, &map->pg_pools);
+ kfree(pi);
+ }
+ }
+
+ /* new_up */
+ err = -EINVAL;
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ struct ceph_entity_addr addr;
+ ceph_decode_32_safe(p, end, osd, bad);
+ ceph_decode_copy_safe(p, end, &addr, sizeof(addr), bad);
+ ceph_decode_addr(&addr);
+ pr_info("osd%d up\n", osd);
+ BUG_ON(osd >= map->max_osd);
+ map->osd_state[osd] |= CEPH_OSD_UP;
+ map->osd_addr[osd] = addr;
+ }
+
+ /* new_down */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd;
+ ceph_decode_32_safe(p, end, osd, bad);
+ (*p)++; /* clean flag */
+ pr_info("osd%d down\n", osd);
+ if (osd < map->max_osd)
+ map->osd_state[osd] &= ~CEPH_OSD_UP;
+ }
+
+ /* new_weight */
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ u32 osd, off;
+ ceph_decode_need(p, end, sizeof(u32)*2, bad);
+ osd = ceph_decode_32(p);
+ off = ceph_decode_32(p);
+ pr_info("osd%d weight 0x%x %s\n", osd, off,
+ off == CEPH_OSD_IN ? "(in)" :
+ (off == CEPH_OSD_OUT ? "(out)" : ""));
+ if (osd < map->max_osd)
+ map->osd_weight[osd] = off;
+ }
+
+ /* new_pg_temp */
+ rbp = rb_first(&map->pg_temp);
+ ceph_decode_32_safe(p, end, len, bad);
+ while (len--) {
+ struct ceph_pg_mapping *pg;
+ int j;
+ struct ceph_pg pgid;
+ u32 pglen;
+ ceph_decode_need(p, end, sizeof(u64) + sizeof(u32), bad);
+ ceph_decode_copy(p, &pgid, sizeof(pgid));
+ pglen = ceph_decode_32(p);
+
+ /* remove any? */
+ while (rbp && pgid_cmp(rb_entry(rbp, struct ceph_pg_mapping,
+ node)->pgid, pgid) <= 0) {
+ struct rb_node *cur = rbp;
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n",
+ *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+ node)->pgid);
+ rb_erase(cur, &map->pg_temp);
+ }
+
+ if (pglen) {
+ /* insert */
+ ceph_decode_need(p, end, pglen*sizeof(u32), bad);
+ pg = kmalloc(sizeof(*pg) + sizeof(u32)*pglen, GFP_NOFS);
+ if (!pg) {
+ err = -ENOMEM;
+ goto bad;
+ }
+ pg->pgid = pgid;
+ pg->len = pglen;
+ for (j = 0; j < pglen; j++)
+ pg->osds[j] = ceph_decode_32(p);
+ err = __insert_pg_mapping(pg, &map->pg_temp);
+ if (err)
+ goto bad;
+ dout(" added pg_temp %llx len %d\n", *(u64 *)&pgid,
+ pglen);
+ }
+ }
+ while (rbp) {
+ struct rb_node *cur = rbp;
+ rbp = rb_next(rbp);
+ dout(" removed pg_temp %llx\n",
+ *(u64 *)&rb_entry(cur, struct ceph_pg_mapping,
+ node)->pgid);
+ rb_erase(cur, &map->pg_temp);
+ }
+
+ /* ignore the rest */
+ *p = end;
+ return map;
+
+bad:
+ pr_err("corrupt inc osdmap epoch %d off %d (%p of %p-%p)\n",
+ epoch, (int)(*p - start), *p, start, end);
+ print_hex_dump(KERN_DEBUG, "osdmap: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ start, end - start, true);
+ if (newcrush)
+ crush_destroy(newcrush);
+ return ERR_PTR(err);
+}

+
+
+
+
+/*

+ * calculate file layout from given offset, length.
+ * fill in correct oid, logical length, and object extent
+ * offset, length.
+ *
+ * for now, we write only a single su, until we can
+ * pass a stride back to the caller.
+ */
+void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u64 *ono,
+ u64 *oxoff, u64 *oxlen)
+{
+ u32 osize = le32_to_cpu(layout->fl_object_size);
+ u32 su = le32_to_cpu(layout->fl_stripe_unit);
+ u32 sc = le32_to_cpu(layout->fl_stripe_count);
+ u32 bl, stripeno, stripepos, objsetno;
+ u32 su_per_object;
+ u64 t, su_offset;
+
+ dout("mapping %llu~%llu osize %u fl_su %u\n", off, *plen,
+ osize, su);
+ su_per_object = osize / su;
+ dout("osize %u / su %u = su_per_object %u\n", osize, su,
+ su_per_object);
+
+ BUG_ON((su & ~PAGE_MASK) != 0);
+ /* bl = *off / su; */
+ t = off;
+ do_div(t, su);
+ bl = t;
+ dout("off %llu / su %u = bl %u\n", off, su, bl);
+
+ stripeno = bl / sc;
+ stripepos = bl % sc;
+ objsetno = stripeno / su_per_object;
+
+ *ono = objsetno * sc + stripepos;
+ dout("objset %u * sc %u = ono %u\n", objsetno, sc, (unsigned)*ono);
+
+ /* *oxoff = *off % layout->fl_stripe_unit; # offset in su */
+ t = off;
+ su_offset = do_div(t, su);
+ *oxoff = su_offset + (stripeno % su_per_object) * su;
+
+ /*
+ * Calculate the length of the extent being written to the selected
+ * object. This is the minimum of the full length requested (plen) or
+ * the remainder of the current stripe being written to.
+ */
+ *oxlen = min_t(u64, *plen, su - su_offset);
+ *plen = *oxlen;
+
+ dout(" obj extent %llu~%llu\n", *oxoff, *oxlen);
+}
+
+/*
+ * calculate an object layout (i.e. pgid) from an oid,
+ * file_layout, and osdmap
+ */
+int ceph_calc_object_layout(struct ceph_object_layout *ol,
+ const char *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap)
+{
+ unsigned num, num_mask;
+ struct ceph_pg pgid;
+ s32 preferred = (s32)le32_to_cpu(fl->fl_pg_preferred);
+ int poolid = le32_to_cpu(fl->fl_pg_pool);
+ struct ceph_pg_pool_info *pool;
+ unsigned ps;
+
+ BUG_ON(!osdmap);
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return -EIO;
+ ps = ceph_str_hash(pool->v.object_hash, oid, strlen(oid));
+ if (preferred >= 0) {
+ ps += preferred;
+ num = le32_to_cpu(pool->v.lpg_num);
+ num_mask = pool->lpg_num_mask;
+ } else {
+ num = le32_to_cpu(pool->v.pg_num);
+ num_mask = pool->pg_num_mask;
+ }
+
+ pgid.ps = cpu_to_le16(ps);
+ pgid.preferred = cpu_to_le16(preferred);
+ pgid.pool = fl->fl_pg_pool;
+ if (preferred >= 0)
+ dout("calc_object_layout '%s' pgid %d.%xp%d\n", oid, poolid, ps,
+ (int)preferred);
+ else
+ dout("calc_object_layout '%s' pgid %d.%x\n", oid, poolid, ps);
+
+ ol->ol_pgid = pgid;
+ ol->ol_stripe_unit = fl->fl_object_stripe_unit;

+ return 0;
+}
+

+/*
+ * Calculate raw osd vector for the given pgid. Return pointer to osd
+ * array, or NULL on failure.
+ */
+static int *calc_pg_raw(struct ceph_osdmap *osdmap, struct ceph_pg pgid,
+ int *osds, int *num)
+{
+ struct ceph_pg_mapping *pg;
+ struct ceph_pg_pool_info *pool;
+ int ruleno;
+ unsigned poolid, ps, pps;
+ int preferred;
+
+ /* pg_temp? */
+ pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid);
+ if (pg) {
+ *num = pg->len;
+ return pg->osds;
+ }
+
+ /* crush */
+ poolid = le32_to_cpu(pgid.pool);
+ ps = le16_to_cpu(pgid.ps);
+ preferred = (s16)le16_to_cpu(pgid.preferred);
+
+ /* don't forcefeed bad device ids to crush */
+ if (preferred >= osdmap->max_osd ||
+ preferred >= osdmap->crush->max_devices)
+ preferred = -1;
+
+ pool = __lookup_pg_pool(&osdmap->pg_pools, poolid);
+ if (!pool)
+ return NULL;
+ ruleno = crush_find_rule(osdmap->crush, pool->v.crush_ruleset,
+ pool->v.type, pool->v.size);
+ if (ruleno < 0) {
+ pr_err("no crush rule pool %d type %d size %d\n",
+ poolid, pool->v.type, pool->v.size);

+ return NULL;
+ }
+

+ if (preferred >= 0)
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.lpgp_num),
+ pool->lpgp_num_mask);
+ else
+ pps = ceph_stable_mod(ps,
+ le32_to_cpu(pool->v.pgp_num),
+ pool->pgp_num_mask);
+ pps += poolid;
+ *num = crush_do_rule(osdmap->crush, ruleno, pps, osds,
+ min_t(int, pool->v.size, *num),
+ preferred, osdmap->osd_weight);
+ return osds;
+}
+
+/*
+ * Return primary osd for given pgid, or -1 if none.
+ */
+int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, struct ceph_pg pgid)
+{
+ int rawosds[10], *osds;
+ int i, num = ARRAY_SIZE(rawosds);
+
+ osds = calc_pg_raw(osdmap, pgid, rawosds, &num);
+ if (!osds)
+ return -1;
+
+ /* primary is first up osd */
+ for (i = 0; i < num; i++)
+ if (ceph_osd_is_up(osdmap, osds[i])) {
+ return osds[i];
+ break;
+ }
+ return -1;
+}
diff --git a/fs/ceph/osdmap.h b/fs/ceph/osdmap.h
new file mode 100644
index 0000000..1fb55af
--- /dev/null
+++ b/fs/ceph/osdmap.h
@@ -0,0 +1,125 @@
+#ifndef _FS_CEPH_OSDMAP_H
+#define _FS_CEPH_OSDMAP_H
+
+#include <linux/rbtree.h>
+#include "types.h"
+#include "ceph_fs.h"
+#include "crush/crush.h"
+
+/*
+ * The osd map describes the current membership of the osd cluster and
+ * specifies the mapping of objects to placement groups and placement
+ * groups to (sets of) osds. That is, it completely specifies the
+ * (desired) distribution of all data objects in the system at some
+ * point in time.
+ *
+ * Each map version is identified by an epoch, which increases monotonically.
+ *
+ * The map can be updated either via an incremental map (diff) describing
+ * the change between two successive epochs, or as a fully encoded map.
+ */
+struct ceph_pg_pool_info {
+ struct rb_node node;
+ int id;
+ struct ceph_pg_pool v;
+ int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask;
+};
+
+struct ceph_pg_mapping {
+ struct rb_node node;
+ struct ceph_pg pgid;
+ int len;
+ int osds[];
+};
+
+struct ceph_osdmap {
+ struct ceph_fsid fsid;
+ u32 epoch;
+ u32 mkfs_epoch;
+ struct ceph_timespec created, modified;
+
+ u32 flags; /* CEPH_OSDMAP_* */
+
+ u32 max_osd; /* size of osd_state, _offload, _addr arrays */
+ u8 *osd_state; /* CEPH_OSD_* */
+ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
+ struct ceph_entity_addr *osd_addr;
+
+ struct rb_root pg_temp;
+ struct rb_root pg_pools;
+ u32 pool_max;
+
+ /* the CRUSH map specifies the mapping of placement groups to
+ * the list of osds that store+replicate them. */
+ struct crush_map *crush;
+};
+
+/*
+ * file layout helpers
+ */
+#define ceph_file_layout_su(l) ((__s32)le32_to_cpu((l).fl_stripe_unit))
+#define ceph_file_layout_stripe_count(l) \
+ ((__s32)le32_to_cpu((l).fl_stripe_count))
+#define ceph_file_layout_object_size(l) ((__s32)le32_to_cpu((l).fl_object_size))
+#define ceph_file_layout_cas_hash(l) ((__s32)le32_to_cpu((l).fl_cas_hash))
+#define ceph_file_layout_object_su(l) \
+ ((__s32)le32_to_cpu((l).fl_object_stripe_unit))
+#define ceph_file_layout_pg_preferred(l) \
+ ((__s32)le32_to_cpu((l).fl_pg_preferred))
+#define ceph_file_layout_pg_pool(l) \
+ ((__s32)le32_to_cpu((l).fl_pg_pool))
+
+static inline unsigned ceph_file_layout_stripe_width(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_stripe_unit) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+/* "period" == bytes before i start on a new set of objects */
+static inline unsigned ceph_file_layout_period(struct ceph_file_layout *l)
+{
+ return le32_to_cpu(l->fl_object_size) *
+ le32_to_cpu(l->fl_stripe_count);
+}
+
+
+static inline int ceph_osd_is_up(struct ceph_osdmap *map, int osd)
+{
+ return (osd < map->max_osd) && (map->osd_state[osd] & CEPH_OSD_UP);
+}
+
+static inline bool ceph_osdmap_flag(struct ceph_osdmap *map, int flag)
+{
+ return map && (map->flags & flag);
+}
+
+extern char *ceph_osdmap_state_str(char *str, int len, int state);
+
+static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
+ int osd)
+{
+ if (osd >= map->max_osd)
+ return NULL;
+ return &map->osd_addr[osd];
+}
+
+extern struct ceph_osdmap *osdmap_decode(void **p, void *end);
+extern struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
+ struct ceph_osdmap *map,
+ struct ceph_messenger *msgr);
+extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
+
+/* calculate mapping of a file extent to an object */
+extern void ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
+ u64 off, u64 *plen,
+ u64 *bno, u64 *oxoff, u64 *oxlen);
+
+/* calculate mapping of object to a placement group */
+extern int ceph_calc_object_layout(struct ceph_object_layout *ol,
+ const char *oid,
+ struct ceph_file_layout *fl,
+ struct ceph_osdmap *osdmap);
+extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
+ struct ceph_pg pgid);

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

The Ceph metadata servers control client access to inode metadata and
file data by issuing capabilities, granting clients permission to read
and/or write both inode field and file data to OSDs (storage nodes).
Each capability consists of a set of bits indicating which operations
are allowed.

If the client holds a *_SHARED cap, the client has a coherent value
that can be safely read from the cached inode.

In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the client
is allowed to change inode attributes (e.g., file size, mtime), note
its dirty state in the ceph_cap, and asynchronously flush that
metadata change to the MDS.

In the event of a conflicting operation (perhaps by another client),
the MDS will revoke the conflicting client capabilities.

In order for a client to cache an inode, it must hold a capability
with at least one MDS server. When inodes are released, release
notifications are batched and periodically sent en masse to the MDS
cluster to release server state.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/caps.c | 2925 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 2925 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/caps.c

diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
new file mode 100644
index 0000000..8b89b91
--- /dev/null
+++ b/fs/ceph/caps.c
@@ -0,0 +1,2925 @@
+#include "ceph_debug.h"
+
+#include <linux/fs.h>
+#include <linux/kernel.h>
+#include <linux/sched.h>
+#include <linux/vmalloc.h>
+#include <linux/wait.h>
+
+#include "super.h"
+#include "decode.h"
+#include "messenger.h"
+
+/*
+ * Capability management
+ *
+ * The Ceph metadata servers control client access to inode metadata
+ * and file data by issuing capabilities, granting clients permission
+ * to read and/or write both inode field and file data to OSDs
+ * (storage nodes). Each capability consists of a set of bits
+ * indicating which operations are allowed.
+ *
+ * If the client holds a *_SHARED cap, the client has a coherent value
+ * that can be safely read from the cached inode.
+ *
+ * In the case of a *_EXCL (exclusive) or FILE_WR capabilities, the
+ * client is allowed to change inode attributes (e.g., file size,
+ * mtime), note its dirty state in the ceph_cap, and asynchronously
+ * flush that metadata change to the MDS.
+ *
+ * In the event of a conflicting operation (perhaps by another
+ * client), the MDS will revoke the conflicting client capabilities.
+ *
+ * In order for a client to cache an inode, it must hold a capability
+ * with at least one MDS server. When inodes are released, release
+ * notifications are batched and periodically sent en masse to the MDS
+ * cluster to release server state.
+ */
+
+
+/*
+ * Generate readable cap strings for debugging output.
+ */
+#define MAX_CAP_STR 20
+static char cap_str[MAX_CAP_STR][40];
+static DEFINE_SPINLOCK(cap_str_lock);
+static int last_cap_str;
+
+static char *gcap_string(char *s, int c)
+{
+ if (c & CEPH_CAP_GSHARED)
+ *s++ = 's';
+ if (c & CEPH_CAP_GEXCL)
+ *s++ = 'x';
+ if (c & CEPH_CAP_GCACHE)
+ *s++ = 'c';
+ if (c & CEPH_CAP_GRD)
+ *s++ = 'r';
+ if (c & CEPH_CAP_GWR)
+ *s++ = 'w';
+ if (c & CEPH_CAP_GBUFFER)
+ *s++ = 'b';
+ if (c & CEPH_CAP_GLAZYIO)
+ *s++ = 'l';
+ return s;
+}
+
+const char *ceph_cap_string(int caps)
+{
+ int i;
+ char *s;
+ int c;
+
+ spin_lock(&cap_str_lock);
+ i = last_cap_str++;
+ if (last_cap_str == MAX_CAP_STR)
+ last_cap_str = 0;
+ spin_unlock(&cap_str_lock);
+
+ s = cap_str[i];
+
+ if (caps & CEPH_CAP_PIN)
+ *s++ = 'p';
+
+ c = (caps >> CEPH_CAP_SAUTH) & 3;
+ if (c) {
+ *s++ = 'A';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SLINK) & 3;
+ if (c) {
+ *s++ = 'L';
+ s = gcap_string(s, c);
+ }
+
+ c = (caps >> CEPH_CAP_SXATTR) & 3;
+ if (c) {
+ *s++ = 'X';
+ s = gcap_string(s, c);
+ }
+
+ c = caps >> CEPH_CAP_SFILE;
+ if (c) {
+ *s++ = 'F';
+ s = gcap_string(s, c);
+ }
+
+ if (s == cap_str[i])
+ *s++ = '-';
+ *s = 0;
+ return cap_str[i];
+}
+
+/*
+ * Cap reservations
+ *
+ * Maintain a global pool of preallocated struct ceph_caps, referenced
+ * by struct ceph_caps_reservations. This ensures that we preallocate
+ * memory needed to successfully process an MDS response. (If an MDS
+ * sends us cap information and we fail to process it, we will have
+ * problems due to the client and MDS being out of sync.)
+ *
+ * Reservations are 'owned' by a ceph_cap_reservation context.
+ */
+static spinlock_t caps_list_lock;
+static struct list_head caps_list; /* unused (reserved or unreserved) */
+static int caps_total_count; /* total caps allocated */
+static int caps_use_count; /* in use */
+static int caps_reserve_count; /* unused, reserved */
+static int caps_avail_count; /* unused, unreserved */
+static int caps_min_count; /* keep at least this many (unreserved) */
+
+void __init ceph_caps_init(void)
+{
+ INIT_LIST_HEAD(&caps_list);
+ spin_lock_init(&caps_list_lock);
+}
+
+void ceph_caps_finalize(void)
+{
+ struct ceph_cap *cap;
+
+ spin_lock(&caps_list_lock);
+ while (!list_empty(&caps_list)) {
+ cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+ kmem_cache_free(ceph_cap_cachep, cap);
+ }
+ caps_total_count = 0;
+ caps_avail_count = 0;
+ caps_use_count = 0;
+ caps_reserve_count = 0;
+ caps_min_count = 0;
+ spin_unlock(&caps_list_lock);
+}
+
+void ceph_adjust_min_caps(int delta)
+{
+ spin_lock(&caps_list_lock);
+ caps_min_count += delta;
+ BUG_ON(caps_min_count < 0);
+ spin_unlock(&caps_list_lock);
+}
+
+int ceph_reserve_caps(struct ceph_cap_reservation *ctx, int need)
+{
+ int i;
+ struct ceph_cap *cap;
+ int have;
+ int alloc = 0;
+ LIST_HEAD(newcaps);

+ int ret = 0;
+

+ dout("reserve caps ctx=%p need=%d\n", ctx, need);
+
+ /* first reserve any caps that are already allocated */
+ spin_lock(&caps_list_lock);
+ if (caps_avail_count >= need)
+ have = need;
+ else
+ have = caps_avail_count;
+ caps_avail_count -= have;
+ caps_reserve_count += have;
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+
+ for (i = have; i < need; i++) {
+ cap = kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+ if (!cap) {
+ ret = -ENOMEM;
+ goto out_alloc_count;
+ }
+ list_add(&cap->caps_item, &newcaps);
+ alloc++;
+ }
+ BUG_ON(have + alloc != need);
+
+ spin_lock(&caps_list_lock);
+ caps_total_count += alloc;
+ caps_reserve_count += alloc;
+ list_splice(&newcaps, &caps_list);
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+
+ ctx->count = need;
+ dout("reserve caps ctx=%p %d = %d used + %d resv + %d avail\n",
+ ctx, caps_total_count, caps_use_count, caps_reserve_count,
+ caps_avail_count);
+ return 0;
+
+out_alloc_count:
+ /* we didn't manage to reserve as much as we needed */
+ pr_warning("reserve caps ctx=%p ENOMEM need=%d got=%d\n",
+ ctx, need, have);

+ return ret;
+}
+

+int ceph_unreserve_caps(struct ceph_cap_reservation *ctx)
+{
+ dout("unreserve caps ctx=%p count=%d\n", ctx, ctx->count);
+ if (ctx->count) {
+ spin_lock(&caps_list_lock);
+ BUG_ON(caps_reserve_count < ctx->count);
+ caps_reserve_count -= ctx->count;
+ caps_avail_count += ctx->count;
+ ctx->count = 0;
+ dout("unreserve caps %d = %d used + %d resv + %d avail\n",
+ caps_total_count, caps_use_count, caps_reserve_count,
+ caps_avail_count);
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+ }

+ return 0;
+}
+

+static struct ceph_cap *get_cap(struct ceph_cap_reservation *ctx)
+{
+ struct ceph_cap *cap = NULL;
+
+ /* temporary, until we do something about cap import/export */
+ if (!ctx)
+ return kmem_cache_alloc(ceph_cap_cachep, GFP_NOFS);
+
+ spin_lock(&caps_list_lock);
+ dout("get_cap ctx=%p (%d) %d = %d used + %d resv + %d avail\n",
+ ctx, ctx->count, caps_total_count, caps_use_count,
+ caps_reserve_count, caps_avail_count);
+ BUG_ON(!ctx->count);
+ BUG_ON(ctx->count > caps_reserve_count);
+ BUG_ON(list_empty(&caps_list));
+
+ ctx->count--;
+ caps_reserve_count--;
+ caps_use_count++;
+
+ cap = list_first_entry(&caps_list, struct ceph_cap, caps_item);
+ list_del(&cap->caps_item);
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+ return cap;
+}
+
+void ceph_put_cap(struct ceph_cap *cap)
+{
+ spin_lock(&caps_list_lock);
+ dout("put_cap %p %d = %d used + %d resv + %d avail\n",
+ cap, caps_total_count, caps_use_count,
+ caps_reserve_count, caps_avail_count);
+ caps_use_count--;
+ /*
+ * Keep some preallocated caps around (ceph_min_count), to
+ * avoid lots of free/alloc churn.
+ */
+ if (caps_avail_count >= caps_reserve_count + caps_min_count) {
+ caps_total_count--;
+ kmem_cache_free(ceph_cap_cachep, cap);
+ } else {
+ caps_avail_count++;
+ list_add(&cap->caps_item, &caps_list);
+ }
+
+ BUG_ON(caps_total_count != caps_use_count + caps_reserve_count +
+ caps_avail_count);
+ spin_unlock(&caps_list_lock);
+}
+
+void ceph_reservation_status(struct ceph_client *client,
+ int *total, int *avail, int *used, int *reserved,
+ int *min)
+{
+ if (total)
+ *total = caps_total_count;
+ if (avail)
+ *avail = caps_avail_count;
+ if (used)
+ *used = caps_use_count;
+ if (reserved)
+ *reserved = caps_reserve_count;
+ if (min)
+ *min = caps_min_count;
+}
+
+/*
+ * Find ceph_cap for given mds, if any.
+ *
+ * Called with i_lock held.
+ */
+static struct ceph_cap *__get_cap_for_mds(struct ceph_inode_info *ci, int mds)
+{
+ struct ceph_cap *cap;
+ struct rb_node *n = ci->i_caps.rb_node;
+
+ while (n) {
+ cap = rb_entry(n, struct ceph_cap, ci_node);
+ if (mds < cap->mds)

+ n = n->rb_left;

+ else if (mds > cap->mds)

+ n = n->rb_right;
+ else

+ return cap;

+ }
+ return NULL;
+}
+
+/*

+ * Return id of any MDS with a cap, preferably FILE_WR|WRBUFFER|EXCL, else
+ * -1.
+ */
+static int __ceph_get_cap_mds(struct ceph_inode_info *ci, u32 *mseq)
+{
+ struct ceph_cap *cap;
+ int mds = -1;

+ struct rb_node *p;
+

+ /* prefer mds with WR|WRBUFFER|EXCL caps */
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ mds = cap->mds;
+ if (mseq)
+ *mseq = cap->mseq;
+ if (cap->issued & (CEPH_CAP_FILE_WR |
+ CEPH_CAP_FILE_BUFFER |
+ CEPH_CAP_FILE_EXCL))
+ break;
+ }
+ return mds;
+}
+
+int ceph_get_cap_mds(struct inode *inode)
+{
+ int mds;
+ spin_lock(&inode->i_lock);
+ mds = __ceph_get_cap_mds(ceph_inode(inode), NULL);
+ spin_unlock(&inode->i_lock);
+ return mds;
+}
+
+/*
+ * Called under i_lock.
+ */
+static void __insert_cap_node(struct ceph_inode_info *ci,
+ struct ceph_cap *new)
+{
+ struct rb_node **p = &ci->i_caps.rb_node;

+ struct rb_node *parent = NULL;

+ struct ceph_cap *cap = NULL;

+
+ while (*p) {
+ parent = *p;

+ cap = rb_entry(parent, struct ceph_cap, ci_node);
+ if (new->mds < cap->mds)

+ p = &(*p)->rb_left;

+ else if (new->mds > cap->mds)

+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+

+ rb_link_node(&new->ci_node, parent, p);
+ rb_insert_color(&new->ci_node, &ci->i_caps);
+}
+
+/*
+ * (re)set cap hold timeouts, which control the delayed release
+ * of unused caps back to the MDS. Should be called on cap use.
+ */
+static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ struct ceph_mount_args *ma = mdsc->client->mount_args;
+
+ ci->i_hold_caps_min = round_jiffies(jiffies +
+ ma->caps_wanted_delay_min * HZ);
+ ci->i_hold_caps_max = round_jiffies(jiffies +
+ ma->caps_wanted_delay_max * HZ);
+ dout("__cap_set_timeouts %p min %lu max %lu\n", &ci->vfs_inode,
+ ci->i_hold_caps_min - jiffies, ci->i_hold_caps_max - jiffies);
+}
+
+/*
+ * (Re)queue cap at the end of the delayed cap release list.
+ *
+ * If I_FLUSH is set, leave the inode at the front of the list.
+ *
+ * Caller holds i_lock
+ * -> we take mdsc->cap_delay_lock
+ */
+static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ __cap_set_timeouts(mdsc, ci);
+ dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
+ ci->i_ceph_flags, ci->i_hold_caps_max);
+ if (!mdsc->stopping) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (!list_empty(&ci->i_cap_delay_list)) {
+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ goto no_change;
+ list_del_init(&ci->i_cap_delay_list);
+ }
+ list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+no_change:
+ spin_unlock(&mdsc->cap_delay_lock);
+ }
+}
+
+/*
+ * Queue an inode for immediate writeback. Mark inode with I_FLUSH,
+ * indicating we should send a cap message to flush dirty metadata
+ * asap, and move to the front of the delayed cap list.
+ */
+static void __cap_delay_requeue_front(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_requeue_front %p\n", &ci->vfs_inode);
+ spin_lock(&mdsc->cap_delay_lock);
+ ci->i_ceph_flags |= CEPH_I_FLUSH;
+ if (!list_empty(&ci->i_cap_delay_list))
+ list_del_init(&ci->i_cap_delay_list);
+ list_add(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Cancel delayed work on cap.
+ *
+ * Caller must hold i_lock.
+ */
+static void __cap_delay_cancel(struct ceph_mds_client *mdsc,
+ struct ceph_inode_info *ci)
+{
+ dout("__cap_delay_cancel %p\n", &ci->vfs_inode);
+ if (list_empty(&ci->i_cap_delay_list))
+ return;
+ spin_lock(&mdsc->cap_delay_lock);
+ list_del_init(&ci->i_cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Common issue checks for add_cap, handle_cap_grant.
+ */
+static void __check_cap_issue(struct ceph_inode_info *ci, struct ceph_cap *cap,
+ unsigned issued)
+{
+ unsigned had = __ceph_caps_issued(ci, NULL);
+
+ /*
+ * Each time we receive FILE_CACHE anew, we increment
+ * i_rdcache_gen.
+ */
+ if ((issued & CEPH_CAP_FILE_CACHE) &&
+ (had & CEPH_CAP_FILE_CACHE) == 0)
+ ci->i_rdcache_gen++;
+
+ /*
+ * if we are newly issued FILE_SHARED, clear I_COMPLETE; we
+ * don't know what happened to this directory while we didn't
+ * have the cap.
+ */
+ if ((issued & CEPH_CAP_FILE_SHARED) &&
+ (had & CEPH_CAP_FILE_SHARED) == 0) {
+ ci->i_shared_gen++;
+ if (S_ISDIR(ci->vfs_inode.i_mode)) {
+ dout(" marking %p NOT complete\n", &ci->vfs_inode);

+ ci->i_ceph_flags &= ~CEPH_I_COMPLETE;

+ }
+ }
+}
+
+/*

+ * Add a capability under the given MDS session.
+ *
+ * Caller should hold session snap_rwsem (read) and s_mutex.
+ *
+ * @fmode is the open file mode, if we are opening a file, otherwise
+ * it is < 0. (This is so we can atomically add the cap and add an
+ * open file reference to it.)
+ */
+int ceph_add_cap(struct inode *inode,

+ struct ceph_mds_session *session, u64 cap_id,
+ int fmode, unsigned issued, unsigned wanted,

+ unsigned seq, unsigned mseq, u64 realmino, int flags,
+ struct ceph_cap_reservation *caps_reservation)
+{
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct ceph_cap *new_cap = NULL;
+ struct ceph_cap *cap;
+ int mds = session->s_mds;
+ int actual_wanted;
+
+ dout("add_cap %p mds%d cap %llx %s seq %d\n", inode,
+ session->s_mds, cap_id, ceph_cap_string(issued), seq);
+
+ /*
+ * If we are opening the file, include file mode wanted bits
+ * in wanted.
+ */
+ if (fmode >= 0)
+ wanted |= ceph_caps_for_mode(fmode);
+
+retry:
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (!cap) {
+ if (new_cap) {
+ cap = new_cap;
+ new_cap = NULL;
+ } else {
+ spin_unlock(&inode->i_lock);
+ new_cap = get_cap(caps_reservation);
+ if (new_cap == NULL)
+ return -ENOMEM;

+ goto retry;
+ }
+

+ cap->issued = 0;
+ cap->implemented = 0;
+ cap->mds = mds;
+ cap->mds_wanted = 0;
+
+ cap->ci = ci;
+ __insert_cap_node(ci, cap);
+
+ /* clear out old exporting info? (i.e. on cap import) */
+ if (ci->i_cap_exporting_mds == mds) {
+ ci->i_cap_exporting_issued = 0;
+ ci->i_cap_exporting_mseq = 0;
+ ci->i_cap_exporting_mds = -1;
+ }
+
+ /* add to session cap list */
+ cap->session = session;
+ spin_lock(&session->s_cap_lock);
+ list_add_tail(&cap->session_caps, &session->s_caps);
+ session->s_nr_caps++;
+ spin_unlock(&session->s_cap_lock);
+ }
+
+ if (!ci->i_snap_realm) {
+ /*
+ * add this inode to the appropriate snap realm
+ */
+ struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
+ realmino);
+ if (realm) {
+ ceph_get_snap_realm(mdsc, realm);
+ spin_lock(&realm->inodes_with_caps_lock);
+ ci->i_snap_realm = realm;
+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ spin_unlock(&realm->inodes_with_caps_lock);
+ } else {
+ pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
+ realmino);
+ }
+ }
+
+ __check_cap_issue(ci, cap, issued);
+
+ /*
+ * If we are issued caps we don't want, or the mds' wanted
+ * value appears to be off, queue a check so we'll release
+ * later and/or update the mds wanted value.
+ */
+ actual_wanted = __ceph_caps_wanted(ci);
+ if ((wanted & ~actual_wanted) ||
+ (issued & ~actual_wanted & CEPH_CAP_ANY_WR)) {
+ dout(" issued %s, mds wanted %s, actual %s, queueing\n",
+ ceph_cap_string(issued), ceph_cap_string(wanted),
+ ceph_cap_string(actual_wanted));
+ __cap_delay_requeue(mdsc, ci);
+ }
+
+ if (flags & CEPH_CAP_FLAG_AUTH)
+ ci->i_auth_cap = cap;
+ else if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
+ dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
+ inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
+ ceph_cap_string(issued|cap->issued), seq, mds);
+ cap->cap_id = cap_id;
+ cap->issued = issued;
+ cap->implemented |= issued;
+ cap->mds_wanted |= wanted;
+ cap->seq = seq;
+ cap->issue_seq = seq;
+ cap->mseq = mseq;
+ cap->cap_gen = session->s_cap_gen;
+
+ if (fmode >= 0)
+ __ceph_get_fmode(ci, fmode);
+ spin_unlock(&inode->i_lock);
+ wake_up(&ci->i_cap_wq);

+ return 0;
+}
+
+/*

+ * Return true if cap has not timed out and belongs to the current
+ * generation of the MDS session (i.e. has not gone 'stale' due to
+ * us losing touch with the mds).
+ */
+static int __cap_is_valid(struct ceph_cap *cap)
+{
+ unsigned long ttl;
+ u32 gen;
+
+ spin_lock(&cap->session->s_cap_lock);
+ gen = cap->session->s_cap_gen;
+ ttl = cap->session->s_cap_ttl;
+ spin_unlock(&cap->session->s_cap_lock);
+
+ if (cap->cap_gen < gen || time_after_eq(jiffies, ttl)) {
+ dout("__cap_is_valid %p cap %p issued %s "
+ "but STALE (gen %u vs %u)\n", &cap->ci->vfs_inode,
+ cap, ceph_cap_string(cap->issued), cap->cap_gen, gen);

+ return 0;
+ }
+

+ return 1;
+}
+
+/*
+ * Return set of valid cap bits issued to us. Note that caps time
+ * out, and may be invalidated in bulk if the client session times out
+ * and session->s_cap_gen is bumped.
+ */
+int __ceph_caps_issued(struct ceph_inode_info *ci, int *implemented)
+{
+ int have = ci->i_snap_caps | ci->i_cap_exporting_issued;
+ struct ceph_cap *cap;

+ struct rb_node *p;
+

+ if (implemented)
+ *implemented = 0;
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ dout("__ceph_caps_issued %p cap %p issued %s\n",
+ &ci->vfs_inode, cap, ceph_cap_string(cap->issued));
+ have |= cap->issued;
+ if (implemented)
+ *implemented |= cap->implemented;
+ }
+ return have;
+}
+
+/*
+ * Get cap bits issued by caps other than @ocap
+ */
+int __ceph_caps_issued_other(struct ceph_inode_info *ci, struct ceph_cap *ocap)
+{
+ int have = ci->i_snap_caps;
+ struct ceph_cap *cap;

+ struct rb_node *p;
+

+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (cap == ocap)
+ continue;
+ if (!__cap_is_valid(cap))
+ continue;
+ have |= cap->issued;
+ }
+ return have;
+}
+
+/*
+ * Move a cap to the end of the LRU (oldest caps at list head, newest
+ * at list tail).
+ */
+static void __touch_cap(struct ceph_cap *cap)
+{
+ struct ceph_mds_session *s = cap->session;
+
+ spin_lock(&s->s_cap_lock);
+ if (s->s_cap_iterator == NULL) {
+ dout("__touch_cap %p cap %p mds%d\n", &cap->ci->vfs_inode, cap,
+ s->s_mds);
+ list_move_tail(&cap->session_caps, &s->s_caps);
+ } else {
+ dout("__touch_cap %p cap %p mds%d NOP, iterating over caps\n",
+ &cap->ci->vfs_inode, cap, s->s_mds);
+ }
+ spin_unlock(&s->s_cap_lock);
+}
+
+/*
+ * Check if we hold the given mask. If so, move the cap(s) to the
+ * front of their respective LRUs. (This is the preferred way for
+ * callers to check for caps they want.)
+ */
+int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int have = ci->i_snap_caps;
+
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p snap issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(have),
+ ceph_cap_string(mask));
+ return 1;
+ }
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ if ((cap->issued & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p cap %p issued %s"
+ " (mask %s)\n", &ci->vfs_inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch)
+ __touch_cap(cap);

+ return 1;
+ }
+

+ /* does a combination of caps satisfy mask? */
+ have |= cap->issued;
+ if ((have & mask) == mask) {
+ dout("__ceph_caps_issued_mask %p combo issued %s"
+ " (mask %s)\n", &ci->vfs_inode,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(mask));
+ if (touch) {
+ struct rb_node *q;
+
+ /* touch this + preceeding caps */
+ __touch_cap(cap);
+ for (q = rb_first(&ci->i_caps); q != p;
+ q = rb_next(q)) {
+ cap = rb_entry(q, struct ceph_cap,
+ ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ __touch_cap(cap);
+ }
+ }
+ return 1;
+ }
+ }
+

+ return 0;
+}
+
+/*

+ * Return true if mask caps are currently being revoked by an MDS.
+ */
+int ceph_caps_revoking(struct ceph_inode_info *ci, int mask)
+{
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ struct rb_node *p;

+ int ret = 0;
+

+ spin_lock(&inode->i_lock);
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (__cap_is_valid(cap) &&
+ (cap->implemented & ~cap->issued & mask)) {
+ ret = 1;
+ break;
+ }
+ }
+ spin_unlock(&inode->i_lock);
+ dout("ceph_caps_revoking %p %s = %d\n", inode,
+ ceph_cap_string(mask), ret);

+ return ret;
+}
+

+int __ceph_caps_used(struct ceph_inode_info *ci)
+{
+ int used = 0;
+ if (ci->i_pin_ref)
+ used |= CEPH_CAP_PIN;
+ if (ci->i_rd_ref)
+ used |= CEPH_CAP_FILE_RD;
+ if (ci->i_rdcache_ref || ci->i_rdcache_gen)
+ used |= CEPH_CAP_FILE_CACHE;
+ if (ci->i_wr_ref)
+ used |= CEPH_CAP_FILE_WR;
+ if (ci->i_wrbuffer_ref)
+ used |= CEPH_CAP_FILE_BUFFER;
+ return used;
+}
+
+/*

+ * wanted, by virtue of open file modes

+ */
+int __ceph_caps_file_wanted(struct ceph_inode_info *ci)
+{
+ int want = 0;
+ int mode;
+ for (mode = 0; mode < 4; mode++)
+ if (ci->i_nr_by_mode[mode])
+ want |= ceph_caps_for_mode(mode);
+ return want;
+}
+
+/*
+ * Return caps we have registered with the MDS(s) as 'wanted'.
+ */
+int __ceph_caps_mds_wanted(struct ceph_inode_info *ci)
+{
+ struct ceph_cap *cap;
+ struct rb_node *p;
+ int mds_wanted = 0;
+
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ if (!__cap_is_valid(cap))
+ continue;
+ mds_wanted |= cap->mds_wanted;
+ }
+ return mds_wanted;
+}
+
+/*
+ * called under i_lock
+ */
+static int __ceph_is_any_caps(struct ceph_inode_info *ci)
+{
+ return !RB_EMPTY_ROOT(&ci->i_caps) || ci->i_cap_exporting_mds >= 0;
+}
+
+/*
+ * caller should hold i_lock.
+ * caller will not hold session s_mutex if called from destroy_inode.
+ */
+void __ceph_remove_cap(struct ceph_cap *cap)
+{
+ struct ceph_mds_session *session = cap->session;
+ struct ceph_inode_info *ci = cap->ci;
+ struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+
+ dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);
+
+ /* remove from inode list */
+ rb_erase(&cap->ci_node, &ci->i_caps);
+ cap->ci = NULL;
+ if (ci->i_auth_cap == cap)
+ ci->i_auth_cap = NULL;
+
+ /* remove from session list */
+ spin_lock(&session->s_cap_lock);
+ if (session->s_cap_iterator == cap) {
+ /* not yet, we are iterating over this very cap */
+ dout("__ceph_remove_cap delaying %p removal from session %p\n",
+ cap, cap->session);
+ } else {
+ list_del_init(&cap->session_caps);
+ session->s_nr_caps--;
+ cap->session = NULL;
+ }
+ spin_unlock(&session->s_cap_lock);
+
+ if (cap->session == NULL)
+ ceph_put_cap(cap);
+
+ if (!__ceph_is_any_caps(ci) && ci->i_snap_realm) {
+ struct ceph_snap_realm *realm = ci->i_snap_realm;
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_del_init(&ci->i_snap_realm_item);
+ ci->i_snap_realm_counter++;
+ ci->i_snap_realm = NULL;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_put_snap_realm(mdsc, realm);
+ }
+ if (!__ceph_is_any_real_caps(ci))
+ __cap_delay_cancel(mdsc, ci);
+}
+
+/*
+ * Build and send a cap message to the given MDS.
+ *
+ * Caller should be holding s_mutex.
+ */
+static int send_cap_msg(struct ceph_mds_session *session,
+ u64 ino, u64 cid, int op,
+ int caps, int wanted, int dirty,
+ u32 seq, u64 flush_tid, u32 issue_seq, u32 mseq,
+ u64 size, u64 max_size,
+ struct timespec *mtime, struct timespec *atime,
+ u64 time_warp_seq,
+ uid_t uid, gid_t gid, mode_t mode,
+ u64 xattr_version,
+ struct ceph_buffer *xattrs_buf,
+ u64 follows)
+{
+ struct ceph_mds_caps *fc;

+ struct ceph_msg *msg;
+

+ dout("send_cap_msg %s %llx %llx caps %s wanted %s dirty %s"
+ " seq %u/%u mseq %u follows %lld size %llu/%llu"
+ " xattr_ver %llu xattr_len %d\n", ceph_cap_op_name(op),
+ cid, ino, ceph_cap_string(caps), ceph_cap_string(wanted),
+ ceph_cap_string(dirty),
+ seq, issue_seq, mseq, follows, size, max_size,
+ xattr_version, xattrs_buf ? (int)xattrs_buf->vec.iov_len : 0);
+
+ msg = ceph_msg_new(CEPH_MSG_CLIENT_CAPS, sizeof(*fc), 0, 0, NULL);
+ if (IS_ERR(msg))
+ return PTR_ERR(msg);
+
+ msg->hdr.tid = cpu_to_le64(flush_tid);
+
+ fc = msg->front.iov_base;
+ memset(fc, 0, sizeof(*fc));
+
+ fc->cap_id = cpu_to_le64(cid);
+ fc->op = cpu_to_le32(op);
+ fc->seq = cpu_to_le32(seq);
+ fc->issue_seq = cpu_to_le32(issue_seq);
+ fc->migrate_seq = cpu_to_le32(mseq);
+ fc->caps = cpu_to_le32(caps);
+ fc->wanted = cpu_to_le32(wanted);
+ fc->dirty = cpu_to_le32(dirty);
+ fc->ino = cpu_to_le64(ino);
+ fc->snap_follows = cpu_to_le64(follows);
+
+ fc->size = cpu_to_le64(size);
+ fc->max_size = cpu_to_le64(max_size);
+ if (mtime)
+ ceph_encode_timespec(&fc->mtime, mtime);
+ if (atime)
+ ceph_encode_timespec(&fc->atime, atime);
+ fc->time_warp_seq = cpu_to_le32(time_warp_seq);
+
+ fc->uid = cpu_to_le32(uid);
+ fc->gid = cpu_to_le32(gid);
+ fc->mode = cpu_to_le32(mode);
+
+ fc->xattr_version = cpu_to_le64(xattr_version);
+ if (xattrs_buf) {
+ msg->middle = ceph_buffer_get(xattrs_buf);
+ fc->xattr_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+ msg->hdr.middle_len = cpu_to_le32(xattrs_buf->vec.iov_len);
+ }
+
+ ceph_con_send(&session->s_con, msg);

+ return 0;
+}
+
+/*

+ * Queue cap releases when an inode is dropped from our cache. Since
+ * inode is about to be destroyed, there is no need for i_lock.
+ */
+void ceph_queue_caps_release(struct inode *inode)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct rb_node *p;
+

+ p = rb_first(&ci->i_caps);
+ while (p) {
+ struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
+ struct ceph_mds_session *session = cap->session;
+ struct ceph_msg *msg;
+ struct ceph_mds_cap_release *head;
+ struct ceph_mds_cap_item *item;
+
+ spin_lock(&session->s_cap_lock);
+ BUG_ON(!session->s_num_cap_releases);
+ msg = list_first_entry(&session->s_cap_releases,
+ struct ceph_msg, list_head);
+
+ dout(" adding %p release to mds%d msg %p (%d left)\n",
+ inode, session->s_mds, msg, session->s_num_cap_releases);
+
+ BUG_ON(msg->front.iov_len + sizeof(*item) > PAGE_CACHE_SIZE);

+ head = msg->front.iov_base;

+ head->num = cpu_to_le32(le32_to_cpu(head->num) + 1);
+ item = msg->front.iov_base + msg->front.iov_len;
+ item->ino = cpu_to_le64(ceph_ino(inode));
+ item->cap_id = cpu_to_le64(cap->cap_id);
+ item->migrate_seq = cpu_to_le32(cap->mseq);
+ item->seq = cpu_to_le32(cap->issue_seq);
+
+ session->s_num_cap_releases--;
+
+ msg->front.iov_len += sizeof(*item);
+ if (le32_to_cpu(head->num) == CEPH_CAPS_PER_RELEASE) {
+ dout(" release msg %p full\n", msg);
+ list_move_tail(&msg->list_head,
+ &session->s_cap_releases_done);
+ } else {
+ dout(" release msg %p at %d/%d (%d)\n", msg,
+ (int)le32_to_cpu(head->num),
+ (int)CEPH_CAPS_PER_RELEASE,
+ (int)msg->front.iov_len);
+ }
+ spin_unlock(&session->s_cap_lock);
+ p = rb_next(p);
+ __ceph_remove_cap(cap);
+ }
+}
+
+/*
+ * Send a cap msg on the given inode. Update our caps state, then
+ * drop i_lock and send the message.
+ *
+ * Make note of max_size reported/requested from mds, revoked caps
+ * that have now been implemented.
+ *
+ * Make half-hearted attempt ot to invalidate page cache if we are
+ * dropping RDCACHE. Note that this will leave behind locked pages
+ * that we'll then need to deal with elsewhere.
+ *
+ * Return non-zero if delayed release, or we experienced an error
+ * such that the caller should requeue + retry later.
+ *
+ * called with i_lock, then drops it.
+ * caller should hold snap_rwsem (read), s_mutex.
+ */
+static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
+ int op, int used, int want, int retain, int flushing,
+ unsigned *pflush_tid)
+ __releases(cap->ci->vfs_inode->i_lock)
+{
+ struct ceph_inode_info *ci = cap->ci;
+ struct inode *inode = &ci->vfs_inode;
+ u64 cap_id = cap->cap_id;
+ int held, revoking, dropping, keep;
+ u64 seq, issue_seq, mseq, time_warp_seq, follows;
+ u64 size, max_size;
+ struct timespec mtime, atime;
+ int wake = 0;

+ mode_t mode;
+ uid_t uid;
+ gid_t gid;

+ struct ceph_mds_session *session;
+ u64 xattr_version = 0;
+ int delayed = 0;
+ u64 flush_tid = 0;
+ int i;
+ int ret;
+
+ held = cap->issued | cap->implemented;
+ revoking = cap->implemented & ~cap->issued;
+ retain &= ~revoking;
+ dropping = cap->issued & ~retain;
+
+ dout("__send_cap %p cap %p session %p %s -> %s (revoking %s)\n",
+ inode, cap, cap->session,
+ ceph_cap_string(held), ceph_cap_string(held & retain),
+ ceph_cap_string(revoking));
+ BUG_ON((retain & CEPH_CAP_PIN) == 0);
+
+ session = cap->session;
+
+ /* don't release wanted unless we've waited a bit. */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_min)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s on send\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ want |= cap->mds_wanted;
+ retain |= cap->issued;
+ delayed = 1;
+ }
+ ci->i_ceph_flags &= ~(CEPH_I_NODELAY | CEPH_I_FLUSH);
+
+ cap->issued &= retain; /* drop bits we don't want */
+ if (cap->implemented & ~cap->issued) {
+ /*
+ * Wake up any waiters on wanted -> needed transition.
+ * This is due to the weird transition from buffered
+ * to sync IO... we need to flush dirty pages _before_
+ * allowing sync writes to avoid reordering.
+ */
+ wake = 1;
+ }
+ cap->implemented &= cap->issued | used;
+ cap->mds_wanted = want;
+
+ if (flushing) {
+ /*
+ * assign a tid for flush operations so we can avoid
+ * flush1 -> dirty1 -> flush2 -> flushack1 -> mark
+ * clean type races. track latest tid for every bit
+ * so we can handle flush AxFw, flush Fw, and have the
+ * first ack clean Ax.
+ */
+ flush_tid = ++ci->i_cap_flush_last_tid;
+ if (pflush_tid)
+ *pflush_tid = flush_tid;
+ dout(" cap_flush_tid %d\n", (int)flush_tid);
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if (flushing & (1 << i))
+ ci->i_cap_flush_tid[i] = flush_tid;
+ }
+
+ keep = cap->implemented;
+ seq = cap->seq;
+ issue_seq = cap->issue_seq;
+ mseq = cap->mseq;
+ size = inode->i_size;
+ ci->i_reported_size = size;
+ max_size = ci->i_wanted_max_size;
+ ci->i_requested_max_size = max_size;
+ mtime = inode->i_mtime;
+ atime = inode->i_atime;
+ time_warp_seq = ci->i_time_warp_seq;
+ follows = ci->i_snap_realm->cached_context->seq;
+ uid = inode->i_uid;
+ gid = inode->i_gid;
+ mode = inode->i_mode;
+
+ if (dropping & CEPH_CAP_XATTR_EXCL) {
+ __ceph_build_xattrs_blob(ci);
+ xattr_version = ci->i_xattrs.version + 1;
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
+ op, keep, want, flushing, seq, flush_tid, issue_seq, mseq,
+ size, max_size, &mtime, &atime, time_warp_seq,
+ uid, gid, mode,
+ xattr_version,
+ (flushing & CEPH_CAP_XATTR_EXCL) ? ci->i_xattrs.blob : NULL,
+ follows);
+ if (ret < 0) {
+ dout("error sending cap msg, must requeue %p\n", inode);
+ delayed = 1;
+ }
+
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+
+ return delayed;
+}
+
+/*
+ * When a snapshot is taken, clients accumulate dirty metadata on
+ * inodes with capabilities in ceph_cap_snaps to describe the file
+ * state at the time the snapshot was taken. This must be flushed
+ * asynchronously back to the MDS once sync writes complete and dirty
+ * data is written out.
+ *
+ * Called under i_lock. Takes s_mutex as needed.
+ */
+void __ceph_flush_snaps(struct ceph_inode_info *ci,
+ struct ceph_mds_session **psession)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int mds;
+ struct ceph_cap_snap *capsnap;
+ u32 mseq;
+ struct ceph_mds_client *mdsc = &ceph_inode_to_client(inode)->mdsc;
+ struct ceph_mds_session *session = NULL; /* if session != NULL, we hold
+ session->s_mutex */
+ u64 next_follows = 0; /* keep track of how far we've gotten through the
+ i_cap_snaps list, and skip these entries next time
+ around to avoid an infinite loop */
+
+ if (psession)
+ session = *psession;
+
+ dout("__flush_snaps %p\n", inode);
+retry:

+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {

+ /* avoid an infiniute loop after retry */
+ if (capsnap->follows < next_follows)
+ continue;
+ /*
+ * we need to wait for sync writes to complete and for dirty
+ * pages to be written out.
+ */
+ if (capsnap->dirty_pages || capsnap->writing)
+ continue;
+
+ /* pick mds, take s_mutex */
+ mds = __ceph_get_cap_mds(ci, &mseq);
+ if (session && session->s_mds != mds) {
+ dout("oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ session = NULL;
+ }
+ if (!session) {
+ spin_unlock(&inode->i_lock);
+ mutex_lock(&mdsc->mutex);
+ session = __ceph_lookup_mds_session(mdsc, mds);
+ mutex_unlock(&mdsc->mutex);
+ if (session) {
+ dout("inverting session/ino locks on %p\n",
+ session);
+ mutex_lock(&session->s_mutex);
+ }
+ /*
+ * if session == NULL, we raced against a cap
+ * deletion. retry, and we'll get a better
+ * @mds value next time.

+ */
+ spin_lock(&inode->i_lock);

+ goto retry;
+ }
+

+ capsnap->flush_tid = ++ci->i_cap_flush_last_tid;
+ atomic_inc(&capsnap->nref);
+ if (!list_empty(&capsnap->flushing_item))
+ list_del_init(&capsnap->flushing_item);
+ list_add_tail(&capsnap->flushing_item,
+ &session->s_cap_snaps_flushing);
+ spin_unlock(&inode->i_lock);
+
+ dout("flush_snaps %p cap_snap %p follows %lld size %llu\n",
+ inode, capsnap, next_follows, capsnap->size);
+ send_cap_msg(session, ceph_vino(inode).ino, 0,
+ CEPH_CAP_OP_FLUSHSNAP, capsnap->issued, 0,
+ capsnap->dirty, 0, capsnap->flush_tid, 0, mseq,
+ capsnap->size, 0,
+ &capsnap->mtime, &capsnap->atime,
+ capsnap->time_warp_seq,
+ capsnap->uid, capsnap->gid, capsnap->mode,
+ 0, NULL,
+ capsnap->follows);
+
+ next_follows = capsnap->follows + 1;
+ ceph_put_cap_snap(capsnap);
+
+ spin_lock(&inode->i_lock);

+ goto retry;
+ }
+

+ /* we flushed them all; remove this inode from the queue */
+ spin_lock(&mdsc->snap_flush_lock);
+ list_del_init(&ci->i_snap_flush_item);
+ spin_unlock(&mdsc->snap_flush_lock);
+
+ if (psession)
+ *psession = session;
+ else if (session) {
+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }
+}
+
+static void ceph_flush_snaps(struct ceph_inode_info *ci)
+{
+ struct inode *inode = &ci->vfs_inode;
+
+ spin_lock(&inode->i_lock);
+ __ceph_flush_snaps(ci, NULL);
+ spin_unlock(&inode->i_lock);
+}
+
+/*
+ * Mark caps dirty. If inode is newly dirty, add to the global dirty
+ * list.
+ */
+void __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(ci->vfs_inode.i_sb)->mdsc;
+ struct inode *inode = &ci->vfs_inode;
+ int was = ci->i_dirty_caps;
+ int dirty = 0;
+
+ dout("__mark_dirty_caps %p %s dirty %s -> %s\n", &ci->vfs_inode,
+ ceph_cap_string(mask), ceph_cap_string(was),
+ ceph_cap_string(was | mask));
+ ci->i_dirty_caps |= mask;
+ if (was == 0) {
+ dout(" inode %p now dirty\n", &ci->vfs_inode);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_add(&ci->i_dirty_item, &mdsc->cap_dirty);
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ igrab(inode);
+ dirty |= I_DIRTY_SYNC;
+ }
+ }
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
+ (mask & CEPH_CAP_FILE_BUFFER))
+ dirty |= I_DIRTY_DATASYNC;
+ if (dirty)
+ __mark_inode_dirty(inode, dirty);
+ __cap_delay_requeue(mdsc, ci);
+}
+
+/*
+ * Add dirty inode to the flushing list. Assigned a seq number so we
+ * can wait for caps to flush without starving.
+ *
+ * Called under i_lock.
+ */
+static int __mark_caps_flushing(struct inode *inode,
+ struct ceph_mds_session *session)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int flushing;
+
+ BUG_ON(ci->i_dirty_caps == 0);
+ BUG_ON(list_empty(&ci->i_dirty_item));
+
+ flushing = ci->i_dirty_caps;
+ dout("__mark_caps_flushing flushing %s, flushing_caps %s -> %s\n",
+ ceph_cap_string(flushing),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps | flushing));
+ ci->i_flushing_caps |= flushing;
+ ci->i_dirty_caps = 0;
+ dout(" inode %p now !dirty\n", inode);
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_del_init(&ci->i_dirty_item);
+
+ ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
+ if (list_empty(&ci->i_flushing_item)) {
+ list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ mdsc->num_cap_flushing++;
+ dout(" inode %p now flushing seq %lld\n", inode,
+ ci->i_cap_flush_seq);
+ } else {
+ list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
+ dout(" inode %p now flushing (more) seq %lld\n", inode,
+ ci->i_cap_flush_seq);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+
+ return flushing;
+}
+
+/*
+ * try to invalidate mapping pages without blocking.
+ */
+static int mapping_is_empty(struct address_space *mapping)
+{
+ struct page *page = find_get_page(mapping, 0);
+
+ if (!page)
+ return 1;
+
+ put_page(page);

+ return 0;
+}
+

+static int try_nonblocking_invalidate(struct inode *inode)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ u32 invalidating_gen = ci->i_rdcache_gen;
+
+ spin_unlock(&inode->i_lock);
+ invalidate_mapping_pages(&inode->i_data, 0, -1);
+ spin_lock(&inode->i_lock);
+
+ if (mapping_is_empty(&inode->i_data) &&
+ invalidating_gen == ci->i_rdcache_gen) {
+ /* success. */
+ dout("try_nonblocking_invalidate %p success\n", inode);
+ ci->i_rdcache_gen = 0;
+ ci->i_rdcache_revoking = 0;
+ return 0;
+ }
+ dout("try_nonblocking_invalidate %p failed\n", inode);

+ return -1;
+}
+

+/*
+ * Swiss army knife function to examine currently used and wanted
+ * versus held caps. Release, flush, ack revoked caps to mds as
+ * appropriate.
+ *
+ * CHECK_CAPS_NODELAY - caller is delayed work and we should not delay
+ * cap release further.
+ * CHECK_CAPS_AUTHONLY - we should only check the auth cap
+ * CHECK_CAPS_FLUSH - we should flush any dirty caps immediately, without
+ * further delay.
+ */
+void ceph_check_caps(struct ceph_inode_info *ci, int flags,
+ struct ceph_mds_session *session)
+{
+ struct ceph_client *client = ceph_inode_to_client(&ci->vfs_inode);

+ struct ceph_mds_client *mdsc = &client->mdsc;

+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ int file_wanted, used;
+ int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
+ int drop_session_lock = session ? 0 : 1;
+ int issued, implemented, want, retain, revoking, flushing = 0;
+ int mds = -1; /* keep track of how far we've gone through i_caps list
+ to avoid an infinite loop on retry */
+ struct rb_node *p;
+ int tried_invalidate = 0;
+ int delayed = 0, sent = 0, force_requeue = 0, num;
+ int queue_invalidate = 0;
+ int is_delayed = flags & CHECK_CAPS_NODELAY;
+
+ /* if we are unmounting, flush any unused caps immediately. */
+ if (mdsc->stopping)
+ is_delayed = 1;

+
+ spin_lock(&inode->i_lock);
+

+ if (ci->i_ceph_flags & CEPH_I_FLUSH)
+ flags |= CHECK_CAPS_FLUSH;
+
+ /* flush snaps first time around only */
+ if (!list_empty(&ci->i_cap_snaps))
+ __ceph_flush_snaps(ci, &session);
+ goto retry_locked;
+retry:
+ spin_lock(&inode->i_lock);
+retry_locked:
+ file_wanted = __ceph_caps_file_wanted(ci);
+ used = __ceph_caps_used(ci);
+ want = file_wanted | used;
+ issued = __ceph_caps_issued(ci, &implemented);
+ revoking = implemented & ~issued;
+
+ retain = want | CEPH_CAP_PIN;
+ if (!mdsc->stopping && inode->i_nlink > 0) {
+ if (want) {
+ retain |= CEPH_CAP_ANY; /* be greedy */
+ } else {
+ retain |= CEPH_CAP_ANY_SHARED;
+ /*
+ * keep RD only if we didn't have the file open RW,
+ * because then the mds would revoke it anyway to
+ * journal max_size=0.
+ */
+ if (ci->i_max_size == 0)
+ retain |= CEPH_CAP_ANY_RD;
+ }
+ }
+
+ dout("check_caps %p file_want %s used %s dirty %s flushing %s"
+ " issued %s revoking %s retain %s %s%s%s\n", inode,
+ ceph_cap_string(file_wanted),
+ ceph_cap_string(used), ceph_cap_string(ci->i_dirty_caps),
+ ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(issued), ceph_cap_string(revoking),
+ ceph_cap_string(retain),
+ (flags & CHECK_CAPS_AUTHONLY) ? " AUTHONLY" : "",
+ (flags & CHECK_CAPS_NODELAY) ? " NODELAY" : "",
+ (flags & CHECK_CAPS_FLUSH) ? " FLUSH" : "");
+
+ /*
+ * If we no longer need to hold onto old our caps, and we may
+ * have cached pages, but don't want them, then try to invalidate.
+ * If we fail, it's because pages are locked.... try again later.
+ */
+ if ((!is_delayed || mdsc->stopping) &&
+ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
+ ci->i_rdcache_gen && /* may have cached pages */
+ (file_wanted == 0 || /* no open files */
+ (revoking & CEPH_CAP_FILE_CACHE)) && /* or revoking cache */
+ !tried_invalidate) {
+ dout("check_caps trying to invalidate on %p\n", inode);
+ if (try_nonblocking_invalidate(inode) < 0) {
+ if (revoking & CEPH_CAP_FILE_CACHE) {
+ dout("check_caps queuing invalidate\n");
+ queue_invalidate = 1;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
+ } else {
+ dout("check_caps failed to invalidate pages\n");
+ /* we failed to invalidate pages. check these
+ caps again later. */
+ force_requeue = 1;
+ __cap_set_timeouts(mdsc, ci);
+ }
+ }
+ tried_invalidate = 1;

+ goto retry_locked;
+ }
+

+ num = 0;
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ cap = rb_entry(p, struct ceph_cap, ci_node);
+ num++;
+
+ /* avoid looping forever */
+ if (mds >= cap->mds ||
+ ((flags & CHECK_CAPS_AUTHONLY) && cap != ci->i_auth_cap))
+ continue;
+
+ /* NOTE: no side-effects allowed, until we take s_mutex */
+
+ revoking = cap->implemented & ~cap->issued;
+ if (revoking)
+ dout(" mds%d revoking %s\n", cap->mds,
+ ceph_cap_string(revoking));
+
+ if (cap == ci->i_auth_cap &&
+ (cap->issued & CEPH_CAP_FILE_WR)) {
+ /* request larger max_size from MDS? */
+ if (ci->i_wanted_max_size > ci->i_max_size &&
+ ci->i_wanted_max_size > ci->i_requested_max_size) {
+ dout("requesting new max_size\n");
+ goto ack;
+ }
+
+ /* approaching file_max? */
+ if ((inode->i_size << 1) >= ci->i_max_size &&
+ (ci->i_reported_size << 1) < ci->i_max_size) {
+ dout("i_size approaching max_size\n");
+ goto ack;
+ }
+ }
+ /* flush anything dirty? */
+ if (cap == ci->i_auth_cap && (flags & CHECK_CAPS_FLUSH) &&
+ ci->i_dirty_caps) {
+ dout("flushing dirty caps\n");
+ goto ack;
+ }
+
+ /* completed revocation? going down and there are no caps? */
+ if (revoking && (revoking & used) == 0) {
+ dout("completed revocation of %s\n",
+ ceph_cap_string(cap->implemented & ~cap->issued));
+ goto ack;
+ }
+
+ /* want more caps from mds? */
+ if (want & ~(cap->mds_wanted | cap->issued))
+ goto ack;
+
+ /* things we might delay */
+ if ((cap->issued & ~retain) == 0 &&
+ cap->mds_wanted == want)
+ continue; /* nope, all good */
+
+ if (is_delayed)
+ goto ack;
+
+ /* delay? */
+ if ((ci->i_ceph_flags & CEPH_I_NODELAY) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max)) {
+ dout(" delaying issued %s -> %s, wanted %s -> %s\n",
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & retain),
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(want));
+ delayed++;
+ continue;
+ }
+
+ack:
+ if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ dout(" skipping %p I_NOFLUSH set\n", inode);
+ continue;
+ }
+
+ if (session && session != cap->session) {
+ dout("oops, wrong session %p mutex\n", session);
+ mutex_unlock(&session->s_mutex);
+ session = NULL;
+ }
+ if (!session) {
+ session = cap->session;
+ if (mutex_trylock(&session->s_mutex) == 0) {
+ dout("inverting session/ino locks on %p\n",
+ session);
+ spin_unlock(&inode->i_lock);
+ if (took_snap_rwsem) {
+ up_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 0;
+ }
+ mutex_lock(&session->s_mutex);

+ goto retry;
+ }
+ }

+ /* take snap_rwsem after session mutex */
+ if (!took_snap_rwsem) {
+ if (down_read_trylock(&mdsc->snap_rwsem) == 0) {
+ dout("inverting snap/in locks on %p\n",
+ inode);
+ spin_unlock(&inode->i_lock);
+ down_read(&mdsc->snap_rwsem);
+ took_snap_rwsem = 1;
+ goto retry;
+ }
+ took_snap_rwsem = 1;
+ }
+
+ if (cap == ci->i_auth_cap && ci->i_dirty_caps)
+ flushing = __mark_caps_flushing(inode, session);
+
+ mds = cap->mds; /* remember mds, so we don't repeat */
+ sent++;
+
+ /* __send_cap drops i_lock */
+ delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want,
+ retain, flushing, NULL);
+ goto retry; /* retake i_lock and restart our cap scan. */
+ }
+
+ /*
+ * Reschedule delayed caps release if we delayed anything,
+ * otherwise cancel.
+ */
+ if (delayed && is_delayed)
+ force_requeue = 1; /* __send_cap delayed release; requeue */
+ if (!delayed && !is_delayed)
+ __cap_delay_cancel(mdsc, ci);
+ else if (!is_delayed || force_requeue)
+ __cap_delay_requeue(mdsc, ci);
+
+ spin_unlock(&inode->i_lock);
+
+ if (queue_invalidate)
+ ceph_queue_invalidate(inode);
+
+ if (session && drop_session_lock)
+ mutex_unlock(&session->s_mutex);
+ if (took_snap_rwsem)
+ up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Try to flush dirty caps back to the auth mds.
+ */
+static int try_flush_caps(struct inode *inode, struct ceph_mds_session *session,
+ unsigned *flush_tid)
+{
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int unlock_session = session ? 0 : 1;
+ int flushing = 0;
+
+retry:
+ spin_lock(&inode->i_lock);
+ if (ci->i_ceph_flags & CEPH_I_NOFLUSH) {
+ dout("try_flush_caps skipping %p I_NOFLUSH set\n", inode);
+ goto out;
+ }
+ if (ci->i_dirty_caps && ci->i_auth_cap) {
+ struct ceph_cap *cap = ci->i_auth_cap;
+ int used = __ceph_caps_used(ci);
+ int want = __ceph_caps_wanted(ci);
+ int delayed;
+
+ if (!session) {
+ spin_unlock(&inode->i_lock);
+ session = cap->session;
+ mutex_lock(&session->s_mutex);
+ goto retry;
+ }
+ BUG_ON(session != cap->session);
+ if (cap->session->s_state < CEPH_MDS_SESSION_OPEN)
+ goto out;
+
+ flushing = __mark_caps_flushing(inode, session);
+
+ /* __send_cap drops i_lock */
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH, used, want,
+ cap->issued | cap->implemented, flushing,
+ flush_tid);
+ if (!delayed)
+ goto out_unlocked;
+
+ spin_lock(&inode->i_lock);
+ __cap_delay_requeue(mdsc, ci);
+ }
+out:
+ spin_unlock(&inode->i_lock);
+out_unlocked:
+ if (session && unlock_session)
+ mutex_unlock(&session->s_mutex);
+ return flushing;
+}
+
+/*
+ * Return true if we've flushed caps through the given flush_tid.
+ */
+static int caps_are_flushed(struct inode *inode, unsigned tid)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int dirty, i, ret = 1;
+
+ spin_lock(&inode->i_lock);
+ dirty = __ceph_caps_dirty(ci);
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if ((ci->i_flushing_caps & (1 << i)) &&
+ ci->i_cap_flush_tid[i] <= tid) {
+ /* still flushing this bit */
+ ret = 0;
+ break;
+ }
+ spin_unlock(&inode->i_lock);

+ return ret;
+}
+
+/*

+ * Wait on any unsafe replies for the given inode. First wait on the
+ * newest request, and make that the upper bound. Then, if there are
+ * more requests, keep waiting on the oldest as long as it is still older
+ * than the original request.
+ */
+static void sync_write_wait(struct inode *inode)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct list_head *head = &ci->i_unsafe_writes;
+ struct ceph_osd_request *req;
+ u64 last_tid;
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (list_empty(head))
+ goto out;
+
+ /* set upper bound as _last_ entry in chain */
+ req = list_entry(head->prev, struct ceph_osd_request,
+ r_unsafe_item);

+ last_tid = req->r_tid;
+
+ do {

+ ceph_osdc_get_request(req);
+ spin_unlock(&ci->i_unsafe_lock);
+ dout("sync_write_wait on tid %llu (until %llu)\n",

+ req->r_tid, last_tid);
+ wait_for_completion(&req->r_safe_completion);

+ spin_lock(&ci->i_unsafe_lock);
+ ceph_osdc_put_request(req);
+
+ /*
+ * from here on look at first entry in chain, since we
+ * only want to wait for anything older than last_tid
+ */
+ if (list_empty(head))
+ break;
+ req = list_entry(head->next, struct ceph_osd_request,
+ r_unsafe_item);

+ } while (req->r_tid < last_tid);
+out:
+ spin_unlock(&ci->i_unsafe_lock);
+}

+
+int ceph_fsync(struct file *file, struct dentry *dentry, int datasync)
+{
+ struct inode *inode = dentry->d_inode;

+ struct ceph_inode_info *ci = ceph_inode(inode);

+ unsigned flush_tid;
+ int ret;
+ int dirty;
+
+ dout("fsync %p%s\n", inode, datasync ? " datasync" : "");
+ sync_write_wait(inode);
+
+ ret = filemap_write_and_wait(inode->i_mapping);
+ if (ret < 0)
+ return ret;
+
+ dirty = try_flush_caps(inode, NULL, &flush_tid);
+ dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));
+
+ /*
+ * only wait on non-file metadata writeback (the mds
+ * can recover size and mtime, so we don't need to
+ * wait for that)
+ */
+ if (!datasync && (dirty & ~CEPH_CAP_ANY_FILE_WR)) {
+ dout("fsync waiting for flush_tid %u\n", flush_tid);
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ }
+
+ dout("fsync %p%s done\n", inode, datasync ? " datasync" : "");

+ return ret;
+}
+
+/*

+ * Flush any dirty caps back to the mds. If we aren't asked to wait,
+ * queue inode for flush but don't do so immediately, because we can
+ * get by with fewer MDS messages if we wait for data writeback to
+ * complete first.
+ */
+int ceph_write_inode(struct inode *inode, int wait)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ unsigned flush_tid;

+ int err = 0;

+ int dirty;
+
+ dout("write_inode %p wait=%d\n", inode, wait);
+ if (wait) {
+ dirty = try_flush_caps(inode, NULL, &flush_tid);
+ if (dirty)
+ err = wait_event_interruptible(ci->i_cap_wq,
+ caps_are_flushed(inode, flush_tid));
+ } else {
+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+
+ spin_lock(&inode->i_lock);
+ if (__ceph_caps_dirty(ci))
+ __cap_delay_requeue_front(mdsc, ci);
+ spin_unlock(&inode->i_lock);
+ }

+ return err;
+}
+
+/*

+ * After a recovering MDS goes active, we need to resend any caps
+ * we were flushing.
+ *
+ * Caller holds session->s_mutex.
+ */
+static void kick_flushing_capsnaps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_cap_snap *capsnap;
+
+ dout("kick_flushing_capsnaps mds%d\n", session->s_mds);
+ list_for_each_entry(capsnap, &session->s_cap_snaps_flushing,
+ flushing_item) {
+ struct ceph_inode_info *ci = capsnap->ci;
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+
+ spin_lock(&inode->i_lock);
+ cap = ci->i_auth_cap;
+ if (cap && cap->session == session) {
+ dout("kick_flushing_caps %p cap %p capsnap %p\n", inode,
+ cap, capsnap);
+ __ceph_flush_snaps(ci, &session);
+ } else {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);

+ spin_unlock(&inode->i_lock);
+ }
+ }
+}

+
+void ceph_kick_flushing_caps(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session)
+{
+ struct ceph_inode_info *ci;
+
+ kick_flushing_capsnaps(mdsc, session);
+
+ dout("kick_flushing_caps mds%d\n", session->s_mds);
+ list_for_each_entry(ci, &session->s_cap_flushing, i_flushing_item) {
+ struct inode *inode = &ci->vfs_inode;
+ struct ceph_cap *cap;
+ int delayed = 0;
+
+ spin_lock(&inode->i_lock);
+ cap = ci->i_auth_cap;
+ if (cap && cap->session == session) {
+ dout("kick_flushing_caps %p cap %p %s\n", inode,
+ cap, ceph_cap_string(ci->i_flushing_caps));
+ delayed = __send_cap(mdsc, cap, CEPH_CAP_OP_FLUSH,
+ __ceph_caps_used(ci),
+ __ceph_caps_wanted(ci),
+ cap->issued | cap->implemented,
+ ci->i_flushing_caps, NULL);
+ if (delayed) {
+ spin_lock(&inode->i_lock);
+ __cap_delay_requeue(mdsc, ci);
+ spin_unlock(&inode->i_lock);
+ }
+ } else {
+ pr_err("%p auth cap %p not mds%d ???\n", inode,
+ cap, session->s_mds);
+ spin_unlock(&inode->i_lock);

+ }
+ }
+}
+
+
+/*

+ * Take references to capabilities we hold, so that we don't release
+ * them to the MDS prematurely.
+ *
+ * Protected by i_lock.
+ */
+static void __take_cap_refs(struct ceph_inode_info *ci, int got)
+{
+ if (got & CEPH_CAP_PIN)
+ ci->i_pin_ref++;
+ if (got & CEPH_CAP_FILE_RD)
+ ci->i_rd_ref++;
+ if (got & CEPH_CAP_FILE_CACHE)
+ ci->i_rdcache_ref++;
+ if (got & CEPH_CAP_FILE_WR)
+ ci->i_wr_ref++;
+ if (got & CEPH_CAP_FILE_BUFFER) {

+ if (ci->i_wrbuffer_ref == 0)

+ igrab(&ci->vfs_inode);
+ ci->i_wrbuffer_ref++;
+ dout("__take_cap_refs %p wrbuffer %d -> %d (?)\n",
+ &ci->vfs_inode, ci->i_wrbuffer_ref-1, ci->i_wrbuffer_ref);
+ }
+}
+
+/*
+ * Try to grab cap references. Specify those refs we @want, and the
+ * minimal set we @need. Also include the larger offset we are writing
+ * to (when applicable), and check against max_size here as well.
+ * Note that caller is responsible for ensuring max_size increases are
+ * requested from the MDS.
+ */
+static int try_get_cap_refs(struct ceph_inode_info *ci, int need, int want,
+ int *got, loff_t endoff, int *check_max, int *err)
+{
+ struct inode *inode = &ci->vfs_inode;

+ int ret = 0;

+ int have, implemented;
+ int file_wanted;
+
+ dout("get_cap_refs %p need %s want %s\n", inode,
+ ceph_cap_string(need), ceph_cap_string(want));
+ spin_lock(&inode->i_lock);
+
+ /* make sure file is actually open */
+ file_wanted = __ceph_caps_file_wanted(ci);
+ if ((file_wanted & need) == 0) {
+ dout("try_get_cap_refs need %s file_wanted %s, EBADF\n",
+ ceph_cap_string(need), ceph_cap_string(file_wanted));
+ *err = -EBADF;
+ ret = 1;

+ goto out;
+ }
+

+ if (need & CEPH_CAP_FILE_WR) {
+ if (endoff >= 0 && endoff > (loff_t)ci->i_max_size) {
+ dout("get_cap_refs %p endoff %llu > maxsize %llu\n",
+ inode, endoff, ci->i_max_size);
+ if (endoff > ci->i_wanted_max_size) {
+ *check_max = 1;
+ ret = 1;
+ }
+ goto out;
+ }
+ /*
+ * If a sync write is in progress, we must wait, so that we
+ * can get a final snapshot value for size+mtime.
+ */
+ if (__ceph_have_pending_cap_snap(ci)) {
+ dout("get_cap_refs %p cap_snap_pending\n", inode);

+ goto out;
+ }
+ }

+ have = __ceph_caps_issued(ci, &implemented);
+
+ /*
+ * disallow writes while a truncate is pending
+ */
+ if (ci->i_truncate_pending)
+ have &= ~CEPH_CAP_FILE_WR;
+
+ if ((have & need) == need) {
+ /*
+ * Look at (implemented & ~have & not) so that we keep waiting
+ * on transition from wanted -> needed caps. This is needed
+ * for WRBUFFER|WR -> WR to avoid a new WR sync write from
+ * going before a prior buffered writeback happens.
+ */
+ int not = want & ~(have & need);
+ int revoking = implemented & ~have;
+ dout("get_cap_refs %p have %s but not %s (revoking %s)\n",
+ inode, ceph_cap_string(have), ceph_cap_string(not),
+ ceph_cap_string(revoking));
+ if ((revoking & not) == 0) {
+ *got = need | (have & want);
+ __take_cap_refs(ci, *got);
+ ret = 1;
+ }
+ } else {
+ dout("get_cap_refs %p have %s needed %s\n", inode,
+ ceph_cap_string(have), ceph_cap_string(need));
+ }
+out:
+ spin_unlock(&inode->i_lock);
+ dout("get_cap_refs %p ret %d got %s\n", inode,
+ ret, ceph_cap_string(*got));

+ return ret;
+}
+
+/*

+ * Check the offset we are writing up to against our current
+ * max_size. If necessary, tell the MDS we want to write to
+ * a larger offset.
+ */
+static void check_max_size(struct inode *inode, loff_t endoff)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int check = 0;
+
+ /* do we need to explicitly request a larger max_size? */
+ spin_lock(&inode->i_lock);
+ if ((endoff >= ci->i_max_size ||
+ endoff > (inode->i_size << 1)) &&
+ endoff > ci->i_wanted_max_size) {
+ dout("write %p at large endoff %llu, req max_size\n",
+ inode, endoff);
+ ci->i_wanted_max_size = endoff;
+ check = 1;
+ }
+ spin_unlock(&inode->i_lock);
+ if (check)
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+}
+
+/*
+ * Wait for caps, and take cap references. If we can't get a WR cap
+ * due to a small max_size, make sure we check_max_size (and possibly
+ * ask the mds) so we don't get hung up indefinitely.
+ */
+int ceph_get_caps(struct ceph_inode_info *ci, int need, int want, int *got,
+ loff_t endoff)
+{
+ int check_max, ret, err;
+
+retry:
+ if (endoff > 0)
+ check_max_size(&ci->vfs_inode, endoff);
+ check_max = 0;
+ err = 0;
+ ret = wait_event_interruptible(ci->i_cap_wq,
+ try_get_cap_refs(ci, need, want,
+ got, endoff,
+ &check_max, &err));
+ if (err)
+ ret = err;
+ if (check_max)
+ goto retry;

+ return ret;
+}
+
+/*

+ * Take cap refs. Caller must already know we hold at least one ref
+ * on the caps in question or we don't know this is safe.
+ */
+void ceph_get_cap_refs(struct ceph_inode_info *ci, int caps)
+{
+ spin_lock(&ci->vfs_inode.i_lock);
+ __take_cap_refs(ci, caps);
+ spin_unlock(&ci->vfs_inode.i_lock);
+}
+
+/*
+ * Release cap refs.
+ *
+ * If we released the last ref on any given cap, call ceph_check_caps
+ * to release (or schedule a release).
+ *
+ * If we are releasing a WR cap (from a sync write), finalize any affected
+ * cap_snap, and wake up any waiters.
+ */
+void ceph_put_cap_refs(struct ceph_inode_info *ci, int had)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0, put = 0, flushsnaps = 0, wake = 0;
+ struct ceph_cap_snap *capsnap;
+
+ spin_lock(&inode->i_lock);
+ if (had & CEPH_CAP_PIN)
+ --ci->i_pin_ref;
+ if (had & CEPH_CAP_FILE_RD)
+ if (--ci->i_rd_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_CACHE)
+ if (--ci->i_rdcache_ref == 0)
+ last++;
+ if (had & CEPH_CAP_FILE_BUFFER) {
+ if (--ci->i_wrbuffer_ref == 0) {
+ last++;
+ put++;
+ }
+ dout("put_cap_refs %p wrbuffer %d -> %d (?)\n",
+ inode, ci->i_wrbuffer_ref+1, ci->i_wrbuffer_ref);
+ }
+ if (had & CEPH_CAP_FILE_WR)
+ if (--ci->i_wr_ref == 0) {
+ last++;
+ if (!list_empty(&ci->i_cap_snaps)) {
+ capsnap = list_first_entry(&ci->i_cap_snaps,
+ struct ceph_cap_snap,
+ ci_item);
+ if (capsnap->writing) {
+ capsnap->writing = 0;
+ flushsnaps =
+ __ceph_finish_cap_snap(ci,
+ capsnap);
+ wake = 1;
+ }
+ }
+ }
+ spin_unlock(&inode->i_lock);
+
+ dout("put_cap_refs %p had %s %s\n", inode, ceph_cap_string(had),
+ last ? "last" : "");
+
+ if (last && !flushsnaps)
+ ceph_check_caps(ci, 0, NULL);
+ else if (flushsnaps)
+ ceph_flush_snaps(ci);
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+ if (put)
+ iput(inode);
+}
+
+/*
+ * Release @nr WRBUFFER refs on dirty pages for the given @snapc snap
+ * context. Adjust per-snap dirty page accounting as appropriate.
+ * Once all dirty data for a cap_snap is flushed, flush snapped file
+ * metadata back to the MDS. If we dropped the last ref, call
+ * ceph_check_caps.
+ */
+void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
+ struct ceph_snap_context *snapc)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0;
+ int last_snap = 0;
+ int found = 0;

+ struct ceph_cap_snap *capsnap = NULL;

+
+ spin_lock(&inode->i_lock);
+ ci->i_wrbuffer_ref -= nr;
+ last = !ci->i_wrbuffer_ref;
+
+ if (ci->i_head_snapc == snapc) {
+ ci->i_wrbuffer_ref_head -= nr;
+ if (!ci->i_wrbuffer_ref_head) {
+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;
+ }
+ dout("put_wrbuffer_cap_refs on %p head %d/%d -> %d/%d %s\n",
+ inode,
+ ci->i_wrbuffer_ref+nr, ci->i_wrbuffer_ref_head+nr,

+ ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,

+ last ? " LAST" : "");
+ } else {

+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {

+ if (capsnap->context == snapc) {
+ found = 1;
+ capsnap->dirty_pages -= nr;
+ last_snap = !capsnap->dirty_pages;
+ break;
+ }
+ }
+ BUG_ON(!found);
+ dout("put_wrbuffer_cap_refs on %p cap_snap %p "
+ " snap %lld %d/%d -> %d/%d %s%s\n",
+ inode, capsnap, capsnap->context->seq,
+ ci->i_wrbuffer_ref+nr, capsnap->dirty_pages + nr,
+ ci->i_wrbuffer_ref, capsnap->dirty_pages,
+ last ? " (wrbuffer last)" : "",
+ last_snap ? " (capsnap last)" : "");
+ }
+
+ spin_unlock(&inode->i_lock);
+
+ if (last) {
+ ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
+ iput(inode);
+ } else if (last_snap) {
+ ceph_flush_snaps(ci);
+ wake_up(&ci->i_cap_wq);
+ }
+}
+
+/*
+ * Handle a cap GRANT message from the MDS. (Note that a GRANT may
+ * actually be a revocation if it specifies a smaller cap set.)
+ *
+ * caller holds s_mutex.
+ * return value:
+ * 0 - ok
+ * 1 - check_caps on auth cap only (writeback)
+ * 2 - check_caps (ack revoke)
+ */
+static int handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap,
+ struct ceph_buffer *xattr_buf)
+ __releases(inode->i_lock)
+

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int mds = session->s_mds;
+ int seq = le32_to_cpu(grant->seq);
+ int newcaps = le32_to_cpu(grant->caps);
+ int issued, implemented, used, wanted, dirty;
+ u64 size = le64_to_cpu(grant->size);
+ u64 max_size = le64_to_cpu(grant->max_size);

+ struct timespec mtime, atime, ctime;

+ int reply = 0;
+ int wake = 0;
+ int writeback = 0;
+ int revoked_rdcache = 0;
+ int queue_invalidate = 0;
+
+ dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n",
+ inode, cap, mds, seq, ceph_cap_string(newcaps));
+ dout(" size %llu max_size %llu, i_size %llu\n", size, max_size,
+ inode->i_size);
+
+ /*
+ * If CACHE is being revoked, and we have no dirty buffers,
+ * try to invalidate (once). (If there are dirty buffers, we
+ * will invalidate _after_ writeback.)
+ */
+ if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
+ !ci->i_wrbuffer_ref) {
+ if (try_nonblocking_invalidate(inode) == 0) {
+ revoked_rdcache = 1;
+ } else {
+ /* there were locked pages.. invalidate later
+ in a separate thread. */
+ if (ci->i_rdcache_revoking != ci->i_rdcache_gen) {
+ queue_invalidate = 1;
+ ci->i_rdcache_revoking = ci->i_rdcache_gen;
+ }
+ }
+ }
+
+ /* side effects now are allowed */
+
+ issued = __ceph_caps_issued(ci, &implemented);
+ issued |= implemented | __ceph_caps_dirty(ci);
+
+ cap->cap_gen = session->s_cap_gen;
+
+ __check_cap_issue(ci, cap, newcaps);
+
+ if ((issued & CEPH_CAP_AUTH_EXCL) == 0) {
+ inode->i_mode = le32_to_cpu(grant->mode);
+ inode->i_uid = le32_to_cpu(grant->uid);
+ inode->i_gid = le32_to_cpu(grant->gid);
+ dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode,
+ inode->i_uid, inode->i_gid);
+ }
+
+ if ((issued & CEPH_CAP_LINK_EXCL) == 0)
+ inode->i_nlink = le32_to_cpu(grant->nlink);
+
+ if ((issued & CEPH_CAP_XATTR_EXCL) == 0 && grant->xattr_len) {
+ int len = le32_to_cpu(grant->xattr_len);
+ u64 version = le64_to_cpu(grant->xattr_version);
+
+ if (version > ci->i_xattrs.version) {
+ dout(" got new xattrs v%llu on %p len %d\n",
+ version, inode, len);
+ if (ci->i_xattrs.blob)
+ ceph_buffer_put(ci->i_xattrs.blob);
+ ci->i_xattrs.blob = ceph_buffer_get(xattr_buf);
+ ci->i_xattrs.version = version;
+ }
+ }
+
+ /* size/ctime/mtime/atime? */
+ ceph_fill_file_size(inode, issued,
+ le32_to_cpu(grant->truncate_seq),
+ le64_to_cpu(grant->truncate_size), size);
+ ceph_decode_timespec(&mtime, &grant->mtime);
+ ceph_decode_timespec(&atime, &grant->atime);
+ ceph_decode_timespec(&ctime, &grant->ctime);
+ ceph_fill_file_time(inode, issued,
+ le32_to_cpu(grant->time_warp_seq), &ctime, &mtime,
+ &atime);
+
+ /* max size increase? */
+ if (max_size != ci->i_max_size) {
+ dout("max_size %lld -> %llu\n", ci->i_max_size, max_size);
+ ci->i_max_size = max_size;
+ if (max_size >= ci->i_wanted_max_size) {
+ ci->i_wanted_max_size = 0; /* reset */
+ ci->i_requested_max_size = 0;
+ }
+ wake = 1;
+ }
+
+ /* check cap bits */
+ wanted = __ceph_caps_wanted(ci);
+ used = __ceph_caps_used(ci);
+ dirty = __ceph_caps_dirty(ci);
+ dout(" my wanted = %s, used = %s, dirty %s\n",
+ ceph_cap_string(wanted),
+ ceph_cap_string(used),
+ ceph_cap_string(dirty));
+ if (wanted != le32_to_cpu(grant->wanted)) {
+ dout("mds wanted %s -> %s\n",
+ ceph_cap_string(le32_to_cpu(grant->wanted)),
+ ceph_cap_string(wanted));
+ grant->wanted = cpu_to_le32(wanted);
+ }
+
+ cap->seq = seq;
+
+ /* file layout may have changed */
+ ci->i_layout = grant->layout;
+
+ /* revocation, grant, or no-op? */
+ if (cap->issued & ~newcaps) {
+ dout("revocation: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
+ if ((used & ~newcaps) & CEPH_CAP_FILE_BUFFER)
+ writeback = 1; /* will delay ack */
+ else if (dirty & ~newcaps)
+ reply = 1; /* initiate writeback in check_caps */
+ else if (((used & ~newcaps) & CEPH_CAP_FILE_CACHE) == 0 ||
+ revoked_rdcache)
+ reply = 2; /* send revoke ack in check_caps */
+ cap->issued = newcaps;
+ } else if (cap->issued == newcaps) {
+ dout("caps unchanged: %s -> %s\n",
+ ceph_cap_string(cap->issued), ceph_cap_string(newcaps));
+ } else {
+ dout("grant: %s -> %s\n", ceph_cap_string(cap->issued),
+ ceph_cap_string(newcaps));
+ cap->issued = newcaps;
+ cap->implemented |= newcaps; /* add bits only, to
+ * avoid stepping on a
+ * pending revocation */
+ wake = 1;

+ }
+
+ spin_unlock(&inode->i_lock);

+ if (writeback)
+ /*
+ * queue inode for writeback: we can't actually call
+ * filemap_write_and_wait, etc. from message handler
+ * context.
+ */
+ ceph_queue_writeback(inode);
+ if (queue_invalidate)
+ ceph_queue_invalidate(inode);
+ if (wake)
+ wake_up(&ci->i_cap_wq);
+ return reply;
+}
+
+/*
+ * Handle FLUSH_ACK from MDS, indicating that metadata we sent to the
+ * MDS has been safely committed.
+ */
+static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session,
+ struct ceph_cap *cap)
+ __releases(inode->i_lock)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+ unsigned seq = le32_to_cpu(m->seq);
+ int dirty = le32_to_cpu(m->dirty);
+ int cleaned = 0;
+ int drop = 0;
+ int i;
+
+ for (i = 0; i < CEPH_CAP_BITS; i++)
+ if ((dirty & (1 << i)) &&
+ flush_tid == ci->i_cap_flush_tid[i])
+ cleaned |= 1 << i;
+
+ dout("handle_cap_flush_ack inode %p mds%d seq %d on %s cleaned %s,"
+ " flushing %s -> %s\n",
+ inode, session->s_mds, seq, ceph_cap_string(dirty),
+ ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
+ ceph_cap_string(ci->i_flushing_caps & ~cleaned));
+
+ if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned))
+ goto out;
+
+ ci->i_flushing_caps &= ~cleaned;
+
+ spin_lock(&mdsc->cap_dirty_lock);
+ if (ci->i_flushing_caps == 0) {
+ list_del_init(&ci->i_flushing_item);
+ if (!list_empty(&session->s_cap_flushing))
+ dout(" mds%d still flushing cap on %p\n",
+ session->s_mds,
+ &list_entry(session->s_cap_flushing.next,
+ struct ceph_inode_info,
+ i_flushing_item)->vfs_inode);
+ mdsc->num_cap_flushing--;
+ wake_up(&mdsc->cap_flushing_wq);
+ dout(" inode %p now !flushing\n", inode);
+
+ if (ci->i_dirty_caps == 0) {
+ dout(" inode %p now clean\n", inode);
+ BUG_ON(!list_empty(&ci->i_dirty_item));
+ drop = 1;
+ } else {
+ BUG_ON(list_empty(&ci->i_dirty_item));
+ }
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ wake_up(&ci->i_cap_wq);
+
+out:
+ spin_unlock(&inode->i_lock);
+ if (drop)
+ iput(inode);
+}
+
+/*
+ * Handle FLUSHSNAP_ACK. MDS has flushed snap data to disk and we can
+ * throw away our cap_snap.
+ *
+ * Caller hold s_mutex.
+ */
+static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
+ struct ceph_mds_caps *m,
+ struct ceph_mds_session *session)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ u64 follows = le64_to_cpu(m->snap_follows);
+ struct ceph_cap_snap *capsnap;
+ int drop = 0;
+
+ dout("handle_cap_flushsnap_ack inode %p ci %p mds%d follows %lld\n",
+ inode, ci, session->s_mds, follows);
+
+ spin_lock(&inode->i_lock);

+ list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {

+ if (capsnap->follows == follows) {
+ if (capsnap->flush_tid != flush_tid) {
+ dout(" cap_snap %p follows %lld tid %lld !="
+ " %lld\n", capsnap, follows,
+ flush_tid, capsnap->flush_tid);
+ break;
+ }
+ WARN_ON(capsnap->dirty_pages || capsnap->writing);
+ dout(" removing cap_snap %p follows %lld\n",
+ capsnap, follows);
+ ceph_put_snap_context(capsnap->context);
+ list_del(&capsnap->ci_item);
+ list_del(&capsnap->flushing_item);
+ ceph_put_cap_snap(capsnap);
+ drop = 1;
+ break;
+ } else {
+ dout(" skipping cap_snap %p follows %lld\n",
+ capsnap, capsnap->follows);

+ }
+ }
+ spin_unlock(&inode->i_lock);

+ if (drop)
+ iput(inode);
+}
+
+/*
+ * Handle TRUNC from MDS, indicating file truncation.
+ *
+ * caller hold s_mutex.
+ */
+static void handle_cap_trunc(struct inode *inode,
+ struct ceph_mds_caps *trunc,
+ struct ceph_mds_session *session)
+ __releases(inode->i_lock)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int mds = session->s_mds;
+ int seq = le32_to_cpu(trunc->seq);
+ u32 truncate_seq = le32_to_cpu(trunc->truncate_seq);
+ u64 truncate_size = le64_to_cpu(trunc->truncate_size);
+ u64 size = le64_to_cpu(trunc->size);
+ int implemented = 0;
+ int dirty = __ceph_caps_dirty(ci);
+ int issued = __ceph_caps_issued(ceph_inode(inode), &implemented);
+ int queue_trunc = 0;
+
+ issued |= implemented | dirty;
+
+ dout("handle_cap_trunc inode %p mds%d seq %d to %lld seq %d\n",
+ inode, mds, seq, truncate_size, truncate_seq);
+ queue_trunc = ceph_fill_file_size(inode, issued,
+ truncate_seq, truncate_size, size);
+ spin_unlock(&inode->i_lock);
+
+ if (queue_trunc)
+ ceph_queue_vmtruncate(inode);
+}
+
+/*
+ * Handle EXPORT from MDS. Cap is being migrated _from_ this mds to a
+ * different one. If we are the most recent migration we've seen (as
+ * indicated by mseq), make note of the migrating cap bits for the
+ * duration (until we see the corresponding IMPORT).
+ *
+ * caller holds s_mutex
+ */
+static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
+ struct ceph_mds_session *session)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int mds = session->s_mds;
+ unsigned mseq = le32_to_cpu(ex->migrate_seq);
+ struct ceph_cap *cap = NULL, *t;
+ struct rb_node *p;
+ int remember = 1;
+
+ dout("handle_cap_export inode %p ci %p mds%d mseq %d\n",
+ inode, ci, mds, mseq);

+
+ spin_lock(&inode->i_lock);
+

+ /* make sure we haven't seen a higher mseq */
+ for (p = rb_first(&ci->i_caps); p; p = rb_next(p)) {
+ t = rb_entry(p, struct ceph_cap, ci_node);
+ if (ceph_seq_cmp(t->mseq, mseq) > 0) {
+ dout(" higher mseq on cap from mds%d\n",
+ t->session->s_mds);
+ remember = 0;
+ }
+ if (t->session->s_mds == mds)
+ cap = t;
+ }
+
+ if (cap) {
+ if (remember) {
+ /* make note */
+ ci->i_cap_exporting_mds = mds;
+ ci->i_cap_exporting_mseq = mseq;
+ ci->i_cap_exporting_issued = cap->issued;
+ }
+ __ceph_remove_cap(cap);
+ } else {
+ WARN_ON(!cap);

+ }
+
+ spin_unlock(&inode->i_lock);

+}
+
+/*
+ * Handle cap IMPORT. If there are temp bits from an older EXPORT,
+ * clean them up.
+ *
+ * caller holds s_mutex.
+ */
+static void handle_cap_import(struct ceph_mds_client *mdsc,
+ struct inode *inode, struct ceph_mds_caps *im,
+ struct ceph_mds_session *session,
+ void *snaptrace, int snaptrace_len)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ int mds = session->s_mds;
+ unsigned issued = le32_to_cpu(im->caps);
+ unsigned wanted = le32_to_cpu(im->wanted);
+ unsigned seq = le32_to_cpu(im->seq);
+ unsigned mseq = le32_to_cpu(im->migrate_seq);
+ u64 realmino = le64_to_cpu(im->realm);
+ u64 cap_id = le64_to_cpu(im->cap_id);
+
+ if (ci->i_cap_exporting_mds >= 0 &&
+ ceph_seq_cmp(ci->i_cap_exporting_mseq, mseq) < 0) {
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d"
+ " - cleared exporting from mds%d\n",
+ inode, ci, mds, mseq,
+ ci->i_cap_exporting_mds);
+ ci->i_cap_exporting_issued = 0;
+ ci->i_cap_exporting_mseq = 0;
+ ci->i_cap_exporting_mds = -1;
+ } else {
+ dout("handle_cap_import inode %p ci %p mds%d mseq %d\n",
+ inode, ci, mds, mseq);
+ }
+
+ down_write(&mdsc->snap_rwsem);
+ ceph_update_snap_trace(mdsc, snaptrace, snaptrace+snaptrace_len,
+ false);
+ downgrade_write(&mdsc->snap_rwsem);
+ ceph_add_cap(inode, session, cap_id, -1,
+ issued, wanted, seq, mseq, realmino, CEPH_CAP_FLAG_AUTH,
+ NULL /* no caps context */);
+ try_flush_caps(inode, session, NULL);
+ up_read(&mdsc->snap_rwsem);
+}
+
+/*
+ * Handle a caps message from the MDS.
+ *
+ * Identify the appropriate session, inode, and call the right handler
+ * based on the cap op.
+ */
+void ceph_handle_caps(struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{
+ struct ceph_mds_client *mdsc = session->s_mdsc;
+ struct super_block *sb = mdsc->client->sb;
+ struct inode *inode;
+ struct ceph_cap *cap;
+ struct ceph_mds_caps *h;
+ int mds = session->s_mds;
+ int op;
+ u32 seq;
+ struct ceph_vino vino;
+ u64 cap_id;
+ u64 size, max_size;
+ u64 tid;
+ int check_caps = 0;
+ void *snaptrace;
+ int r;
+
+ dout("handle_caps from mds%d\n", mds);
+
+ /* decode */

+ tid = le64_to_cpu(msg->hdr.tid);

+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;
+ h = msg->front.iov_base;
+ snaptrace = h + 1;
+ op = le32_to_cpu(h->op);
+ vino.ino = le64_to_cpu(h->ino);
+ vino.snap = CEPH_NOSNAP;
+ cap_id = le64_to_cpu(h->cap_id);
+ seq = le32_to_cpu(h->seq);
+ size = le64_to_cpu(h->size);
+ max_size = le64_to_cpu(h->max_size);
+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;
+ dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
+ (unsigned)seq);
+
+ /* lookup ino */
+ inode = ceph_find_inode(sb, vino);
+ dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
+ vino.snap, inode);
+ if (!inode) {
+ dout(" i don't have ino %llx\n", vino.ino);

+ goto done;
+ }
+

+ /* these will work even if we don't have a cap yet */
+ switch (op) {
+ case CEPH_CAP_OP_FLUSHSNAP_ACK:
+ handle_cap_flushsnap_ack(inode, tid, h, session);
+ goto done;
+
+ case CEPH_CAP_OP_EXPORT:
+ handle_cap_export(inode, h, session);
+ goto done;
+
+ case CEPH_CAP_OP_IMPORT:
+ handle_cap_import(mdsc, inode, h, session,
+ snaptrace, le32_to_cpu(h->snap_trace_len));
+ check_caps = 1; /* we may have sent a RELEASE to the old auth */

+ goto done;
+ }
+

+ /* the rest require a cap */
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ceph_inode(inode), mds);
+ if (!cap) {
+ dout("no cap on %p ino %llx.%llx from mds%d, releasing\n",
+ inode, ceph_ino(inode), ceph_snap(inode), mds);
+ spin_unlock(&inode->i_lock);

+ goto done;
+ }
+

+ /* note that each of these drops i_lock for us */
+ switch (op) {
+ case CEPH_CAP_OP_REVOKE:
+ case CEPH_CAP_OP_GRANT:
+ r = handle_cap_grant(inode, h, session, cap, msg->middle);
+ if (r == 1)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_NODELAY|CHECK_CAPS_AUTHONLY,
+ session);
+ else if (r == 2)
+ ceph_check_caps(ceph_inode(inode),
+ CHECK_CAPS_NODELAY,
+ session);
+ break;
+
+ case CEPH_CAP_OP_FLUSH_ACK:
+ handle_cap_flush_ack(inode, tid, h, session, cap);
+ break;
+
+ case CEPH_CAP_OP_TRUNC:
+ handle_cap_trunc(inode, h, session);

+ break;
+
+ default:

+ spin_unlock(&inode->i_lock);
+ pr_err("ceph_handle_caps: unknown cap op %d %s\n", op,
+ ceph_cap_op_name(op));
+ }
+
+done:
+ mutex_unlock(&session->s_mutex);
+
+ if (check_caps)
+ ceph_check_caps(ceph_inode(inode), CHECK_CAPS_NODELAY, NULL);
+ if (inode)
+ iput(inode);
+ return;
+
+bad:
+ pr_err("ceph_handle_caps: corrupt message\n");
+ ceph_msg_dump(msg);
+ return;
+}
+
+/*
+ * Delayed work handler to process end of delayed cap release LRU list.
+ */
+void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci;
+ int flags = CHECK_CAPS_NODELAY;
+
+ dout("check_delayed_caps\n");
+ while (1) {
+ spin_lock(&mdsc->cap_delay_lock);
+ if (list_empty(&mdsc->cap_delay_list))
+ break;
+ ci = list_first_entry(&mdsc->cap_delay_list,
+ struct ceph_inode_info,
+ i_cap_delay_list);
+ if ((ci->i_ceph_flags & CEPH_I_FLUSH) == 0 &&
+ time_before(jiffies, ci->i_hold_caps_max))
+ break;
+ list_del_init(&ci->i_cap_delay_list);
+ spin_unlock(&mdsc->cap_delay_lock);
+ dout("check_delayed_caps on %p\n", &ci->vfs_inode);
+ ceph_check_caps(ci, flags, NULL);
+ }
+ spin_unlock(&mdsc->cap_delay_lock);
+}
+
+/*
+ * Flush all dirty caps to the mds
+ */
+void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
+{
+ struct ceph_inode_info *ci, *nci = NULL;
+ struct inode *inode, *ninode = NULL;
+ struct list_head *p, *n;
+
+ dout("flush_dirty_caps\n");
+ spin_lock(&mdsc->cap_dirty_lock);
+ list_for_each_safe(p, n, &mdsc->cap_dirty) {
+ if (nci) {
+ ci = nci;
+ inode = ninode;
+ ci->i_ceph_flags &= ~CEPH_I_NOFLUSH;
+ dout("flush_dirty_caps inode %p (was next inode)\n",
+ inode);
+ } else {
+ ci = list_entry(p, struct ceph_inode_info,
+ i_dirty_item);
+ inode = igrab(&ci->vfs_inode);
+ BUG_ON(!inode);
+ dout("flush_dirty_caps inode %p\n", inode);
+ }
+ if (n != &mdsc->cap_dirty) {
+ nci = list_entry(n, struct ceph_inode_info,
+ i_dirty_item);
+ ninode = igrab(&nci->vfs_inode);
+ BUG_ON(!ninode);
+ nci->i_ceph_flags |= CEPH_I_NOFLUSH;
+ dout("flush_dirty_caps next inode %p, noflush\n",
+ ninode);
+ } else {
+ nci = NULL;
+ ninode = NULL;
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+ if (inode) {
+ ceph_check_caps(ci, CHECK_CAPS_NODELAY|CHECK_CAPS_FLUSH,
+ NULL);
+ iput(inode);
+ }
+ spin_lock(&mdsc->cap_dirty_lock);
+ }
+ spin_unlock(&mdsc->cap_dirty_lock);
+}
+
+/*
+ * Drop open file reference. If we were the last open file,
+ * we may need to release capabilities to the MDS (or schedule
+ * their delayed release).
+ */
+void ceph_put_fmode(struct ceph_inode_info *ci, int fmode)
+{
+ struct inode *inode = &ci->vfs_inode;
+ int last = 0;
+
+ spin_lock(&inode->i_lock);
+ dout("put_fmode %p fmode %d %d -> %d\n", inode, fmode,
+ ci->i_nr_by_mode[fmode], ci->i_nr_by_mode[fmode]-1);
+ BUG_ON(ci->i_nr_by_mode[fmode] == 0);
+ if (--ci->i_nr_by_mode[fmode] == 0)
+ last++;
+ spin_unlock(&inode->i_lock);
+
+ if (last && ci->i_vino.snap == CEPH_NOSNAP)
+ ceph_check_caps(ci, 0, NULL);
+}
+
+/*
+ * Helpers for embedding cap and dentry lease releases into mds
+ * requests.
+ *
+ * @force is used by dentry_release (below) to force inclusion of a
+ * record for the directory inode, even when there aren't any caps to
+ * drop.
+ */
+int ceph_encode_inode_release(void **p, struct inode *inode,

+ int mds, int drop, int unless, int force)

+{
+ struct ceph_inode_info *ci = ceph_inode(inode);

+ struct ceph_cap *cap;
+ struct ceph_mds_request_release *rel = *p;

+ int ret = 0;
+

+ dout("encode_inode_release %p mds%d drop %s unless %s\n", inode,
+ mds, ceph_cap_string(drop), ceph_cap_string(unless));
+
+ spin_lock(&inode->i_lock);
+ cap = __get_cap_for_mds(ci, mds);
+ if (cap && __cap_is_valid(cap)) {
+ if (force ||
+ ((cap->issued & drop) &&
+ (cap->issued & unless) == 0)) {
+ if ((cap->issued & drop) &&
+ (cap->issued & unless) == 0) {
+ dout("encode_inode_release %p cap %p %s -> "
+ "%s\n", inode, cap,
+ ceph_cap_string(cap->issued),
+ ceph_cap_string(cap->issued & ~drop));
+ cap->issued &= ~drop;
+ cap->implemented &= ~drop;
+ if (ci->i_ceph_flags & CEPH_I_NODELAY) {
+ int wanted = __ceph_caps_wanted(ci);
+ dout(" wanted %s -> %s (act %s)\n",
+ ceph_cap_string(cap->mds_wanted),
+ ceph_cap_string(cap->mds_wanted &
+ ~wanted),
+ ceph_cap_string(wanted));
+ cap->mds_wanted &= wanted;
+ }
+ } else {
+ dout("encode_inode_release %p cap %p %s"
+ " (force)\n", inode, cap,
+ ceph_cap_string(cap->issued));
+ }
+
+ rel->ino = cpu_to_le64(ceph_ino(inode));
+ rel->cap_id = cpu_to_le64(cap->cap_id);
+ rel->seq = cpu_to_le32(cap->seq);
+ rel->issue_seq = cpu_to_le32(cap->issue_seq),
+ rel->mseq = cpu_to_le32(cap->mseq);
+ rel->caps = cpu_to_le32(cap->issued);
+ rel->wanted = cpu_to_le32(cap->mds_wanted);
+ rel->dname_len = 0;
+ rel->dname_seq = 0;
+ *p += sizeof(*rel);
+ ret = 1;
+ } else {
+ dout("encode_inode_release %p cap %p %s\n",
+ inode, cap, ceph_cap_string(cap->issued));

+ }
+ }
+ spin_unlock(&inode->i_lock);

+ return ret;
+}
+

+int ceph_encode_dentry_release(void **p, struct dentry *dentry,

+ int mds, int drop, int unless)

+{
+ struct inode *dir = dentry->d_parent->d_inode;

+ struct ceph_mds_request_release *rel = *p;

+ struct ceph_dentry_info *di = ceph_dentry(dentry);

+ int force = 0;
+ int ret;
+
+ /*
+ * force an record for the directory caps if we have a dentry lease.
+ * this is racy (can't take i_lock and d_lock together), but it
+ * doesn't have to be perfect; the mds will revoke anything we don't
+ * release.
+ */
+ spin_lock(&dentry->d_lock);
+ if (di->lease_session && di->lease_session->s_mds == mds)
+ force = 1;
+ spin_unlock(&dentry->d_lock);
+
+ ret = ceph_encode_inode_release(p, dir, mds, drop, unless, force);
+
+ spin_lock(&dentry->d_lock);
+ if (ret && di->lease_session && di->lease_session->s_mds == mds) {
+ dout("encode_dentry_release %p mds%d seq %d\n",
+ dentry, mds, (int)di->lease_seq);
+ rel->dname_len = cpu_to_le32(dentry->d_name.len);
+ memcpy(*p, dentry->d_name.name, dentry->d_name.len);
+ *p += dentry->d_name.len;
+ rel->dname_seq = cpu_to_le32(di->lease_seq);
+ }
+ spin_unlock(&dentry->d_lock);
+ return ret;
+}

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

A generic message passing library is used to communicate with all
other components in the Ceph file system. The messenger library
provides ordered, reliable delivery of messages between two nodes in
the system.

This implementation is based on TCP.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/decode.h | 194 +++++
fs/ceph/messenger.c | 2240 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/messenger.h | 254 ++++++
3 files changed, 2688 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/decode.h
create mode 100644 fs/ceph/messenger.c
create mode 100644 fs/ceph/messenger.h

diff --git a/fs/ceph/decode.h b/fs/ceph/decode.h
new file mode 100644
index 0000000..65b3e02
--- /dev/null
+++ b/fs/ceph/decode.h
@@ -0,0 +1,194 @@
+#ifndef __CEPH_DECODE_H
+#define __CEPH_DECODE_H
+
+#include <asm/unaligned.h>
+#include <linux/time.h>
+
+#include "types.h"
+
+/*
+ * in all cases,
+ * void **p pointer to position pointer
+ * void *end pointer to end of buffer (last byte + 1)
+ */
+
+static inline u64 ceph_decode_64(void **p)
+{
+ u64 v = get_unaligned_le64(*p);
+ *p += sizeof(u64);
+ return v;
+}
+static inline u32 ceph_decode_32(void **p)
+{
+ u32 v = get_unaligned_le32(*p);
+ *p += sizeof(u32);
+ return v;
+}
+static inline u16 ceph_decode_16(void **p)
+{
+ u16 v = get_unaligned_le16(*p);
+ *p += sizeof(u16);
+ return v;
+}
+static inline u8 ceph_decode_8(void **p)
+{
+ u8 v = *(u8 *)*p;
+ (*p)++;
+ return v;
+}
+static inline void ceph_decode_copy(void **p, void *pv, size_t n)
+{
+ memcpy(pv, *p, n);
+ *p += n;
+}
+
+/*
+ * bounds check input.
+ */
+#define ceph_decode_need(p, end, n, bad) \
+ do { \
+ if (unlikely(*(p) + (n) > (end))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_decode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u64), bad); \
+ v = ceph_decode_64(p); \
+ } while (0)
+#define ceph_decode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u32), bad); \
+ v = ceph_decode_32(p); \
+ } while (0)
+#define ceph_decode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u16), bad); \
+ v = ceph_decode_16(p); \
+ } while (0)
+#define ceph_decode_8_safe(p, end, v, bad) \
+ do { \
+ ceph_decode_need(p, end, sizeof(u8), bad); \
+ v = ceph_decode_8(p); \
+ } while (0)
+
+#define ceph_decode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_decode_need(p, end, n, bad); \
+ ceph_decode_copy(p, pv, n); \
+ } while (0)
+
+/*
+ * struct ceph_timespec <-> struct timespec
+ */
+static inline void ceph_decode_timespec(struct timespec *ts,
+ const struct ceph_timespec *tv)
+{
+ ts->tv_sec = le32_to_cpu(tv->tv_sec);
+ ts->tv_nsec = le32_to_cpu(tv->tv_nsec);
+}
+static inline void ceph_encode_timespec(struct ceph_timespec *tv,
+ const struct timespec *ts)
+{
+ tv->tv_sec = cpu_to_le32(ts->tv_sec);
+ tv->tv_nsec = cpu_to_le32(ts->tv_nsec);
+}
+
+/*
+ * sockaddr_storage <-> ceph_sockaddr
+ */
+static inline void ceph_encode_addr(struct ceph_entity_addr *a)
+{
+ a->in_addr.ss_family = htons(a->in_addr.ss_family);
+}
+static inline void ceph_decode_addr(struct ceph_entity_addr *a)
+{
+ a->in_addr.ss_family = ntohs(a->in_addr.ss_family);
+ WARN_ON(a->in_addr.ss_family == 512);
+}
+
+/*
+ * encoders
+ */
+static inline void ceph_encode_64(void **p, u64 v)
+{
+ put_unaligned_le64(v, (__le64 *)*p);
+ *p += sizeof(u64);
+}
+static inline void ceph_encode_32(void **p, u32 v)
+{
+ put_unaligned_le32(v, (__le32 *)*p);
+ *p += sizeof(u32);
+}
+static inline void ceph_encode_16(void **p, u16 v)
+{
+ put_unaligned_le16(v, (__le16 *)*p);
+ *p += sizeof(u16);
+}
+static inline void ceph_encode_8(void **p, u8 v)
+{
+ *(u8 *)*p = v;
+ (*p)++;
+}
+static inline void ceph_encode_copy(void **p, const void *s, int len)
+{
+ memcpy(*p, s, len);
+ *p += len;
+}
+
+/*
+ * filepath, string encoders
+ */
+static inline void ceph_encode_filepath(void **p, void *end,
+ u64 ino, const char *path)
+{
+ u32 len = path ? strlen(path) : 0;
+ BUG_ON(*p + sizeof(ino) + sizeof(len) + len > end);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, ino);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, path, len);

+ *p += len;
+}

+
+static inline void ceph_encode_string(void **p, void *end,
+ const char *s, u32 len)
+{
+ BUG_ON(*p + sizeof(len) + len > end);
+ ceph_encode_32(p, len);
+ if (len)
+ memcpy(*p, s, len);

+ *p += len;
+}

+
+#define ceph_encode_need(p, end, n, bad) \
+ do { \
+ if (unlikely(*(p) + (n) > (end))) \
+ goto bad; \
+ } while (0)
+
+#define ceph_encode_64_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u64), bad); \
+ ceph_encode_64(p, v); \
+ } while (0)
+#define ceph_encode_32_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u32), bad); \
+ ceph_encode_32(p, v); \
+ } while (0)
+#define ceph_encode_16_safe(p, end, v, bad) \
+ do { \
+ ceph_encode_need(p, end, sizeof(u16), bad); \
+ ceph_encode_16(p, v); \
+ } while (0)
+
+#define ceph_encode_copy_safe(p, end, pv, n, bad) \
+ do { \
+ ceph_encode_need(p, end, n, bad); \
+ ceph_encode_copy(p, pv, n); \
+ } while (0)
+
+
+#endif
diff --git a/fs/ceph/messenger.c b/fs/ceph/messenger.c
new file mode 100644
index 0000000..781656a
--- /dev/null
+++ b/fs/ceph/messenger.c
@@ -0,0 +1,2240 @@
+#include "ceph_debug.h"
+
+#include <linux/crc32c.h>
+#include <linux/ctype.h>
+#include <linux/highmem.h>
+#include <linux/inet.h>
+#include <linux/kthread.h>
+#include <linux/net.h>
+#include <linux/socket.h>
+#include <linux/string.h>
+#include <net/tcp.h>
+
+#include "super.h"
+#include "messenger.h"
+#include "decode.h"
+#include "pagelist.h"
+
+/*
+ * Ceph uses the messenger to exchange ceph_msg messages with other
+ * hosts in the system. The messenger provides ordered and reliable
+ * delivery. We tolerate TCP disconnects by reconnecting (with
+ * exponential backoff) in the case of a fault (disconnection, bad
+ * crc, protocol error). Acks allow sent messages to be discarded by
+ * the sender.
+ */
+
+/* static tag bytes (protocol control messages) */
+static char tag_msg = CEPH_MSGR_TAG_MSG;
+static char tag_ack = CEPH_MSGR_TAG_ACK;
+static char tag_keepalive = CEPH_MSGR_TAG_KEEPALIVE;
+
+
+static void queue_con(struct ceph_connection *con);
+static void con_work(struct work_struct *);
+static void ceph_fault(struct ceph_connection *con);
+
+const char *ceph_name_type_str(int t)
+{
+ switch (t) {

+ case CEPH_ENTITY_TYPE_MON: return "mon";
+ case CEPH_ENTITY_TYPE_MDS: return "mds";
+ case CEPH_ENTITY_TYPE_OSD: return "osd";
+ case CEPH_ENTITY_TYPE_CLIENT: return "client";
+ case CEPH_ENTITY_TYPE_ADMIN: return "admin";

+ default: return "???";
+ }
+}
+
+/*
+ * nicely render a sockaddr as a string.
+ */
+#define MAX_ADDR_STR 20
+static char addr_str[MAX_ADDR_STR][40];
+static DEFINE_SPINLOCK(addr_str_lock);
+static int last_addr_str;
+
+const char *pr_addr(const struct sockaddr_storage *ss)

+{
+ int i;
+ char *s;

+ struct sockaddr_in *in4 = (void *)ss;
+ unsigned char *quad = (void *)&in4->sin_addr.s_addr;
+ struct sockaddr_in6 *in6 = (void *)ss;
+
+ spin_lock(&addr_str_lock);
+ i = last_addr_str++;
+ if (last_addr_str == MAX_ADDR_STR)
+ last_addr_str = 0;
+ spin_unlock(&addr_str_lock);
+ s = addr_str[i];
+
+ switch (ss->ss_family) {
+ case AF_INET:
+ sprintf(s, "%u.%u.%u.%u:%u",
+ (unsigned int)quad[0],
+ (unsigned int)quad[1],
+ (unsigned int)quad[2],
+ (unsigned int)quad[3],
+ (unsigned int)ntohs(in4->sin_port));
+ break;
+
+ case AF_INET6:
+ sprintf(s, "%04x:%04x:%04x:%04x:%04x:%04x:%04x:%04x:%u",
+ in6->sin6_addr.s6_addr16[0],
+ in6->sin6_addr.s6_addr16[1],
+ in6->sin6_addr.s6_addr16[2],
+ in6->sin6_addr.s6_addr16[3],
+ in6->sin6_addr.s6_addr16[4],
+ in6->sin6_addr.s6_addr16[5],
+ in6->sin6_addr.s6_addr16[6],
+ in6->sin6_addr.s6_addr16[7],
+ (unsigned int)ntohs(in6->sin6_port));

+ break;
+
+ default:

+ sprintf(s, "(unknown sockaddr family %d)", (int)ss->ss_family);
+ }
+

+ return s;
+}
+

+static void encode_my_addr(struct ceph_messenger *msgr)
+{
+ memcpy(&msgr->my_enc_addr, &msgr->inst.addr, sizeof(msgr->my_enc_addr));
+ ceph_encode_addr(&msgr->my_enc_addr);
+}
+
+/*
+ * work queue for all reading and writing to/from the socket.
+ */
+struct workqueue_struct *ceph_msgr_wq;
+
+int __init ceph_msgr_init(void)
+{
+ ceph_msgr_wq = create_workqueue("ceph-msgr");
+ if (IS_ERR(ceph_msgr_wq)) {
+ int ret = PTR_ERR(ceph_msgr_wq);
+ pr_err("msgr_init failed to create workqueue: %d\n", ret);
+ ceph_msgr_wq = NULL;
+ return ret;
+ }

+ return 0;
+}
+

+void ceph_msgr_exit(void)
+{
+ destroy_workqueue(ceph_msgr_wq);
+}
+
+/*
+ * socket callback functions
+ */
+
+/* data available on socket, or listen socket received a connect */
+static void ceph_data_ready(struct sock *sk, int count_unused)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+ if (sk->sk_state != TCP_CLOSE_WAIT) {
+ dout("ceph_data_ready on %p state = %lu, queueing work\n",
+ con, con->state);
+ queue_con(con);
+ }
+}
+
+/* socket has buffer space for writing */
+static void ceph_write_space(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ /* only queue to workqueue if there is data we want to write. */
+ if (test_bit(WRITE_PENDING, &con->state)) {
+ dout("ceph_write_space %p queueing write work\n", con);
+ queue_con(con);
+ } else {
+ dout("ceph_write_space %p nothing to write\n", con);
+ }
+
+ /* since we have our own write_space, clear the SOCK_NOSPACE flag */
+ clear_bit(SOCK_NOSPACE, &sk->sk_socket->flags);
+}
+
+/* socket's state has changed */
+static void ceph_state_change(struct sock *sk)
+{
+ struct ceph_connection *con =
+ (struct ceph_connection *)sk->sk_user_data;
+
+ dout("ceph_state_change %p state = %lu sk_state = %u\n",
+ con, con->state, sk->sk_state);
+
+ if (test_bit(CLOSED, &con->state))
+ return;
+
+ switch (sk->sk_state) {
+ case TCP_CLOSE:
+ dout("ceph_state_change TCP_CLOSE\n");
+ case TCP_CLOSE_WAIT:
+ dout("ceph_state_change TCP_CLOSE_WAIT\n");
+ if (test_and_set_bit(SOCK_CLOSED, &con->state) == 0) {
+ if (test_bit(CONNECTING, &con->state))
+ con->error_msg = "connection failed";
+ else
+ con->error_msg = "socket closed";
+ queue_con(con);
+ }
+ break;
+ case TCP_ESTABLISHED:
+ dout("ceph_state_change TCP_ESTABLISHED\n");
+ queue_con(con);

+ break;
+ }
+}
+

+/*
+ * set up socket callbacks
+ */
+static void set_sock_callbacks(struct socket *sock,

+ struct ceph_connection *con)
+{

+ struct sock *sk = sock->sk;
+ sk->sk_user_data = (void *)con;
+ sk->sk_data_ready = ceph_data_ready;
+ sk->sk_write_space = ceph_write_space;
+ sk->sk_state_change = ceph_state_change;
+}
+
+
+/*
+ * socket helpers
+ */
+
+/*
+ * initiate connection to a remote socket.
+ */
+static struct socket *ceph_tcp_connect(struct ceph_connection *con)
+{
+ struct sockaddr *paddr = (struct sockaddr *)&con->peer_addr.in_addr;
+ struct socket *sock;
+ int ret;
+
+ BUG_ON(con->sock);
+ ret = sock_create_kern(AF_INET, SOCK_STREAM, IPPROTO_TCP, &sock);
+ if (ret)
+ return ERR_PTR(ret);
+ con->sock = sock;
+ sock->sk->sk_allocation = GFP_NOFS;
+
+ set_sock_callbacks(sock, con);
+
+ dout("connect %s\n", pr_addr(&con->peer_addr.in_addr));
+
+ ret = sock->ops->connect(sock, paddr, sizeof(*paddr), O_NONBLOCK);
+ if (ret == -EINPROGRESS) {
+ dout("connect %s EINPROGRESS sk_state = %u\n",
+ pr_addr(&con->peer_addr.in_addr),
+ sock->sk->sk_state);

+ ret = 0;
+ }

+ if (ret < 0) {
+ pr_err("connect %s error %d\n",
+ pr_addr(&con->peer_addr.in_addr), ret);
+ sock_release(sock);
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ }
+
+ if (ret < 0)
+ return ERR_PTR(ret);
+ return sock;
+}
+
+static int ceph_tcp_recvmsg(struct socket *sock, void *buf, size_t len)
+{
+ struct kvec iov = {buf, len};
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ return kernel_recvmsg(sock, &msg, &iov, 1, len, msg.msg_flags);
+}
+
+/*
+ * write something. @more is true if caller will be sending more data
+ * shortly.
+ */
+static int ceph_tcp_sendmsg(struct socket *sock, struct kvec *iov,
+ size_t kvlen, size_t len, int more)
+{
+ struct msghdr msg = { .msg_flags = MSG_DONTWAIT | MSG_NOSIGNAL };
+
+ if (more)
+ msg.msg_flags |= MSG_MORE;
+ else
+ msg.msg_flags |= MSG_EOR; /* superfluous, but what the hell */
+
+ return kernel_sendmsg(sock, &msg, iov, kvlen, len);
+}
+
+
+/*
+ * Shutdown/close the socket for the given connection.
+ */
+static int con_close_socket(struct ceph_connection *con)

+{
+ int rc;
+

+ dout("con_close_socket on %p sock %p\n", con, con->sock);
+ if (!con->sock)
+ return 0;
+ set_bit(SOCK_CLOSED, &con->state);
+ rc = con->sock->ops->shutdown(con->sock, SHUT_RDWR);
+ sock_release(con->sock);
+ con->sock = NULL;
+ clear_bit(SOCK_CLOSED, &con->state);
+ return rc;
+}
+
+/*
+ * Reset a connection. Discard all incoming and outgoing messages
+ * and clear *_seq state.
+ */
+static void ceph_msg_remove(struct ceph_msg *msg)
+{
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+}
+static void ceph_msg_remove_list(struct list_head *head)
+{
+ while (!list_empty(head)) {
+ struct ceph_msg *msg = list_first_entry(head, struct ceph_msg,
+ list_head);
+ ceph_msg_remove(msg);
+ }
+}
+
+static void reset_connection(struct ceph_connection *con)
+{
+ /* reset connection, out_queue, msg_ and connect_seq */
+ /* discard existing out_queue and msg_seq */
+ ceph_msg_remove_list(&con->out_queue);
+ ceph_msg_remove_list(&con->out_sent);
+
+ if (con->in_msg) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+
+ con->connect_seq = 0;
+ con->out_seq = 0;
+ if (con->out_msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ con->in_seq = 0;
+}
+
+/*
+ * mark a peer down. drop any open connections.
+ */
+void ceph_con_close(struct ceph_connection *con)
+{
+ dout("con_close %p peer %s\n", con, pr_addr(&con->peer_addr.in_addr));
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+ clear_bit(STANDBY, &con->state); /* avoid connect_seq bump */
+ clear_bit(LOSSYTX, &con->state); /* so we retry next connect */
+ clear_bit(KEEPALIVE_PENDING, &con->state);
+ clear_bit(WRITE_PENDING, &con->state);
+ mutex_lock(&con->mutex);
+ reset_connection(con);
+ cancel_delayed_work(&con->work);
+ mutex_unlock(&con->mutex);
+ queue_con(con);
+}
+
+/*
+ * Reopen a closed connection, with a new peer address.
+ */
+void ceph_con_open(struct ceph_connection *con, struct ceph_entity_addr *addr)
+{
+ dout("con_open %p %s\n", con, pr_addr(&addr->in_addr));
+ set_bit(OPENING, &con->state);
+ clear_bit(CLOSED, &con->state);
+ memcpy(&con->peer_addr, addr, sizeof(*addr));
+ con->delay = 0; /* reset backoff memory */
+ queue_con(con);
+}
+
+/*
+ * generic get/put
+ */
+struct ceph_connection *ceph_con_get(struct ceph_connection *con)
+{
+ dout("con_get %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) + 1);
+ if (atomic_inc_not_zero(&con->nref))
+ return con;

+ return NULL;
+}
+

+void ceph_con_put(struct ceph_connection *con)
+{
+ dout("con_put %p nref = %d -> %d\n", con,
+ atomic_read(&con->nref), atomic_read(&con->nref) - 1);
+ BUG_ON(atomic_read(&con->nref) == 0);
+ if (atomic_dec_and_test(&con->nref)) {
+ BUG_ON(con->sock);
+ kfree(con);
+ }
+}
+
+/*
+ * initialize a new connection.
+ */
+void ceph_con_init(struct ceph_messenger *msgr, struct ceph_connection *con)
+{
+ dout("con_init %p\n", con);
+ memset(con, 0, sizeof(*con));
+ atomic_set(&con->nref, 1);
+ con->msgr = msgr;
+ mutex_init(&con->mutex);
+ INIT_LIST_HEAD(&con->out_queue);
+ INIT_LIST_HEAD(&con->out_sent);
+ INIT_DELAYED_WORK(&con->work, con_work);
+}
+
+
+/*
+ * We maintain a global counter to order connection attempts. Get
+ * a unique seq greater than @gt.
+ */
+static u32 get_global_seq(struct ceph_messenger *msgr, u32 gt)
+{
+ u32 ret;
+
+ spin_lock(&msgr->global_seq_lock);
+ if (msgr->global_seq < gt)
+ msgr->global_seq = gt;
+ ret = ++msgr->global_seq;
+ spin_unlock(&msgr->global_seq_lock);

+ return ret;
+}
+
+

+/*
+ * Prepare footer for currently outgoing message, and finish things
+ * off. Assumes out_kvec* are already valid.. we just add on to the end.
+ */
+static void prepare_write_message_footer(struct ceph_connection *con, int v)
+{
+ struct ceph_msg *m = con->out_msg;
+
+ dout("prepare_write_message_footer %p\n", con);
+ con->out_kvec_is_msg = true;
+ con->out_kvec[v].iov_base = &m->footer;
+ con->out_kvec[v].iov_len = sizeof(m->footer);
+ con->out_kvec_bytes += sizeof(m->footer);
+ con->out_kvec_left++;
+ con->out_more = m->more_to_follow;
+ con->out_msg_done = true;
+}
+
+/*
+ * Prepare headers for the next outgoing message.
+ */
+static void prepare_write_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ int v = 0;
+
+ con->out_kvec_bytes = 0;
+ con->out_kvec_is_msg = true;
+ con->out_msg_done = false;
+
+ /* Sneak an ack in there first? If we can get it into the same
+ * TCP packet that's a good thing. */
+ if (con->in_seq > con->in_seq_acked) {
+ con->in_seq_acked = con->in_seq;
+ con->out_kvec[v].iov_base = &tag_ack;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[v].iov_base = &con->out_temp_ack;
+ con->out_kvec[v++].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ }
+
+ m = list_first_entry(&con->out_queue,
+ struct ceph_msg, list_head);
+ con->out_msg = m;
+ if (test_bit(LOSSYTX, &con->state)) {
+ list_del_init(&m->list_head);
+ } else {
+ /* put message on sent list */
+ ceph_msg_get(m);
+ list_move_tail(&m->list_head, &con->out_sent);
+ }
+
+ m->hdr.seq = cpu_to_le64(++con->out_seq);
+
+ dout("prepare_write_message %p seq %lld type %d len %d+%d+%d %d pgs\n",
+ m, con->out_seq, le16_to_cpu(m->hdr.type),
+ le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
+ le32_to_cpu(m->hdr.data_len),
+ m->nr_pages);
+ BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len);
+
+ /* tag + hdr + front + middle */
+ con->out_kvec[v].iov_base = &tag_msg;
+ con->out_kvec[v++].iov_len = 1;
+ con->out_kvec[v].iov_base = &m->hdr;
+ con->out_kvec[v++].iov_len = sizeof(m->hdr);
+ con->out_kvec[v++] = m->front;
+ if (m->middle)
+ con->out_kvec[v++] = m->middle->vec;
+ con->out_kvec_left = v;
+ con->out_kvec_bytes += 1 + sizeof(m->hdr) + m->front.iov_len +
+ (m->middle ? m->middle->vec.iov_len : 0);
+ con->out_kvec_cur = con->out_kvec;
+
+ /* fill in crc (except data pages), footer */
+ con->out_msg->hdr.crc =
+ cpu_to_le32(crc32c(0, (void *)&m->hdr,
+ sizeof(m->hdr) - sizeof(m->hdr.crc)));
+ con->out_msg->footer.flags = CEPH_MSG_FOOTER_COMPLETE;
+ con->out_msg->footer.front_crc =
+ cpu_to_le32(crc32c(0, m->front.iov_base, m->front.iov_len));
+ if (m->middle)
+ con->out_msg->footer.middle_crc =
+ cpu_to_le32(crc32c(0, m->middle->vec.iov_base,
+ m->middle->vec.iov_len));
+ else
+ con->out_msg->footer.middle_crc = 0;
+ con->out_msg->footer.data_crc = 0;
+ dout("prepare_write_message front_crc %u data_crc %u\n",
+ le32_to_cpu(con->out_msg->footer.front_crc),
+ le32_to_cpu(con->out_msg->footer.middle_crc));
+
+ /* is there a data payload? */
+ if (le32_to_cpu(m->hdr.data_len) > 0) {
+ /* initialize page iterator */
+ con->out_msg_pos.page = 0;
+ con->out_msg_pos.page_pos =
+ le16_to_cpu(m->hdr.data_off) & ~PAGE_MASK;
+ con->out_msg_pos.data_pos = 0;
+ con->out_msg_pos.did_page_crc = 0;
+ con->out_more = 1; /* data + footer will follow */
+ } else {
+ /* no, queue up footer too and be done */
+ prepare_write_message_footer(con, v);
+ }
+
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare an ack.
+ */
+static void prepare_write_ack(struct ceph_connection *con)
+{
+ dout("prepare_write_ack %p %llu -> %llu\n", con,
+ con->in_seq_acked, con->in_seq);
+ con->in_seq_acked = con->in_seq;
+
+ con->out_kvec[0].iov_base = &tag_ack;
+ con->out_kvec[0].iov_len = 1;
+ con->out_temp_ack = cpu_to_le64(con->in_seq_acked);
+ con->out_kvec[1].iov_base = &con->out_temp_ack;
+ con->out_kvec[1].iov_len = sizeof(con->out_temp_ack);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = 1 + sizeof(con->out_temp_ack);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 1; /* more will follow.. eventually.. */
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Prepare to write keepalive byte.
+ */
+static void prepare_write_keepalive(struct ceph_connection *con)
+{
+ dout("prepare_write_keepalive %p\n", con);
+ con->out_kvec[0].iov_base = &tag_keepalive;
+ con->out_kvec[0].iov_len = 1;
+ con->out_kvec_left = 1;
+ con->out_kvec_bytes = 1;
+ con->out_kvec_cur = con->out_kvec;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+/*
+ * Connection negotiation.
+ */
+
+static void prepare_connect_authorizer(struct ceph_connection *con)
+{
+ void *auth_buf;
+ int auth_len = 0;
+ int auth_protocol = 0;
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->get_authorizer)
+ con->ops->get_authorizer(con, &auth_buf, &auth_len,
+ &auth_protocol, &con->auth_reply_buf,
+ &con->auth_reply_buf_len,
+ con->auth_retry);
+ mutex_lock(&con->mutex);
+
+ con->out_connect.authorizer_protocol = cpu_to_le32(auth_protocol);
+ con->out_connect.authorizer_len = cpu_to_le32(auth_len);
+
+ con->out_kvec[con->out_kvec_left].iov_base = auth_buf;
+ con->out_kvec[con->out_kvec_left].iov_len = auth_len;
+ con->out_kvec_left++;
+ con->out_kvec_bytes += auth_len;
+}
+
+/*
+ * We connected to a peer and are saying hello.
+ */
+static void prepare_write_banner(struct ceph_messenger *msgr,

+ struct ceph_connection *con)
+{

+ int len = strlen(CEPH_BANNER);
+
+ con->out_kvec[0].iov_base = CEPH_BANNER;
+ con->out_kvec[0].iov_len = len;
+ con->out_kvec[1].iov_base = &msgr->my_enc_addr;
+ con->out_kvec[1].iov_len = sizeof(msgr->my_enc_addr);
+ con->out_kvec_left = 2;
+ con->out_kvec_bytes = len + sizeof(msgr->my_enc_addr);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+}
+
+static void prepare_write_connect(struct ceph_messenger *msgr,
+ struct ceph_connection *con,
+ int after_banner)
+{
+ unsigned global_seq = get_global_seq(con->msgr, 0);
+ int proto;
+
+ switch (con->peer_name.type) {
+ case CEPH_ENTITY_TYPE_MON:
+ proto = CEPH_MONC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_OSD:
+ proto = CEPH_OSDC_PROTOCOL;
+ break;
+ case CEPH_ENTITY_TYPE_MDS:
+ proto = CEPH_MDSC_PROTOCOL;
+ break;
+ default:
+ BUG();
+ }
+
+ dout("prepare_write_connect %p cseq=%d gseq=%d proto=%d\n", con,
+ con->connect_seq, global_seq, proto);
+
+ con->out_connect.features = CEPH_FEATURE_SUPPORTED;
+ con->out_connect.host_type = cpu_to_le32(CEPH_ENTITY_TYPE_CLIENT);
+ con->out_connect.connect_seq = cpu_to_le32(con->connect_seq);
+ con->out_connect.global_seq = cpu_to_le32(global_seq);
+ con->out_connect.protocol_version = cpu_to_le32(proto);
+ con->out_connect.flags = 0;
+
+ if (!after_banner) {
+ con->out_kvec_left = 0;
+ con->out_kvec_bytes = 0;
+ }
+ con->out_kvec[con->out_kvec_left].iov_base = &con->out_connect;
+ con->out_kvec[con->out_kvec_left].iov_len = sizeof(con->out_connect);
+ con->out_kvec_left++;
+ con->out_kvec_bytes += sizeof(con->out_connect);
+ con->out_kvec_cur = con->out_kvec;
+ con->out_more = 0;
+ set_bit(WRITE_PENDING, &con->state);
+
+ prepare_connect_authorizer(con);
+}
+
+
+/*
+ * write as much of pending kvecs to the socket as we can.
+ * 1 -> done
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_kvec(struct ceph_connection *con)
+{
+ int ret;
+
+ dout("write_partial_kvec %p %d left\n", con, con->out_kvec_bytes);
+ while (con->out_kvec_bytes > 0) {
+ ret = ceph_tcp_sendmsg(con->sock, con->out_kvec_cur,
+ con->out_kvec_left, con->out_kvec_bytes,
+ con->out_more);
+ if (ret <= 0)
+ goto out;
+ con->out_kvec_bytes -= ret;
+ if (con->out_kvec_bytes == 0)
+ break; /* done */
+ while (ret > 0) {
+ if (ret >= con->out_kvec_cur->iov_len) {
+ ret -= con->out_kvec_cur->iov_len;
+ con->out_kvec_cur++;
+ con->out_kvec_left--;
+ } else {
+ con->out_kvec_cur->iov_len -= ret;
+ con->out_kvec_cur->iov_base += ret;

+ ret = 0;
+ break;
+ }
+ }

+ }
+ con->out_kvec_left = 0;
+ con->out_kvec_is_msg = false;
+ ret = 1;
+out:
+ dout("write_partial_kvec %p %d left in %d kvecs ret = %d\n", con,
+ con->out_kvec_bytes, con->out_kvec_left, ret);
+ return ret; /* done! */
+}
+
+/*
+ * Write as much message data payload as we can. If we finish, queue
+ * up the footer.
+ * 1 -> done, footer is now queued in out_kvec[].
+ * 0 -> socket full, but more to do
+ * <0 -> error
+ */
+static int write_partial_msg_pages(struct ceph_connection *con)
+{
+ struct ceph_msg *msg = con->out_msg;
+ unsigned data_len = le32_to_cpu(msg->hdr.data_len);
+ size_t len;
+ int crc = con->msgr->nocrc;
+ int ret;
+
+ dout("write_partial_msg_pages %p msg %p page %d/%d offset %d\n",
+ con, con->out_msg, con->out_msg_pos.page, con->out_msg->nr_pages,
+ con->out_msg_pos.page_pos);
+
+ while (con->out_msg_pos.page < con->out_msg->nr_pages) {
+ struct page *page = NULL;
+ void *kaddr = NULL;
+
+ /*
+ * if we are calculating the data crc (the default), we need
+ * to map the page. if our pages[] has been revoked, use the
+ * zero page.
+ */
+ if (msg->pages) {
+ page = msg->pages[con->out_msg_pos.page];
+ if (crc)
+ kaddr = kmap(page);
+ } else if (msg->pagelist) {
+ page = list_first_entry(&msg->pagelist->head,
+ struct page, lru);
+ if (crc)
+ kaddr = kmap(page);
+ } else {
+ page = con->msgr->zero_page;
+ if (crc)
+ kaddr = page_address(con->msgr->zero_page);
+ }
+ len = min((int)(PAGE_SIZE - con->out_msg_pos.page_pos),
+ (int)(data_len - con->out_msg_pos.data_pos));
+ if (crc && !con->out_msg_pos.did_page_crc) {
+ void *base = kaddr + con->out_msg_pos.page_pos;
+ u32 tmpcrc = le32_to_cpu(con->out_msg->footer.data_crc);
+
+ BUG_ON(kaddr == NULL);
+ con->out_msg->footer.data_crc =
+ cpu_to_le32(crc32c(tmpcrc, base, len));
+ con->out_msg_pos.did_page_crc = 1;
+ }
+
+ ret = kernel_sendpage(con->sock, page,
+ con->out_msg_pos.page_pos, len,
+ MSG_DONTWAIT | MSG_NOSIGNAL |
+ MSG_MORE);
+
+ if (crc && (msg->pages || msg->pagelist))
+ kunmap(page);
+
+ if (ret <= 0)
+ goto out;
+
+ con->out_msg_pos.data_pos += ret;
+ con->out_msg_pos.page_pos += ret;
+ if (ret == len) {
+ con->out_msg_pos.page_pos = 0;
+ con->out_msg_pos.page++;
+ con->out_msg_pos.did_page_crc = 0;
+ if (msg->pagelist)
+ list_move_tail(&page->lru,
+ &msg->pagelist->head);
+ }
+ }
+
+ dout("write_partial_msg_pages %p msg %p done\n", con, msg);
+
+ /* prepare and queue up footer, too */
+ if (!crc)
+ con->out_msg->footer.flags |= CEPH_MSG_FOOTER_NOCRC;
+ con->out_kvec_bytes = 0;
+ con->out_kvec_left = 0;
+ con->out_kvec_cur = con->out_kvec;
+ prepare_write_message_footer(con, 0);
+ ret = 1;
+out:

+ return ret;
+}
+
+/*

+ * write some zeros
+ */
+static int write_partial_skip(struct ceph_connection *con)
+{
+ int ret;
+
+ while (con->out_skip > 0) {
+ struct kvec iov = {
+ .iov_base = page_address(con->msgr->zero_page),
+ .iov_len = min(con->out_skip, (int)PAGE_CACHE_SIZE)
+ };
+
+ ret = ceph_tcp_sendmsg(con->sock, &iov, 1, iov.iov_len, 1);
+ if (ret <= 0)
+ goto out;
+ con->out_skip -= ret;
+ }
+ ret = 1;
+out:

+ return ret;
+}
+
+/*

+ * Prepare to read connection handshake, or an ack.
+ */
+static void prepare_read_banner(struct ceph_connection *con)
+{
+ dout("prepare_read_banner %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect(struct ceph_connection *con)
+{
+ dout("prepare_read_connect %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_connect_retry(struct ceph_connection *con)
+{
+ dout("prepare_read_connect_retry %p\n", con);
+ con->in_base_pos = strlen(CEPH_BANNER) + sizeof(con->actual_peer_addr)
+ + sizeof(con->peer_addr_for_me);
+}
+
+static void prepare_read_ack(struct ceph_connection *con)
+{
+ dout("prepare_read_ack %p\n", con);
+ con->in_base_pos = 0;
+}
+
+static void prepare_read_tag(struct ceph_connection *con)
+{
+ dout("prepare_read_tag %p\n", con);
+ con->in_base_pos = 0;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+}
+
+/*
+ * Prepare to read a message.
+ */
+static int prepare_read_message(struct ceph_connection *con)
+{
+ dout("prepare_read_message %p\n", con);
+ BUG_ON(con->in_msg != NULL);
+ con->in_base_pos = 0;
+ con->in_front_crc = con->in_middle_crc = con->in_data_crc = 0;

+ return 0;
+}
+

+
+static int read_partial(struct ceph_connection *con,
+ int *to, int size, void *object)
+{
+ *to += size;
+ while (con->in_base_pos < *to) {
+ int left = *to - con->in_base_pos;
+ int have = size - left;
+ int ret = ceph_tcp_recvmsg(con->sock, object + have, left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;

+ }
+ return 1;
+}
+
+

+/*
+ * Read all or part of the connect-side handshake on a new connection
+ */
+static int read_partial_banner(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_banner %p at %d\n", con, con->in_base_pos);
+
+ /* peer's banner */
+ ret = read_partial(con, &to, strlen(CEPH_BANNER), con->in_banner);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->actual_peer_addr),
+ &con->actual_peer_addr);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, sizeof(con->peer_addr_for_me),
+ &con->peer_addr_for_me);
+ if (ret <= 0)
+ goto out;
+out:

+ return ret;
+}
+

+static int read_partial_connect(struct ceph_connection *con)
+{
+ int ret, to = 0;
+
+ dout("read_partial_connect %p at %d\n", con, con->in_base_pos);
+
+ ret = read_partial(con, &to, sizeof(con->in_reply), &con->in_reply);
+ if (ret <= 0)
+ goto out;
+ ret = read_partial(con, &to, le32_to_cpu(con->in_reply.authorizer_len),
+ con->auth_reply_buf);
+ if (ret <= 0)
+ goto out;
+
+ dout("read_partial_connect %p tag %d, con_seq = %u, g_seq = %u\n",
+ con, (int)con->in_reply.tag,
+ le32_to_cpu(con->in_reply.connect_seq),
+ le32_to_cpu(con->in_reply.global_seq));
+out:

+ return ret;
+
+}
+

+/*
+ * Verify the hello banner looks okay.
+ */
+static int verify_hello(struct ceph_connection *con)
+{
+ if (memcmp(con->in_banner, CEPH_BANNER, strlen(CEPH_BANNER))) {
+ pr_err("connect to %s got bad banner\n",
+ pr_addr(&con->peer_addr.in_addr));
+ con->error_msg = "protocol error, bad banner";
+ return -1;
+ }

+ return 0;
+}
+

+static bool addr_is_blank(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ((struct sockaddr_in *)ss)->sin_addr.s_addr == 0;
+ case AF_INET6:
+ return
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[0] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[1] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[2] == 0 &&
+ ((struct sockaddr_in6 *)ss)->sin6_addr.s6_addr32[3] == 0;
+ }
+ return false;
+}
+
+static int addr_port(struct sockaddr_storage *ss)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ return ntohs(((struct sockaddr_in *)ss)->sin_port);
+ case AF_INET6:
+ return ntohs(((struct sockaddr_in6 *)ss)->sin6_port);

+ }
+ return 0;
+}
+

+static void addr_set_port(struct sockaddr_storage *ss, int p)
+{
+ switch (ss->ss_family) {
+ case AF_INET:
+ ((struct sockaddr_in *)ss)->sin_port = htons(p);
+ case AF_INET6:
+ ((struct sockaddr_in6 *)ss)->sin6_port = htons(p);
+ }
+}
+
+/*
+ * Parse an ip[:port] list into an addr array. Use the default
+ * monitor port if a port isn't specified.
+ */
+int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count)
+{
+ int i;
+ const char *p = c;
+
+ dout("parse_ips on '%.*s'\n", (int)(end-c), c);
+ for (i = 0; i < max_count; i++) {
+ const char *ipend;
+ struct sockaddr_storage *ss = &addr[i].in_addr;
+ struct sockaddr_in *in4 = (void *)ss;
+ struct sockaddr_in6 *in6 = (void *)ss;
+ int port;
+
+ memset(ss, 0, sizeof(*ss));
+ if (in4_pton(p, end - p, (u8 *)&in4->sin_addr.s_addr,
+ ',', &ipend)) {
+ ss->ss_family = AF_INET;
+ } else if (in6_pton(p, end - p, (u8 *)&in6->sin6_addr.s6_addr,
+ ',', &ipend)) {
+ ss->ss_family = AF_INET6;
+ } else {
+ goto bad;
+ }
+ p = ipend;
+
+ /* port? */
+ if (p < end && *p == ':') {
+ port = 0;
+ p++;
+ while (p < end && *p >= '0' && *p <= '9') {
+ port = (port * 10) + (*p - '0');
+ p++;
+ }
+ if (port > 65535 || port == 0)
+ goto bad;
+ } else {
+ port = CEPH_MON_PORT;
+ }
+
+ addr_set_port(ss, port);
+
+ dout("parse_ips got %s\n", pr_addr(ss));
+
+ if (p == end)
+ break;
+ if (*p != ',')
+ goto bad;
+ p++;
+ }
+
+ if (p != end)
+ goto bad;
+
+ if (count)
+ *count = i + 1;
+ return 0;
+
+bad:
+ pr_err("parse_ips bad ip '%s'\n", c);

+ return -EINVAL;
+}
+

+static int process_banner(struct ceph_connection *con)
+{
+ dout("process_banner on %p\n", con);
+
+ if (verify_hello(con) < 0)
+ return -1;
+
+ ceph_decode_addr(&con->actual_peer_addr);
+ ceph_decode_addr(&con->peer_addr_for_me);
+
+ /*
+ * Make sure the other end is who we wanted. note that the other
+ * end may not yet know their ip address, so if it's 0.0.0.0, give
+ * them the benefit of the doubt.
+ */
+ if (memcmp(&con->peer_addr, &con->actual_peer_addr,
+ sizeof(con->peer_addr)) != 0 &&
+ !(addr_is_blank(&con->actual_peer_addr.in_addr) &&
+ con->actual_peer_addr.nonce == con->peer_addr.nonce)) {
+ pr_warning("wrong peer, want %s/%lld, got %s/%lld\n",
+ pr_addr(&con->peer_addr.in_addr),
+ le64_to_cpu(con->peer_addr.nonce),
+ pr_addr(&con->actual_peer_addr.in_addr),
+ le64_to_cpu(con->actual_peer_addr.nonce));
+ con->error_msg = "wrong peer at address";

+ return -1;
+ }
+
+ /*

+ * did we learn our address?
+ */
+ if (addr_is_blank(&con->msgr->inst.addr.in_addr)) {
+ int port = addr_port(&con->msgr->inst.addr.in_addr);
+
+ memcpy(&con->msgr->inst.addr.in_addr,
+ &con->peer_addr_for_me.in_addr,
+ sizeof(con->peer_addr_for_me.in_addr));
+ addr_set_port(&con->msgr->inst.addr.in_addr, port);
+ encode_my_addr(con->msgr);
+ dout("process_banner learned my addr is %s\n",
+ pr_addr(&con->msgr->inst.addr.in_addr));
+ }
+
+ set_bit(NEGOTIATING, &con->state);
+ prepare_read_connect(con);

+ return 0;
+}
+

+static void fail_protocol(struct ceph_connection *con)
+{
+ reset_connection(con);
+ set_bit(CLOSED, &con->state); /* in case there's queued work */
+
+ mutex_unlock(&con->mutex);
+ if (con->ops->bad_proto)
+ con->ops->bad_proto(con);
+ mutex_lock(&con->mutex);
+}
+
+static int process_connect(struct ceph_connection *con)
+{
+ u64 sup_feat = CEPH_FEATURE_SUPPORTED;
+ u64 req_feat = CEPH_FEATURE_REQUIRED;
+ u64 server_feat = le64_to_cpu(con->in_reply.features);
+
+ dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
+
+ switch (con->in_reply.tag) {
+ case CEPH_MSGR_TAG_FEATURES:
+ pr_err("%s%lld %s feature set mismatch,"
+ " my %llx < server's %llx, missing %llx\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr),
+ sup_feat, server_feat, server_feat & ~sup_feat);
+ con->error_msg = "missing required protocol features";
+ fail_protocol(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADPROTOVER:
+ pr_err("%s%lld %s protocol version mismatch,"
+ " my %d != server's %d\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr),
+ le32_to_cpu(con->out_connect.protocol_version),
+ le32_to_cpu(con->in_reply.protocol_version));
+ con->error_msg = "protocol version mismatch";
+ fail_protocol(con);
+ return -1;
+
+ case CEPH_MSGR_TAG_BADAUTHORIZER:
+ con->auth_retry++;
+ dout("process_connect %p got BADAUTHORIZER attempt %d\n", con,
+ con->auth_retry);
+ if (con->auth_retry == 2) {
+ con->error_msg = "connect authorization failure";
+ reset_connection(con);
+ set_bit(CLOSED, &con->state);
+ return -1;
+ }
+ con->auth_retry = 1;
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect_retry(con);
+ break;
+
+ case CEPH_MSGR_TAG_RESETSESSION:
+ /*
+ * If we connected with a large connect_seq but the peer
+ * has no record of a session with us (no connection, or
+ * connect_seq == 0), they will send RESETSESION to indicate
+ * that they must have reset their session, and may have
+ * dropped messages.
+ */
+ dout("process_connect got RESET peer seq %u\n",
+ le32_to_cpu(con->in_connect.connect_seq));
+ pr_err("%s%lld %s connection reset\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr));
+ reset_connection(con);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+
+ /* Tell ceph about it. */
+ mutex_unlock(&con->mutex);
+ pr_info("reset on %s%lld\n", ENTITY_NAME(con->peer_name));
+ if (con->ops->peer_reset)
+ con->ops->peer_reset(con);
+ mutex_lock(&con->mutex);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_SESSION:
+ /*
+ * If we sent a smaller connect_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY my seq = %u, peer_seq = %u\n",
+ le32_to_cpu(con->out_connect.connect_seq),
+ le32_to_cpu(con->in_connect.connect_seq));
+ con->connect_seq = le32_to_cpu(con->in_connect.connect_seq);
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_RETRY_GLOBAL:
+ /*
+ * If we sent a smaller global_seq than the peer has, try
+ * again with a larger value.
+ */
+ dout("process_connect got RETRY_GLOBAL my %u peer_gseq %u\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_connect.global_seq));
+ get_global_seq(con->msgr,
+ le32_to_cpu(con->in_connect.global_seq));
+ prepare_write_connect(con->msgr, con, 0);
+ prepare_read_connect(con);
+ break;
+
+ case CEPH_MSGR_TAG_READY:
+ if (req_feat & ~server_feat) {
+ pr_err("%s%lld %s protocol feature mismatch,"
+ " my required %llx > server's %llx, need %llx\n",
+ ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr),
+ req_feat, server_feat, req_feat & ~server_feat);
+ con->error_msg = "missing required protocol features";
+ fail_protocol(con);
+ return -1;
+ }
+ clear_bit(CONNECTING, &con->state);
+ con->peer_global_seq = le32_to_cpu(con->in_reply.global_seq);
+ con->connect_seq++;
+ dout("process_connect got READY gseq %d cseq %d (%d)\n",
+ con->peer_global_seq,
+ le32_to_cpu(con->in_reply.connect_seq),
+ con->connect_seq);
+ WARN_ON(con->connect_seq !=
+ le32_to_cpu(con->in_reply.connect_seq));
+
+ if (con->in_reply.flags & CEPH_MSG_CONNECT_LOSSY)
+ set_bit(LOSSYTX, &con->state);
+
+ prepare_read_tag(con);
+ break;
+
+ case CEPH_MSGR_TAG_WAIT:
+ /*
+ * If there is a connection race (we are opening
+ * connections to each other), one of us may just have
+ * to WAIT. This shouldn't happen if we are the
+ * client.
+ */
+ pr_err("process_connect peer connecting WAIT\n");
+
+ default:
+ pr_err("connect protocol error, will retry\n");
+ con->error_msg = "protocol error, garbage tag during connect";
+ return -1;
+ }

+ return 0;
+}
+
+

+/*
+ * read (part of) an ack
+ */
+static int read_partial_ack(struct ceph_connection *con)
+{
+ int to = 0;
+
+ return read_partial(con, &to, sizeof(con->in_temp_ack),
+ &con->in_temp_ack);
+}
+
+
+/*
+ * We can finally discard anything that's been acked.
+ */
+static void process_ack(struct ceph_connection *con)
+{
+ struct ceph_msg *m;
+ u64 ack = le64_to_cpu(con->in_temp_ack);
+ u64 seq;
+
+ while (!list_empty(&con->out_sent)) {
+ m = list_first_entry(&con->out_sent, struct ceph_msg,
+ list_head);
+ seq = le64_to_cpu(m->hdr.seq);
+ if (seq > ack)
+ break;
+ dout("got ack for seq %llu type %d at %p\n", seq,
+ le16_to_cpu(m->hdr.type), m);
+ ceph_msg_remove(m);
+ }
+ prepare_read_tag(con);

+}
+
+
+
+

+static int read_partial_message_section(struct ceph_connection *con,
+ struct kvec *section, unsigned int sec_len,
+ u32 *crc)
+{
+ int left;
+ int ret;
+
+ BUG_ON(!section);
+
+ while (section->iov_len < sec_len) {
+ BUG_ON(section->iov_base == NULL);
+ left = sec_len - section->iov_len;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)section->iov_base +
+ section->iov_len, left);
+ if (ret <= 0)
+ return ret;
+ section->iov_len += ret;
+ if (section->iov_len == sec_len)
+ *crc = crc32c(0, section->iov_base,
+ section->iov_len);

+ }
+
+ return 1;
+}
+

+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,
+ struct ceph_msg_header *hdr,
+ int *skip);
+/*
+ * read (part of) a message.
+ */
+static int read_partial_message(struct ceph_connection *con)
+{
+ struct ceph_msg *m = con->in_msg;
+ void *p;
+ int ret;
+ int to, left;
+ unsigned front_len, middle_len, data_len, data_off;
+ int datacrc = con->msgr->nocrc;
+ int skip;
+
+ dout("read_partial_message con %p msg %p\n", con, m);
+
+ /* header */
+ while (con->in_base_pos < sizeof(con->in_hdr)) {
+ left = sizeof(con->in_hdr) - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock,
+ (char *)&con->in_hdr + con->in_base_pos,
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ if (con->in_base_pos == sizeof(con->in_hdr)) {
+ u32 crc = crc32c(0, (void *)&con->in_hdr,
+ sizeof(con->in_hdr) - sizeof(con->in_hdr.crc));
+ if (crc != le32_to_cpu(con->in_hdr.crc)) {
+ pr_err("read_partial_message bad hdr "
+ " crc %u != expected %u\n",
+ crc, con->in_hdr.crc);
+ return -EBADMSG;
+ }
+ }
+ }
+ front_len = le32_to_cpu(con->in_hdr.front_len);
+ if (front_len > CEPH_MSG_MAX_FRONT_LEN)
+ return -EIO;
+ middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ if (middle_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+ data_len = le32_to_cpu(con->in_hdr.data_len);
+ if (data_len > CEPH_MSG_MAX_DATA_LEN)
+ return -EIO;
+ data_off = le16_to_cpu(con->in_hdr.data_off);
+
+ /* allocate message? */
+ if (!con->in_msg) {
+ dout("got hdr type %d front %d data %d\n", con->in_hdr.type,
+ con->in_hdr.front_len, con->in_hdr.data_len);
+ con->in_msg = ceph_alloc_msg(con, &con->in_hdr, &skip);
+ if (skip) {
+ /* skip this message */
+ dout("alloc_msg returned NULL, skipping message\n");
+ con->in_base_pos = -front_len - middle_len - data_len -
+ sizeof(m->footer);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ return 0;
+ }
+ if (IS_ERR(con->in_msg)) {
+ ret = PTR_ERR(con->in_msg);
+ con->in_msg = NULL;
+ con->error_msg =
+ "error allocating memory for incoming message";
+ return ret;
+ }
+ m = con->in_msg;
+ m->front.iov_len = 0; /* haven't read it yet */
+ if (m->middle)
+ m->middle->vec.iov_len = 0;
+
+ con->in_msg_pos.page = 0;
+ con->in_msg_pos.page_pos = data_off & ~PAGE_MASK;
+ con->in_msg_pos.data_pos = 0;
+ }
+
+ /* front */
+ ret = read_partial_message_section(con, &m->front, front_len,
+ &con->in_front_crc);
+ if (ret <= 0)
+ return ret;
+
+ /* middle */
+ if (m->middle) {
+ ret = read_partial_message_section(con, &m->middle->vec, middle_len,
+ &con->in_middle_crc);
+ if (ret <= 0)

+ return ret;
+ }
+

+ /* (page) data */
+ while (con->in_msg_pos.data_pos < data_len) {
+ left = min((int)(data_len - con->in_msg_pos.data_pos),
+ (int)(PAGE_SIZE - con->in_msg_pos.page_pos));
+ BUG_ON(m->pages == NULL);
+ p = kmap(m->pages[con->in_msg_pos.page]);
+ ret = ceph_tcp_recvmsg(con->sock, p + con->in_msg_pos.page_pos,
+ left);
+ if (ret > 0 && datacrc)
+ con->in_data_crc =
+ crc32c(con->in_data_crc,
+ p + con->in_msg_pos.page_pos, ret);
+ kunmap(m->pages[con->in_msg_pos.page]);
+ if (ret <= 0)
+ return ret;
+ con->in_msg_pos.data_pos += ret;
+ con->in_msg_pos.page_pos += ret;
+ if (con->in_msg_pos.page_pos == PAGE_SIZE) {
+ con->in_msg_pos.page_pos = 0;
+ con->in_msg_pos.page++;
+ }
+ }
+
+ /* footer */
+ to = sizeof(m->hdr) + sizeof(m->footer);
+ while (con->in_base_pos < to) {
+ left = to - con->in_base_pos;
+ ret = ceph_tcp_recvmsg(con->sock, (char *)&m->footer +
+ (con->in_base_pos - sizeof(m->hdr)),
+ left);
+ if (ret <= 0)
+ return ret;
+ con->in_base_pos += ret;
+ }
+ dout("read_partial_message got msg %p %d (%u) + %d (%u) + %d (%u)\n",
+ m, front_len, m->footer.front_crc, middle_len,
+ m->footer.middle_crc, data_len, m->footer.data_crc);
+
+ /* crc ok? */
+ if (con->in_front_crc != le32_to_cpu(m->footer.front_crc)) {
+ pr_err("read_partial_message %p front crc %u != exp. %u\n",
+ m, con->in_front_crc, m->footer.front_crc);
+ return -EBADMSG;
+ }
+ if (con->in_middle_crc != le32_to_cpu(m->footer.middle_crc)) {
+ pr_err("read_partial_message %p middle crc %u != exp %u\n",
+ m, con->in_middle_crc, m->footer.middle_crc);
+ return -EBADMSG;
+ }
+ if (datacrc &&
+ (m->footer.flags & CEPH_MSG_FOOTER_NOCRC) == 0 &&
+ con->in_data_crc != le32_to_cpu(m->footer.data_crc)) {
+ pr_err("read_partial_message %p data crc %u != exp. %u\n", m,
+ con->in_data_crc, le32_to_cpu(m->footer.data_crc));
+ return -EBADMSG;
+ }
+
+ return 1; /* done! */
+}
+
+/*
+ * Process message. This happens in the worker thread. The callback should
+ * be careful not to do anything that waits on other incoming messages or it
+ * may deadlock.
+ */
+static void process_message(struct ceph_connection *con)
+{

+ struct ceph_msg *msg;
+

+ msg = con->in_msg;
+ con->in_msg = NULL;
+
+ /* if first message, set peer_name */
+ if (con->peer_name.type == 0)
+ con->peer_name = msg->hdr.src.name;
+
+ con->in_seq++;
+ mutex_unlock(&con->mutex);
+
+ dout("===== %p %llu from %s%lld %d=%s len %d+%d (%u %u %u) =====\n",
+ msg, le64_to_cpu(msg->hdr.seq),
+ ENTITY_NAME(msg->hdr.src.name),
+ le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.data_len),
+ con->in_front_crc, con->in_middle_crc, con->in_data_crc);
+ con->ops->dispatch(con, msg);
+
+ mutex_lock(&con->mutex);
+ prepare_read_tag(con);
+}
+
+
+/*
+ * Write something to the socket. Called in a worker thread when the
+ * socket appears to be writeable and we have something ready to send.
+ */
+static int try_write(struct ceph_connection *con)
+{
+ struct ceph_messenger *msgr = con->msgr;
+ int ret = 1;
+
+ dout("try_write start %p state %lu nref %d\n", con, con->state,
+ atomic_read(&con->nref));
+
+ mutex_lock(&con->mutex);
+more:
+ dout("try_write out_kvec_bytes %d\n", con->out_kvec_bytes);
+
+ /* open the socket first? */
+ if (con->sock == NULL) {
+ /*
+ * if we were STANDBY and are reconnecting _this_
+ * connection, bump connect_seq now. Always bump
+ * global_seq.
+ */
+ if (test_and_clear_bit(STANDBY, &con->state))
+ con->connect_seq++;
+
+ prepare_write_banner(msgr, con);
+ prepare_write_connect(msgr, con, 1);
+ prepare_read_banner(con);
+ set_bit(CONNECTING, &con->state);
+ clear_bit(NEGOTIATING, &con->state);
+
+ BUG_ON(con->in_msg);
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ dout("try_write initiating connect on %p new state %lu\n",
+ con, con->state);
+ con->sock = ceph_tcp_connect(con);
+ if (IS_ERR(con->sock)) {
+ con->sock = NULL;
+ con->error_msg = "connect error";
+ ret = -1;

+ goto out;
+ }
+ }
+

+more_kvec:
+ /* kvec data queued? */
+ if (con->out_skip) {
+ ret = write_partial_skip(con);
+ if (ret <= 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_skip err %d\n", ret);

+ goto done;
+ }
+ }

+ if (con->out_kvec_left) {
+ ret = write_partial_kvec(con);
+ if (ret <= 0)

+ goto done;
+ }
+

+ /* msg pages? */
+ if (con->out_msg) {
+ if (con->out_msg_done) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL; /* we're done with this one */
+ goto do_next;
+ }
+
+ ret = write_partial_msg_pages(con);
+ if (ret == 1)
+ goto more_kvec; /* we need to send the footer, too! */
+ if (ret == 0)
+ goto done;
+ if (ret < 0) {
+ dout("try_write write_partial_msg_pages err %d\n",
+ ret);

+ goto done;
+ }
+ }
+

+do_next:
+ if (!test_bit(CONNECTING, &con->state)) {
+ /* is anything else pending? */
+ if (!list_empty(&con->out_queue)) {
+ prepare_write_message(con);
+ goto more;
+ }
+ if (con->in_seq > con->in_seq_acked) {
+ prepare_write_ack(con);
+ goto more;
+ }
+ if (test_and_clear_bit(KEEPALIVE_PENDING, &con->state)) {
+ prepare_write_keepalive(con);
+ goto more;
+ }
+ }
+
+ /* Nothing to do! */
+ clear_bit(WRITE_PENDING, &con->state);
+ dout("try_write nothing else to write.\n");
+done:
+ ret = 0;
+out:
+ mutex_unlock(&con->mutex);
+ dout("try_write done on %p\n", con);

+ return ret;
+}
+

+
+
+/*
+ * Read what we can from the socket.
+ */
+static int try_read(struct ceph_connection *con)
+{
+ struct ceph_messenger *msgr;

+ int ret = -1;
+

+ if (!con->sock)
+ return 0;
+
+ if (test_bit(STANDBY, &con->state))
+ return 0;
+
+ dout("try_read start on %p\n", con);
+ msgr = con->msgr;
+
+ mutex_lock(&con->mutex);
+
+more:
+ dout("try_read tag %d in_base_pos %d\n", (int)con->in_tag,
+ con->in_base_pos);
+ if (test_bit(CONNECTING, &con->state)) {
+ if (!test_bit(NEGOTIATING, &con->state)) {
+ dout("try_read connecting\n");
+ ret = read_partial_banner(con);
+ if (ret <= 0)
+ goto done;
+ if (process_banner(con) < 0) {
+ ret = -1;

+ goto out;
+ }
+ }

+ ret = read_partial_connect(con);
+ if (ret <= 0)
+ goto done;
+ if (process_connect(con) < 0) {
+ ret = -1;
+ goto out;
+ }
+ goto more;
+ }
+
+ if (con->in_base_pos < 0) {
+ /*
+ * skipping + discarding content.
+ *
+ * FIXME: there must be a better way to do this!
+ */
+ static char buf[1024];
+ int skip = min(1024, -con->in_base_pos);
+ dout("skipping %d / %d bytes\n", skip, -con->in_base_pos);
+ ret = ceph_tcp_recvmsg(con->sock, buf, skip);
+ if (ret <= 0)
+ goto done;
+ con->in_base_pos += ret;
+ if (con->in_base_pos)
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_READY) {
+ /*
+ * what's next?
+ */
+ ret = ceph_tcp_recvmsg(con->sock, &con->in_tag, 1);
+ if (ret <= 0)
+ goto done;
+ dout("try_read got tag %d\n", (int)con->in_tag);
+ switch (con->in_tag) {
+ case CEPH_MSGR_TAG_MSG:
+ prepare_read_message(con);
+ break;
+ case CEPH_MSGR_TAG_ACK:
+ prepare_read_ack(con);
+ break;
+ case CEPH_MSGR_TAG_CLOSE:
+ set_bit(CLOSED, &con->state); /* fixme */
+ goto done;
+ default:
+ goto bad_tag;
+ }
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_MSG) {
+ ret = read_partial_message(con);
+ if (ret <= 0) {
+ switch (ret) {
+ case -EBADMSG:
+ con->error_msg = "bad crc";
+ ret = -EIO;
+ goto out;
+ case -EIO:
+ con->error_msg = "io error";
+ goto out;
+ default:

+ goto done;
+ }
+ }

+ if (con->in_tag == CEPH_MSGR_TAG_READY)
+ goto more;
+ process_message(con);
+ goto more;
+ }
+ if (con->in_tag == CEPH_MSGR_TAG_ACK) {
+ ret = read_partial_ack(con);
+ if (ret <= 0)
+ goto done;
+ process_ack(con);

+ goto more;
+ }
+

+done:
+ ret = 0;
+out:
+ mutex_unlock(&con->mutex);
+ dout("try_read done on %p\n", con);
+ return ret;
+
+bad_tag:
+ pr_err("try_read bad con->in_tag = %d\n", (int)con->in_tag);
+ con->error_msg = "protocol error, garbage tag";
+ ret = -1;

+ goto out;
+}
+
+

+/*
+ * Atomically queue work on a connection. Bump @con reference to
+ * avoid races with connection teardown.
+ *
+ * There is some trickery going on with QUEUED and BUSY because we
+ * only want a _single_ thread operating on each connection at any
+ * point in time, but we want to use all available CPUs.
+ *
+ * The worker thread only proceeds if it can atomically set BUSY. It
+ * clears QUEUED and does it's thing. When it thinks it's done, it
+ * clears BUSY, then rechecks QUEUED.. if it's set again, it loops
+ * (tries again to set BUSY).
+ *
+ * To queue work, we first set QUEUED, _then_ if BUSY isn't set, we
+ * try to queue work. If that fails (work is already queued, or BUSY)
+ * we give up (work also already being done or is queued) but leave QUEUED
+ * set so that the worker thread will loop if necessary.
+ */
+static void queue_con(struct ceph_connection *con)
+{
+ if (test_bit(DEAD, &con->state)) {
+ dout("queue_con %p ignoring: DEAD\n",
+ con);
+ return;
+ }
+
+ if (!con->ops->get(con)) {
+ dout("queue_con %p ref count 0\n", con);
+ return;
+ }
+
+ set_bit(QUEUED, &con->state);
+ if (test_bit(BUSY, &con->state)) {
+ dout("queue_con %p - already BUSY\n", con);
+ con->ops->put(con);
+ } else if (!queue_work(ceph_msgr_wq, &con->work.work)) {
+ dout("queue_con %p - already queued\n", con);
+ con->ops->put(con);
+ } else {
+ dout("queue_con %p\n", con);
+ }
+}
+
+/*
+ * Do some work on a connection. Drop a connection ref when we're done.
+ */
+static void con_work(struct work_struct *work)
+{
+ struct ceph_connection *con = container_of(work, struct ceph_connection,
+ work.work);
+ int backoff = 0;
+
+more:
+ if (test_and_set_bit(BUSY, &con->state) != 0) {
+ dout("con_work %p BUSY already set\n", con);
+ goto out;
+ }
+ dout("con_work %p start, clearing QUEUED\n", con);
+ clear_bit(QUEUED, &con->state);
+
+ if (test_bit(CLOSED, &con->state)) { /* e.g. if we are replaced */
+ dout("con_work CLOSED\n");
+ con_close_socket(con);
+ goto done;
+ }
+ if (test_and_clear_bit(OPENING, &con->state)) {
+ /* reopen w/ new peer */
+ dout("con_work OPENING\n");
+ con_close_socket(con);
+ }
+
+ if (test_and_clear_bit(SOCK_CLOSED, &con->state) ||
+ try_read(con) < 0 ||
+ try_write(con) < 0) {
+ backoff = 1;
+ ceph_fault(con); /* error/fault path */
+ }
+
+done:
+ clear_bit(BUSY, &con->state);
+ dout("con->state=%lu\n", con->state);
+ if (test_bit(QUEUED, &con->state)) {
+ if (!backoff || test_bit(OPENING, &con->state)) {
+ dout("con_work %p QUEUED reset, looping\n", con);
+ goto more;
+ }
+ dout("con_work %p QUEUED reset, but just faulted\n", con);
+ clear_bit(QUEUED, &con->state);
+ }
+ dout("con_work %p done\n", con);
+
+out:
+ con->ops->put(con);
+}
+
+
+/*
+ * Generic error/fault handler. A retry mechanism is used with
+ * exponential backoff
+ */
+static void ceph_fault(struct ceph_connection *con)
+{
+ pr_err("%s%lld %s %s\n", ENTITY_NAME(con->peer_name),
+ pr_addr(&con->peer_addr.in_addr), con->error_msg);
+ dout("fault %p state %lu to peer %s\n",
+ con, con->state, pr_addr(&con->peer_addr.in_addr));
+
+ if (test_bit(LOSSYTX, &con->state)) {
+ dout("fault on LOSSYTX channel\n");

+ goto out;
+ }
+

+ clear_bit(BUSY, &con->state); /* to avoid an improbable race */
+
+ mutex_lock(&con->mutex);
+ if (test_bit(CLOSED, &con->state))
+ goto out_unlock;
+
+ con_close_socket(con);
+
+ if (con->in_msg) {
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ }
+
+ /* Requeue anything that hasn't been acked */
+ list_splice_init(&con->out_sent, &con->out_queue);
+
+ /* If there are no messages in the queue, place the connection
+ * in a STANDBY state (i.e., don't try to reconnect just yet). */
+ if (list_empty(&con->out_queue) && !con->out_keepalive_pending) {
+ dout("fault setting STANDBY\n");
+ set_bit(STANDBY, &con->state);
+ } else {
+ /* retry after a delay. */
+ if (con->delay == 0)
+ con->delay = BASE_DELAY_INTERVAL;
+ else if (con->delay < MAX_DELAY_INTERVAL)
+ con->delay *= 2;
+ dout("fault queueing %p delay %lu\n", con, con->delay);
+ con->ops->get(con);
+ if (queue_delayed_work(ceph_msgr_wq, &con->work,
+ round_jiffies_relative(con->delay)) == 0)
+ con->ops->put(con);
+ }
+
+out_unlock:
+ mutex_unlock(&con->mutex);
+out:
+ /*
+ * in case we faulted due to authentication, invalidate our
+ * current tickets so that we can get new ones.
+ */
+ if (con->auth_retry && con->ops->invalidate_authorizer) {
+ dout("calling invalidate_authorizer()\n");
+ con->ops->invalidate_authorizer(con);
+ }
+
+ if (con->ops->fault)
+ con->ops->fault(con);

+}
+
+
+
+/*

+ * create a new messenger instance
+ */
+struct ceph_messenger *ceph_messenger_create(struct ceph_entity_addr *myaddr)

+{
+ struct ceph_messenger *msgr;

+
+ msgr = kzalloc(sizeof(*msgr), GFP_KERNEL);
+ if (msgr == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ spin_lock_init(&msgr->global_seq_lock);
+
+ /* the zero page is needed if a request is "canceled" while the message
+ * is being written over the socket */
+ msgr->zero_page = alloc_page(GFP_KERNEL | __GFP_ZERO);
+ if (!msgr->zero_page) {
+ kfree(msgr);
+ return ERR_PTR(-ENOMEM);
+ }
+ kmap(msgr->zero_page);
+
+ if (myaddr)
+ msgr->inst.addr = *myaddr;
+
+ /* select a random nonce */
+ msgr->inst.addr.type = 0;
+ get_random_bytes(&msgr->inst.addr.nonce, sizeof(msgr->inst.addr.nonce));
+ encode_my_addr(msgr);
+
+ dout("messenger_create %p\n", msgr);
+ return msgr;
+}
+
+void ceph_messenger_destroy(struct ceph_messenger *msgr)
+{
+ dout("destroy %p\n", msgr);
+ kunmap(msgr->zero_page);
+ __free_page(msgr->zero_page);
+ kfree(msgr);
+ dout("destroyed messenger %p\n", msgr);
+}
+
+/*
+ * Queue up an outgoing message on the given connection.
+ */
+void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ if (test_bit(CLOSED, &con->state)) {
+ dout("con_send %p closed, dropping %p\n", con, msg);
+ ceph_msg_put(msg);
+ return;
+ }
+
+ /* set src+dst */
+ msg->hdr.src.name = con->msgr->inst.name;
+ msg->hdr.src.addr = con->msgr->my_enc_addr;
+ msg->hdr.orig_src = msg->hdr.src;
+
+ BUG_ON(msg->front.iov_len != le32_to_cpu(msg->hdr.front_len));
+
+ /* queue */
+ mutex_lock(&con->mutex);
+ BUG_ON(!list_empty(&msg->list_head));
+ list_add_tail(&msg->list_head, &con->out_queue);
+ dout("----- %p to %s%lld %d=%s len %d+%d+%d -----\n", msg,
+ ENTITY_NAME(con->peer_name), le16_to_cpu(msg->hdr.type),
+ ceph_msg_type_name(le16_to_cpu(msg->hdr.type)),
+ le32_to_cpu(msg->hdr.front_len),
+ le32_to_cpu(msg->hdr.middle_len),
+ le32_to_cpu(msg->hdr.data_len));
+ mutex_unlock(&con->mutex);
+
+ /* if there wasn't anything waiting to send before, queue
+ * new work */
+ if (test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+
+/*
+ * Revoke a message that was previously queued for send
+ */
+void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ mutex_lock(&con->mutex);
+ if (!list_empty(&msg->list_head)) {
+ dout("con_revoke %p msg %p\n", con, msg);
+ list_del_init(&msg->list_head);
+ ceph_msg_put(msg);
+ msg->hdr.seq = 0;
+ if (con->out_msg == msg) {
+ ceph_msg_put(con->out_msg);
+ con->out_msg = NULL;
+ }
+ if (con->out_kvec_is_msg) {
+ con->out_skip = con->out_kvec_bytes;
+ con->out_kvec_is_msg = false;
+ }
+ } else {
+ dout("con_revoke %p msg %p - not queued (sent?)\n", con, msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Revoke a message that we may be reading data into
+ */
+void ceph_con_revoke_message(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ mutex_lock(&con->mutex);
+ if (con->in_msg && con->in_msg == msg) {
+ unsigned front_len = le32_to_cpu(con->in_hdr.front_len);
+ unsigned middle_len = le32_to_cpu(con->in_hdr.middle_len);
+ unsigned data_len = le32_to_cpu(con->in_hdr.data_len);
+
+ /* skip rest of message */
+ dout("con_revoke_pages %p msg %p revoked\n", con, msg);
+ con->in_base_pos = con->in_base_pos -
+ sizeof(struct ceph_msg_header) -
+ front_len -
+ middle_len -
+ data_len -
+ sizeof(struct ceph_msg_footer);
+ ceph_msg_put(con->in_msg);
+ con->in_msg = NULL;
+ con->in_tag = CEPH_MSGR_TAG_READY;
+ } else {
+ dout("con_revoke_pages %p msg %p pages %p no-op\n",
+ con, con->in_msg, msg);
+ }
+ mutex_unlock(&con->mutex);
+}
+
+/*
+ * Queue a keepalive byte to ensure the tcp connection is alive.
+ */
+void ceph_con_keepalive(struct ceph_connection *con)
+{
+ if (test_and_set_bit(KEEPALIVE_PENDING, &con->state) == 0 &&
+ test_and_set_bit(WRITE_PENDING, &con->state) == 0)
+ queue_con(con);
+}
+
+
+/*
+ * construct a new message with given type, size
+ * the new msg has a ref count of 1.
+ */
+struct ceph_msg *ceph_msg_new(int type, int front_len,
+ int page_len, int page_off, struct page **pages)
+{

+ struct ceph_msg *m;
+

+ m = kmalloc(sizeof(*m), GFP_NOFS);
+ if (m == NULL)
+ goto out;
+ kref_init(&m->kref);
+ INIT_LIST_HEAD(&m->list_head);
+
+ m->hdr.type = cpu_to_le16(type);
+ m->hdr.front_len = cpu_to_le32(front_len);
+ m->hdr.middle_len = 0;
+ m->hdr.data_len = cpu_to_le32(page_len);
+ m->hdr.data_off = cpu_to_le16(page_off);
+ m->hdr.priority = cpu_to_le16(CEPH_MSG_PRIO_DEFAULT);
+ m->footer.front_crc = 0;
+ m->footer.middle_crc = 0;
+ m->footer.data_crc = 0;
+ m->front_max = front_len;
+ m->front_is_vmalloc = false;
+ m->more_to_follow = false;
+ m->pool = NULL;
+
+ /* front */
+ if (front_len) {
+ if (front_len > PAGE_CACHE_SIZE) {
+ m->front.iov_base = __vmalloc(front_len, GFP_NOFS,
+ PAGE_KERNEL);
+ m->front_is_vmalloc = true;
+ } else {
+ m->front.iov_base = kmalloc(front_len, GFP_NOFS);
+ }
+ if (m->front.iov_base == NULL) {
+ pr_err("msg_new can't allocate %d bytes\n",
+ front_len);
+ goto out2;
+ }
+ } else {
+ m->front.iov_base = NULL;
+ }
+ m->front.iov_len = front_len;
+
+ /* middle */
+ m->middle = NULL;
+
+ /* data */
+ m->nr_pages = calc_pages_for(page_off, page_len);
+ m->pages = pages;
+ m->pagelist = NULL;
+
+ dout("ceph_msg_new %p page %d~%d -> %d\n", m, page_off, page_len,
+ m->nr_pages);
+ return m;
+
+out2:
+ ceph_msg_put(m);
+out:
+ pr_err("msg_new can't create type %d len %d\n", type, front_len);
+ return ERR_PTR(-ENOMEM);
+}
+
+/*
+ * Allocate "middle" portion of a message, if it is needed and wasn't
+ * allocated by alloc_msg. This allows us to read a small fixed-size
+ * per-type header in the front and then gracefully fail (i.e.,
+ * propagate the error to the caller based on info in the front) when
+ * the middle is too large.
+ */
+static int ceph_alloc_middle(struct ceph_connection *con, struct ceph_msg *msg)
+{

+ int type = le16_to_cpu(msg->hdr.type);

+ int middle_len = le32_to_cpu(msg->hdr.middle_len);
+
+ dout("alloc_middle %p type %d %s middle_len %d\n", msg, type,
+ ceph_msg_type_name(type), middle_len);
+ BUG_ON(!middle_len);
+ BUG_ON(msg->middle);
+
+ msg->middle = ceph_buffer_new(middle_len, GFP_NOFS);
+ if (!msg->middle)
+ return -ENOMEM;

+ return 0;
+}
+
+/*

+ * Generic message allocator, for incoming messages.
+ */
+static struct ceph_msg *ceph_alloc_msg(struct ceph_connection *con,

+ struct ceph_msg_header *hdr,
+ int *skip)
+{

+ int type = le16_to_cpu(hdr->type);

+ int front_len = le32_to_cpu(hdr->front_len);
+ int middle_len = le32_to_cpu(hdr->middle_len);
+ struct ceph_msg *msg = NULL;
+ int ret;
+
+ if (con->ops->alloc_msg) {
+ mutex_unlock(&con->mutex);
+ msg = con->ops->alloc_msg(con, hdr, skip);
+ mutex_lock(&con->mutex);
+ if (IS_ERR(msg))
+ return msg;
+
+ if (*skip)
+ return NULL;
+ }
+ if (!msg) {
+ *skip = 0;
+ msg = ceph_msg_new(type, front_len, 0, 0, NULL);
+ if (!msg) {
+ pr_err("unable to allocate msg type %d len %d\n",
+ type, front_len);

+ return ERR_PTR(-ENOMEM);
+ }
+ }

+ memcpy(&msg->hdr, &con->in_hdr, sizeof(con->in_hdr));
+
+ if (middle_len) {
+ ret = ceph_alloc_middle(con, msg);
+
+ if (ret < 0) {
+ ceph_msg_put(msg);
+ return msg;
+ }
+ }
+
+ return msg;
+}
+
+
+/*
+ * Free a generically kmalloc'd message.
+ */
+void ceph_msg_kfree(struct ceph_msg *m)
+{
+ dout("msg_kfree %p\n", m);
+ if (m->front_is_vmalloc)
+ vfree(m->front.iov_base);
+ else
+ kfree(m->front.iov_base);
+ kfree(m);
+}
+
+/*
+ * Drop a msg ref. Destroy as needed.
+ */
+void ceph_msg_last_put(struct kref *kref)
+{
+ struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
+
+ dout("ceph_msg_put last one on %p\n", m);
+ WARN_ON(!list_empty(&m->list_head));
+
+ /* drop middle, data, if any */
+ if (m->middle) {
+ ceph_buffer_put(m->middle);
+ m->middle = NULL;
+ }
+ m->nr_pages = 0;
+ m->pages = NULL;
+
+ if (m->pagelist) {
+ ceph_pagelist_release(m->pagelist);
+ kfree(m->pagelist);
+ m->pagelist = NULL;
+ }
+
+ if (m->pool)
+ ceph_msgpool_put(m->pool, m);
+ else
+ ceph_msg_kfree(m);
+}
+
+void ceph_msg_dump(struct ceph_msg *msg)
+{
+ pr_debug("msg_dump %p (front_max %d nr_pages %d)\n", msg,
+ msg->front_max, msg->nr_pages);
+ print_hex_dump(KERN_DEBUG, "header: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->hdr, sizeof(msg->hdr), true);
+ print_hex_dump(KERN_DEBUG, " front: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->front.iov_base, msg->front.iov_len, true);
+ if (msg->middle)
+ print_hex_dump(KERN_DEBUG, "middle: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ msg->middle->vec.iov_base,
+ msg->middle->vec.iov_len, true);
+ print_hex_dump(KERN_DEBUG, "footer: ",
+ DUMP_PREFIX_OFFSET, 16, 1,
+ &msg->footer, sizeof(msg->footer), true);
+}
diff --git a/fs/ceph/messenger.h b/fs/ceph/messenger.h
new file mode 100644
index 0000000..4caaa59
--- /dev/null
+++ b/fs/ceph/messenger.h
@@ -0,0 +1,254 @@
+#ifndef __FS_CEPH_MESSENGER_H
+#define __FS_CEPH_MESSENGER_H
+
+#include <linux/kref.h>
+#include <linux/mutex.h>
+#include <linux/net.h>
+#include <linux/radix-tree.h>
+#include <linux/uio.h>
+#include <linux/version.h>
+#include <linux/workqueue.h>
+
+#include "types.h"
+#include "buffer.h"
+
+struct ceph_msg;
+struct ceph_connection;
+
+extern struct workqueue_struct *ceph_msgr_wq; /* receive work queue */
+
+/*
+ * Ceph defines these callbacks for handling connection events.
+ */
+struct ceph_connection_operations {
+ struct ceph_connection *(*get)(struct ceph_connection *);
+ void (*put)(struct ceph_connection *);
+
+ /* handle an incoming message. */
+ void (*dispatch) (struct ceph_connection *con, struct ceph_msg *m);
+
+ /* authorize an outgoing connection */
+ int (*get_authorizer) (struct ceph_connection *con,

+ void **buf, int *len, int *proto,
+ void **reply_buf, int *reply_len, int force_new);

+ int (*verify_authorizer_reply) (struct ceph_connection *con, int len);
+ int (*invalidate_authorizer)(struct ceph_connection *con);
+
+ /* protocol version mismatch */
+ void (*bad_proto) (struct ceph_connection *con);
+
+ /* there was some error on the socket (disconnect, whatever) */
+ void (*fault) (struct ceph_connection *con);
+
+ /* a remote host as terminated a message exchange session, and messages
+ * we sent (or they tried to send us) may be lost. */
+ void (*peer_reset) (struct ceph_connection *con);
+
+ struct ceph_msg * (*alloc_msg) (struct ceph_connection *con,

+ struct ceph_msg_header *hdr,
+ int *skip);
+};
+

+extern const char *ceph_name_type_str(int t);
+
+/* use format string %s%d */
+#define ENTITY_NAME(n) ceph_name_type_str((n).type), le64_to_cpu((n).num)
+
+struct ceph_messenger {
+ struct ceph_entity_inst inst; /* my name+address */
+ struct ceph_entity_addr my_enc_addr;
+ struct page *zero_page; /* used in certain error cases */
+
+ bool nocrc;
+
+ /*
+ * the global_seq counts connections i (attempt to) initiate
+ * in order to disambiguate certain connect race conditions.
+ */
+ u32 global_seq;
+ spinlock_t global_seq_lock;
+};
+
+/*
+ * a single message. it contains a header (src, dest, message type, etc.),
+ * footer (crc values, mainly), a "front" message body, and possibly a
+ * data payload (stored in some number of pages).
+ */
+struct ceph_msg {
+ struct ceph_msg_header hdr; /* header */
+ struct ceph_msg_footer footer; /* footer */
+ struct kvec front; /* unaligned blobs of message */
+ struct ceph_buffer *middle;
+ struct page **pages; /* data payload. NOT OWNER. */
+ unsigned nr_pages; /* size of page array */
+ struct ceph_pagelist *pagelist; /* instead of pages */
+ struct list_head list_head;
+ struct kref kref;
+ bool front_is_vmalloc;
+ bool more_to_follow;
+ int front_max;
+
+ struct ceph_msgpool *pool;
+};
+
+struct ceph_msg_pos {
+ int page, page_pos; /* which page; offset in page */
+ int data_pos; /* offset in data payload */
+ int did_page_crc; /* true if we've calculated crc for current page */
+};
+
+/* ceph connection fault delay defaults, for exponential backoff */
+#define BASE_DELAY_INTERVAL (HZ/2)
+#define MAX_DELAY_INTERVAL (5 * 60 * HZ)
+
+/*
+ * ceph_connection state bit flags
+ *
+ * QUEUED and BUSY are used together to ensure that only a single
+ * thread is currently opening, reading or writing data to the socket.
+ */
+#define LOSSYTX 0 /* we can close channel or drop messages on errors */
+#define CONNECTING 1
+#define NEGOTIATING 2
+#define KEEPALIVE_PENDING 3
+#define WRITE_PENDING 4 /* we have data ready to send */
+#define QUEUED 5 /* there is work queued on this connection */
+#define BUSY 6 /* work is being done */
+#define STANDBY 8 /* no outgoing messages, socket closed. we keep
+ * the ceph_connection around to maintain shared
+ * state with the peer. */
+#define CLOSED 10 /* we've closed the connection */
+#define SOCK_CLOSED 11 /* socket state changed to closed */
+#define OPENING 13 /* open connection w/ (possibly new) peer */
+#define DEAD 14 /* dead, about to kfree */
+
+/*
+ * A single connection with another host.
+ *
+ * We maintain a queue of outgoing messages, and some session state to
+ * ensure that we can preserve the lossless, ordered delivery of
+ * messages in the case of a TCP disconnect.
+ */
+struct ceph_connection {
+ void *private;
+ atomic_t nref;
+
+ const struct ceph_connection_operations *ops;

+
+ struct ceph_messenger *msgr;

+ struct socket *sock;
+ unsigned long state; /* connection state (see flags above) */
+ const char *error_msg; /* error message, if any */
+
+ struct ceph_entity_addr peer_addr; /* peer address */
+ struct ceph_entity_name peer_name; /* peer name */
+ struct ceph_entity_addr peer_addr_for_me;
+ u32 connect_seq; /* identify the most recent connection
+ attempt for this connection, client */
+ u32 peer_global_seq; /* peer's global seq for this connection */
+
+ int auth_retry; /* true if we need a newer authorizer */
+ void *auth_reply_buf; /* where to put the authorizer reply */
+ int auth_reply_buf_len;
+
+ struct mutex mutex;
+
+ /* out queue */
+ struct list_head out_queue;
+ struct list_head out_sent; /* sending or sent but unacked */
+ u64 out_seq; /* last message queued for send */
+ u64 out_seq_sent; /* last message sent */
+ bool out_keepalive_pending;
+
+ u64 in_seq, in_seq_acked; /* last message received, acked */
+
+ /* connection negotiation temps */
+ char in_banner[CEPH_BANNER_MAX_LEN];
+ union {
+ struct { /* outgoing connection */
+ struct ceph_msg_connect out_connect;
+ struct ceph_msg_connect_reply in_reply;
+ };
+ struct { /* incoming */
+ struct ceph_msg_connect in_connect;
+ struct ceph_msg_connect_reply out_reply;
+ };
+ };
+ struct ceph_entity_addr actual_peer_addr;
+
+ /* message out temps */
+ struct ceph_msg *out_msg; /* sending message (== tail of
+ out_sent) */
+ bool out_msg_done;
+ struct ceph_msg_pos out_msg_pos;
+
+ struct kvec out_kvec[8], /* sending header/footer data */
+ *out_kvec_cur;
+ int out_kvec_left; /* kvec's left in out_kvec */
+ int out_skip; /* skip this many bytes */
+ int out_kvec_bytes; /* total bytes left */
+ bool out_kvec_is_msg; /* kvec refers to out_msg */
+ int out_more; /* there is more data after the kvecs */
+ __le64 out_temp_ack; /* for writing an ack */
+
+ /* message in temps */
+ struct ceph_msg_header in_hdr;
+ struct ceph_msg *in_msg;
+ struct ceph_msg_pos in_msg_pos;
+ u32 in_front_crc, in_middle_crc, in_data_crc; /* calculated crc */
+
+ char in_tag; /* protocol control byte */
+ int in_base_pos; /* bytes read */
+ __le64 in_temp_ack; /* for reading an ack */
+
+ struct delayed_work work; /* send|recv work */
+ unsigned long delay; /* current delay interval */
+};
+
+
+extern const char *pr_addr(const struct sockaddr_storage *ss);
+extern int ceph_parse_ips(const char *c, const char *end,
+ struct ceph_entity_addr *addr,
+ int max_count, int *count);
+
+
+extern int ceph_msgr_init(void);
+extern void ceph_msgr_exit(void);
+
+extern struct ceph_messenger *ceph_messenger_create(
+ struct ceph_entity_addr *myaddr);
+extern void ceph_messenger_destroy(struct ceph_messenger *);
+
+extern void ceph_con_init(struct ceph_messenger *msgr,
+ struct ceph_connection *con);
+extern void ceph_con_open(struct ceph_connection *con,
+ struct ceph_entity_addr *addr);
+extern void ceph_con_close(struct ceph_connection *con);
+extern void ceph_con_send(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke(struct ceph_connection *con, struct ceph_msg *msg);
+extern void ceph_con_revoke_message(struct ceph_connection *con,
+ struct ceph_msg *msg);
+extern void ceph_con_keepalive(struct ceph_connection *con);
+extern struct ceph_connection *ceph_con_get(struct ceph_connection *con);
+extern void ceph_con_put(struct ceph_connection *con);
+
+extern struct ceph_msg *ceph_msg_new(int type, int front_len,
+ int page_len, int page_off,
+ struct page **pages);
+extern void ceph_msg_kfree(struct ceph_msg *m);
+
+
+static inline struct ceph_msg *ceph_msg_get(struct ceph_msg *msg)
+{
+ kref_get(&msg->kref);
+ return msg;
+}
+extern void ceph_msg_last_put(struct kref *kref);
+static inline void ceph_msg_put(struct ceph_msg *msg)
+{
+ kref_put(&msg->kref, ceph_msg_last_put);
+}
+
+extern void ceph_msg_dump(struct ceph_msg *msg);
+
+#endif

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

This implements the 'auth_x' authentication scheme, which bears
some similarity to Kerberos but is designed to perform well for
large, distributed clusters providing a single 'virtual' service
(as with the ceph osd and mds clusters).

The client authenticates with the monitor and obtains an auth
ticket as well as service tickets for talking to the mds and
osd clusters. Tickets are periodically renewed, as the servers
rotate their keys.

Each time the client opens a TCP connection to a service, it
provides the ticket, which mutually authenticates the client
to the service and the service to the client.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/armor.c | 99 +++++++
fs/ceph/auth_x.c | 656 +++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/auth_x.h | 49 ++++
fs/ceph/auth_x_protocol.h | 90 ++++++
fs/ceph/crypto.c | 408 ++++++++++++++++++++++++++++
fs/ceph/crypto.h | 48 ++++
6 files changed, 1350 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/armor.c
create mode 100644 fs/ceph/auth_x.c
create mode 100644 fs/ceph/auth_x.h
create mode 100644 fs/ceph/auth_x_protocol.h
create mode 100644 fs/ceph/crypto.c
create mode 100644 fs/ceph/crypto.h

diff --git a/fs/ceph/armor.c b/fs/ceph/armor.c
new file mode 100644
index 0000000..67b2c03
--- /dev/null
+++ b/fs/ceph/armor.c
@@ -0,0 +1,99 @@
+
+#include <linux/errno.h>
+
+/*
+ * base64 encode/decode.
+ */
+
+const char *pem_key = "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/";
+
+static int encode_bits(int c)
+{
+ return pem_key[c];
+}
+
+static int decode_bits(char c)
+{
+ if (c >= 'A' && c <= 'Z')
+ return c - 'A';
+ if (c >= 'a' && c <= 'z')
+ return c - 'a' + 26;
+ if (c >= '0' && c <= '9')
+ return c - '0' + 52;
+ if (c == '+')
+ return 62;
+ if (c == '/')
+ return 63;
+ if (c == '=')
+ return 0; /* just non-negative, please */

+ return -EINVAL;
+}
+

+int ceph_armor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+ int line = 0;
+
+ while (src < end) {
+ unsigned char a, b, c;
+
+ a = *src++;
+ *dst++ = encode_bits(a >> 2);
+ if (src < end) {
+ b = *src++;
+ *dst++ = encode_bits(((a & 3) << 4) | (b >> 4));
+ if (src < end) {
+ c = *src++;
+ *dst++ = encode_bits(((b & 15) << 2) |
+ (c >> 6));
+ *dst++ = encode_bits(c & 63);
+ } else {
+ *dst++ = encode_bits((b & 15) << 2);
+ *dst++ = '=';
+ }
+ } else {
+ *dst++ = encode_bits(((a & 3) << 4));
+ *dst++ = '=';
+ *dst++ = '=';
+ }
+ olen += 4;
+ line += 4;
+ if (line == 64) {
+ line = 0;
+ *(dst++) = '\n';
+ olen++;
+ }
+ }
+ return olen;
+}
+
+int ceph_unarmor(char *dst, const char *src, const char *end)
+{
+ int olen = 0;
+
+ while (src < end) {
+ int a, b, c, d;
+
+ if (src < end && src[0] == '\n')
+ src++;
+ if (src + 4 > end)
+ return -EINVAL;
+ a = decode_bits(src[0]);
+ b = decode_bits(src[1]);
+ c = decode_bits(src[2]);
+ d = decode_bits(src[3]);
+ if (a < 0 || b < 0 || c < 0 || d < 0)
+ return -EINVAL;
+
+ *dst++ = (a << 2) | (b >> 4);
+ if (src[2] == '=')
+ return olen + 1;
+ *dst++ = ((b & 15) << 4) | (c >> 2);
+ if (src[3] == '=')
+ return olen + 2;
+ *dst++ = ((c & 3) << 6) | d;
+ olen += 3;
+ src += 4;
+ }
+ return olen;
+}
diff --git a/fs/ceph/auth_x.c b/fs/ceph/auth_x.c
new file mode 100644
index 0000000..f031842
--- /dev/null
+++ b/fs/ceph/auth_x.c
@@ -0,0 +1,656 @@
+

+#include "ceph_debug.h"
+
+#include <linux/err.h>

+#include <linux/module.h>
+#include <linux/random.h>
+
+#include "auth_x.h"
+#include "auth_x_protocol.h"
+#include "crypto.h"
+#include "auth.h"
+#include "decode.h"
+
+struct kmem_cache *ceph_x_ticketbuf_cachep;
+
+#define TEMP_TICKET_BUF_LEN 256
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed);
+
+static int ceph_x_is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+
+ ceph_x_validate_tickets(ac, &need);
+ dout("ceph_x_is_authenticated want=%d need=%d have=%d\n",
+ ac->want_keys, need, xi->have_keys);
+ return (ac->want_keys & xi->have_keys) == ac->want_keys;
+}
+
+static int ceph_x_encrypt(struct ceph_crypto_key *secret,
+ void *ibuf, int ilen, void *obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head = {
+ .struct_v = 1,
+ .magic = cpu_to_le64(CEPHX_ENC_MAGIC)
+ };
+ size_t len = olen - sizeof(u32);
+ int ret;
+
+ ret = ceph_encrypt2(secret, obuf + sizeof(u32), &len,
+ &head, sizeof(head), ibuf, ilen);
+ if (ret)
+ return ret;
+ ceph_encode_32(&obuf, len);
+ return len + sizeof(u32);
+}
+
+static int ceph_x_decrypt(struct ceph_crypto_key *secret,
+ void **p, void *end, void *obuf, size_t olen)
+{
+ struct ceph_x_encrypt_header head;
+ size_t head_len = sizeof(head);
+ int len, ret;
+
+ len = ceph_decode_32(p);
+ if (*p + len > end)
+ return -EINVAL;
+
+ dout("ceph_x_decrypt len %d\n", len);
+ ret = ceph_decrypt2(secret, &head, &head_len, obuf, &olen,
+ *p, len);
+ if (ret)
+ return ret;
+ if (head.struct_v != 1 || le64_to_cpu(head.magic) != CEPHX_ENC_MAGIC)
+ return -EPERM;
+ *p += len;
+ return olen;
+}
+
+/*
+ * get existing (or insert new) ticket handler
+ */
+struct ceph_x_ticket_handler *get_ticket_handler(struct ceph_auth_client *ac,
+ int service)
+{
+ struct ceph_x_ticket_handler *th;
+ struct ceph_x_info *xi = ac->private;
+ struct rb_node *parent = NULL, **p = &xi->ticket_handlers.rb_node;

+
+ while (*p) {
+ parent = *p;

+ th = rb_entry(parent, struct ceph_x_ticket_handler, node);
+ if (service < th->service)

+ p = &(*p)->rb_left;

+ else if (service > th->service)

+ p = &(*p)->rb_right;
+ else

+ return th;
+ }
+
+ /* add it */
+ th = kzalloc(sizeof(*th), GFP_NOFS);
+ if (!th)
+ return ERR_PTR(-ENOMEM);
+ th->service = service;
+ rb_link_node(&th->node, parent, p);
+ rb_insert_color(&th->node, &xi->ticket_handlers);
+ return th;
+}
+
+static void remove_ticket_handler(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("remove_ticket_handler %p %d\n", th, th->service);
+ rb_erase(&th->node, &xi->ticket_handlers);
+ ceph_crypto_key_destroy(&th->session_key);
+ if (th->ticket_blob)
+ ceph_buffer_put(th->ticket_blob);
+ kfree(th);
+}
+
+static int ceph_x_proc_ticket_reply(struct ceph_auth_client *ac,
+ struct ceph_crypto_key *secret,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int num;
+ void *p = buf;
+ int ret;
+ char *dbuf;
+ char *ticket_buf;
+ u8 struct_v;
+
+ dbuf = kmem_cache_alloc(ceph_x_ticketbuf_cachep, GFP_NOFS | GFP_ATOMIC);
+ if (!dbuf)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ ticket_buf = kmem_cache_alloc(ceph_x_ticketbuf_cachep,
+ GFP_NOFS | GFP_ATOMIC);
+ if (!ticket_buf)
+ goto out_dbuf;
+
+ ceph_decode_need(&p, end, 1 + sizeof(u32), bad);
+ struct_v = ceph_decode_8(&p);
+ if (struct_v != 1)
+ goto bad;
+ num = ceph_decode_32(&p);
+ dout("%d tickets\n", num);
+ while (num--) {
+ int type;
+ u8 struct_v;
+ struct ceph_x_ticket_handler *th;
+ void *dp, *dend;
+ int dlen;
+ char is_enc;
+ struct timespec validity;
+ struct ceph_crypto_key old_key;
+ void *tp, *tpend;
+
+ ceph_decode_need(&p, end, sizeof(u32) + 1, bad);
+
+ type = ceph_decode_32(&p);
+ dout(" ticket type %d %s\n", type, ceph_entity_type_name(type));
+
+ struct_v = ceph_decode_8(&p);
+ if (struct_v != 1)
+ goto bad;
+
+ th = get_ticket_handler(ac, type);
+ if (IS_ERR(th)) {
+ ret = PTR_ERR(th);

+ goto out;
+ }
+

+ /* blob for me */
+ dlen = ceph_x_decrypt(secret, &p, end, dbuf,
+ TEMP_TICKET_BUF_LEN);
+ if (dlen <= 0) {
+ ret = dlen;
+ goto out;
+ }
+ dout(" decrypted %d bytes\n", dlen);
+ dend = dbuf + dlen;
+ dp = dbuf;
+
+ struct_v = ceph_decode_8(&dp);
+ if (struct_v != 1)
+ goto bad;
+
+ memcpy(&old_key, &th->session_key, sizeof(old_key));
+ ret = ceph_crypto_key_decode(&th->session_key, &dp, dend);
+ if (ret)
+ goto out;
+
+ ceph_decode_copy(&dp, &th->validity, sizeof(th->validity));
+ ceph_decode_timespec(&validity, &th->validity);
+ th->expires = get_seconds() + validity.tv_sec;
+ th->renew_after = th->expires - (validity.tv_sec / 4);
+ dout(" expires=%lu renew_after=%lu\n", th->expires,
+ th->renew_after);
+
+ /* ticket blob for service */
+ ceph_decode_8_safe(&p, end, is_enc, bad);
+ tp = ticket_buf;
+ if (is_enc) {
+ /* encrypted */
+ dout(" encrypted ticket\n");
+ dlen = ceph_x_decrypt(&old_key, &p, end, ticket_buf,
+ TEMP_TICKET_BUF_LEN);
+ if (dlen < 0) {
+ ret = dlen;
+ goto out;
+ }
+ dlen = ceph_decode_32(&tp);
+ } else {
+ /* unencrypted */
+ ceph_decode_32_safe(&p, end, dlen, bad);
+ ceph_decode_need(&p, end, dlen, bad);
+ ceph_decode_copy(&p, ticket_buf, dlen);
+ }
+ tpend = tp + dlen;
+ dout(" ticket blob is %d bytes\n", dlen);
+ ceph_decode_need(&tp, tpend, 1 + sizeof(u64), bad);
+ struct_v = ceph_decode_8(&tp);
+ th->secret_id = ceph_decode_64(&tp);
+ ret = ceph_decode_buffer(&th->ticket_blob, &tp, tpend);
+ if (ret)
+ goto out;
+ dout(" got ticket service %d (%s) secret_id %lld len %d\n",
+ type, ceph_entity_type_name(type), th->secret_id,
+ (int)th->ticket_blob->vec.iov_len);
+ xi->have_keys |= th->service;
+ }
+
+ ret = 0;
+out:
+ kmem_cache_free(ceph_x_ticketbuf_cachep, ticket_buf);
+out_dbuf:
+ kmem_cache_free(ceph_x_ticketbuf_cachep, dbuf);
+ return ret;
+
+bad:
+ ret = -EINVAL;

+ goto out;
+}
+

+static int ceph_x_build_authorizer(struct ceph_auth_client *ac,
+ struct ceph_x_ticket_handler *th,
+ struct ceph_x_authorizer *au)
+{
+ int len;
+ struct ceph_x_authorize_a *msg_a;
+ struct ceph_x_authorize_b msg_b;
+ void *p, *end;
+ int ret;
+ int ticket_blob_len =
+ (th->ticket_blob ? th->ticket_blob->vec.iov_len : 0);
+
+ dout("build_authorizer for %s %p\n",
+ ceph_entity_type_name(th->service), au);
+
+ len = sizeof(*msg_a) + sizeof(msg_b) + sizeof(u32) +
+ ticket_blob_len + 16;
+ dout(" need len %d\n", len);
+ if (au->buf && au->buf->alloc_len < len) {
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;
+ }
+ if (!au->buf) {
+ au->buf = ceph_buffer_new(len, GFP_NOFS);
+ if (!au->buf)
+ return -ENOMEM;
+ }
+ au->service = th->service;
+
+ msg_a = au->buf->vec.iov_base;
+ msg_a->struct_v = 1;
+ msg_a->global_id = cpu_to_le64(ac->global_id);
+ msg_a->service_id = cpu_to_le32(th->service);
+ msg_a->ticket_blob.struct_v = 1;
+ msg_a->ticket_blob.secret_id = cpu_to_le64(th->secret_id);
+ msg_a->ticket_blob.blob_len = cpu_to_le32(ticket_blob_len);
+ if (ticket_blob_len) {
+ memcpy(msg_a->ticket_blob.blob, th->ticket_blob->vec.iov_base,
+ th->ticket_blob->vec.iov_len);
+ }
+ dout(" th %p secret_id %lld %lld\n", th, th->secret_id,
+ le64_to_cpu(msg_a->ticket_blob.secret_id));
+
+ p = msg_a + 1;
+ p += ticket_blob_len;
+ end = au->buf->vec.iov_base + au->buf->vec.iov_len;
+
+ get_random_bytes(&au->nonce, sizeof(au->nonce));
+ msg_b.struct_v = 1;
+ msg_b.nonce = cpu_to_le64(au->nonce);
+ ret = ceph_x_encrypt(&th->session_key, &msg_b, sizeof(msg_b),
+ p, end - p);
+ if (ret < 0)
+ goto out_buf;
+ p += ret;
+ au->buf->vec.iov_len = p - au->buf->vec.iov_base;
+ dout(" built authorizer nonce %llx len %d\n", au->nonce,
+ (int)au->buf->vec.iov_len);
+ return 0;
+
+out_buf:
+ ceph_buffer_put(au->buf);
+ au->buf = NULL;

+ return ret;
+}
+

+static int ceph_x_encode_ticket(struct ceph_x_ticket_handler *th,
+ void **p, void *end)
+{
+ ceph_decode_need(p, end, 1 + sizeof(u64), bad);
+ ceph_encode_8(p, 1);
+ ceph_encode_64(p, th->secret_id);
+ if (th->ticket_blob) {
+ const char *buf = th->ticket_blob->vec.iov_base;
+ u32 len = th->ticket_blob->vec.iov_len;
+
+ ceph_encode_32_safe(p, end, len, bad);
+ ceph_encode_copy_safe(p, end, buf, len, bad);
+ } else {
+ ceph_encode_32_safe(p, end, 0, bad);

+ }
+
+ return 0;

+bad:
+ return -ERANGE;
+}
+
+static void ceph_x_validate_tickets(struct ceph_auth_client *ac, int *pneed)
+{
+ int want = ac->want_keys;
+ struct ceph_x_info *xi = ac->private;
+ int service;
+
+ *pneed = ac->want_keys & ~(xi->have_keys);
+
+ for (service = 1; service <= want; service <<= 1) {
+ struct ceph_x_ticket_handler *th;
+
+ if (!(ac->want_keys & service))
+ continue;
+
+ if (*pneed & service)
+ continue;
+
+ th = get_ticket_handler(ac, service);
+
+ if (!th) {
+ *pneed |= service;
+ continue;
+ }
+
+ if (get_seconds() >= th->renew_after)
+ *pneed |= service;
+ if (get_seconds() >= th->expires)
+ xi->have_keys &= ~service;
+ }
+}
+
+
+static int ceph_x_build_request(struct ceph_auth_client *ac,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ int need;
+ struct ceph_x_request_header *head = buf;
+ int ret;
+ struct ceph_x_ticket_handler *th =
+ get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+
+ ceph_x_validate_tickets(ac, &need);
+
+ dout("build_request want %x have %x need %x\n",
+ ac->want_keys, xi->have_keys, need);
+
+ if (need & CEPH_ENTITY_TYPE_AUTH) {
+ struct ceph_x_authenticate *auth = (void *)(head + 1);
+ void *p = auth + 1;
+ struct ceph_x_challenge_blob tmp;
+ char tmp_enc[40];
+ u64 *u;
+
+ if (p > end)
+ return -ERANGE;
+
+ dout(" get_auth_session_key\n");
+ head->op = cpu_to_le16(CEPHX_GET_AUTH_SESSION_KEY);
+
+ /* encrypt and hash */
+ get_random_bytes(&auth->client_challenge, sizeof(u64));
+ tmp.client_challenge = auth->client_challenge;
+ tmp.server_challenge = cpu_to_le64(xi->server_challenge);
+ ret = ceph_x_encrypt(&xi->secret, &tmp, sizeof(tmp),
+ tmp_enc, sizeof(tmp_enc));
+ if (ret < 0)
+ return ret;
+
+ auth->struct_v = 1;
+ auth->key = 0;
+ for (u = (u64 *)tmp_enc; u + 1 <= (u64 *)(tmp_enc + ret); u++)
+ auth->key ^= *u;
+ dout(" server_challenge %llx client_challenge %llx key %llx\n",
+ xi->server_challenge, le64_to_cpu(auth->client_challenge),
+ le64_to_cpu(auth->key));
+
+ /* now encode the old ticket if exists */
+ ret = ceph_x_encode_ticket(th, &p, end);
+ if (ret < 0)
+ return ret;
+
+ return p - buf;
+ }
+
+ if (need) {
+ void *p = head + 1;
+ struct ceph_x_service_ticket_request *req;
+
+ if (p > end)
+ return -ERANGE;
+ head->op = cpu_to_le16(CEPHX_GET_PRINCIPAL_SESSION_KEY);
+
+ BUG_ON(!th);
+ ret = ceph_x_build_authorizer(ac, th, &xi->auth_authorizer);
+ if (ret)
+ return ret;
+ ceph_encode_copy(&p, xi->auth_authorizer.buf->vec.iov_base,
+ xi->auth_authorizer.buf->vec.iov_len);
+
+ req = p;
+ req->keys = cpu_to_le32(need);
+ p += sizeof(*req);
+ return p - buf;
+ }

+
+ return 0;
+}
+

+static int ceph_x_handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_x_info *xi = ac->private;
+ struct ceph_x_reply_header *head = buf;
+ struct ceph_x_ticket_handler *th;
+ int len = end - buf;
+ int op;
+ int ret;
+
+ if (result)
+ return result; /* XXX hmm? */
+
+ if (xi->starting) {
+ /* it's a hello */
+ struct ceph_x_server_challenge *sc = buf;
+
+ if (len != sizeof(*sc))
+ return -EINVAL;
+ xi->server_challenge = le64_to_cpu(sc->server_challenge);
+ dout("handle_reply got server challenge %llx\n",
+ xi->server_challenge);
+ xi->starting = false;
+ xi->have_keys &= ~CEPH_ENTITY_TYPE_AUTH;
+ return -EAGAIN;
+ }
+
+ op = le32_to_cpu(head->op);
+ result = le32_to_cpu(head->result);
+ dout("handle_reply op %d result %d\n", op, result);
+ switch (op) {
+ case CEPHX_GET_AUTH_SESSION_KEY:
+ /* verify auth key */
+ ret = ceph_x_proc_ticket_reply(ac, &xi->secret,
+ buf + sizeof(*head), end);
+ break;
+
+ case CEPHX_GET_PRINCIPAL_SESSION_KEY:
+ th = get_ticket_handler(ac, CEPH_ENTITY_TYPE_AUTH);
+ BUG_ON(!th);
+ ret = ceph_x_proc_ticket_reply(ac, &th->session_key,
+ buf + sizeof(*head), end);

+ break;
+
+ default:

+ return -EINVAL;
+ }
+ if (ret)
+ return ret;
+ if (ac->want_keys == xi->have_keys)
+ return 0;
+ return -EAGAIN;
+}
+
+static int ceph_x_create_authorizer(
+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len)
+{
+ struct ceph_x_authorizer *au;
+ struct ceph_x_ticket_handler *th;
+ int ret;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (IS_ERR(th))
+ return PTR_ERR(th);
+
+ au = kzalloc(sizeof(*au), GFP_NOFS);
+ if (!au)
+ return -ENOMEM;
+
+ ret = ceph_x_build_authorizer(ac, th, au);
+ if (ret) {
+ kfree(au);

+ return ret;
+ }
+

+ *a = (struct ceph_authorizer *)au;
+ *buf = au->buf->vec.iov_base;
+ *len = au->buf->vec.iov_len;
+ *reply_buf = au->reply_buf;
+ *reply_len = sizeof(au->reply_buf);

+ return 0;
+}
+

+static int ceph_x_verify_authorizer_reply(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a, size_t len)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+ struct ceph_x_ticket_handler *th;

+ int ret = 0;

+ struct ceph_x_authorize_reply reply;
+ void *p = au->reply_buf;
+ void *end = p + sizeof(au->reply_buf);
+
+ th = get_ticket_handler(ac, au->service);
+ if (!th)
+ return -EIO; /* hrm! */
+ ret = ceph_x_decrypt(&th->session_key, &p, end, &reply, sizeof(reply));
+ if (ret < 0)
+ return ret;
+ if (ret != sizeof(reply))
+ return -EPERM;
+
+ if (au->nonce + 1 != le64_to_cpu(reply.nonce_plus_one))
+ ret = -EPERM;
+ else
+ ret = 0;
+ dout("verify_authorizer_reply nonce %llx got %llx ret %d\n",
+ au->nonce, le64_to_cpu(reply.nonce_plus_one), ret);

+ return ret;
+}
+

+static void ceph_x_destroy_authorizer(struct ceph_auth_client *ac,
+ struct ceph_authorizer *a)
+{
+ struct ceph_x_authorizer *au = (void *)a;
+
+ ceph_buffer_put(au->buf);
+ kfree(au);
+}
+
+
+static void ceph_x_reset(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;
+
+ dout("reset\n");
+ xi->starting = true;
+ xi->server_challenge = 0;
+}
+
+static void ceph_x_destroy(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi = ac->private;

+ struct rb_node *p;
+

+ dout("ceph_x_destroy %p\n", ac);
+ ceph_crypto_key_destroy(&xi->secret);
+
+ while ((p = rb_first(&xi->ticket_handlers)) != NULL) {
+ struct ceph_x_ticket_handler *th =
+ rb_entry(p, struct ceph_x_ticket_handler, node);
+ remove_ticket_handler(ac, th);
+ }
+
+ kmem_cache_destroy(ceph_x_ticketbuf_cachep);
+
+ kfree(ac->private);
+ ac->private = NULL;
+}
+
+static void ceph_x_invalidate_authorizer(struct ceph_auth_client *ac,
+ int peer_type)
+{
+ struct ceph_x_ticket_handler *th;
+
+ th = get_ticket_handler(ac, peer_type);
+ if (th && !IS_ERR(th))
+ remove_ticket_handler(ac, th);
+}
+
+
+static const struct ceph_auth_client_ops ceph_x_ops = {
+ .is_authenticated = ceph_x_is_authenticated,
+ .build_request = ceph_x_build_request,
+ .handle_reply = ceph_x_handle_reply,
+ .create_authorizer = ceph_x_create_authorizer,
+ .verify_authorizer_reply = ceph_x_verify_authorizer_reply,
+ .destroy_authorizer = ceph_x_destroy_authorizer,
+ .invalidate_authorizer = ceph_x_invalidate_authorizer,
+ .reset = ceph_x_reset,
+ .destroy = ceph_x_destroy,
+};
+
+
+int ceph_x_init(struct ceph_auth_client *ac)
+{
+ struct ceph_x_info *xi;
+ int ret;
+
+ dout("ceph_x_init %p\n", ac);
+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)
+ return -ENOMEM;
+
+ ret = -ENOMEM;
+ ceph_x_ticketbuf_cachep = kmem_cache_create("ceph_x_ticketbuf",
+ TEMP_TICKET_BUF_LEN, 8,
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ NULL);
+ if (!ceph_x_ticketbuf_cachep)
+ goto done_nomem;
+ ret = -EINVAL;
+ if (!ac->secret) {
+ pr_err("no secret set (for auth_x protocol)\n");
+ goto done_nomem;
+ }
+
+ ret = ceph_crypto_key_unarmor(&xi->secret, ac->secret);
+ if (ret)
+ goto done_nomem;
+
+ xi->starting = true;
+ xi->ticket_handlers = RB_ROOT;
+
+ ac->protocol = CEPH_AUTH_CEPHX;
+ ac->private = xi;
+ ac->ops = &ceph_x_ops;
+ return 0;
+
+done_nomem:
+ kfree(xi);
+ if (ceph_x_ticketbuf_cachep)
+ kmem_cache_destroy(ceph_x_ticketbuf_cachep);

+ return ret;
+}
+
+

diff --git a/fs/ceph/auth_x.h b/fs/ceph/auth_x.h
new file mode 100644
index 0000000..ff6f818
--- /dev/null
+++ b/fs/ceph/auth_x.h
@@ -0,0 +1,49 @@
+#ifndef _FS_CEPH_AUTH_X_H
+#define _FS_CEPH_AUTH_X_H
+
+#include <linux/rbtree.h>
+
+#include "crypto.h"
+#include "auth.h"
+#include "auth_x_protocol.h"
+
+/*
+ * Handle ticket for a single service.
+ */
+struct ceph_x_ticket_handler {
+ struct rb_node node;
+ unsigned service;
+
+ struct ceph_crypto_key session_key;
+ struct ceph_timespec validity;
+
+ u64 secret_id;
+ struct ceph_buffer *ticket_blob;
+
+ unsigned long renew_after, expires;
+};
+
+
+struct ceph_x_authorizer {
+ struct ceph_buffer *buf;
+ unsigned service;
+ u64 nonce;
+ char reply_buf[128]; /* big enough for encrypted blob */
+};
+
+struct ceph_x_info {
+ struct ceph_crypto_key secret;
+
+ bool starting;
+ u64 server_challenge;
+
+ unsigned have_keys;
+ struct rb_root ticket_handlers;
+
+ struct ceph_x_authorizer auth_authorizer;
+};
+
+extern int ceph_x_init(struct ceph_auth_client *ac);
+
+#endif
+
diff --git a/fs/ceph/auth_x_protocol.h b/fs/ceph/auth_x_protocol.h
new file mode 100644
index 0000000..671d305
--- /dev/null
+++ b/fs/ceph/auth_x_protocol.h
@@ -0,0 +1,90 @@
+#ifndef __FS_CEPH_AUTH_X_PROTOCOL
+#define __FS_CEPH_AUTH_X_PROTOCOL
+
+#define CEPHX_GET_AUTH_SESSION_KEY 0x0100
+#define CEPHX_GET_PRINCIPAL_SESSION_KEY 0x0200
+#define CEPHX_GET_ROTATING_KEY 0x0400
+
+/* common bits */
+struct ceph_x_ticket_blob {
+ __u8 struct_v;
+ __le64 secret_id;
+ __le32 blob_len;
+ char blob[];

+} __attribute__ ((packed));
+
+

+/* common request/reply headers */
+struct ceph_x_request_header {
+ __le16 op;
+} __attribute__ ((packed));
+
+struct ceph_x_reply_header {
+ __le16 op;
+ __le32 result;

+} __attribute__ ((packed));
+
+

+/* authenticate handshake */
+
+/* initial hello (no reply header) */
+struct ceph_x_server_challenge {
+ __u8 struct_v;
+ __le64 server_challenge;
+} __attribute__ ((packed));
+
+struct ceph_x_authenticate {
+ __u8 struct_v;
+ __le64 client_challenge;
+ __le64 key;
+ /* ticket blob */
+} __attribute__ ((packed));
+
+struct ceph_x_service_ticket_request {
+ __u8 struct_v;
+ __le32 keys;
+} __attribute__ ((packed));
+
+struct ceph_x_challenge_blob {
+ __le64 server_challenge;
+ __le64 client_challenge;

+} __attribute__ ((packed));
+
+
+

+/* authorize handshake */
+
+/*
+ * The authorizer consists of two pieces:
+ * a - service id, ticket blob
+ * b - encrypted with session key
+ */
+struct ceph_x_authorize_a {
+ __u8 struct_v;
+ __le64 global_id;
+ __le32 service_id;
+ struct ceph_x_ticket_blob ticket_blob;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_b {
+ __u8 struct_v;
+ __le64 nonce;
+} __attribute__ ((packed));
+
+struct ceph_x_authorize_reply {
+ __u8 struct_v;
+ __le64 nonce_plus_one;
+} __attribute__ ((packed));
+
+
+/*
+ * encyption bundle
+ */
+#define CEPHX_ENC_MAGIC 0xff009cad8826aa55ull
+
+struct ceph_x_encrypt_header {
+ __u8 struct_v;
+ __le64 magic;
+} __attribute__ ((packed));
+
+#endif
diff --git a/fs/ceph/crypto.c b/fs/ceph/crypto.c
new file mode 100644
index 0000000..291ac28
--- /dev/null
+++ b/fs/ceph/crypto.c
@@ -0,0 +1,408 @@
+

+#include "ceph_debug.h"
+
+#include <linux/err.h>

+#include <linux/scatterlist.h>
+#include <crypto/hash.h>
+
+#include "crypto.h"
+#include "decode.h"
+
+int ceph_crypto_key_encode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ if (*p + sizeof(u16) + sizeof(key->created) +
+ sizeof(u16) + key->len > end)
+ return -ERANGE;
+ ceph_encode_16(p, key->type);
+ ceph_encode_copy(p, &key->created, sizeof(key->created));
+ ceph_encode_16(p, key->len);
+ ceph_encode_copy(p, key->key, key->len);

+ return 0;
+}
+

+int ceph_crypto_key_decode(struct ceph_crypto_key *key, void **p, void *end)
+{
+ ceph_decode_need(p, end, 2*sizeof(u16) + sizeof(key->created), bad);
+ key->type = ceph_decode_16(p);
+ ceph_decode_copy(p, &key->created, sizeof(key->created));
+ key->len = ceph_decode_16(p);
+ ceph_decode_need(p, end, key->len, bad);
+ key->key = kmalloc(key->len, GFP_NOFS);
+ if (!key->key)
+ return -ENOMEM;
+ ceph_decode_copy(p, key->key, key->len);

+ return 0;
+
+bad:

+ dout("failed to decode crypto key\n");

+ return -EINVAL;
+}
+

+int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *inkey)
+{
+ int inlen = strlen(inkey);
+ int blen = inlen * 3 / 4;
+ void *buf, *p;
+ int ret;
+
+ dout("crypto_key_unarmor %s\n", inkey);
+ buf = kmalloc(blen, GFP_NOFS);
+ if (!buf)
+ return -ENOMEM;
+ blen = ceph_unarmor(buf, inkey, inkey+inlen);
+ if (blen < 0) {
+ kfree(buf);
+ return blen;
+ }
+
+ p = buf;
+ ret = ceph_crypto_key_decode(key, &p, p + blen);
+ kfree(buf);
+ if (ret)
+ return ret;
+ dout("crypto_key_unarmor key %p type %d len %d\n", key,
+ key->type, key->len);

+ return 0;
+}
+
+
+

+#define AES_KEY_SIZE 16
+
+static struct crypto_blkcipher *ceph_crypto_alloc_cipher(void)
+{
+ return crypto_alloc_blkcipher("cbc(aes)", 0, CRYPTO_ALG_ASYNC);
+}
+
+const u8 *aes_iv = "cephsageyudagreg";
+
+int ceph_aes_encrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[2], sg_out[1];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - (src_len & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src_len + zero_padding;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 2);
+ sg_set_buf(&sg_in[0], src, src_len);
+ sg_set_buf(&sg_in[1], pad, zero_padding);
+ sg_init_table(sg_out, 1);
+ sg_set_buf(sg_out, dst, *dst_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+ src_len + zero_padding);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0)
+ pr_err("ceph_aes_crypt failed %d\n", ret);
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+int ceph_aes_encrypt2(const void *key, int key_len, void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ struct scatterlist sg_in[3], sg_out[1];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm, .flags = 0 };
+ int ret;
+ void *iv;
+ int ivsize;
+ size_t zero_padding = (0x10 - ((src1_len + src2_len) & 0x0f));
+ char pad[16];
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ memset(pad, zero_padding, zero_padding);
+
+ *dst_len = src1_len + src2_len + zero_padding;
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 3);
+ sg_set_buf(&sg_in[0], src1, src1_len);
+ sg_set_buf(&sg_in[1], src2, src2_len);
+ sg_set_buf(&sg_in[2], pad, zero_padding);
+ sg_init_table(sg_out, 1);
+ sg_set_buf(sg_out, dst, *dst_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+ /*
+ print_hex_dump(KERN_ERR, "enc key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "enc src1: ", DUMP_PREFIX_NONE, 16, 1,
+ src1, src1_len, 1);
+ print_hex_dump(KERN_ERR, "enc src2: ", DUMP_PREFIX_NONE, 16, 1,
+ src2, src2_len, 1);
+ print_hex_dump(KERN_ERR, "enc pad: ", DUMP_PREFIX_NONE, 16, 1,
+ pad, zero_padding, 1);
+ */
+ ret = crypto_blkcipher_encrypt(&desc, sg_out, sg_in,
+ src1_len + src2_len + zero_padding);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0)
+ pr_err("ceph_aes_crypt2 failed %d\n", ret);
+ /*
+ print_hex_dump(KERN_ERR, "enc out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+int ceph_aes_decrypt(const void *key, int key_len, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[1], sg_out[2];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ sg_init_table(sg_in, 1);
+ sg_init_table(sg_out, 2);
+ sg_set_buf(sg_in, src, src_len);
+ sg_set_buf(&sg_out[0], dst, *dst_len);
+ sg_set_buf(&sg_out[1], pad, sizeof(pad));
+
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);

+ return ret;
+ }
+

+ if (src_len <= *dst_len)
+ last_byte = ((char *)dst)[src_len - 1];
+ else
+ last_byte = pad[src_len - *dst_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ *dst_len = src_len - last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out: ", DUMP_PREFIX_NONE, 16, 1,
+ dst, *dst_len, 1);
+ */
+ return 0;
+}
+
+int ceph_aes_decrypt2(const void *key, int key_len,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ struct scatterlist sg_in[1], sg_out[3];
+ struct crypto_blkcipher *tfm = ceph_crypto_alloc_cipher();
+ struct blkcipher_desc desc = { .tfm = tfm };
+ char pad[16];
+ void *iv;
+ int ivsize;
+ int ret;
+ int last_byte;
+
+ if (IS_ERR(tfm))
+ return PTR_ERR(tfm);
+
+ sg_init_table(sg_in, 1);
+ sg_set_buf(sg_in, src, src_len);
+ sg_init_table(sg_out, 3);
+ sg_set_buf(&sg_out[0], dst1, *dst1_len);
+ sg_set_buf(&sg_out[1], dst2, *dst2_len);
+ sg_set_buf(&sg_out[2], pad, sizeof(pad));
+
+ crypto_blkcipher_setkey((void *)tfm, key, key_len);
+ iv = crypto_blkcipher_crt(tfm)->iv;
+ ivsize = crypto_blkcipher_ivsize(tfm);
+
+ memcpy(iv, aes_iv, ivsize);
+
+ /*
+ print_hex_dump(KERN_ERR, "dec key: ", DUMP_PREFIX_NONE, 16, 1,
+ key, key_len, 1);
+ print_hex_dump(KERN_ERR, "dec in: ", DUMP_PREFIX_NONE, 16, 1,
+ src, src_len, 1);
+ */
+
+ ret = crypto_blkcipher_decrypt(&desc, sg_out, sg_in, src_len);
+ crypto_free_blkcipher(tfm);
+ if (ret < 0) {
+ pr_err("ceph_aes_decrypt failed %d\n", ret);

+ return ret;
+ }
+

+ if (src_len <= *dst1_len)
+ last_byte = ((char *)dst1)[src_len - 1];
+ else if (src_len <= *dst1_len + *dst2_len)
+ last_byte = ((char *)dst2)[src_len - *dst1_len - 1];
+ else
+ last_byte = pad[src_len - *dst1_len - *dst2_len - 1];
+ if (last_byte <= 16 && src_len >= last_byte) {
+ src_len -= last_byte;
+ } else {
+ pr_err("ceph_aes_decrypt got bad padding %d on src len %d\n",
+ last_byte, (int)src_len);
+ return -EPERM; /* bad padding */
+ }
+
+ if (src_len < *dst1_len) {
+ *dst1_len = src_len;
+ *dst2_len = 0;
+ } else {
+ *dst2_len = src_len - *dst1_len;
+ }
+ /*
+ print_hex_dump(KERN_ERR, "dec out1: ", DUMP_PREFIX_NONE, 16, 1,
+ dst1, *dst1_len, 1);
+ print_hex_dump(KERN_ERR, "dec out2: ", DUMP_PREFIX_NONE, 16, 1,
+ dst2, *dst2_len, 1);
+ */

+
+ return 0;
+}
+
+

+int ceph_decrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:

+ return -EINVAL;
+ }
+}
+

+int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len)
+{
+ size_t t;
+
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst1_len + *dst2_len < src_len)
+ return -ERANGE;
+ t = min(*dst1_len, src_len);
+ memcpy(dst1, src, t);
+ *dst1_len = t;
+ src += t;
+ src_len -= t;
+ if (src_len) {
+ t = min(*dst2_len, src_len);
+ memcpy(dst2, src, t);
+ *dst2_len = t;

+ }
+ return 0;
+

+ case CEPH_CRYPTO_AES:
+ return ceph_aes_decrypt2(secret->key, secret->len,
+ dst1, dst1_len, dst2, dst2_len,
+ src, src_len);
+
+ default:

+ return -EINVAL;
+ }
+}
+

+int ceph_encrypt(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src, size_t src_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src_len)
+ return -ERANGE;
+ memcpy(dst, src, src_len);
+ *dst_len = src_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt(secret->key, secret->len, dst,
+ dst_len, src, src_len);
+
+ default:

+ return -EINVAL;
+ }
+}
+

+int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len)
+{
+ switch (secret->type) {
+ case CEPH_CRYPTO_NONE:
+ if (*dst_len < src1_len + src2_len)
+ return -ERANGE;
+ memcpy(dst, src1, src1_len);
+ memcpy(dst + src1_len, src2, src2_len);
+ *dst_len = src1_len + src2_len;
+ return 0;
+
+ case CEPH_CRYPTO_AES:
+ return ceph_aes_encrypt2(secret->key, secret->len, dst, dst_len,
+ src1, src1_len, src2, src2_len);
+
+ default:

+ return -EINVAL;
+ }
+}

diff --git a/fs/ceph/crypto.h b/fs/ceph/crypto.h
new file mode 100644
index 0000000..40b502e
--- /dev/null
+++ b/fs/ceph/crypto.h
@@ -0,0 +1,48 @@
+#ifndef _FS_CEPH_CRYPTO_H
+#define _FS_CEPH_CRYPTO_H

+
+#include "types.h"
+#include "buffer.h"
+

+/*
+ * cryptographic secret
+ */
+struct ceph_crypto_key {
+ int type;
+ struct ceph_timespec created;
+ int len;
+ void *key;
+};
+
+static inline void ceph_crypto_key_destroy(struct ceph_crypto_key *key)
+{
+ kfree(key->key);
+}
+
+extern int ceph_crypto_key_encode(struct ceph_crypto_key *key,
+ void **p, void *end);
+extern int ceph_crypto_key_decode(struct ceph_crypto_key *key,
+ void **p, void *end);
+extern int ceph_crypto_key_unarmor(struct ceph_crypto_key *key, const char *in);
+
+/* crypto.c */
+extern int ceph_decrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+extern int ceph_encrypt(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src, size_t src_len);
+extern int ceph_decrypt2(struct ceph_crypto_key *secret,
+ void *dst1, size_t *dst1_len,
+ void *dst2, size_t *dst2_len,
+ const void *src, size_t src_len);
+extern int ceph_encrypt2(struct ceph_crypto_key *secret,
+ void *dst, size_t *dst_len,
+ const void *src1, size_t src1_len,
+ const void *src2, size_t src2_len);
+
+/* armor.c */
+extern int ceph_armor(char *dst, const void *src, const void *end);
+extern int ceph_unarmor(void *dst, const char *src, const char *end);

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

Ceph snapshots rely on client cooperation in determining which
operations apply to which snapshots, and appropriately flushing
snapshotted data and metadata back to the OSD and MDS clusters.
Because snapshots apply to subtrees of the file hierarchy and can be
created at any time, there is a fair bit of bookkeeping required to
make this work.

Portions of the hierarchy that belong to the same set of snapshots
are described by a single 'snap realm.' A 'snap context' describes
the set of snapshots that exist for a given file or directory.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/snap.c | 904 ++++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 904 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/snap.c

diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
new file mode 100644
index 0000000..bf2a5f3
--- /dev/null
+++ b/fs/ceph/snap.c
@@ -0,0 +1,904 @@
+#include "ceph_debug.h"
+
+#include <linux/sort.h>

+
+#include "super.h"
+#include "decode.h"
+

+/*
+ * Snapshots in ceph are driven in large part by cooperation from the
+ * client. In contrast to local file systems or file servers that
+ * implement snapshots at a single point in the system, ceph's
+ * distributed access to storage requires clients to help decide
+ * whether a write logically occurs before or after a recently created
+ * snapshot.
+ *
+ * This provides a perfect instantanous client-wide snapshot. Between
+ * clients, however, snapshots may appear to be applied at slightly
+ * different points in time, depending on delays in delivering the
+ * snapshot notification.
+ *
+ * Snapshots are _not_ file system-wide. Instead, each snapshot
+ * applies to the subdirectory nested beneath some directory. This
+ * effectively divides the hierarchy into multiple "realms," where all
+ * of the files contained by each realm share the same set of
+ * snapshots. An individual realm's snap set contains snapshots
+ * explicitly created on that realm, as well as any snaps in its
+ * parent's snap set _after_ the point at which the parent became it's
+ * parent (due to, say, a rename). Similarly, snaps from prior parents
+ * during the time intervals during which they were the parent are included.
+ *
+ * The client is spared most of this detail, fortunately... it must only
+ * maintains a hierarchy of realms reflecting the current parent/child
+ * realm relationship, and for each realm has an explicit list of snaps
+ * inherited from prior parents.
+ *
+ * A snap_realm struct is maintained for realms containing every inode
+ * with an open cap in the system. (The needed snap realm information is
+ * provided by the MDS whenever a cap is issued, i.e., on open.) A 'seq'
+ * version number is used to ensure that as realm parameters change (new
+ * snapshot, new parent, etc.) the client's realm hierarchy is updated.
+ *
+ * The realm hierarchy drives the generation of a 'snap context' for each
+ * realm, which simply lists the resulting set of snaps for the realm. This
+ * is attached to any writes sent to OSDs.
+ */
+/*
+ * Unfortunately error handling is a bit mixed here. If we get a snap
+ * update, but don't have enough memory to update our realm hierarchy,
+ * it's not clear what we can do about it (besides complaining to the
+ * console).
+ */
+
+
+/*
+ * increase ref count for the realm
+ *
+ * caller must hold snap_rwsem for write.
+ */
+void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("get_realm %p %d -> %d\n", realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
+ /*
+ * since we _only_ increment realm refs or empty the empty
+ * list with snap_rwsem held, adjusting the empty list here is
+ * safe. we do need to protect against concurrent empty list
+ * additions, however.
+ */
+ if (atomic_read(&realm->nref) == 0) {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_del_init(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+
+ atomic_inc(&realm->nref);
+}
+
+static void __insert_snap_realm(struct rb_root *root,
+ struct ceph_snap_realm *new)
+{
+ struct rb_node **p = &root->rb_node;
+ struct rb_node *parent = NULL;
+ struct ceph_snap_realm *r = NULL;

+
+ while (*p) {
+ parent = *p;

+ r = rb_entry(parent, struct ceph_snap_realm, node);
+ if (new->ino < r->ino)

+ p = &(*p)->rb_left;

+ else if (new->ino > r->ino)

+ p = &(*p)->rb_right;
+ else

+ BUG();
+ }
+

+ rb_link_node(&new->node, parent, p);
+ rb_insert_color(&new->node, root);

+}
+
+/*
+ * create and get the realm rooted at @ino and bump its ref count.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static struct ceph_snap_realm *ceph_create_snap_realm(
+ struct ceph_mds_client *mdsc,
+ u64 ino)
+{
+ struct ceph_snap_realm *realm;
+
+ realm = kzalloc(sizeof(*realm), GFP_NOFS);
+ if (!realm)
+ return ERR_PTR(-ENOMEM);
+
+ atomic_set(&realm->nref, 0); /* tree does not take a ref */
+ realm->ino = ino;
+ INIT_LIST_HEAD(&realm->children);
+ INIT_LIST_HEAD(&realm->child_item);
+ INIT_LIST_HEAD(&realm->empty_item);
+ INIT_LIST_HEAD(&realm->inodes_with_caps);
+ spin_lock_init(&realm->inodes_with_caps_lock);
+ __insert_snap_realm(&mdsc->snap_realms, realm);
+ dout("create_snap_realm %llx %p\n", realm->ino, realm);
+ return realm;
+}
+
+/*
+ * lookup the realm rooted at @ino.
+ *
+ * caller must hold snap_rwsem for write.
+ */

+struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
+ u64 ino)

+{
+ struct rb_node *n = mdsc->snap_realms.rb_node;
+ struct ceph_snap_realm *r;
+
+ while (n) {
+ r = rb_entry(n, struct ceph_snap_realm, node);
+ if (ino < r->ino)

+ n = n->rb_left;

+ else if (ino > r->ino)

+ n = n->rb_right;
+ else {

+ dout("lookup_snap_realm %llx %p\n", r->ino, r);

+ return r;
+ }
+ }

+ return NULL;
+}
+
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm);
+
+/*
+ * called with snap_rwsem (write)
+ */
+static void __destroy_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__destroy_snap_realm %p %llx\n", realm, realm->ino);
+
+ rb_erase(&realm->node, &mdsc->snap_realms);
+
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ __put_snap_realm(mdsc, realm->parent);
+ }
+
+ kfree(realm->prior_parent_snaps);
+ kfree(realm->snaps);
+ ceph_put_snap_context(realm->cached_context);
+ kfree(realm);
+}
+
+/*
+ * caller holds snap_rwsem (write)
+ */
+static void __put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (atomic_dec_and_test(&realm->nref))
+ __destroy_snap_realm(mdsc, realm);
+}
+
+/*
+ * caller needn't hold any locks
+ */
+void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm)
+{
+ dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm,
+ atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
+ if (!atomic_dec_and_test(&realm->nref))
+ return;
+
+ if (down_write_trylock(&mdsc->snap_rwsem)) {
+ __destroy_snap_realm(mdsc, realm);
+ up_write(&mdsc->snap_rwsem);
+ } else {
+ spin_lock(&mdsc->snap_empty_lock);
+ list_add(&mdsc->snap_empty, &realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ }
+}
+
+/*
+ * Clean up any realms whose ref counts have dropped to zero. Note
+ * that this does not include realms who were created but not yet
+ * used.
+ *
+ * Called under snap_rwsem (write)
+ */
+static void __cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ struct ceph_snap_realm *realm;
+
+ spin_lock(&mdsc->snap_empty_lock);
+ while (!list_empty(&mdsc->snap_empty)) {
+ realm = list_first_entry(&mdsc->snap_empty,
+ struct ceph_snap_realm, empty_item);
+ list_del(&realm->empty_item);
+ spin_unlock(&mdsc->snap_empty_lock);
+ __destroy_snap_realm(mdsc, realm);
+ spin_lock(&mdsc->snap_empty_lock);
+ }
+ spin_unlock(&mdsc->snap_empty_lock);
+}
+
+void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc)
+{
+ down_write(&mdsc->snap_rwsem);
+ __cleanup_empty_realms(mdsc);
+ up_write(&mdsc->snap_rwsem);
+}
+
+/*
+ * adjust the parent realm of a given @realm. adjust child list, and parent
+ * pointers, and ref counts appropriately.
+ *
+ * return true if parent was changed, 0 if unchanged, <0 on error.
+ *
+ * caller must hold snap_rwsem for write.
+ */
+static int adjust_snap_realm_parent(struct ceph_mds_client *mdsc,
+ struct ceph_snap_realm *realm,
+ u64 parentino)

+{
+ struct ceph_snap_realm *parent;
+

+ if (realm->parent_ino == parentino)
+ return 0;
+
+ parent = ceph_lookup_snap_realm(mdsc, parentino);
+ if (!parent) {
+ parent = ceph_create_snap_realm(mdsc, parentino);
+ if (IS_ERR(parent))
+ return PTR_ERR(parent);
+ }
+ dout("adjust_snap_realm_parent %llx %p: %llx %p -> %llx %p\n",
+ realm->ino, realm, realm->parent_ino, realm->parent,
+ parentino, parent);
+ if (realm->parent) {
+ list_del_init(&realm->child_item);
+ ceph_put_snap_realm(mdsc, realm->parent);
+ }
+ realm->parent_ino = parentino;
+ realm->parent = parent;
+ ceph_get_snap_realm(mdsc, parent);
+ list_add(&realm->child_item, &parent->children);
+ return 1;
+}
+
+
+static int cmpu64_rev(const void *a, const void *b)
+{
+ if (*(u64 *)a < *(u64 *)b)
+ return 1;
+ if (*(u64 *)a > *(u64 *)b)
+ return -1;

+ return 0;
+}
+

+/*
+ * build the snap context for a given realm.
+ */
+static int build_snap_context(struct ceph_snap_realm *realm)
+{
+ struct ceph_snap_realm *parent = realm->parent;
+ struct ceph_snap_context *snapc;

+ int err = 0;

+ int i;
+ int num = realm->num_prior_parent_snaps + realm->num_snaps;
+
+ /*
+ * build parent context, if it hasn't been built.
+ * conservatively estimate that all parent snaps might be
+ * included by us.
+ */
+ if (parent) {
+ if (!parent->cached_context) {
+ err = build_snap_context(parent);
+ if (err)
+ goto fail;
+ }
+ num += parent->cached_context->num_snaps;
+ }
+
+ /* do i actually need to update? not if my context seq
+ matches realm seq, and my parents' does to. (this works
+ because we rebuild_snap_realms() works _downward_ in
+ hierarchy after each update.) */
+ if (realm->cached_context &&
+ realm->cached_context->seq <= realm->seq &&
+ (!parent ||
+ realm->cached_context->seq <= parent->cached_context->seq)) {
+ dout("build_snap_context %llx %p: %p seq %lld (%d snaps)"
+ " (unchanged)\n",
+ realm->ino, realm, realm->cached_context,
+ realm->cached_context->seq,
+ realm->cached_context->num_snaps);

+ return 0;
+ }
+

+ /* alloc new snap context */
+ err = -ENOMEM;
+ if (num > ULONG_MAX / sizeof(u64) - sizeof(*snapc))
+ goto fail;
+ snapc = kzalloc(sizeof(*snapc) + num*sizeof(u64), GFP_NOFS);
+ if (!snapc)
+ goto fail;
+ atomic_set(&snapc->nref, 1);
+
+ /* build (reverse sorted) snap vector */
+ num = 0;
+ snapc->seq = realm->seq;
+ if (parent) {
+ /* include any of parent's snaps occuring _after_ my
+ parent became my parent */
+ for (i = 0; i < parent->cached_context->num_snaps; i++)
+ if (parent->cached_context->snaps[i] >=
+ realm->parent_since)
+ snapc->snaps[num++] =
+ parent->cached_context->snaps[i];
+ if (parent->cached_context->seq > snapc->seq)
+ snapc->seq = parent->cached_context->seq;
+ }
+ memcpy(snapc->snaps + num, realm->snaps,
+ sizeof(u64)*realm->num_snaps);
+ num += realm->num_snaps;
+ memcpy(snapc->snaps + num, realm->prior_parent_snaps,
+ sizeof(u64)*realm->num_prior_parent_snaps);
+ num += realm->num_prior_parent_snaps;
+
+ sort(snapc->snaps, num, sizeof(u64), cmpu64_rev, NULL);
+ snapc->num_snaps = num;
+ dout("build_snap_context %llx %p: %p seq %lld (%d snaps)\n",
+ realm->ino, realm, snapc, snapc->seq, snapc->num_snaps);
+
+ if (realm->cached_context)
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = snapc;
+ return 0;
+
+fail:
+ /*
+ * if we fail, clear old (incorrect) cached_context... hopefully
+ * we'll have better luck building it later
+ */
+ if (realm->cached_context) {
+ ceph_put_snap_context(realm->cached_context);
+ realm->cached_context = NULL;
+ }
+ pr_err("build_snap_context %llx %p fail %d\n", realm->ino,
+ realm, err);
+ return err;
+}
+
+/*
+ * rebuild snap context for the given realm and all of its children.
+ */
+static void rebuild_snap_realms(struct ceph_snap_realm *realm)
+{
+ struct ceph_snap_realm *child;
+
+ dout("rebuild_snap_realms %llx %p\n", realm->ino, realm);
+ build_snap_context(realm);
+
+ list_for_each_entry(child, &realm->children, child_item)
+ rebuild_snap_realms(child);
+}
+
+
+/*
+ * helper to allocate and decode an array of snapids. free prior
+ * instance, if any.
+ */
+static int dup_array(u64 **dst, __le64 *src, int num)

+{
+ int i;
+

+ kfree(*dst);
+ if (num) {
+ *dst = kcalloc(num, sizeof(u64), GFP_NOFS);
+ if (!*dst)
+ return -ENOMEM;
+ for (i = 0; i < num; i++)
+ (*dst)[i] = get_unaligned_le64(src + i);
+ } else {
+ *dst = NULL;

+ }
+ return 0;
+}
+
+

+/*
+ * When a snapshot is applied, the size/mtime inode metadata is queued
+ * in a ceph_cap_snap (one for each snapshot) until writeback
+ * completes and the metadata can be flushed back to the MDS.
+ *
+ * However, if a (sync) write is currently in-progress when we apply
+ * the snapshot, we have to wait until the write succeeds or fails
+ * (and a final size/mtime is known). In this case the
+ * cap_snap->writing = 1, and is said to be "pending." When the write
+ * finishes, we __ceph_finish_cap_snap().
+ *
+ * Caller must hold snap_rwsem for read (i.e., the realm topology won't
+ * change).
+ */
+void ceph_queue_cap_snap(struct ceph_inode_info *ci,

+ struct ceph_snap_context *snapc)
+{
+ struct inode *inode = &ci->vfs_inode;

+ struct ceph_cap_snap *capsnap;
+ int used;
+
+ capsnap = kzalloc(sizeof(*capsnap), GFP_NOFS);
+ if (!capsnap) {
+ pr_err("ENOMEM allocating ceph_cap_snap on %p\n", inode);
+ return;
+ }
+
+ spin_lock(&inode->i_lock);
+ used = __ceph_caps_used(ci);
+ if (__ceph_have_pending_cap_snap(ci)) {
+ /* there is no point in queuing multiple "pending" cap_snaps,
+ as no new writes are allowed to start when pending, so any
+ writes in progress now were started before the previous
+ cap_snap. lucky us. */
+ dout("queue_cap_snap %p snapc %p seq %llu used %d"
+ " already pending\n", inode, snapc, snapc->seq, used);
+ kfree(capsnap);
+ } else if (ci->i_wrbuffer_ref_head || (used & CEPH_CAP_FILE_WR)) {
+ igrab(inode);
+
+ atomic_set(&capsnap->nref, 1);
+ capsnap->ci = ci;
+ INIT_LIST_HEAD(&capsnap->ci_item);
+ INIT_LIST_HEAD(&capsnap->flushing_item);
+
+ capsnap->follows = snapc->seq - 1;
+ capsnap->context = ceph_get_snap_context(snapc);
+ capsnap->issued = __ceph_caps_issued(ci, NULL);
+ capsnap->dirty = __ceph_caps_dirty(ci);
+
+ capsnap->mode = inode->i_mode;
+ capsnap->uid = inode->i_uid;
+ capsnap->gid = inode->i_gid;
+
+ /* fixme? */
+ capsnap->xattr_blob = NULL;
+ capsnap->xattr_len = 0;
+
+ /* dirty page count moved from _head to this cap_snap;
+ all subsequent writes page dirties occur _after_ this
+ snapshot. */
+ capsnap->dirty_pages = ci->i_wrbuffer_ref_head;
+ ci->i_wrbuffer_ref_head = 0;

+ ceph_put_snap_context(ci->i_head_snapc);
+ ci->i_head_snapc = NULL;

+ list_add_tail(&capsnap->ci_item, &ci->i_cap_snaps);
+
+ if (used & CEPH_CAP_FILE_WR) {
+ dout("queue_cap_snap %p cap_snap %p snapc %p"
+ " seq %llu used WR, now pending\n", inode,
+ capsnap, snapc, snapc->seq);
+ capsnap->writing = 1;
+ } else {
+ /* note mtime, size NOW. */
+ __ceph_finish_cap_snap(ci, capsnap);
+ }
+ } else {
+ dout("queue_cap_snap %p nothing dirty|writing\n", inode);
+ kfree(capsnap);

+ }
+
+ spin_unlock(&inode->i_lock);

+}
+
+/*
+ * Finalize the size, mtime for a cap_snap.. that is, settle on final values
+ * to be used for the snapshot, to be flushed back to the mds.
+ *
+ * If capsnap can now be flushed, add to snap_flush list, and return 1.

+ *
+ * Caller must hold i_lock.
+ */

+int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
+ struct ceph_cap_snap *capsnap)

+{
+ struct inode *inode = &ci->vfs_inode;

+ struct ceph_mds_client *mdsc = &ceph_client(inode->i_sb)->mdsc;
+

+ BUG_ON(capsnap->writing);
+ capsnap->size = inode->i_size;
+ capsnap->mtime = inode->i_mtime;
+ capsnap->atime = inode->i_atime;
+ capsnap->ctime = inode->i_ctime;
+ capsnap->time_warp_seq = ci->i_time_warp_seq;
+ if (capsnap->dirty_pages) {
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu "
+ "still has %d dirty pages\n", inode, capsnap,
+ capsnap->context, capsnap->context->seq,
+ capsnap->size, capsnap->dirty_pages);
+ return 0;
+ }
+ dout("finish_cap_snap %p cap_snap %p snapc %p %llu s=%llu clean\n",
+ inode, capsnap, capsnap->context,
+ capsnap->context->seq, capsnap->size);
+
+ spin_lock(&mdsc->snap_flush_lock);
+ list_add_tail(&ci->i_snap_flush_item, &mdsc->snap_flush_list);
+ spin_unlock(&mdsc->snap_flush_lock);
+ return 1; /* caller may want to ceph_flush_snaps */
+}
+
+
+/*
+ * Parse and apply a snapblob "snap trace" from the MDS. This specifies
+ * the snap realm parameters from a given realm and all of its ancestors,
+ * up to the root.
+ *
+ * Caller must hold snap_rwsem for write.
+ */
+int ceph_update_snap_trace(struct ceph_mds_client *mdsc,

+ void *p, void *e, bool deletion)

+{
+ struct ceph_mds_snap_realm *ri; /* encoded */
+ __le64 *snaps; /* encoded */
+ __le64 *prior_parent_snaps; /* encoded */
+ struct ceph_snap_realm *realm;
+ int invalidate = 0;
+ int err = -ENOMEM;
+
+ dout("update_snap_trace deletion=%d\n", deletion);
+more:
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ ri = p;
+ p += sizeof(*ri);
+ ceph_decode_need(&p, e, sizeof(u64)*(le32_to_cpu(ri->num_snaps) +
+ le32_to_cpu(ri->num_prior_parent_snaps)), bad);
+ snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_snaps);
+ prior_parent_snaps = p;
+ p += sizeof(u64) * le32_to_cpu(ri->num_prior_parent_snaps);
+
+ realm = ceph_lookup_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, le64_to_cpu(ri->ino));
+ if (IS_ERR(realm)) {
+ err = PTR_ERR(realm);
+ goto fail;
+ }
+ }
+
+ if (le64_to_cpu(ri->seq) > realm->seq) {
+ dout("update_snap_trace updating %llx %p %lld -> %lld\n",
+ realm->ino, realm, realm->seq, le64_to_cpu(ri->seq));
+ /*
+ * if the realm seq has changed, queue a cap_snap for every
+ * inode with open caps. we do this _before_ we update
+ * the realm info so that we prepare for writeback under the
+ * _previous_ snap context.
+ *
+ * ...unless it's a snap deletion!
+ */
+ if (!deletion) {
+ struct ceph_inode_info *ci;
+ struct inode *lastinode = NULL;
+
+ spin_lock(&realm->inodes_with_caps_lock);
+ list_for_each_entry(ci, &realm->inodes_with_caps,
+ i_snap_realm_item) {
+ struct inode *inode = igrab(&ci->vfs_inode);
+ if (!inode)
+ continue;
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+ lastinode = inode;
+ ceph_queue_cap_snap(ci, realm->cached_context);
+ spin_lock(&realm->inodes_with_caps_lock);
+ }
+ spin_unlock(&realm->inodes_with_caps_lock);
+ if (lastinode)
+ iput(lastinode);
+ dout("update_snap_trace cap_snaps queued\n");
+ }
+
+ } else {
+ dout("update_snap_trace %llx %p seq %lld unchanged\n",
+ realm->ino, realm, realm->seq);
+ }
+
+ /* ensure the parent is correct */
+ err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
+ if (err < 0)
+ goto fail;
+ invalidate += err;
+
+ if (le64_to_cpu(ri->seq) > realm->seq) {
+ /* update realm parameters, snap lists */
+ realm->seq = le64_to_cpu(ri->seq);
+ realm->created = le64_to_cpu(ri->created);
+ realm->parent_since = le64_to_cpu(ri->parent_since);
+
+ realm->num_snaps = le32_to_cpu(ri->num_snaps);
+ err = dup_array(&realm->snaps, snaps, realm->num_snaps);
+ if (err < 0)
+ goto fail;
+
+ realm->num_prior_parent_snaps =
+ le32_to_cpu(ri->num_prior_parent_snaps);
+ err = dup_array(&realm->prior_parent_snaps, prior_parent_snaps,
+ realm->num_prior_parent_snaps);
+ if (err < 0)
+ goto fail;
+
+ invalidate = 1;
+ } else if (!realm->cached_context) {
+ invalidate = 1;
+ }
+
+ dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
+ realm, invalidate, p, e);
+
+ if (p < e)
+ goto more;
+
+ /* invalidate when we reach the _end_ (root) of the trace */
+ if (invalidate)
+ rebuild_snap_realms(realm);
+
+ __cleanup_empty_realms(mdsc);

+ return 0;
+
+bad:

+ err = -EINVAL;
+fail:
+ pr_err("update_snap_trace error %d\n", err);
+ return err;
+}
+
+
+/*
+ * Send any cap_snaps that are queued for flush. Try to carry
+ * s_mutex across multiple snap flushes to avoid locking overhead.
+ *
+ * Caller holds no locks.
+ */
+static void flush_snaps(struct ceph_mds_client *mdsc)

+{
+ struct ceph_inode_info *ci;

+ struct inode *inode;
+ struct ceph_mds_session *session = NULL;
+
+ dout("flush_snaps\n");
+ spin_lock(&mdsc->snap_flush_lock);
+ while (!list_empty(&mdsc->snap_flush_list)) {
+ ci = list_first_entry(&mdsc->snap_flush_list,
+ struct ceph_inode_info, i_snap_flush_item);
+ inode = &ci->vfs_inode;
+ igrab(inode);
+ spin_unlock(&mdsc->snap_flush_lock);
+ spin_lock(&inode->i_lock);
+ __ceph_flush_snaps(ci, &session);
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ spin_lock(&mdsc->snap_flush_lock);
+ }
+ spin_unlock(&mdsc->snap_flush_lock);
+
+ if (session) {

+ mutex_unlock(&session->s_mutex);
+ ceph_put_mds_session(session);
+ }

+ dout("flush_snaps done\n");
+}
+
+
+/*
+ * Handle a snap notification from the MDS.
+ *
+ * This can take two basic forms: the simplest is just a snap creation
+ * or deletion notification on an existing realm. This should update the
+ * realm and its children.
+ *
+ * The more difficult case is realm creation, due to snap creation at a
+ * new point in the file hierarchy, or due to a rename that moves a file or
+ * directory into another realm.
+ */
+void ceph_handle_snap(struct ceph_mds_client *mdsc,
+ struct ceph_mds_session *session,
+ struct ceph_msg *msg)
+{

+ struct super_block *sb = mdsc->client->sb;

+ int mds = session->s_mds;

+ u64 split;
+ int op;
+ int trace_len;
+ struct ceph_snap_realm *realm = NULL;
+ void *p = msg->front.iov_base;
+ void *e = p + msg->front.iov_len;
+ struct ceph_mds_snap_head *h;
+ int num_split_inos, num_split_realms;
+ __le64 *split_inos = NULL, *split_realms = NULL;
+ int i;
+ int locked_rwsem = 0;
+
+ /* decode */

+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;

+ h = p;

+ op = le32_to_cpu(h->op);

+ split = le64_to_cpu(h->split); /* non-zero if we are splitting an
+ * existing realm */
+ num_split_inos = le32_to_cpu(h->num_split_inos);
+ num_split_realms = le32_to_cpu(h->num_split_realms);
+ trace_len = le32_to_cpu(h->trace_len);
+ p += sizeof(*h);
+
+ dout("handle_snap from mds%d op %s split %llx tracelen %d\n", mds,
+ ceph_snap_op_name(op), split, trace_len);

+
+ mutex_lock(&session->s_mutex);
+ session->s_seq++;

+ mutex_unlock(&session->s_mutex);
+
+ down_write(&mdsc->snap_rwsem);
+ locked_rwsem = 1;
+
+ if (op == CEPH_SNAP_OP_SPLIT) {
+ struct ceph_mds_snap_realm *ri;
+
+ /*
+ * A "split" breaks part of an existing realm off into
+ * a new realm. The MDS provides a list of inodes
+ * (with caps) and child realms that belong to the new
+ * child.
+ */
+ split_inos = p;
+ p += sizeof(u64) * num_split_inos;
+ split_realms = p;
+ p += sizeof(u64) * num_split_realms;
+ ceph_decode_need(&p, e, sizeof(*ri), bad);
+ /* we will peek at realm info here, but will _not_
+ * advance p, as the realm update will occur below in
+ * ceph_update_snap_trace. */
+ ri = p;
+
+ realm = ceph_lookup_snap_realm(mdsc, split);
+ if (!realm) {
+ realm = ceph_create_snap_realm(mdsc, split);
+ if (IS_ERR(realm))
+ goto out;
+ }
+ ceph_get_snap_realm(mdsc, realm);
+
+ dout("splitting snap_realm %llx %p\n", realm->ino, realm);
+ for (i = 0; i < num_split_inos; i++) {
+ struct ceph_vino vino = {
+ .ino = le64_to_cpu(split_inos[i]),
+ .snap = CEPH_NOSNAP,
+ };
+ struct inode *inode = ceph_find_inode(sb, vino);

+ struct ceph_inode_info *ci;
+

+ if (!inode)
+ continue;
+ ci = ceph_inode(inode);
+
+ spin_lock(&inode->i_lock);
+ if (!ci->i_snap_realm)
+ goto skip_inode;
+ /*
+ * If this inode belongs to a realm that was
+ * created after our new realm, we experienced
+ * a race (due to another split notifications
+ * arriving from a different MDS). So skip
+ * this inode.
+ */
+ if (ci->i_snap_realm->created >
+ le64_to_cpu(ri->created)) {
+ dout(" leaving %p in newer realm %llx %p\n",
+ inode, ci->i_snap_realm->ino,
+ ci->i_snap_realm);
+ goto skip_inode;
+ }
+ dout(" will move %p to split realm %llx %p\n",
+ inode, realm->ino, realm);
+ /*
+ * Remove the inode from the realm's inode
+ * list, but don't add it to the new realm
+ * yet. We don't want the cap_snap to be
+ * queued (again) by ceph_update_snap_trace()
+ * below. Queue it _now_, under the old context.
+ */
+ list_del_init(&ci->i_snap_realm_item);
+ spin_unlock(&inode->i_lock);
+
+ ceph_queue_cap_snap(ci,
+ ci->i_snap_realm->cached_context);
+
+ iput(inode);
+ continue;
+
+skip_inode:
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ }
+
+ /* we may have taken some of the old realm's children. */
+ for (i = 0; i < num_split_realms; i++) {
+ struct ceph_snap_realm *child =
+ ceph_lookup_snap_realm(mdsc,
+ le64_to_cpu(split_realms[i]));
+ if (!child)
+ continue;
+ adjust_snap_realm_parent(mdsc, child, realm->ino);

+ }
+ }
+
+ /*

+ * update using the provided snap trace. if we are deleting a
+ * snap, we can avoid queueing cap_snaps.
+ */
+ ceph_update_snap_trace(mdsc, p, e,
+ op == CEPH_SNAP_OP_DESTROY);
+
+ if (op == CEPH_SNAP_OP_SPLIT) {
+ /*
+ * ok, _now_ add the inodes into the new realm.
+ */
+ for (i = 0; i < num_split_inos; i++) {
+ struct ceph_vino vino = {
+ .ino = le64_to_cpu(split_inos[i]),
+ .snap = CEPH_NOSNAP,
+ };
+ struct inode *inode = ceph_find_inode(sb, vino);

+ struct ceph_inode_info *ci;
+

+ if (!inode)
+ continue;
+ ci = ceph_inode(inode);
+ spin_lock(&inode->i_lock);
+ if (!ci->i_snap_realm)
+ goto split_skip_inode;
+ ceph_put_snap_realm(mdsc, ci->i_snap_realm);
+ spin_lock(&realm->inodes_with_caps_lock);

+ list_add(&ci->i_snap_realm_item,
+ &realm->inodes_with_caps);
+ ci->i_snap_realm = realm;

+ spin_unlock(&realm->inodes_with_caps_lock);
+ ceph_get_snap_realm(mdsc, realm);
+split_skip_inode:
+ spin_unlock(&inode->i_lock);
+ iput(inode);
+ }
+
+ /* we took a reference when we created the realm, above */

+ ceph_put_snap_realm(mdsc, realm);
+ }
+

+ __cleanup_empty_realms(mdsc);
+
+ up_write(&mdsc->snap_rwsem);
+
+ flush_snaps(mdsc);
+ return;
+
+bad:
+ pr_err("corrupt snap message from mds%d\n", mds);
+ ceph_msg_dump(msg);
+out:
+ if (locked_rwsem)
+ up_write(&mdsc->snap_rwsem);

+ return;
+}
+
+

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

struct ceph_pagelist is a simple linked list of pages for encoding
data into. Whole pages are allocated and added to the list as needed.
This is used for encoding large outgoing messages when the final
buffer size is known know ahead of time and we do not want to worry
about something like valloc failing due to fragmentation.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/pagelist.c | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/pagelist.h | 54 ++++++++++++++++++++++++++++++++++++++++++++++++++++
2 files changed, 108 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/pagelist.c
create mode 100644 fs/ceph/pagelist.h

diff --git a/fs/ceph/pagelist.c b/fs/ceph/pagelist.c
new file mode 100644
index 0000000..370e936
--- /dev/null
+++ b/fs/ceph/pagelist.c
@@ -0,0 +1,54 @@
+
+#include <linux/pagemap.h>
+#include <linux/highmem.h>
+
+#include "pagelist.h"
+
+int ceph_pagelist_release(struct ceph_pagelist *pl)
+{
+ if (pl->mapped_tail)
+ kunmap(pl->mapped_tail);
+ while (!list_empty(&pl->head)) {
+ struct page *page = list_first_entry(&pl->head, struct page,
+ lru);
+ list_del(&page->lru);
+ __free_page(page);

+ }
+ return 0;
+}
+

+static int ceph_pagelist_addpage(struct ceph_pagelist *pl)
+{
+ struct page *page = alloc_page(GFP_NOFS);
+ if (!page)
+ return -ENOMEM;
+ pl->room += PAGE_SIZE;
+ list_add_tail(&page->lru, &pl->head);
+ if (pl->mapped_tail)
+ kunmap(pl->mapped_tail);
+ pl->mapped_tail = kmap(page);

+ return 0;
+}
+

+int ceph_pagelist_append(struct ceph_pagelist *pl, void *buf, size_t len)
+{
+ while (pl->room < len) {
+ size_t bit = pl->room;
+ int ret;
+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK),
+ buf, bit);
+ pl->length += bit;
+ pl->room -= bit;
+ buf += bit;
+ len -= bit;
+ ret = ceph_pagelist_addpage(pl);

+ if (ret)
+ return ret;
+ }

+
+ memcpy(pl->mapped_tail + (pl->length & ~PAGE_CACHE_MASK), buf, len);
+ pl->length += len;
+ pl->room -= len;
+ return 0;
+}
diff --git a/fs/ceph/pagelist.h b/fs/ceph/pagelist.h
new file mode 100644
index 0000000..e8a4187
--- /dev/null
+++ b/fs/ceph/pagelist.h
@@ -0,0 +1,54 @@
+#ifndef __FS_CEPH_PAGELIST_H
+#define __FS_CEPH_PAGELIST_H
+
+#include <linux/list.h>
+
+struct ceph_pagelist {
+ struct list_head head;
+ void *mapped_tail;
+ size_t length;
+ size_t room;
+};
+
+static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
+{
+ INIT_LIST_HEAD(&pl->head);
+ pl->mapped_tail = NULL;
+ pl->length = 0;
+ pl->room = 0;
+}
+extern int ceph_pagelist_release(struct ceph_pagelist *pl);
+
+extern int ceph_pagelist_append(struct ceph_pagelist *pl, void *d, size_t l);
+
+static inline int ceph_pagelist_encode_64(struct ceph_pagelist *pl, u64 v)
+{
+ __le64 ev = cpu_to_le64(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_32(struct ceph_pagelist *pl, u32 v)
+{
+ __le32 ev = cpu_to_le32(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_16(struct ceph_pagelist *pl, u16 v)
+{
+ __le16 ev = cpu_to_le16(v);
+ return ceph_pagelist_append(pl, &ev, sizeof(ev));
+}
+static inline int ceph_pagelist_encode_8(struct ceph_pagelist *pl, u8 v)
+{
+ return ceph_pagelist_append(pl, &v, 1);
+}
+static inline int ceph_pagelist_encode_string(struct ceph_pagelist *pl,
+ char *s, size_t len)
+{
+ int ret = ceph_pagelist_encode_32(pl, len);

+ if (ret)
+ return ret;

+ if (len)
+ return ceph_pagelist_append(pl, s, len);

+ return 0;
+}
+

+#endif

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

CRUSH is a pseudorandom data distribution function designed to map
inputs onto a dynamic hierarchy of devices, while minimizing the
extent to which inputs are remapped when the devices are added or
removed. It includes some features that are specifically useful for
storage, most notably the ability to map each input onto a set of N
devices that are separated across administrator-defined failure
domains. CRUSH is used to distribute data across the cluster of Ceph
storage nodes.

More information about CRUSH can be found in this paper:

http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/crush/crush.c | 151 ++++++++++++
fs/ceph/crush/crush.h | 180 +++++++++++++++
fs/ceph/crush/hash.c | 149 ++++++++++++
fs/ceph/crush/hash.h | 17 ++
fs/ceph/crush/mapper.c | 596 ++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/crush/mapper.h | 20 ++
6 files changed, 1113 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/crush/crush.c
create mode 100644 fs/ceph/crush/crush.h
create mode 100644 fs/ceph/crush/hash.c
create mode 100644 fs/ceph/crush/hash.h
create mode 100644 fs/ceph/crush/mapper.c
create mode 100644 fs/ceph/crush/mapper.h

diff --git a/fs/ceph/crush/crush.c b/fs/ceph/crush/crush.c
new file mode 100644
index 0000000..fabd302
--- /dev/null
+++ b/fs/ceph/crush/crush.c
@@ -0,0 +1,151 @@
+
+#ifdef __KERNEL__
+# include <linux/slab.h>
+#else
+# include <stdlib.h>
+# include <assert.h>
+# define kfree(x) do { if (x) free(x); } while (0)
+# define BUG_ON(x) assert(!(x))
+#endif
+
+#include "crush.h"
+
+const char *crush_bucket_alg_name(int alg)
+{
+ switch (alg) {
+ case CRUSH_BUCKET_UNIFORM: return "uniform";
+ case CRUSH_BUCKET_LIST: return "list";
+ case CRUSH_BUCKET_TREE: return "tree";
+ case CRUSH_BUCKET_STRAW: return "straw";

+ default: return "unknown";
+ }
+}
+

+/**
+ * crush_get_bucket_item_weight - Get weight of an item in given bucket
+ * @b: bucket pointer
+ * @p: item index in bucket
+ */
+int crush_get_bucket_item_weight(struct crush_bucket *b, int p)
+{
+ if (p >= b->size)
+ return 0;
+

+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:

+ return ((struct crush_bucket_uniform *)b)->item_weight;
+ case CRUSH_BUCKET_LIST:
+ return ((struct crush_bucket_list *)b)->item_weights[p];
+ case CRUSH_BUCKET_TREE:
+ if (p & 1)
+ return ((struct crush_bucket_tree *)b)->node_weights[p];
+ return 0;
+ case CRUSH_BUCKET_STRAW:
+ return ((struct crush_bucket_straw *)b)->item_weights[p];

+ }
+ return 0;
+}
+

+/**
+ * crush_calc_parents - Calculate parent vectors for the given crush map.
+ * @map: crush_map pointer
+ */
+void crush_calc_parents(struct crush_map *map)
+{
+ int i, b, c;
+
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ for (i = 0; i < map->buckets[b]->size; i++) {
+ c = map->buckets[b]->items[i];
+ BUG_ON(c >= map->max_devices ||
+ c < -map->max_buckets);
+ if (c >= 0)
+ map->device_parents[c] = map->buckets[b]->id;
+ else
+ map->bucket_parents[-1-c] = map->buckets[b]->id;
+ }
+ }
+}
+
+void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b)
+{
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_list(struct crush_bucket_list *b)
+{
+ kfree(b->item_weights);
+ kfree(b->sum_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket_tree(struct crush_bucket_tree *b)
+{
+ kfree(b->node_weights);
+ kfree(b);
+}
+
+void crush_destroy_bucket_straw(struct crush_bucket_straw *b)
+{
+ kfree(b->straws);
+ kfree(b->item_weights);
+ kfree(b->h.perm);
+ kfree(b->h.items);
+ kfree(b);
+}
+
+void crush_destroy_bucket(struct crush_bucket *b)

+{
+ switch (b->alg) {
+ case CRUSH_BUCKET_UNIFORM:

+ crush_destroy_bucket_uniform((struct crush_bucket_uniform *)b);

+ break;
+ case CRUSH_BUCKET_LIST:

+ crush_destroy_bucket_list((struct crush_bucket_list *)b);

+ break;
+ case CRUSH_BUCKET_TREE:

+ crush_destroy_bucket_tree((struct crush_bucket_tree *)b);

+ break;
+ case CRUSH_BUCKET_STRAW:

+ crush_destroy_bucket_straw((struct crush_bucket_straw *)b);

+ break;
+ }
+}
+

+/**
+ * crush_destroy - Destroy a crush_map
+ * @map: crush_map pointer
+ */
+void crush_destroy(struct crush_map *map)
+{
+ int b;

+
+ /* buckets */

+ if (map->buckets) {
+ for (b = 0; b < map->max_buckets; b++) {
+ if (map->buckets[b] == NULL)
+ continue;
+ crush_destroy_bucket(map->buckets[b]);
+ }
+ kfree(map->buckets);

+ }
+
+ /* rules */

+ if (map->rules) {
+ for (b = 0; b < map->max_rules; b++)
+ kfree(map->rules[b]);
+ kfree(map->rules);
+ }
+
+ kfree(map->bucket_parents);
+ kfree(map->device_parents);
+ kfree(map);
+}
+
+
diff --git a/fs/ceph/crush/crush.h b/fs/ceph/crush/crush.h
new file mode 100644
index 0000000..dcd7e75
--- /dev/null
+++ b/fs/ceph/crush/crush.h
@@ -0,0 +1,180 @@
+#ifndef _CRUSH_CRUSH_H
+#define _CRUSH_CRUSH_H
+
+#include <linux/types.h>
+
+/*
+ * CRUSH is a pseudo-random data distribution algorithm that
+ * efficiently distributes input values (typically, data objects)
+ * across a heterogeneous, structured storage cluster.
+ *
+ * The algorithm was originally described in detail in this paper
+ * (although the algorithm has evolved somewhat since then):
+ *
+ * http://www.ssrc.ucsc.edu/Papers/weil-sc06.pdf

+ *
+ * LGPL2
+ */
+
+

+#define CRUSH_MAGIC 0x00010000ul /* for detecting algorithm revisions */
+
+
+#define CRUSH_MAX_DEPTH 10 /* max crush hierarchy depth */
+#define CRUSH_MAX_SET 10 /* max size of a mapping result */
+
+
+/*
+ * CRUSH uses user-defined "rules" to describe how inputs should be
+ * mapped to devices. A rule consists of sequence of steps to perform
+ * to generate the set of output devices.
+ */
+struct crush_rule_step {
+ __u32 op;
+ __s32 arg1;
+ __s32 arg2;
+};
+
+/* step op codes */
+enum {
+ CRUSH_RULE_NOOP = 0,
+ CRUSH_RULE_TAKE = 1, /* arg1 = value to start with */
+ CRUSH_RULE_CHOOSE_FIRSTN = 2, /* arg1 = num items to pick */
+ /* arg2 = type */
+ CRUSH_RULE_CHOOSE_INDEP = 3, /* same */
+ CRUSH_RULE_EMIT = 4, /* no args */
+ CRUSH_RULE_CHOOSE_LEAF_FIRSTN = 6,
+ CRUSH_RULE_CHOOSE_LEAF_INDEP = 7,
+};
+
+/*
+ * for specifying choose num (arg1) relative to the max parameter
+ * passed to do_rule
+ */
+#define CRUSH_CHOOSE_N 0
+#define CRUSH_CHOOSE_N_MINUS(x) (-(x))
+
+/*
+ * The rule mask is used to describe what the rule is intended for.
+ * Given a ruleset and size of output set, we search through the
+ * rule list for a matching rule_mask.
+ */
+struct crush_rule_mask {
+ __u8 ruleset;
+ __u8 type;
+ __u8 min_size;
+ __u8 max_size;
+};
+
+struct crush_rule {
+ __u32 len;
+ struct crush_rule_mask mask;
+ struct crush_rule_step steps[0];
+};
+
+#define crush_rule_size(len) (sizeof(struct crush_rule) + \
+ (len)*sizeof(struct crush_rule_step))
+
+
+
+/*
+ * A bucket is a named container of other items (either devices or
+ * other buckets). Items within a bucket are chosen using one of a
+ * few different algorithms. The table summarizes how the speed of
+ * each option measures up against mapping stability when items are
+ * added or removed.
+ *
+ * Bucket Alg Speed Additions Removals
+ * ------------------------------------------------
+ * uniform O(1) poor poor
+ * list O(n) optimal poor
+ * tree O(log n) good good
+ * straw O(n) optimal optimal
+ */
+enum {
+ CRUSH_BUCKET_UNIFORM = 1,
+ CRUSH_BUCKET_LIST = 2,
+ CRUSH_BUCKET_TREE = 3,
+ CRUSH_BUCKET_STRAW = 4
+};
+extern const char *crush_bucket_alg_name(int alg);
+
+struct crush_bucket {
+ __s32 id; /* this'll be negative */
+ __u16 type; /* non-zero; type=0 is reserved for devices */
+ __u8 alg; /* one of CRUSH_BUCKET_* */
+ __u8 hash; /* which hash function to use, CRUSH_HASH_* */
+ __u32 weight; /* 16-bit fixed point */
+ __u32 size; /* num items */
+ __s32 *items;
+
+ /*
+ * cached random permutation: used for uniform bucket and for
+ * the linear search fallback for the other bucket types.
+ */
+ __u32 perm_x; /* @x for which *perm is defined */
+ __u32 perm_n; /* num elements of *perm that are permuted/defined */
+ __u32 *perm;
+};
+
+struct crush_bucket_uniform {
+ struct crush_bucket h;
+ __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
+};
+
+struct crush_bucket_list {
+ struct crush_bucket h;
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *sum_weights; /* 16-bit fixed point. element i is sum
+ of weights 0..i, inclusive */
+};
+
+struct crush_bucket_tree {
+ struct crush_bucket h; /* note: h.size is _tree_ size, not number of
+ actual items */
+ __u8 num_nodes;
+ __u32 *node_weights;
+};
+
+struct crush_bucket_straw {
+ struct crush_bucket h;
+ __u32 *item_weights; /* 16-bit fixed point */
+ __u32 *straws; /* 16-bit fixed point */
+};
+
+
+
+/*
+ * CRUSH map includes all buckets, rules, etc.
+ */
+struct crush_map {
+ struct crush_bucket **buckets;
+ struct crush_rule **rules;
+
+ /*
+ * Parent pointers to identify the parent bucket a device or
+ * bucket in the hierarchy. If an item appears more than
+ * once, this is the _last_ time it appeared (where buckets
+ * are processed in bucket id order, from -1 on down to
+ * -max_buckets.
+ */
+ __u32 *bucket_parents;
+ __u32 *device_parents;
+
+ __s32 max_buckets;
+ __u32 max_rules;
+ __s32 max_devices;
+};
+
+
+/* crush.c */
+extern int crush_get_bucket_item_weight(struct crush_bucket *b, int pos);
+extern void crush_calc_parents(struct crush_map *map);
+extern void crush_destroy_bucket_uniform(struct crush_bucket_uniform *b);
+extern void crush_destroy_bucket_list(struct crush_bucket_list *b);
+extern void crush_destroy_bucket_tree(struct crush_bucket_tree *b);
+extern void crush_destroy_bucket_straw(struct crush_bucket_straw *b);
+extern void crush_destroy_bucket(struct crush_bucket *b);
+extern void crush_destroy(struct crush_map *map);
+
+#endif
diff --git a/fs/ceph/crush/hash.c b/fs/ceph/crush/hash.c
new file mode 100644
index 0000000..5873aed
--- /dev/null
+++ b/fs/ceph/crush/hash.c
@@ -0,0 +1,149 @@
+
+#include <linux/types.h>
+#include "hash.h"
+
+/*
+ * Robert Jenkins' function for mixing 32-bit values
+ * http://burtleburtle.net/bob/hash/evahash.html
+ * a, b = random bits, c = input and output
+ */
+#define crush_hashmix(a, b, c) do { \
+ a = a-b; a = a-c; a = a^(c>>13); \
+ b = b-c; b = b-a; b = b^(a<<8); \
+ c = c-a; c = c-b; c = c^(b>>13); \
+ a = a-b; a = a-c; a = a^(c>>12); \
+ b = b-c; b = b-a; b = b^(a<<16); \
+ c = c-a; c = c-b; c = c^(b>>5); \
+ a = a-b; a = a-c; a = a^(c>>3); \
+ b = b-c; b = b-a; b = b^(a<<10); \
+ c = c-a; c = c-b; c = c^(b>>15); \
+ } while (0)
+
+#define crush_hash_seed 1315423911
+
+static __u32 crush_hash32_rjenkins1(__u32 a)
+{
+ __u32 hash = crush_hash_seed ^ a;
+ __u32 b = a;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, a, hash);

+ return hash;
+}
+

+static __u32 crush_hash32_rjenkins1_2(__u32 a, __u32 b)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(x, a, hash);
+ crush_hashmix(b, y, hash);

+ return hash;
+}
+

+static __u32 crush_hash32_rjenkins1_3(__u32 a, __u32 b, __u32 c)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);

+ return hash;
+}
+

+static __u32 crush_hash32_rjenkins1_4(__u32 a, __u32 b, __u32 c, __u32 d)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(a, x, hash);
+ crush_hashmix(y, b, hash);
+ crush_hashmix(c, x, hash);
+ crush_hashmix(y, d, hash);

+ return hash;
+}
+

+static __u32 crush_hash32_rjenkins1_5(__u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e)
+{
+ __u32 hash = crush_hash_seed ^ a ^ b ^ c ^ d ^ e;
+ __u32 x = 231232;
+ __u32 y = 1232;
+ crush_hashmix(a, b, hash);
+ crush_hashmix(c, d, hash);
+ crush_hashmix(e, x, hash);
+ crush_hashmix(y, a, hash);
+ crush_hashmix(b, x, hash);
+ crush_hashmix(y, c, hash);
+ crush_hashmix(d, x, hash);
+ crush_hashmix(y, e, hash);

+ return hash;
+}
+
+

+__u32 crush_hash32(int type, __u32 a)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1(a);
+ default:

+ return 0;
+ }
+}
+

+__u32 crush_hash32_2(int type, __u32 a, __u32 b)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_2(a, b);
+ default:

+ return 0;
+ }
+}
+

+__u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_3(a, b, c);
+ default:

+ return 0;
+ }
+}
+

+__u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_4(a, b, c, d);
+ default:

+ return 0;
+ }
+}
+

+__u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d, __u32 e)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return crush_hash32_rjenkins1_5(a, b, c, d, e);
+ default:

+ return 0;
+ }
+}
+

+const char *crush_hash_name(int type)
+{
+ switch (type) {
+ case CRUSH_HASH_RJENKINS1:
+ return "rjenkins1";

+ default:
+ return "unknown";
+ }
+}

diff --git a/fs/ceph/crush/hash.h b/fs/ceph/crush/hash.h
new file mode 100644
index 0000000..ff48e11
--- /dev/null
+++ b/fs/ceph/crush/hash.h
@@ -0,0 +1,17 @@
+#ifndef _CRUSH_HASH_H
+#define _CRUSH_HASH_H
+
+#define CRUSH_HASH_RJENKINS1 0
+
+#define CRUSH_HASH_DEFAULT CRUSH_HASH_RJENKINS1
+
+extern const char *crush_hash_name(int type);
+
+extern __u32 crush_hash32(int type, __u32 a);
+extern __u32 crush_hash32_2(int type, __u32 a, __u32 b);
+extern __u32 crush_hash32_3(int type, __u32 a, __u32 b, __u32 c);
+extern __u32 crush_hash32_4(int type, __u32 a, __u32 b, __u32 c, __u32 d);
+extern __u32 crush_hash32_5(int type, __u32 a, __u32 b, __u32 c, __u32 d,
+ __u32 e);
+
+#endif
diff --git a/fs/ceph/crush/mapper.c b/fs/ceph/crush/mapper.c
new file mode 100644
index 0000000..9ba54ef
--- /dev/null
+++ b/fs/ceph/crush/mapper.c
@@ -0,0 +1,596 @@
+
+#ifdef __KERNEL__
+# include <linux/string.h>
+# include <linux/slab.h>
+# include <linux/bug.h>
+# include <linux/kernel.h>
+# ifndef dprintk
+# define dprintk(args...)
+# endif
+#else
+# include <string.h>
+# include <stdio.h>
+# include <stdlib.h>
+# include <assert.h>
+# define BUG_ON(x) assert(!(x))
+# define dprintk(args...) /* printf(args) */
+# define kmalloc(x, f) malloc(x)
+# define kfree(x) free(x)
+#endif
+
+#include "crush.h"
+#include "hash.h"
+
+/*
+ * Implement the core CRUSH mapping algorithm.
+ */
+
+/**
+ * crush_find_rule - find a crush_rule id for a given ruleset, type, and size.
+ * @map: the crush_map
+ * @ruleset: the storage ruleset id (user defined)
+ * @type: storage ruleset type (user defined)
+ * @size: output set size
+ */
+int crush_find_rule(struct crush_map *map, int ruleset, int type, int size)

+{
+ int i;
+

+ for (i = 0; i < map->max_rules; i++) {
+ if (map->rules[i] &&
+ map->rules[i]->mask.ruleset == ruleset &&
+ map->rules[i]->mask.type == type &&
+ map->rules[i]->mask.min_size <= size &&
+ map->rules[i]->mask.max_size >= size)
+ return i;
+ }
+ return -1;
+}
+
+
+/*
+ * bucket choose methods
+ *
+ * For each bucket algorithm, we have a "choose" method that, given a
+ * crush input @x and replica position (usually, position in output set) @r,
+ * will produce an item in the bucket.
+ */
+
+/*
+ * Choose based on a random permutation of the bucket.
+ *
+ * We used to use some prime number arithmetic to do this, but it
+ * wasn't very random, and had some other bad behaviors. Instead, we
+ * calculate an actual random permutation of the bucket members.
+ * Since this is expensive, we optimize for the r=0 case, which
+ * captures the vast majority of calls.
+ */
+static int bucket_perm_choose(struct crush_bucket *bucket,
+ int x, int r)
+{
+ unsigned pr = r % bucket->size;
+ unsigned i, s;
+
+ /* start a new permutation if @x has changed */
+ if (bucket->perm_x != x || bucket->perm_n == 0) {
+ dprintk("bucket %d new x=%d\n", bucket->id, x);
+ bucket->perm_x = x;
+
+ /* optimize common r=0 case */
+ if (pr == 0) {
+ s = crush_hash32_3(bucket->hash, x, bucket->id, 0) %
+ bucket->size;
+ bucket->perm[0] = s;
+ bucket->perm_n = 0xffff; /* magic value, see below */

+ goto out;
+ }
+

+ for (i = 0; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm_n = 0;
+ } else if (bucket->perm_n == 0xffff) {
+ /* clean up after the r=0 case above */
+ for (i = 1; i < bucket->size; i++)
+ bucket->perm[i] = i;
+ bucket->perm[bucket->perm[0]] = 0;
+ bucket->perm_n = 1;
+ }
+
+ /* calculate permutation up to pr */
+ for (i = 0; i < bucket->perm_n; i++)
+ dprintk(" perm_choose have %d: %d\n", i, bucket->perm[i]);
+ while (bucket->perm_n <= pr) {
+ unsigned p = bucket->perm_n;
+ /* no point in swapping the final entry */
+ if (p < bucket->size - 1) {
+ i = crush_hash32_3(bucket->hash, x, bucket->id, p) %
+ (bucket->size - p);
+ if (i) {
+ unsigned t = bucket->perm[p + i];
+ bucket->perm[p + i] = bucket->perm[p];
+ bucket->perm[p] = t;
+ }
+ dprintk(" perm_choose swap %d with %d\n", p, p+i);
+ }
+ bucket->perm_n++;
+ }
+ for (i = 0; i < bucket->size; i++)
+ dprintk(" perm_choose %d: %d\n", i, bucket->perm[i]);
+
+ s = bucket->perm[pr];
+out:
+ dprintk(" perm_choose %d sz=%d x=%d r=%d (%d) s=%d\n", bucket->id,
+ bucket->size, x, r, pr, s);
+ return bucket->items[s];
+}
+
+/* uniform */
+static int bucket_uniform_choose(struct crush_bucket_uniform *bucket,
+ int x, int r)
+{
+ return bucket_perm_choose(&bucket->h, x, r);
+}
+
+/* list */
+static int bucket_list_choose(struct crush_bucket_list *bucket,
+ int x, int r)

+{
+ int i;
+

+ for (i = bucket->h.size-1; i >= 0; i--) {
+ __u64 w = crush_hash32_4(bucket->h.hash,x, bucket->h.items[i],
+ r, bucket->h.id);
+ w &= 0xffff;
+ dprintk("list_choose i=%d x=%d r=%d item %d weight %x "
+ "sw %x rand %llx",
+ i, x, r, bucket->h.items[i], bucket->item_weights[i],
+ bucket->sum_weights[i], w);
+ w *= bucket->sum_weights[i];
+ w = w >> 16;
+ /*dprintk(" scaled %llx\n", w);*/
+ if (w < bucket->item_weights[i])
+ return bucket->h.items[i];
+ }
+
+ BUG_ON(1);

+ return 0;
+}
+
+

+/* (binary) tree */
+static int height(int n)
+{
+ int h = 0;
+ while ((n & 1) == 0) {
+ h++;
+ n = n >> 1;
+ }
+ return h;
+}
+
+static int left(int x)
+{
+ int h = height(x);
+ return x - (1 << (h-1));
+}
+
+static int right(int x)
+{
+ int h = height(x);
+ return x + (1 << (h-1));
+}
+
+static int terminal(int x)
+{
+ return x & 1;
+}
+
+static int bucket_tree_choose(struct crush_bucket_tree *bucket,
+ int x, int r)
+{
+ int n, l;
+ __u32 w;
+ __u64 t;
+
+ /* start at root */
+ n = bucket->num_nodes >> 1;
+
+ while (!terminal(n)) {
+ /* pick point in [0, w) */
+ w = bucket->node_weights[n];
+ t = (__u64)crush_hash32_4(bucket->h.hash, x, n, r,
+ bucket->h.id) * (__u64)w;
+ t = t >> 32;
+
+ /* descend to the left or right? */
+ l = left(n);
+ if (t < bucket->node_weights[l])
+ n = l;
+ else
+ n = right(n);
+ }
+
+ return bucket->h.items[n >> 1];
+}
+
+
+/* straw */
+
+static int bucket_straw_choose(struct crush_bucket_straw *bucket,
+ int x, int r)
+{
+ int i;
+ int high = 0;
+ __u64 high_draw = 0;
+ __u64 draw;
+
+ for (i = 0; i < bucket->h.size; i++) {
+ draw = crush_hash32_3(bucket->h.hash, x, bucket->h.items[i], r);
+ draw &= 0xffff;
+ draw *= bucket->straws[i];
+ if (i == 0 || draw > high_draw) {
+ high = i;
+ high_draw = draw;
+ }
+ }
+ return bucket->h.items[high];
+}
+
+static int crush_bucket_choose(struct crush_bucket *in, int x, int r)
+{
+ dprintk("choose %d x=%d r=%d\n", in->id, x, r);
+ switch (in->alg) {
+ case CRUSH_BUCKET_UNIFORM:
+ return bucket_uniform_choose((struct crush_bucket_uniform *)in,
+ x, r);
+ case CRUSH_BUCKET_LIST:
+ return bucket_list_choose((struct crush_bucket_list *)in,
+ x, r);
+ case CRUSH_BUCKET_TREE:
+ return bucket_tree_choose((struct crush_bucket_tree *)in,
+ x, r);
+ case CRUSH_BUCKET_STRAW:
+ return bucket_straw_choose((struct crush_bucket_straw *)in,
+ x, r);
+ default:
+ BUG_ON(1);
+ return in->items[0];
+ }
+}
+
+/*
+ * true if device is marked "out" (failed, fully offloaded)
+ * of the cluster
+ */
+static int is_out(struct crush_map *map, __u32 *weight, int item, int x)
+{
+ if (weight[item] >= 0x1000)
+ return 0;
+ if (weight[item] == 0)
+ return 1;
+ if ((crush_hash32_2(CRUSH_HASH_RJENKINS1, x, item) & 0xffff)
+ < weight[item])
+ return 0;

+ return 1;
+}
+

+/**
+ * crush_choose - choose numrep distinct items of given type
+ * @map: the crush_map
+ * @bucket: the bucket we are choose an item from
+ * @x: crush input value
+ * @numrep: the number of items to choose
+ * @type: the type of item to choose
+ * @out: pointer to output vector
+ * @outpos: our position in that vector
+ * @firstn: true if choosing "first n" items, false if choosing "indep"
+ * @recurse_to_leaf: true if we want one device under each item of given type
+ * @out2: second output vector for leaf items (if @recurse_to_leaf)
+ */
+static int crush_choose(struct crush_map *map,
+ struct crush_bucket *bucket,
+ __u32 *weight,
+ int x, int numrep, int type,
+ int *out, int outpos,
+ int firstn, int recurse_to_leaf,
+ int *out2)
+{
+ int rep;
+ int ftotal, flocal;
+ int retry_descent, retry_bucket, skip_rep;
+ struct crush_bucket *in = bucket;
+ int r;
+ int i;
+ int item = 0;
+ int itemtype;
+ int collide, reject;
+ const int orig_tries = 5; /* attempts before we fall back to search */
+ dprintk("choose bucket %d x %d outpos %d\n", bucket->id, x, outpos);
+
+ for (rep = outpos; rep < numrep; rep++) {
+ /* keep trying until we get a non-out, non-colliding item */
+ ftotal = 0;
+ skip_rep = 0;
+ do {
+ retry_descent = 0;
+ in = bucket; /* initial bucket */
+
+ /* choose through intervening buckets */
+ flocal = 0;
+ do {
+ collide = 0;
+ retry_bucket = 0;
+ r = rep;
+ if (in->alg == CRUSH_BUCKET_UNIFORM) {
+ /* be careful */
+ if (firstn || numrep >= in->size)
+ /* r' = r + f_total */
+ r += ftotal;
+ else if (in->size % numrep == 0)
+ /* r'=r+(n+1)*f_local */
+ r += (numrep+1) *
+ (flocal+ftotal);
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ } else {
+ if (firstn)
+ /* r' = r + f_total */
+ r += ftotal;
+ else
+ /* r' = r + n*f_local */
+ r += numrep * (flocal+ftotal);
+ }
+
+ /* bucket choose */
+ if (in->size == 0) {
+ reject = 1;
+ goto reject;
+ }
+ if (flocal >= (in->size>>1) &&
+ flocal > orig_tries)
+ item = bucket_perm_choose(in, x, r);
+ else
+ item = crush_bucket_choose(in, x, r);
+ BUG_ON(item >= map->max_devices);
+
+ /* desired type? */
+ if (item < 0)
+ itemtype = map->buckets[-1-item]->type;
+ else
+ itemtype = 0;
+ dprintk(" item %d type %d\n", item, itemtype);
+
+ /* keep going? */
+ if (itemtype != type) {
+ BUG_ON(item >= 0 ||
+ (-1-item) >= map->max_buckets);
+ in = map->buckets[-1-item];
+ continue;
+ }
+
+ /* collision? */
+ for (i = 0; i < outpos; i++) {
+ if (out[i] == item) {
+ collide = 1;
+ break;
+ }
+ }
+
+ if (recurse_to_leaf &&
+ item < 0 &&
+ crush_choose(map, map->buckets[-1-item],
+ weight,
+ x, outpos+1, 0,
+ out2, outpos,
+ firstn, 0, NULL) <= outpos) {
+ reject = 1;
+ } else {
+ /* out? */
+ if (itemtype == 0)
+ reject = is_out(map, weight,
+ item, x);
+ else
+ reject = 0;
+ }
+
+reject:
+ if (reject || collide) {
+ ftotal++;
+ flocal++;
+
+ if (collide && flocal < 3)
+ /* retry locally a few times */
+ retry_bucket = 1;
+ else if (flocal < in->size + orig_tries)
+ /* exhaustive bucket search */
+ retry_bucket = 1;
+ else if (ftotal < 20)
+ /* then retry descent */
+ retry_descent = 1;
+ else
+ /* else give up */
+ skip_rep = 1;
+ dprintk(" reject %d collide %d "
+ "ftotal %d flocal %d\n",
+ reject, collide, ftotal,
+ flocal);
+ }
+ } while (retry_bucket);
+ } while (retry_descent);
+
+ if (skip_rep) {
+ dprintk("skip rep\n");
+ continue;
+ }
+
+ dprintk("choose got %d\n", item);
+ out[outpos] = item;
+ outpos++;
+ }
+
+ dprintk("choose returns %d\n", outpos);
+ return outpos;
+}
+
+
+/**
+ * crush_do_rule - calculate a mapping with the given input and rule
+ * @map: the crush_map
+ * @ruleno: the rule id
+ * @x: hash input
+ * @result: pointer to result vector
+ * @result_max: maximum result size
+ * @force: force initial replica choice; -1 for none
+ */
+int crush_do_rule(struct crush_map *map,
+ int ruleno, int x, int *result, int result_max,
+ int force, __u32 *weight)
+{
+ int result_len;
+ int force_context[CRUSH_MAX_DEPTH];
+ int force_pos = -1;
+ int a[CRUSH_MAX_SET];
+ int b[CRUSH_MAX_SET];
+ int c[CRUSH_MAX_SET];
+ int recurse_to_leaf;
+ int *w;
+ int wsize = 0;
+ int *o;
+ int osize;
+ int *tmp;
+ struct crush_rule *rule;
+ int step;
+ int i, j;
+ int numrep;
+ int firstn;
+ int rc = -1;
+
+ BUG_ON(ruleno >= map->max_rules);
+
+ rule = map->rules[ruleno];
+ result_len = 0;
+ w = a;
+ o = b;
+
+ /*
+ * determine hierarchical context of force, if any. note
+ * that this may or may not correspond to the specific types
+ * referenced by the crush rule.
+ */
+ if (force >= 0) {
+ if (force >= map->max_devices ||
+ map->device_parents[force] == 0) {
+ /*dprintk("CRUSH: forcefed device dne\n");*/
+ rc = -1; /* force fed device dne */
+ goto out;
+ }
+ if (!is_out(map, weight, force, x)) {
+ while (1) {
+ force_context[++force_pos] = force;
+ if (force >= 0)
+ force = map->device_parents[force];
+ else
+ force = map->bucket_parents[-1-force];
+ if (force == 0)

+ break;
+ }
+ }
+ }
+

+ for (step = 0; step < rule->len; step++) {
+ firstn = 0;
+ switch (rule->steps[step].op) {
+ case CRUSH_RULE_TAKE:
+ w[0] = rule->steps[step].arg1;
+ if (force_pos >= 0) {
+ BUG_ON(force_context[force_pos] != w[0]);
+ force_pos--;
+ }
+ wsize = 1;
+ break;
+
+ case CRUSH_RULE_CHOOSE_LEAF_FIRSTN:
+ case CRUSH_RULE_CHOOSE_FIRSTN:
+ firstn = 1;
+ case CRUSH_RULE_CHOOSE_LEAF_INDEP:
+ case CRUSH_RULE_CHOOSE_INDEP:
+ BUG_ON(wsize == 0);
+
+ recurse_to_leaf =
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_FIRSTN ||
+ rule->steps[step].op ==
+ CRUSH_RULE_CHOOSE_LEAF_INDEP;
+
+ /* reset output */
+ osize = 0;
+
+ for (i = 0; i < wsize; i++) {
+ /*
+ * see CRUSH_N, CRUSH_N_MINUS macros.
+ * basically, numrep <= 0 means relative to
+ * the provided result_max
+ */
+ numrep = rule->steps[step].arg1;
+ if (numrep <= 0) {
+ numrep += result_max;
+ if (numrep <= 0)
+ continue;
+ }
+ j = 0;
+ if (osize == 0 && force_pos >= 0) {
+ /* skip any intermediate types */
+ while (force_pos &&
+ force_context[force_pos] < 0 &&
+ rule->steps[step].arg2 !=
+ map->buckets[-1 -
+ force_context[force_pos]]->type)
+ force_pos--;
+ o[osize] = force_context[force_pos];
+ if (recurse_to_leaf)
+ c[osize] = force_context[0];
+ j++;
+ force_pos--;
+ }
+ osize += crush_choose(map,
+ map->buckets[-1-w[i]],
+ weight,
+ x, numrep,
+ rule->steps[step].arg2,
+ o+osize, j,
+ firstn,
+ recurse_to_leaf, c+osize);
+ }
+
+ if (recurse_to_leaf)
+ /* copy final _leaf_ values to output set */
+ memcpy(o, c, osize*sizeof(*o));
+
+ /* swap t and w arrays */
+ tmp = o;
+ o = w;
+ w = tmp;
+ wsize = osize;
+ break;
+
+
+ case CRUSH_RULE_EMIT:
+ for (i = 0; i < wsize && result_len < result_max; i++) {
+ result[result_len] = w[i];
+ result_len++;
+ }
+ wsize = 0;

+ break;
+
+ default:

+ BUG_ON(1);
+ }
+ }
+ rc = result_len;
+
+out:
+ return rc;
+}
+
+
diff --git a/fs/ceph/crush/mapper.h b/fs/ceph/crush/mapper.h
new file mode 100644
index 0000000..98e9004
--- /dev/null
+++ b/fs/ceph/crush/mapper.h
@@ -0,0 +1,20 @@
+#ifndef _CRUSH_MAPPER_H
+#define _CRUSH_MAPPER_H
+
+/*
+ * CRUSH functions for find rules and then mapping an input to an
+ * output set.

+ *
+ * LGPL2
+ */
+

+#include "crush.h"
+
+extern int crush_find_rule(struct crush_map *map, int pool, int type, int size);
+extern int crush_do_rule(struct crush_map *map,
+ int ruleno,
+ int x, int *result, int result_max,
+ int forcefeed, /* -1 for none */
+ __u32 *weights);

Sage Weil

unread,

Mar 2, 2010, 8:00:02 PM3/2/10

Mount option parsing, client setup and teardown, and a few odds and
ends (e.g., statfs).

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/super.c | 1030 +++++++++++++++++++++++++++++++++++++++++++++++++++++++
1 files changed, 1030 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/super.c

diff --git a/fs/ceph/super.c b/fs/ceph/super.c
new file mode 100644
index 0000000..4290a6e
--- /dev/null
+++ b/fs/ceph/super.c
@@ -0,0 +1,1030 @@
+
+#include "ceph_debug.h"
+
+#include <linux/backing-dev.h>
+#include <linux/fs.h>
+#include <linux/inet.h>
+#include <linux/in6.h>
+#include <linux/module.h>
+#include <linux/mount.h>
+#include <linux/parser.h>
+#include <linux/rwsem.h>
+#include <linux/sched.h>
+#include <linux/seq_file.h>
+#include <linux/statfs.h>
+#include <linux/string.h>
+#include <linux/version.h>
+#include <linux/vmalloc.h>
+
+#include "decode.h"
+#include "super.h"
+#include "mon_client.h"
+#include "auth.h"
+
+/*
+ * Ceph superblock operations
+ *
+ * Handle the basics of mounting, unmounting.
+ */
+
+
+/*
+ * find filename portion of a path (/foo/bar/baz -> baz)
+ */
+const char *ceph_file_part(const char *s, int len)
+{
+ const char *e = s + len;
+
+ while (e != s && *(e-1) != '/')
+ e--;
+ return e;
+}
+
+
+/*
+ * super ops
+ */
+static void ceph_put_super(struct super_block *s)
+{
+ struct ceph_client *cl = ceph_client(s);
+
+ dout("put_super\n");
+ ceph_mdsc_close_sessions(&cl->mdsc);
+ return;
+}
+
+static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
+{
+ struct ceph_client *client = ceph_inode_to_client(dentry->d_inode);
+ struct ceph_monmap *monmap = client->monc.monmap;
+ struct ceph_statfs st;
+ u64 fsid;
+ int err;
+
+ dout("statfs\n");
+ err = ceph_monc_do_statfs(&client->monc, &st);
+ if (err < 0)
+ return err;
+
+ /* fill in kstatfs */
+ buf->f_type = CEPH_SUPER_MAGIC; /* ?? */
+
+ /*
+ * express utilization in terms of large blocks to avoid
+ * overflow on 32-bit machines.
+ */
+ buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
+ buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
+ buf->f_bfree = (le64_to_cpu(st.kb) - le64_to_cpu(st.kb_used)) >>
+ (CEPH_BLOCK_SHIFT-10);
+ buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
+
+ buf->f_files = le64_to_cpu(st.num_objects);
+ buf->f_ffree = -1;
+ buf->f_namelen = PATH_MAX;
+ buf->f_frsize = PAGE_CACHE_SIZE;
+
+ /* leave fsid little-endian, regardless of host endianness */
+ fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
+ buf->f_fsid.val[0] = fsid & 0xffffffff;
+ buf->f_fsid.val[1] = fsid >> 32;

+
+ return 0;
+}
+
+

+static int ceph_syncfs(struct super_block *sb, int wait)
+{
+ dout("sync_fs %d\n", wait);
+ ceph_osdc_sync(&ceph_client(sb)->osdc);
+ ceph_mdsc_sync(&ceph_client(sb)->mdsc);
+ dout("sync_fs %d done\n", wait);

+ return 0;
+}
+
+

+/**
+ * ceph_show_options - Show mount options in /proc/mounts
+ * @m: seq_file to write to
+ * @mnt: mount descriptor
+ */
+static int ceph_show_options(struct seq_file *m, struct vfsmount *mnt)
+{
+ struct ceph_client *client = ceph_sb_to_client(mnt->mnt_sb);
+ struct ceph_mount_args *args = client->mount_args;
+
+ if (args->flags & CEPH_OPT_FSID)
+ seq_printf(m, ",fsidmajor=%llu,fsidminor%llu",
+ le64_to_cpu(*(__le64 *)&args->fsid.fsid[0]),
+ le64_to_cpu(*(__le64 *)&args->fsid.fsid[8]));
+ if (args->flags & CEPH_OPT_NOSHARE)
+ seq_puts(m, ",noshare");
+ if (args->flags & CEPH_OPT_DIRSTAT)
+ seq_puts(m, ",dirstat");
+ if ((args->flags & CEPH_OPT_RBYTES) == 0)
+ seq_puts(m, ",norbytes");
+ if (args->flags & CEPH_OPT_NOCRC)
+ seq_puts(m, ",nocrc");
+ if (args->flags & CEPH_OPT_NOASYNCREADDIR)
+ seq_puts(m, ",noasyncreaddir");
+ if (strcmp(args->snapdir_name, CEPH_SNAPDIRNAME_DEFAULT))
+ seq_printf(m, ",snapdirname=%s", args->snapdir_name);
+ if (args->name)
+ seq_printf(m, ",name=%s", args->name);
+ if (args->secret)
+ seq_puts(m, ",secret=<hidden>");
+ return 0;
+}
+
+/*
+ * caches
+ */
+struct kmem_cache *ceph_inode_cachep;
+struct kmem_cache *ceph_cap_cachep;
+struct kmem_cache *ceph_dentry_cachep;
+struct kmem_cache *ceph_file_cachep;
+
+static void ceph_inode_init_once(void *foo)
+{
+ struct ceph_inode_info *ci = foo;
+ inode_init_once(&ci->vfs_inode);
+}
+
+static int default_congestion_kb(void)
+{
+ int congestion_kb;
+
+ /*
+ * Copied from NFS
+ *
+ * congestion size, scale with available memory.
+ *
+ * 64MB: 8192k
+ * 128MB: 11585k
+ * 256MB: 16384k
+ * 512MB: 23170k
+ * 1GB: 32768k
+ * 2GB: 46340k
+ * 4GB: 65536k
+ * 8GB: 92681k
+ * 16GB: 131072k
+ *
+ * This allows larger machines to have larger/more transfers.
+ * Limit the default to 256M
+ */
+ congestion_kb = (16*int_sqrt(totalram_pages)) << (PAGE_SHIFT-10);
+ if (congestion_kb > 256*1024)
+ congestion_kb = 256*1024;
+
+ return congestion_kb;
+}
+
+static int __init init_caches(void)
+{
+ ceph_inode_cachep = kmem_cache_create("ceph_inode_info",
+ sizeof(struct ceph_inode_info),
+ __alignof__(struct ceph_inode_info),
+ (SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD),
+ ceph_inode_init_once);
+ if (ceph_inode_cachep == NULL)
+ return -ENOMEM;
+
+ ceph_cap_cachep = KMEM_CACHE(ceph_cap,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_cap_cachep == NULL)
+ goto bad_cap;
+
+ ceph_dentry_cachep = KMEM_CACHE(ceph_dentry_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_dentry_cachep == NULL)
+ goto bad_dentry;
+
+ ceph_file_cachep = KMEM_CACHE(ceph_file_info,
+ SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
+ if (ceph_file_cachep == NULL)
+ goto bad_file;

+
+ return 0;
+

+bad_file:
+ kmem_cache_destroy(ceph_dentry_cachep);
+bad_dentry:
+ kmem_cache_destroy(ceph_cap_cachep);
+bad_cap:
+ kmem_cache_destroy(ceph_inode_cachep);

+ return -ENOMEM;
+}
+

+static void destroy_caches(void)
+{
+ kmem_cache_destroy(ceph_inode_cachep);
+ kmem_cache_destroy(ceph_cap_cachep);
+ kmem_cache_destroy(ceph_dentry_cachep);
+ kmem_cache_destroy(ceph_file_cachep);
+}
+
+
+/*
+ * ceph_umount_begin - initiate forced umount. Tear down down the
+ * mount, skipping steps that may hang while waiting for server(s).
+ */
+static void ceph_umount_begin(struct super_block *sb)
+{
+ struct ceph_client *client = ceph_sb_to_client(sb);
+
+ dout("ceph_umount_begin - starting forced umount\n");
+ if (!client)
+ return;
+ client->mount_state = CEPH_MOUNT_SHUTDOWN;
+ return;
+}
+
+static const struct super_operations ceph_super_ops = {
+ .alloc_inode = ceph_alloc_inode,
+ .destroy_inode = ceph_destroy_inode,
+ .write_inode = ceph_write_inode,
+ .sync_fs = ceph_syncfs,
+ .put_super = ceph_put_super,
+ .show_options = ceph_show_options,
+ .statfs = ceph_statfs,
+ .umount_begin = ceph_umount_begin,
+};
+
+
+const char *ceph_msg_type_name(int type)
+{
+ switch (type) {
+ case CEPH_MSG_SHUTDOWN: return "shutdown";
+ case CEPH_MSG_PING: return "ping";
+ case CEPH_MSG_AUTH: return "auth";
+ case CEPH_MSG_AUTH_REPLY: return "auth_reply";
+ case CEPH_MSG_MON_MAP: return "mon_map";
+ case CEPH_MSG_MON_GET_MAP: return "mon_get_map";
+ case CEPH_MSG_MON_SUBSCRIBE: return "mon_subscribe";
+ case CEPH_MSG_MON_SUBSCRIBE_ACK: return "mon_subscribe_ack";
+ case CEPH_MSG_STATFS: return "statfs";
+ case CEPH_MSG_STATFS_REPLY: return "statfs_reply";
+ case CEPH_MSG_MDS_MAP: return "mds_map";
+ case CEPH_MSG_CLIENT_SESSION: return "client_session";
+ case CEPH_MSG_CLIENT_RECONNECT: return "client_reconnect";
+ case CEPH_MSG_CLIENT_REQUEST: return "client_request";
+ case CEPH_MSG_CLIENT_REQUEST_FORWARD: return "client_request_forward";
+ case CEPH_MSG_CLIENT_REPLY: return "client_reply";
+ case CEPH_MSG_CLIENT_CAPS: return "client_caps";
+ case CEPH_MSG_CLIENT_CAPRELEASE: return "client_cap_release";
+ case CEPH_MSG_CLIENT_SNAP: return "client_snap";
+ case CEPH_MSG_CLIENT_LEASE: return "client_lease";
+ case CEPH_MSG_OSD_MAP: return "osd_map";
+ case CEPH_MSG_OSD_OP: return "osd_op";
+ case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";

+ default: return "unknown";
+ }
+}
+

+
+/*
+ * mount options
+ */

+enum {
+ Opt_fsidmajor,
+ Opt_fsidminor,
+ Opt_monport,
+ Opt_wsize,
+ Opt_rsize,
+ Opt_osdtimeout,
+ Opt_osdkeepalivetimeout,
+ Opt_mount_timeout,
+ Opt_osd_idle_ttl,
+ Opt_caps_wanted_delay_min,
+ Opt_caps_wanted_delay_max,
+ Opt_readdir_max_entries,
+ Opt_congestion_kb,
+ Opt_last_int,
+ /* int args above */
+ Opt_snapdirname,
+ Opt_name,
+ Opt_secret,
+ Opt_last_string,
+ /* string args above */
+ Opt_ip,
+ Opt_noshare,
+ Opt_dirstat,
+ Opt_nodirstat,
+ Opt_rbytes,
+ Opt_norbytes,
+ Opt_nocrc,
+ Opt_noasyncreaddir,
+};
+
+static match_table_t arg_tokens = {
+ {Opt_fsidmajor, "fsidmajor=%ld"},
+ {Opt_fsidminor, "fsidminor=%ld"},
+ {Opt_monport, "monport=%d"},
+ {Opt_wsize, "wsize=%d"},
+ {Opt_rsize, "rsize=%d"},
+ {Opt_osdtimeout, "osdtimeout=%d"},
+ {Opt_osdkeepalivetimeout, "osdkeepalive=%d"},
+ {Opt_mount_timeout, "mount_timeout=%d"},
+ {Opt_osd_idle_ttl, "osd_idle_ttl=%d"},
+ {Opt_caps_wanted_delay_min, "caps_wanted_delay_min=%d"},
+ {Opt_caps_wanted_delay_max, "caps_wanted_delay_max=%d"},
+ {Opt_readdir_max_entries, "readdir_max_entries=%d"},
+ {Opt_congestion_kb, "write_congestion_kb=%d"},
+ /* int args above */
+ {Opt_snapdirname, "snapdirname=%s"},
+ {Opt_name, "name=%s"},
+ {Opt_secret, "secret=%s"},
+ /* string args above */
+ {Opt_ip, "ip=%s"},
+ {Opt_noshare, "noshare"},
+ {Opt_dirstat, "dirstat"},
+ {Opt_nodirstat, "nodirstat"},
+ {Opt_rbytes, "rbytes"},
+ {Opt_norbytes, "norbytes"},
+ {Opt_nocrc, "nocrc"},
+ {Opt_noasyncreaddir, "noasyncreaddir"},
+ {-1, NULL}
+};
+
+
+static struct ceph_mount_args *parse_mount_args(int flags, char *options,
+ const char *dev_name,
+ const char **path)
+{
+ struct ceph_mount_args *args;
+ const char *c;

+ int err = -ENOMEM;

+ substring_t argstr[MAX_OPT_ARGS];
+
+ args = kzalloc(sizeof(*args), GFP_KERNEL);
+ if (!args)
+ return ERR_PTR(-ENOMEM);
+ args->mon_addr = kcalloc(CEPH_MAX_MON, sizeof(*args->mon_addr),
+ GFP_KERNEL);
+ if (!args->mon_addr)
+ goto out;
+
+ dout("parse_mount_args %p, dev_name '%s'\n", args, dev_name);
+
+ /* start with defaults */
+ args->sb_flags = flags;
+ args->flags = CEPH_OPT_DEFAULT;
+ args->osd_timeout = CEPH_OSD_TIMEOUT_DEFAULT;
+ args->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
+ args->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; /* seconds */
+ args->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; /* seconds */
+ args->caps_wanted_delay_min = CEPH_CAPS_WANTED_DELAY_MIN_DEFAULT;
+ args->caps_wanted_delay_max = CEPH_CAPS_WANTED_DELAY_MAX_DEFAULT;
+ args->rsize = CEPH_MOUNT_RSIZE_DEFAULT;
+ args->snapdir_name = kstrdup(CEPH_SNAPDIRNAME_DEFAULT, GFP_KERNEL);
+ args->cap_release_safety = CEPH_CAPS_PER_RELEASE * 4;
+ args->max_readdir = 1024;
+ args->congestion_kb = default_congestion_kb();
+
+ /* ip1[:port1][,ip2[:port2]...]:/subdir/in/fs */
+ err = -EINVAL;
+ if (!dev_name)
+ goto out;
+ *path = strstr(dev_name, ":/");
+ if (*path == NULL) {
+ pr_err("device name is missing path (no :/ in %s)\n",
+ dev_name);

+ goto out;
+ }
+

+ /* get mon ip(s) */
+ err = ceph_parse_ips(dev_name, *path, args->mon_addr,
+ CEPH_MAX_MON, &args->num_mon);
+ if (err < 0)
+ goto out;
+
+ /* path on server */
+ *path += 2;
+ dout("server path '%s'\n", *path);
+
+ /* parse mount options */
+ while ((c = strsep(&options, ",")) != NULL) {
+ int token, intval, ret;
+ if (!*c)
+ continue;
+ err = -EINVAL;
+ token = match_token((char *)c, arg_tokens, argstr);
+ if (token < 0) {
+ pr_err("bad mount option at '%s'\n", c);
+ goto out;
+ }
+ if (token < Opt_last_int) {
+ ret = match_int(&argstr[0], &intval);
+ if (ret < 0) {
+ pr_err("bad mount option arg (not int) "
+ "at '%s'\n", c);
+ continue;
+ }
+ dout("got int token %d val %d\n", token, intval);
+ } else if (token > Opt_last_int && token < Opt_last_string) {
+ dout("got string token %d val %s\n", token,
+ argstr[0].from);
+ } else {
+ dout("got token %d\n", token);
+ }
+ switch (token) {
+ case Opt_fsidmajor:
+ *(__le64 *)&args->fsid.fsid[0] = cpu_to_le64(intval);
+ break;
+ case Opt_fsidminor:
+ *(__le64 *)&args->fsid.fsid[8] = cpu_to_le64(intval);
+ break;
+ case Opt_ip:
+ err = ceph_parse_ips(argstr[0].from,
+ argstr[0].to,
+ &args->my_addr,
+ 1, NULL);
+ if (err < 0)
+ goto out;
+ args->flags |= CEPH_OPT_MYIP;
+ break;
+
+ case Opt_snapdirname:
+ kfree(args->snapdir_name);
+ args->snapdir_name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_name:
+ args->name = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+ case Opt_secret:
+ args->secret = kstrndup(argstr[0].from,
+ argstr[0].to-argstr[0].from,
+ GFP_KERNEL);
+ break;
+
+ /* misc */
+ case Opt_wsize:
+ args->wsize = intval;
+ break;
+ case Opt_rsize:
+ args->rsize = intval;
+ break;
+ case Opt_osdtimeout:
+ args->osd_timeout = intval;
+ break;
+ case Opt_osdkeepalivetimeout:
+ args->osd_keepalive_timeout = intval;
+ break;
+ case Opt_mount_timeout:
+ args->mount_timeout = intval;
+ break;
+ case Opt_caps_wanted_delay_min:
+ args->caps_wanted_delay_min = intval;
+ break;
+ case Opt_caps_wanted_delay_max:
+ args->caps_wanted_delay_max = intval;
+ break;
+ case Opt_readdir_max_entries:
+ args->max_readdir = intval;
+ break;
+ case Opt_congestion_kb:
+ args->congestion_kb = intval;
+ break;
+
+ case Opt_noshare:
+ args->flags |= CEPH_OPT_NOSHARE;
+ break;
+
+ case Opt_dirstat:
+ args->flags |= CEPH_OPT_DIRSTAT;
+ break;
+ case Opt_nodirstat:
+ args->flags &= ~CEPH_OPT_DIRSTAT;
+ break;
+ case Opt_rbytes:
+ args->flags |= CEPH_OPT_RBYTES;
+ break;
+ case Opt_norbytes:
+ args->flags &= ~CEPH_OPT_RBYTES;
+ break;
+ case Opt_nocrc:
+ args->flags |= CEPH_OPT_NOCRC;
+ break;
+ case Opt_noasyncreaddir:
+ args->flags |= CEPH_OPT_NOASYNCREADDIR;

+ break;
+
+ default:

+ BUG_ON(token);
+ }
+ }
+ return args;
+
+out:
+ kfree(args->mon_addr);
+ kfree(args);

+ return ERR_PTR(err);
+}
+

+static void destroy_mount_args(struct ceph_mount_args *args)
+{
+ dout("destroy_mount_args %p\n", args);
+ kfree(args->snapdir_name);
+ args->snapdir_name = NULL;
+ kfree(args->name);
+ args->name = NULL;
+ kfree(args->secret);
+ args->secret = NULL;
+ kfree(args);
+}
+
+/*
+ * create a fresh client instance
+ */
+static struct ceph_client *ceph_create_client(struct ceph_mount_args *args)
+{
+ struct ceph_client *client;

+ int err = -ENOMEM;
+

+ client = kzalloc(sizeof(*client), GFP_KERNEL);
+ if (client == NULL)
+ return ERR_PTR(-ENOMEM);
+
+ mutex_init(&client->mount_mutex);
+
+ init_waitqueue_head(&client->auth_wq);
+
+ client->sb = NULL;
+ client->mount_state = CEPH_MOUNT_MOUNTING;
+ client->mount_args = args;
+
+ client->msgr = NULL;
+
+ client->auth_err = 0;
+ atomic_long_set(&client->writeback_count, 0);
+
+ err = bdi_init(&client->backing_dev_info);

+ if (err < 0)
+ goto fail;
+

+ err = -ENOMEM;
+ client->wb_wq = create_workqueue("ceph-writeback");
+ if (client->wb_wq == NULL)
+ goto fail_bdi;
+ client->pg_inv_wq = create_singlethread_workqueue("ceph-pg-invalid");
+ if (client->pg_inv_wq == NULL)
+ goto fail_wb_wq;
+ client->trunc_wq = create_singlethread_workqueue("ceph-trunc");
+ if (client->trunc_wq == NULL)
+ goto fail_pg_inv_wq;
+
+ /* set up mempools */
+ err = -ENOMEM;
+ client->wb_pagevec_pool = mempool_create_kmalloc_pool(10,
+ client->mount_args->wsize >> PAGE_CACHE_SHIFT);
+ if (!client->wb_pagevec_pool)
+ goto fail_trunc_wq;
+
+ /* caps */
+ client->min_caps = args->max_readdir;
+ ceph_adjust_min_caps(client->min_caps);
+
+ /* subsystems */
+ err = ceph_monc_init(&client->monc, client);
+ if (err < 0)
+ goto fail_mempool;
+ err = ceph_osdc_init(&client->osdc, client);
+ if (err < 0)
+ goto fail_monc;
+ err = ceph_mdsc_init(&client->mdsc, client);
+ if (err < 0)
+ goto fail_osdc;
+ return client;
+
+fail_osdc:
+ ceph_osdc_stop(&client->osdc);
+fail_monc:
+ ceph_monc_stop(&client->monc);
+fail_mempool:
+ mempool_destroy(client->wb_pagevec_pool);
+fail_trunc_wq:
+ destroy_workqueue(client->trunc_wq);
+fail_pg_inv_wq:
+ destroy_workqueue(client->pg_inv_wq);
+fail_wb_wq:
+ destroy_workqueue(client->wb_wq);
+fail_bdi:
+ bdi_destroy(&client->backing_dev_info);
+fail:
+ kfree(client);

+ return ERR_PTR(err);
+}
+

+static void ceph_destroy_client(struct ceph_client *client)
+{
+ dout("destroy_client %p\n", client);
+
+ /* unmount */
+ ceph_mdsc_stop(&client->mdsc);
+ ceph_monc_stop(&client->monc);
+ ceph_osdc_stop(&client->osdc);
+
+ ceph_adjust_min_caps(-client->min_caps);
+
+ ceph_debugfs_client_cleanup(client);
+ destroy_workqueue(client->wb_wq);
+ destroy_workqueue(client->pg_inv_wq);
+ destroy_workqueue(client->trunc_wq);
+
+ if (client->msgr)
+ ceph_messenger_destroy(client->msgr);
+ mempool_destroy(client->wb_pagevec_pool);
+
+ destroy_mount_args(client->mount_args);
+
+ kfree(client);
+ dout("destroy_client %p done\n", client);
+}
+
+/*
+ * Initially learn our fsid, or verify an fsid matches.
+ */
+int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid)
+{
+ if (client->have_fsid) {
+ if (ceph_fsid_compare(&client->fsid, fsid)) {
+ pr_err("bad fsid, had " FSID_FORMAT " got " FSID_FORMAT,
+ PR_FSID(&client->fsid), PR_FSID(fsid));
+ return -1;
+ }
+ } else {
+ pr_info("client%lld fsid " FSID_FORMAT "\n",
+ client->monc.auth->global_id, PR_FSID(fsid));
+ memcpy(&client->fsid, fsid, sizeof(*fsid));
+ ceph_debugfs_client_init(client);
+ client->have_fsid = true;

+ }
+ return 0;
+}
+

+/*
+ * true if we have the mon map (and have thus joined the cluster)
+ */
+static int have_mon_map(struct ceph_client *client)
+{
+ return client->monc.monmap && client->monc.monmap->epoch;
+}
+
+/*
+ * Bootstrap mount by opening the root directory. Note the mount
+ * @started time from caller, and time out if this takes too long.
+ */
+static struct dentry *open_root_dentry(struct ceph_client *client,
+ const char *path,
+ unsigned long started)
+{

+ struct ceph_mds_client *mdsc = &client->mdsc;

+ struct ceph_mds_request *req = NULL;
+ int err;
+ struct dentry *root;
+
+ /* open dir */
+ dout("open_root_inode opening '%s'\n", path);
+ req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
+ if (IS_ERR(req))
+ return ERR_PTR(PTR_ERR(req));
+ req->r_path1 = kstrdup(path, GFP_NOFS);
+ req->r_ino1.ino = CEPH_INO_ROOT;
+ req->r_ino1.snap = CEPH_NOSNAP;
+ req->r_started = started;
+ req->r_timeout = client->mount_args->mount_timeout * HZ;

+ req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE);

+ req->r_num_caps = 2;
+ err = ceph_mdsc_do_request(mdsc, NULL, req);

+ if (err == 0) {

+ dout("open_root_inode success\n");
+ if (ceph_ino(req->r_target_inode) == CEPH_INO_ROOT &&
+ client->sb->s_root == NULL)
+ root = d_alloc_root(req->r_target_inode);
+ else
+ root = d_obtain_alias(req->r_target_inode);
+ req->r_target_inode = NULL;
+ dout("open_root_inode success, root dentry is %p\n", root);
+ } else {
+ root = ERR_PTR(err);
+ }
+ ceph_mdsc_put_request(req);
+ return root;
+}
+
+/*
+ * mount: join the ceph cluster, and open root directory.
+ */
+static int ceph_mount(struct ceph_client *client, struct vfsmount *mnt,
+ const char *path)
+{
+ struct ceph_entity_addr *myaddr = NULL;
+ int err;
+ unsigned long timeout = client->mount_args->mount_timeout * HZ;
+ unsigned long started = jiffies; /* note the start time */
+ struct dentry *root;
+
+ dout("mount start\n");
+ mutex_lock(&client->mount_mutex);
+
+ /* initialize the messenger */
+ if (client->msgr == NULL) {
+ if (ceph_test_opt(client, MYIP))
+ myaddr = &client->mount_args->my_addr;
+ client->msgr = ceph_messenger_create(myaddr);
+ if (IS_ERR(client->msgr)) {
+ err = PTR_ERR(client->msgr);
+ client->msgr = NULL;
+ goto out;
+ }
+ client->msgr->nocrc = ceph_test_opt(client, NOCRC);
+ }
+
+ /* open session, and wait for mon, mds, and osd maps */
+ err = ceph_monc_open_session(&client->monc);
+ if (err < 0)
+ goto out;
+
+ while (!have_mon_map(client)) {
+ err = -EIO;
+ if (timeout && time_after_eq(jiffies, started + timeout))
+ goto out;
+
+ /* wait */
+ dout("mount waiting for mon_map\n");
+ err = wait_event_interruptible_timeout(client->auth_wq,
+ have_mon_map(client) || (client->auth_err < 0),
+ timeout);
+ if (err == -EINTR || err == -ERESTARTSYS)
+ goto out;
+ if (client->auth_err < 0) {
+ err = client->auth_err;

+ goto out;
+ }
+ }
+

+ dout("mount opening root\n");
+ root = open_root_dentry(client, "", started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ goto out;
+ }
+ if (client->sb->s_root)
+ dput(root);
+ else
+ client->sb->s_root = root;
+
+ if (path[0] == 0) {
+ dget(root);
+ } else {
+ dout("mount opening base mountpoint\n");
+ root = open_root_dentry(client, path, started);
+ if (IS_ERR(root)) {
+ err = PTR_ERR(root);
+ dput(client->sb->s_root);
+ client->sb->s_root = NULL;

+ goto out;
+ }
+ }
+

+ mnt->mnt_root = root;
+ mnt->mnt_sb = client->sb;
+
+ client->mount_state = CEPH_MOUNT_MOUNTED;
+ dout("mount success\n");

+ err = 0;
+

+out:
+ mutex_unlock(&client->mount_mutex);

+ return err;
+}
+

+static int ceph_set_super(struct super_block *s, void *data)
+{
+ struct ceph_client *client = data;
+ int ret;
+
+ dout("set_super %p data %p\n", s, data);
+
+ s->s_flags = client->mount_args->sb_flags;
+ s->s_maxbytes = 1ULL << 40; /* temp value until we get mdsmap */
+
+ s->s_fs_info = client;
+ client->sb = s;
+
+ s->s_op = &ceph_super_ops;
+ s->s_export_op = &ceph_export_ops;
+
+ s->s_time_gran = 1000; /* 1000 ns == 1 us */
+
+ ret = set_anon_super(s, NULL); /* what is that second arg for? */
+ if (ret != 0)
+ goto fail;
+
+ return ret;
+
+fail:
+ s->s_fs_info = NULL;
+ client->sb = NULL;
+ return ret;
+}
+
+/*
+ * share superblock if same fs AND options
+ */
+static int ceph_compare_super(struct super_block *sb, void *data)
+{
+ struct ceph_client *new = data;
+ struct ceph_mount_args *args = new->mount_args;
+ struct ceph_client *other = ceph_sb_to_client(sb);
+ int i;
+
+ dout("ceph_compare_super %p\n", sb);
+ if (args->flags & CEPH_OPT_FSID) {
+ if (ceph_fsid_compare(&args->fsid, &other->fsid)) {
+ dout("fsid doesn't match\n");
+ return 0;
+ }
+ } else {
+ /* do we share (a) monitor? */
+ for (i = 0; i < new->monc.monmap->num_mon; i++)
+ if (ceph_monmap_contains(other->monc.monmap,
+ &new->monc.monmap->mon_inst[i].addr))
+ break;
+ if (i == new->monc.monmap->num_mon) {
+ dout("mon ip not part of monmap\n");
+ return 0;
+ }
+ dout("mon ip matches existing sb %p\n", sb);
+ }
+ if (args->sb_flags != other->mount_args->sb_flags) {
+ dout("flags differ\n");

+ return 0;
+ }
+ return 1;
+}
+
+/*

+ * construct our own bdi so we can control readahead, etc.
+ */
+static int ceph_register_bdi(struct super_block *sb, struct ceph_client *client)

+{
+ int err;
+

+ sb->s_bdi = &client->backing_dev_info;
+
+ /* set ra_pages based on rsize mount option? */
+ if (client->mount_args->rsize >= PAGE_CACHE_SIZE)
+ client->backing_dev_info.ra_pages =
+ (client->mount_args->rsize + PAGE_CACHE_SIZE - 1)
+ >> PAGE_SHIFT;
+ err = bdi_register_dev(&client->backing_dev_info, sb->s_dev);

+ return err;
+}
+

+static int ceph_get_sb(struct file_system_type *fs_type,
+ int flags, const char *dev_name, void *data,
+ struct vfsmount *mnt)
+{
+ struct super_block *sb;
+ struct ceph_client *client;
+ int err;
+ int (*compare_super)(struct super_block *, void *) = ceph_compare_super;
+ const char *path = NULL;
+ struct ceph_mount_args *args;
+
+ dout("ceph_get_sb\n");
+ args = parse_mount_args(flags, data, dev_name, &path);
+ if (IS_ERR(args)) {
+ err = PTR_ERR(args);

+ goto out_final;
+ }
+

+ /* create client (which we may/may not use) */
+ client = ceph_create_client(args);
+ if (IS_ERR(client)) {
+ err = PTR_ERR(client);

+ goto out_final;
+ }
+

+ if (client->mount_args->flags & CEPH_OPT_NOSHARE)
+ compare_super = NULL;
+ sb = sget(fs_type, compare_super, ceph_set_super, client);
+ if (IS_ERR(sb)) {
+ err = PTR_ERR(sb);

+ goto out;
+ }
+

+ if (ceph_client(sb) != client) {
+ ceph_destroy_client(client);
+ client = ceph_client(sb);
+ dout("get_sb got existing client %p\n", client);
+ } else {
+ dout("get_sb using new client %p\n", client);
+ err = ceph_register_bdi(sb, client);
+ if (err < 0)
+ goto out_splat;
+ }
+
+ err = ceph_mount(client, mnt, path);
+ if (err < 0)
+ goto out_splat;
+ dout("root %p inode %p ino %llx.%llx\n", mnt->mnt_root,
+ mnt->mnt_root->d_inode, ceph_vinop(mnt->mnt_root->d_inode));
+ return 0;
+
+out_splat:
+ ceph_mdsc_close_sessions(&client->mdsc);
+ up_write(&sb->s_umount);
+ deactivate_super(sb);
+ goto out_final;
+
+out:
+ ceph_destroy_client(client);
+out_final:
+ dout("ceph_get_sb fail %d\n", err);

+ return err;
+}
+

+static void ceph_kill_sb(struct super_block *s)
+{
+ struct ceph_client *client = ceph_sb_to_client(s);
+ dout("kill_sb %p\n", s);
+ ceph_mdsc_pre_umount(&client->mdsc);
+ kill_anon_super(s); /* will call put_super after sb is r/o */
+ if (s->s_bdi == &client->backing_dev_info)
+ bdi_unregister(&client->backing_dev_info);
+ bdi_destroy(&client->backing_dev_info);
+ ceph_destroy_client(client);
+}
+
+static struct file_system_type ceph_fs_type = {
+ .owner = THIS_MODULE,
+ .name = "ceph",
+ .get_sb = ceph_get_sb,
+ .kill_sb = ceph_kill_sb,
+ .fs_flags = FS_RENAME_DOES_D_MOVE,
+};
+
+#define _STRINGIFY(x) #x
+#define STRINGIFY(x) _STRINGIFY(x)
+
+static int __init init_ceph(void)
+{
+ int ret = 0;
+
+ ret = ceph_debugfs_init();
+ if (ret < 0)
+ goto out;
+
+ ret = ceph_msgr_init();
+ if (ret < 0)
+ goto out_debugfs;
+
+ ret = init_caches();
+ if (ret)
+ goto out_msgr;
+
+ ceph_caps_init();
+
+ ret = register_filesystem(&ceph_fs_type);
+ if (ret)
+ goto out_icache;
+
+ pr_info("loaded %d.%d.%d (mon/mds/osd proto %d/%d/%d)\n",
+ CEPH_VERSION_MAJOR, CEPH_VERSION_MINOR, CEPH_VERSION_PATCH,
+ CEPH_MONC_PROTOCOL, CEPH_MDSC_PROTOCOL, CEPH_OSDC_PROTOCOL);
+ return 0;
+
+out_icache:
+ destroy_caches();
+out_msgr:
+ ceph_msgr_exit();
+out_debugfs:
+ ceph_debugfs_cleanup();
+out:

+ return ret;
+}
+

+static void __exit exit_ceph(void)
+{
+ dout("exit_ceph\n");
+ unregister_filesystem(&ceph_fs_type);
+ ceph_caps_finalize();
+ destroy_caches();
+ ceph_msgr_exit();
+ ceph_debugfs_cleanup();
+}
+
+module_init(init_ceph);
+module_exit(exit_ceph);
+
+MODULE_AUTHOR("Sage Weil <sa...@newdream.net>");
+MODULE_AUTHOR("Yehuda Sadeh <yeh...@hq.newdream.net>");
+MODULE_AUTHOR("Patience Warnick <pati...@newdream.net>");
+MODULE_DESCRIPTION("Ceph filesystem for Linux");
+MODULE_LICENSE("GPL");

Message has been deleted

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

The monitor cluster is responsible for managing cluster membership
and state. The monitor client handles what minimal interaction
the Ceph client has with it: checking for updated versions of the
MDS and OSD maps, getting statfs() information, and unmounting.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/mon_client.c | 834 ++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/mon_client.h | 119 +++++++
2 files changed, 953 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/mon_client.c
create mode 100644 fs/ceph/mon_client.h

diff --git a/fs/ceph/mon_client.c b/fs/ceph/mon_client.c
new file mode 100644
index 0000000..890597c
--- /dev/null
+++ b/fs/ceph/mon_client.c
@@ -0,0 +1,834 @@
+#include "ceph_debug.h"
+
+#include <linux/types.h>
+#include <linux/random.h>
+#include <linux/sched.h>
+
+#include "mon_client.h"
+#include "super.h"
+#include "auth.h"
+#include "decode.h"
+
+/*
+ * Interact with Ceph monitor cluster. Handle requests for new map
+ * versions, and periodically resend as needed. Also implement
+ * statfs() and umount().
+ *
+ * A small cluster of Ceph "monitors" are responsible for managing critical
+ * cluster configuration and state information. An odd number (e.g., 3, 5)
+ * of cmon daemons use a modified version of the Paxos part-time parliament
+ * algorithm to manage the MDS map (mds cluster membership), OSD map, and
+ * list of clients who have mounted the file system.
+ *
+ * We maintain an open, active session with a monitor at all times in order to
+ * receive timely MDSMap updates. We periodically send a keepalive byte on the
+ * TCP socket to ensure we detect a failure. If the connection does break, we
+ * randomly hunt for a new monitor. Once the connection is reestablished, we
+ * resend any outstanding requests.
+ */
+
+const static struct ceph_connection_operations mon_con_ops;
+
+static int __validate_auth(struct ceph_mon_client *monc);
+
+/*
+ * Decode a monmap blob (e.g., during mount).
+ */
+struct ceph_monmap *ceph_monmap_decode(void *p, void *end)
+{
+ struct ceph_monmap *m = NULL;
+ int i, err = -EINVAL;
+ struct ceph_fsid fsid;
+ u32 epoch, num_mon;
+ u16 version;
+ u32 len;
+
+ ceph_decode_32_safe(&p, end, len, bad);
+ ceph_decode_need(&p, end, len, bad);
+
+ dout("monmap_decode %p %p len %d\n", p, end, (int)(end-p));
+
+ ceph_decode_16_safe(&p, end, version, bad);
+
+ ceph_decode_need(&p, end, sizeof(fsid) + 2*sizeof(u32), bad);
+ ceph_decode_copy(&p, &fsid, sizeof(fsid));
+ epoch = ceph_decode_32(&p);
+
+ num_mon = ceph_decode_32(&p);
+ ceph_decode_need(&p, end, num_mon*sizeof(m->mon_inst[0]), bad);
+
+ if (num_mon >= CEPH_MAX_MON)
+ goto bad;
+ m = kmalloc(sizeof(*m) + sizeof(m->mon_inst[0])*num_mon, GFP_NOFS);
+ if (m == NULL)
+ return ERR_PTR(-ENOMEM);
+ m->fsid = fsid;
+ m->epoch = epoch;
+ m->num_mon = num_mon;
+ ceph_decode_copy(&p, m->mon_inst, num_mon*sizeof(m->mon_inst[0]));
+ for (i = 0; i < num_mon; i++)
+ ceph_decode_addr(&m->mon_inst[i].addr);
+
+ dout("monmap_decode epoch %d, num_mon %d\n", m->epoch,
+ m->num_mon);
+ for (i = 0; i < m->num_mon; i++)
+ dout("monmap_decode mon%d is %s\n", i,
+ pr_addr(&m->mon_inst[i].addr.in_addr));
+ return m;
+
+bad:
+ dout("monmap_decode failed with %d\n", err);
+ kfree(m);

+ return ERR_PTR(err);
+}
+

+/*
+ * return true if *addr is included in the monmap.
+ */
+int ceph_monmap_contains(struct ceph_monmap *m, struct ceph_entity_addr *addr)
+{
+ int i;
+
+ for (i = 0; i < m->num_mon; i++)
+ if (memcmp(addr, &m->mon_inst[i].addr, sizeof(*addr)) == 0)
+ return 1;

+ return 0;
+}
+
+/*

+ * Send an auth request.
+ */
+static void __send_prepared_auth_request(struct ceph_mon_client *monc, int len)
+{
+ monc->pending_auth = 1;
+ monc->m_auth->front.iov_len = len;
+ monc->m_auth->hdr.front_len = cpu_to_le32(len);
+ ceph_msg_get(monc->m_auth); /* keep our ref */
+ ceph_con_send(monc->con, monc->m_auth);
+}
+
+/*
+ * Close monitor session, if any.
+ */
+static void __close_session(struct ceph_mon_client *monc)
+{
+ if (monc->con) {
+ dout("__close_session closing mon%d\n", monc->cur_mon);
+ ceph_con_revoke(monc->con, monc->m_auth);
+ ceph_con_close(monc->con);
+ monc->cur_mon = -1;
+ monc->pending_auth = 0;
+ ceph_auth_reset(monc->auth);
+ }
+}
+
+/*
+ * Open a session with a (new) monitor.
+ */
+static int __open_session(struct ceph_mon_client *monc)
+{
+ char r;
+ int ret;
+
+ if (monc->cur_mon < 0) {
+ get_random_bytes(&r, 1);
+ monc->cur_mon = r % monc->monmap->num_mon;
+ dout("open_session num=%d r=%d -> mon%d\n",
+ monc->monmap->num_mon, r, monc->cur_mon);
+ monc->sub_sent = 0;
+ monc->sub_renew_after = jiffies; /* i.e., expired */
+ monc->want_next_osdmap = !!monc->want_next_osdmap;
+
+ dout("open_session mon%d opening\n", monc->cur_mon);
+ monc->con->peer_name.type = CEPH_ENTITY_TYPE_MON;
+ monc->con->peer_name.num = cpu_to_le64(monc->cur_mon);
+ ceph_con_open(monc->con,
+ &monc->monmap->mon_inst[monc->cur_mon].addr);
+
+ /* initiatiate authentication handshake */
+ ret = ceph_auth_build_hello(monc->auth,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ __send_prepared_auth_request(monc, ret);
+ } else {
+ dout("open_session mon%d already open\n", monc->cur_mon);

+ }
+ return 0;
+}
+

+static bool __sub_expired(struct ceph_mon_client *monc)
+{
+ return time_after_eq(jiffies, monc->sub_renew_after);
+}
+
+/*
+ * Reschedule delayed work timer.
+ */
+static void __schedule_delayed(struct ceph_mon_client *monc)
+{
+ unsigned delay;
+
+ if (monc->cur_mon < 0 || __sub_expired(monc))
+ delay = 10 * HZ;
+ else
+ delay = 20 * HZ;
+ dout("__schedule_delayed after %u\n", delay);
+ schedule_delayed_work(&monc->delayed_work, delay);
+}
+
+/*
+ * Send subscribe request for mdsmap and/or osdmap.
+ */
+static void __send_subscribe(struct ceph_mon_client *monc)
+{
+ dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
+ (unsigned)monc->sub_sent, __sub_expired(monc),
+ monc->want_next_osdmap);
+ if ((__sub_expired(monc) && !monc->sub_sent) ||
+ monc->want_next_osdmap == 1) {
+ struct ceph_msg *msg;
+ struct ceph_mon_subscribe_item *i;

+ void *p, *end;
+

+ msg = ceph_msg_new(CEPH_MSG_MON_SUBSCRIBE, 96, 0, 0, NULL);
+ if (!msg)
+ return;
+

+ p = msg->front.iov_base;

+ end = p + msg->front.iov_len;
+
+ dout("__send_subscribe to 'mdsmap' %u+\n",
+ (unsigned)monc->have_mdsmap);
+ if (monc->want_next_osdmap) {
+ dout("__send_subscribe to 'osdmap' %u\n",
+ (unsigned)monc->have_osdmap);
+ ceph_encode_32(&p, 3);
+ ceph_encode_string(&p, end, "osdmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_osdmap);
+ i->onetime = 1;
+ p += sizeof(*i);
+ monc->want_next_osdmap = 2; /* requested */
+ } else {
+ ceph_encode_32(&p, 2);
+ }
+ ceph_encode_string(&p, end, "mdsmap", 6);
+ i = p;
+ i->have = cpu_to_le64(monc->have_mdsmap);
+ i->onetime = 0;
+ p += sizeof(*i);
+ ceph_encode_string(&p, end, "monmap", 6);
+ i = p;
+ i->have = 0;
+ i->onetime = 0;
+ p += sizeof(*i);
+
+ msg->front.iov_len = p - msg->front.iov_base;
+ msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
+ ceph_con_send(monc->con, msg);
+
+ monc->sub_sent = jiffies | 1; /* never 0 */
+ }
+}
+
+static void handle_subscribe_ack(struct ceph_mon_client *monc,

+ struct ceph_msg *msg)
+{

+ unsigned seconds;
+ struct ceph_mon_subscribe_ack *h = msg->front.iov_base;
+

+ if (msg->front.iov_len < sizeof(*h))
+ goto bad;

+ seconds = le32_to_cpu(h->duration);
+
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ pr_info("mon%d %s session established\n",
+ monc->cur_mon, pr_addr(&monc->con->peer_addr.in_addr));
+ monc->hunting = false;
+ }
+ dout("handle_subscribe_ack after %d seconds\n", seconds);
+ monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
+ monc->sub_sent = 0;
+ mutex_unlock(&monc->mutex);
+ return;
+bad:
+ pr_err("got corrupt subscribe-ack msg\n");
+ ceph_msg_dump(msg);
+}
+
+/*
+ * Keep track of which maps we have
+ */
+int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_mdsmap = got;
+ mutex_unlock(&monc->mutex);

+ return 0;
+}
+

+int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got)
+{
+ mutex_lock(&monc->mutex);
+ monc->have_osdmap = got;
+ monc->want_next_osdmap = 0;
+ mutex_unlock(&monc->mutex);

+ return 0;
+}
+
+/*

+ * Register interest in the next osdmap
+ */
+void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
+{
+ dout("request_next_osdmap have %u\n", monc->have_osdmap);
+ mutex_lock(&monc->mutex);
+ if (!monc->want_next_osdmap)
+ monc->want_next_osdmap = 1;
+ if (monc->want_next_osdmap < 2)
+ __send_subscribe(monc);
+ mutex_unlock(&monc->mutex);

+}
+
+/*
+ *

+ */
+int ceph_monc_open_session(struct ceph_mon_client *monc)
+{
+ if (!monc->con) {
+ monc->con = kmalloc(sizeof(*monc->con), GFP_KERNEL);
+ if (!monc->con)
+ return -ENOMEM;
+ ceph_con_init(monc->client->msgr, monc->con);
+ monc->con->private = monc;
+ monc->con->ops = &mon_con_ops;
+ }
+
+ mutex_lock(&monc->mutex);
+ __open_session(monc);
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);

+ return 0;
+}
+
+/*

+ * The monitor responds with mount ack indicate mount success. The
+ * included client ticket allows the client to talk to MDSs and OSDs.
+ */
+static void ceph_monc_handle_map(struct ceph_mon_client *monc,
+ struct ceph_msg *msg)
+{
+ struct ceph_client *client = monc->client;
+ struct ceph_monmap *monmap = NULL, *old = monc->monmap;

+ void *p, *end;
+

+ mutex_lock(&monc->mutex);
+
+ dout("handle_monmap\n");

+ p = msg->front.iov_base;

+ end = p + msg->front.iov_len;
+
+ monmap = ceph_monmap_decode(p, end);
+ if (IS_ERR(monmap)) {
+ pr_err("problem decoding monmap, %d\n",
+ (int)PTR_ERR(monmap));

+ goto out;
+ }
+

+ if (ceph_check_fsid(monc->client, &monmap->fsid) < 0) {
+ kfree(monmap);

+ goto out;
+ }
+

+ client->monc.monmap = monmap;
+ kfree(old);
+
+out:
+ mutex_unlock(&monc->mutex);
+ wake_up(&client->auth_wq);
+}
+
+/*
+ * statfs
+ */
+static struct ceph_mon_statfs_request *__lookup_statfs(
+ struct ceph_mon_client *monc, u64 tid)
+{
+ struct ceph_mon_statfs_request *req;
+ struct rb_node *n = monc->statfs_request_tree.rb_node;
+
+ while (n) {
+ req = rb_entry(n, struct ceph_mon_statfs_request, node);
+ if (tid < req->tid)

+ n = n->rb_left;

+ else if (tid > req->tid)

+ n = n->rb_right;
+ else

+ return req;

+ }
+ return NULL;
+}
+

+static void __insert_statfs(struct ceph_mon_client *monc,
+ struct ceph_mon_statfs_request *new)
+{
+ struct rb_node **p = &monc->statfs_request_tree.rb_node;

+ struct rb_node *parent = NULL;

+ struct ceph_mon_statfs_request *req = NULL;

+
+ while (*p) {
+ parent = *p;

+ req = rb_entry(parent, struct ceph_mon_statfs_request, node);
+ if (new->tid < req->tid)

+ p = &(*p)->rb_left;

+ else if (new->tid > req->tid)

+ p = &(*p)->rb_right;
+ else
+ BUG();
+ }
+
+ rb_link_node(&new->node, parent, p);

+ rb_insert_color(&new->node, &monc->statfs_request_tree);
+}
+
+static void handle_statfs_reply(struct ceph_mon_client *monc,

+ struct ceph_msg *msg)
+{

+ struct ceph_mon_statfs_request *req;
+ struct ceph_mon_statfs_reply *reply = msg->front.iov_base;
+ u64 tid;
+
+ if (msg->front.iov_len != sizeof(*reply))
+ goto bad;

+ tid = le64_to_cpu(msg->hdr.tid);

+ dout("handle_statfs_reply %p tid %llu\n", msg, tid);
+
+ mutex_lock(&monc->mutex);
+ req = __lookup_statfs(monc, tid);
+ if (req) {
+ *req->buf = reply->st;
+ req->result = 0;
+ }
+ mutex_unlock(&monc->mutex);
+ if (req)
+ complete(&req->completion);
+ return;
+
+bad:
+ pr_err("corrupt statfs reply, no tid\n");
+ ceph_msg_dump(msg);
+}
+
+/*
+ * (re)send a statfs request
+ */
+static int send_statfs(struct ceph_mon_client *monc,
+ struct ceph_mon_statfs_request *req)

+{
+ struct ceph_msg *msg;

+ struct ceph_mon_statfs *h;
+
+ dout("send_statfs tid %llu\n", req->tid);
+ msg = ceph_msg_new(CEPH_MSG_STATFS, sizeof(*h), 0, 0, NULL);

+ if (IS_ERR(msg))
+ return PTR_ERR(msg);

+ req->request = msg;
+ msg->hdr.tid = cpu_to_le64(req->tid);

+ h = msg->front.iov_base;

+ h->monhdr.have_version = 0;
+ h->monhdr.session_mon = cpu_to_le16(-1);
+ h->monhdr.session_mon_tid = 0;
+ h->fsid = monc->monmap->fsid;
+ ceph_con_send(monc->con, msg);

+ return 0;
+}
+
+/*

+ * Do a synchronous statfs().
+ */
+int ceph_monc_do_statfs(struct ceph_mon_client *monc, struct ceph_statfs *buf)
+{
+ struct ceph_mon_statfs_request req;
+ int err;
+
+ req.buf = buf;
+ init_completion(&req.completion);
+
+ /* allocate memory for reply */
+ err = ceph_msgpool_resv(&monc->msgpool_statfs_reply, 1);
+ if (err)
+ return err;
+
+ /* register request */
+ mutex_lock(&monc->mutex);
+ req.tid = ++monc->last_tid;
+ req.last_attempt = jiffies;
+ req.delay = BASE_DELAY_INTERVAL;
+ __insert_statfs(monc, &req);
+ monc->num_statfs_requests++;
+ mutex_unlock(&monc->mutex);
+
+ /* send request and wait */
+ err = send_statfs(monc, &req);
+ if (!err)
+ err = wait_for_completion_interruptible(&req.completion);
+
+ mutex_lock(&monc->mutex);
+ rb_erase(&req.node, &monc->statfs_request_tree);
+ monc->num_statfs_requests--;
+ ceph_msgpool_resv(&monc->msgpool_statfs_reply, -1);
+ mutex_unlock(&monc->mutex);
+
+ if (!err)
+ err = req.result;

+ return err;
+}
+

+/*
+ * Resend pending statfs requests.
+ */
+static void __resend_statfs(struct ceph_mon_client *monc)
+{
+ struct ceph_mon_statfs_request *req;

+ struct rb_node *p;
+

+ for (p = rb_first(&monc->statfs_request_tree); p; p = rb_next(p)) {
+ req = rb_entry(p, struct ceph_mon_statfs_request, node);
+ send_statfs(monc, req);
+ }
+}
+
+/*
+ * Delayed work. If we haven't mounted yet, retry. Otherwise,
+ * renew/retry subscription as needed (in case it is timing out, or we
+ * got an ENOMEM). And keep the monitor connection alive.
+ */
+static void delayed_work(struct work_struct *work)
+{
+ struct ceph_mon_client *monc =
+ container_of(work, struct ceph_mon_client, delayed_work.work);
+
+ dout("monc delayed_work\n");
+ mutex_lock(&monc->mutex);
+ if (monc->hunting) {
+ __close_session(monc);
+ __open_session(monc); /* continue hunting */
+ } else {
+ ceph_con_keepalive(monc->con);
+
+ __validate_auth(monc);
+
+ if (monc->auth->ops->is_authenticated(monc->auth))
+ __send_subscribe(monc);
+ }
+ __schedule_delayed(monc);
+ mutex_unlock(&monc->mutex);
+}
+
+/*
+ * On startup, we build a temporary monmap populated with the IPs
+ * provided by mount(2).
+ */
+static int build_initial_monmap(struct ceph_mon_client *monc)
+{
+ struct ceph_mount_args *args = monc->client->mount_args;
+ struct ceph_entity_addr *mon_addr = args->mon_addr;
+ int num_mon = args->num_mon;
+ int i;
+
+ /* build initial monmap */
+ monc->monmap = kzalloc(sizeof(*monc->monmap) +
+ num_mon*sizeof(monc->monmap->mon_inst[0]),
+ GFP_KERNEL);
+ if (!monc->monmap)
+ return -ENOMEM;
+ for (i = 0; i < num_mon; i++) {
+ monc->monmap->mon_inst[i].addr = mon_addr[i];
+ monc->monmap->mon_inst[i].addr.nonce = 0;
+ monc->monmap->mon_inst[i].name.type =
+ CEPH_ENTITY_TYPE_MON;
+ monc->monmap->mon_inst[i].name.num = cpu_to_le64(i);
+ }
+ monc->monmap->num_mon = num_mon;
+ monc->have_fsid = false;
+
+ /* release addr memory */
+ kfree(args->mon_addr);
+ args->mon_addr = NULL;
+ args->num_mon = 0;

+ return 0;
+}
+

+int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
+{

+ int err = 0;
+

+ dout("init\n");

+ memset(monc, 0, sizeof(*monc));
+ monc->client = cl;
+ monc->monmap = NULL;
+ mutex_init(&monc->mutex);
+
+ err = build_initial_monmap(monc);
+ if (err)
+ goto out;
+
+ monc->con = NULL;
+
+ /* authentication */
+ monc->auth = ceph_auth_init(cl->mount_args->name,
+ cl->mount_args->secret);
+ if (IS_ERR(monc->auth))
+ return PTR_ERR(monc->auth);
+ monc->auth->want_keys =
+ CEPH_ENTITY_TYPE_AUTH | CEPH_ENTITY_TYPE_MON |
+ CEPH_ENTITY_TYPE_OSD | CEPH_ENTITY_TYPE_MDS;
+
+ /* msg pools */
+ err = ceph_msgpool_init(&monc->msgpool_subscribe_ack,
+ sizeof(struct ceph_mon_subscribe_ack), 1, false);
+ if (err < 0)
+ goto out_monmap;
+ err = ceph_msgpool_init(&monc->msgpool_statfs_reply,
+ sizeof(struct ceph_mon_statfs_reply), 0, false);
+ if (err < 0)
+ goto out_pool1;
+ err = ceph_msgpool_init(&monc->msgpool_auth_reply, 4096, 1, false);
+ if (err < 0)
+ goto out_pool2;
+
+ monc->m_auth = ceph_msg_new(CEPH_MSG_AUTH, 4096, 0, 0, NULL);
+ monc->pending_auth = 0;
+ if (IS_ERR(monc->m_auth)) {
+ err = PTR_ERR(monc->m_auth);
+ monc->m_auth = NULL;
+ goto out_pool3;
+ }
+
+ monc->cur_mon = -1;
+ monc->hunting = true;
+ monc->sub_renew_after = jiffies;
+ monc->sub_sent = 0;
+
+ INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
+ monc->statfs_request_tree = RB_ROOT;
+ monc->num_statfs_requests = 0;
+ monc->last_tid = 0;
+
+ monc->have_mdsmap = 0;
+ monc->have_osdmap = 0;
+ monc->want_next_osdmap = 1;
+ return 0;
+
+out_pool3:
+ ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+out_pool2:
+ ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+out_pool1:
+ ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+out_monmap:
+ kfree(monc->monmap);
+out:

+ return err;
+}
+

+void ceph_monc_stop(struct ceph_mon_client *monc)
+{
+ dout("stop\n");
+ cancel_delayed_work_sync(&monc->delayed_work);
+
+ mutex_lock(&monc->mutex);
+ __close_session(monc);
+ if (monc->con) {
+ monc->con->private = NULL;
+ monc->con->ops->put(monc->con);
+ monc->con = NULL;

+ }
+ mutex_unlock(&monc->mutex);
+

+ ceph_auth_destroy(monc->auth);
+
+ ceph_msg_put(monc->m_auth);
+ ceph_msgpool_destroy(&monc->msgpool_subscribe_ack);
+ ceph_msgpool_destroy(&monc->msgpool_statfs_reply);
+ ceph_msgpool_destroy(&monc->msgpool_auth_reply);
+
+ kfree(monc->monmap);
+}
+
+static void handle_auth_reply(struct ceph_mon_client *monc,

+ struct ceph_msg *msg)
+{

+ int ret;
+
+ mutex_lock(&monc->mutex);
+ monc->pending_auth = 0;
+ ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
+ msg->front.iov_len,
+ monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);
+ if (ret < 0) {
+ monc->client->auth_err = ret;
+ wake_up(&monc->client->auth_wq);
+ } else if (ret > 0) {
+ __send_prepared_auth_request(monc, ret);
+ } else if (monc->auth->ops->is_authenticated(monc->auth)) {
+ dout("authenticated, starting session\n");
+
+ monc->client->msgr->inst.name.type = CEPH_ENTITY_TYPE_CLIENT;
+ monc->client->msgr->inst.name.num = monc->auth->global_id;
+
+ __send_subscribe(monc);
+ __resend_statfs(monc);
+ }
+ mutex_unlock(&monc->mutex);
+}
+
+static int __validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ if (monc->pending_auth)
+ return 0;
+
+ ret = ceph_build_auth(monc->auth, monc->m_auth->front.iov_base,
+ monc->m_auth->front_max);

+ if (ret <= 0)

+ return ret; /* either an error, or no need to authenticate */
+ __send_prepared_auth_request(monc, ret);

+ return 0;
+}
+

+int ceph_monc_validate_auth(struct ceph_mon_client *monc)
+{
+ int ret;
+
+ mutex_lock(&monc->mutex);
+ ret = __validate_auth(monc);
+ mutex_unlock(&monc->mutex);

+ return ret;
+}
+
+/*

+ * handle incoming message
+ */

+static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
+{
+ struct ceph_mon_client *monc = con->private;

+ int type = le16_to_cpu(msg->hdr.type);
+

+ if (!monc)
+ return;
+
+ switch (type) {
+ case CEPH_MSG_AUTH_REPLY:
+ handle_auth_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ handle_subscribe_ack(monc, msg);
+ break;
+
+ case CEPH_MSG_STATFS_REPLY:
+ handle_statfs_reply(monc, msg);
+ break;
+
+ case CEPH_MSG_MON_MAP:
+ ceph_monc_handle_map(monc, msg);
+ break;
+
+ case CEPH_MSG_MDS_MAP:
+ ceph_mdsc_handle_map(&monc->client->mdsc, msg);
+ break;
+
+ case CEPH_MSG_OSD_MAP:
+ ceph_osdc_handle_map(&monc->client->osdc, msg);

+ break;
+
+ default:

+ pr_err("received unknown message type %d %s\n", type,
+ ceph_msg_type_name(type));
+ }
+ ceph_msg_put(msg);

+}
+
+/*
+ * Allocate memory for incoming message
+ */
+static struct ceph_msg *mon_alloc_msg(struct ceph_connection *con,

+ struct ceph_msg_header *hdr,
+ int *skip)
+{

+ struct ceph_mon_client *monc = con->private;

+ int type = le16_to_cpu(hdr->type);
+ int front_len = le32_to_cpu(hdr->front_len);

+ struct ceph_msg *m = NULL;

+
+ *skip = 0;

+
+ switch (type) {
+ case CEPH_MSG_MON_SUBSCRIBE_ACK:
+ m = ceph_msgpool_get(&monc->msgpool_subscribe_ack, front_len);
+ break;
+ case CEPH_MSG_STATFS_REPLY:
+ m = ceph_msgpool_get(&monc->msgpool_statfs_reply, front_len);
+ break;
+ case CEPH_MSG_AUTH_REPLY:
+ m = ceph_msgpool_get(&monc->msgpool_auth_reply, front_len);
+ break;
+ case CEPH_MSG_MON_MAP:
+ case CEPH_MSG_MDS_MAP:
+ case CEPH_MSG_OSD_MAP:
+ m = ceph_msg_new(type, front_len, 0, 0, NULL);
+ break;
+ }
+
+ if (!m) {
+ pr_info("alloc_msg unknown type %d\n", type);

+ *skip = 1;
+ }

+ return m;
+}
+
+/*
+ * If the monitor connection resets, pick a new monitor and resubmit
+ * any pending requests.
+ */
+static void mon_fault(struct ceph_connection *con)
+{
+ struct ceph_mon_client *monc = con->private;
+
+ if (!monc)
+ return;
+
+ dout("mon_fault\n");
+ mutex_lock(&monc->mutex);
+ if (!con->private)
+ goto out;
+
+ if (monc->con && !monc->hunting)
+ pr_info("mon%d %s session lost, "
+ "hunting for new mon\n", monc->cur_mon,
+ pr_addr(&monc->con->peer_addr.in_addr));
+
+ __close_session(monc);
+ if (!monc->hunting) {
+ /* start hunting */
+ monc->hunting = true;
+ __open_session(monc);
+ } else {
+ /* already hunting, let's wait a bit */
+ __schedule_delayed(monc);
+ }
+out:
+ mutex_unlock(&monc->mutex);
+}
+
+const static struct ceph_connection_operations mon_con_ops = {
+ .get = ceph_con_get,
+ .put = ceph_con_put,
+ .dispatch = dispatch,
+ .fault = mon_fault,
+ .alloc_msg = mon_alloc_msg,
+};
diff --git a/fs/ceph/mon_client.h b/fs/ceph/mon_client.h
new file mode 100644
index 0000000..b958ad5
--- /dev/null
+++ b/fs/ceph/mon_client.h
@@ -0,0 +1,119 @@
+#ifndef _FS_CEPH_MON_CLIENT_H
+#define _FS_CEPH_MON_CLIENT_H
+
+#include <linux/completion.h>
+#include <linux/rbtree.h>
+
+#include "messenger.h"
+#include "msgpool.h"
+
+struct ceph_client;
+struct ceph_mount_args;
+struct ceph_auth_client;
+
+/*
+ * The monitor map enumerates the set of all monitors.
+ */
+struct ceph_monmap {

+ struct ceph_fsid fsid;
+ u32 epoch;

+ u32 num_mon;
+ struct ceph_entity_inst mon_inst[0];
+};
+
+struct ceph_mon_client;
+struct ceph_mon_statfs_request;
+
+
+/*
+ * Generic mechanism for resending monitor requests.
+ */
+typedef void (*ceph_monc_request_func_t)(struct ceph_mon_client *monc,
+ int newmon);
+
+/* a pending monitor request */
+struct ceph_mon_request {
+ struct ceph_mon_client *monc;
+ struct delayed_work delayed_work;
+ unsigned long delay;
+ ceph_monc_request_func_t do_request;
+};
+
+/*
+ * statfs() is done a bit differently because we need to get data back
+ * to the caller
+ */
+struct ceph_mon_statfs_request {
+ u64 tid;
+ struct rb_node node;
+ int result;
+ struct ceph_statfs *buf;
+ struct completion completion;
+ unsigned long last_attempt, delay; /* jiffies */
+ struct ceph_msg *request; /* original request */
+};
+
+struct ceph_mon_client {
+ struct ceph_client *client;
+ struct ceph_monmap *monmap;

+
+ struct mutex mutex;

+ struct delayed_work delayed_work;
+
+ struct ceph_auth_client *auth;
+ struct ceph_msg *m_auth;
+ int pending_auth;
+
+ bool hunting;
+ int cur_mon; /* last monitor i contacted */
+ unsigned long sub_sent, sub_renew_after;
+ struct ceph_connection *con;
+ bool have_fsid;
+
+ /* msg pools */
+ struct ceph_msgpool msgpool_subscribe_ack;
+ struct ceph_msgpool msgpool_statfs_reply;
+ struct ceph_msgpool msgpool_auth_reply;
+
+ /* pending statfs requests */
+ struct rb_root statfs_request_tree;
+ int num_statfs_requests;
+ u64 last_tid;
+
+ /* mds/osd map */
+ int want_next_osdmap; /* 1 = want, 2 = want+asked */
+ u32 have_osdmap, have_mdsmap;
+

+#ifdef CONFIG_DEBUG_FS
+ struct dentry *debugfs_file;
+#endif
+};
+

+extern struct ceph_monmap *ceph_monmap_decode(void *p, void *end);
+extern int ceph_monmap_contains(struct ceph_monmap *m,
+ struct ceph_entity_addr *addr);
+
+extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
+extern void ceph_monc_stop(struct ceph_mon_client *monc);
+
+/*
+ * The model here is to indicate that we need a new map of at least
+ * epoch @want, and also call in when we receive a map. We will
+ * periodically rerequest the map from the monitor cluster until we
+ * get what we want.
+ */
+extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have);
+extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have);
+
+extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
+
+extern int ceph_monc_do_statfs(struct ceph_mon_client *monc,
+ struct ceph_statfs *buf);
+
+extern int ceph_monc_open_session(struct ceph_mon_client *monc);
+
+extern int ceph_monc_validate_auth(struct ceph_mon_client *monc);
+
+
+
+#endif

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

The msgpool is a basic mempool_t-like structure to preallocate
messages we expect to receive over the wire. This ensures we have the
necessary memory preallocated to process replies to requests, or to
process unsolicited messages from various servers.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/msgpool.c | 186 +++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/msgpool.h | 27 ++++++++
2 files changed, 213 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/msgpool.c
create mode 100644 fs/ceph/msgpool.h

diff --git a/fs/ceph/msgpool.c b/fs/ceph/msgpool.c
new file mode 100644
index 0000000..ca3b44a
--- /dev/null
+++ b/fs/ceph/msgpool.c
@@ -0,0 +1,186 @@
+#include "ceph_debug.h"
+
+#include <linux/err.h>
+#include <linux/sched.h>
+#include <linux/types.h>
+#include <linux/vmalloc.h>
+
+#include "msgpool.h"
+
+/*
+ * We use msg pools to preallocate memory for messages we expect to
+ * receive over the wire, to avoid getting ourselves into OOM
+ * conditions at unexpected times. We take use a few different
+ * strategies:
+ *
+ * - for request/response type interactions, we preallocate the
+ * memory needed for the response when we generate the request.
+ *
+ * - for messages we can receive at any time from the MDS, we preallocate
+ * a pool of messages we can re-use.
+ *
+ * - for writeback, we preallocate some number of messages to use for
+ * requests and their replies, so that we always make forward
+ * progress.
+ *
+ * The msgpool behaves like a mempool_t, but keeps preallocated
+ * ceph_msgs strung together on a list_head instead of using a pointer
+ * vector. This avoids vector reallocation when we adjust the number
+ * of preallocated items (which happens frequently).
+ */
+
+
+/*
+ * Allocate or release as necessary to meet our target pool size.
+ */
+static int __fill_msgpool(struct ceph_msgpool *pool)

+{
+ struct ceph_msg *msg;
+

+ while (pool->num < pool->min) {
+ dout("fill_msgpool %p %d/%d allocating\n", pool, pool->num,
+ pool->min);
+ spin_unlock(&pool->lock);
+ msg = ceph_msg_new(0, pool->front_len, 0, 0, NULL);
+ spin_lock(&pool->lock);

+ if (IS_ERR(msg))
+ return PTR_ERR(msg);

+ msg->pool = pool;
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ }
+ while (pool->num > pool->min) {
+ msg = list_first_entry(&pool->msgs, struct ceph_msg, list_head);
+ dout("fill_msgpool %p %d/%d releasing %p\n", pool, pool->num,
+ pool->min, msg);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ ceph_msg_kfree(msg);

+ }
+ return 0;
+}
+

+int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int min, bool blocking)

+{
+ int ret;
+

+ dout("msgpool_init %p front_len %d min %d\n", pool, front_len, min);
+ spin_lock_init(&pool->lock);
+ pool->front_len = front_len;
+ INIT_LIST_HEAD(&pool->msgs);
+ pool->num = 0;
+ pool->min = min;
+ pool->blocking = blocking;
+ init_waitqueue_head(&pool->wait);
+
+ spin_lock(&pool->lock);
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);

+ return ret;
+}
+

+void ceph_msgpool_destroy(struct ceph_msgpool *pool)
+{
+ dout("msgpool_destroy %p\n", pool);
+ spin_lock(&pool->lock);
+ pool->min = 0;
+ __fill_msgpool(pool);
+ spin_unlock(&pool->lock);
+}
+
+int ceph_msgpool_resv(struct ceph_msgpool *pool, int delta)

+{
+ int ret;
+

+ spin_lock(&pool->lock);
+ dout("msgpool_resv %p delta %d\n", pool, delta);
+ pool->min += delta;
+ ret = __fill_msgpool(pool);
+ spin_unlock(&pool->lock);

+ return ret;
+}
+

+struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len)
+{
+ wait_queue_t wait;

+ struct ceph_msg *msg;
+

+ if (front_len && front_len > pool->front_len) {
+ pr_err("msgpool_get pool %p need front %d, pool size is %d\n",
+ pool, front_len, pool->front_len);
+ WARN_ON(1);
+
+ /* try to alloc a fresh message */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))

+ return msg;
+ }
+

+ if (!front_len)
+ front_len = pool->front_len;
+
+ if (pool->blocking) {
+ /* mempool_t behavior; first try to alloc */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))

+ return msg;
+ }
+

+ while (1) {
+ spin_lock(&pool->lock);
+ if (likely(pool->num)) {
+ msg = list_entry(pool->msgs.next, struct ceph_msg,
+ list_head);
+ list_del_init(&msg->list_head);
+ pool->num--;
+ dout("msgpool_get %p got %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ return msg;
+ }
+ pr_err("msgpool_get %p now %d/%d, %s\n", pool, pool->num,
+ pool->min, pool->blocking ? "waiting" : "may fail");
+ spin_unlock(&pool->lock);
+
+ if (!pool->blocking) {
+ WARN_ON(1);
+
+ /* maybe we can allocate it now? */
+ msg = ceph_msg_new(0, front_len, 0, 0, NULL);
+ if (!IS_ERR(msg))
+ return msg;
+
+ pr_err("msgpool_get %p empty + alloc failed\n", pool);

+ return ERR_PTR(-ENOMEM);
+ }
+

+ init_wait(&wait);
+ prepare_to_wait(&pool->wait, &wait, TASK_UNINTERRUPTIBLE);
+ schedule();
+ finish_wait(&pool->wait, &wait);
+ }
+}
+
+void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
+{
+ spin_lock(&pool->lock);
+ if (pool->num < pool->min) {
+ /* reset msg front_len; user may have changed it */
+ msg->front.iov_len = pool->front_len;
+ msg->hdr.front_len = cpu_to_le32(pool->front_len);
+
+ kref_set(&msg->kref, 1); /* retake a single ref */
+ list_add(&msg->list_head, &pool->msgs);
+ pool->num++;
+ dout("msgpool_put %p reclaim %p, now %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ wake_up(&pool->wait);
+ } else {
+ dout("msgpool_put %p drop %p, at %d/%d\n", pool, msg,
+ pool->num, pool->min);
+ spin_unlock(&pool->lock);
+ ceph_msg_kfree(msg);
+ }
+}
diff --git a/fs/ceph/msgpool.h b/fs/ceph/msgpool.h
new file mode 100644
index 0000000..bc834bf
--- /dev/null
+++ b/fs/ceph/msgpool.h
@@ -0,0 +1,27 @@
+#ifndef _FS_CEPH_MSGPOOL
+#define _FS_CEPH_MSGPOOL
+
+#include "messenger.h"
+
+/*
+ * we use memory pools for preallocating messages we may receive, to
+ * avoid unexpected OOM conditions.
+ */
+struct ceph_msgpool {
+ spinlock_t lock;
+ int front_len; /* preallocated payload size */
+ struct list_head msgs; /* msgs in the pool; each has 1 ref */
+ int num, min; /* cur, min # msgs in the pool */
+ bool blocking;
+ wait_queue_head_t wait;
+};
+
+extern int ceph_msgpool_init(struct ceph_msgpool *pool,
+ int front_len, int size, bool blocking);
+extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
+extern int ceph_msgpool_resv(struct ceph_msgpool *, int delta);
+extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
+ int front_len);
+extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

struct ceph_buffer is a simple ref-counted buffer. We transparently
choose between kmalloc for small buffers and vmalloc for large ones.

This is currently used only for allocating memory for xattr data.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/buffer.c | 78 ++++++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/buffer.h | 39 +++++++++++++++++++++++++++
2 files changed, 117 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/buffer.c
create mode 100644 fs/ceph/buffer.h

diff --git a/fs/ceph/buffer.c b/fs/ceph/buffer.c
new file mode 100644
index 0000000..b98086c
--- /dev/null
+++ b/fs/ceph/buffer.c
@@ -0,0 +1,78 @@
+
+#include "ceph_debug.h"
+#include "buffer.h"
+#include "decode.h"
+
+struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp)
+{
+ struct ceph_buffer *b;
+
+ b = kmalloc(sizeof(*b), gfp);
+ if (!b)
+ return NULL;
+
+ b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+ if (b->vec.iov_base) {
+ b->is_vmalloc = false;
+ } else {
+ b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+ if (!b->vec.iov_base) {
+ kfree(b);
+ return NULL;
+ }
+ b->is_vmalloc = true;
+ }
+
+ kref_init(&b->kref);
+ b->alloc_len = len;
+ b->vec.iov_len = len;
+ dout("buffer_new %p\n", b);

+ return b;
+}
+

+void ceph_buffer_release(struct kref *kref)
+{
+ struct ceph_buffer *b = container_of(kref, struct ceph_buffer, kref);
+
+ dout("buffer_release %p\n", b);
+ if (b->vec.iov_base) {
+ if (b->is_vmalloc)
+ vfree(b->vec.iov_base);
+ else
+ kfree(b->vec.iov_base);
+ }
+ kfree(b);
+}
+
+int ceph_buffer_alloc(struct ceph_buffer *b, int len, gfp_t gfp)
+{
+ b->vec.iov_base = kmalloc(len, gfp | __GFP_NOWARN);
+ if (b->vec.iov_base) {
+ b->is_vmalloc = false;
+ } else {
+ b->vec.iov_base = __vmalloc(len, gfp, PAGE_KERNEL);
+ b->is_vmalloc = true;
+ }
+ if (!b->vec.iov_base)
+ return -ENOMEM;
+ b->alloc_len = len;
+ b->vec.iov_len = len;

+ return 0;
+}
+

+int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end)
+{
+ size_t len;
+

+ ceph_decode_need(p, end, sizeof(u32), bad);

+ len = ceph_decode_32(p);
+ dout("decode_buffer len %d\n", (int)len);

+ ceph_decode_need(p, end, len, bad);

+ *b = ceph_buffer_new(len, GFP_NOFS);
+ if (!*b)
+ return -ENOMEM;
+ ceph_decode_copy(p, (*b)->vec.iov_base, len);
+ return 0;
+bad:
+ return -EINVAL;
+}
diff --git a/fs/ceph/buffer.h b/fs/ceph/buffer.h
new file mode 100644
index 0000000..58d1901
--- /dev/null
+++ b/fs/ceph/buffer.h
@@ -0,0 +1,39 @@
+#ifndef __FS_CEPH_BUFFER_H
+#define __FS_CEPH_BUFFER_H
+
+#include <linux/kref.h>
+#include <linux/mm.h>
+#include <linux/vmalloc.h>
+#include <linux/types.h>
+#include <linux/uio.h>
+
+/*
+ * a simple reference counted buffer.
+ *
+ * use kmalloc for small sizes (<= one page), vmalloc for larger
+ * sizes.
+ */
+struct ceph_buffer {
+ struct kref kref;
+ struct kvec vec;
+ size_t alloc_len;
+ bool is_vmalloc;
+};
+
+extern struct ceph_buffer *ceph_buffer_new(size_t len, gfp_t gfp);
+extern void ceph_buffer_release(struct kref *kref);
+
+static inline struct ceph_buffer *ceph_buffer_get(struct ceph_buffer *b)
+{
+ kref_get(&b->kref);
+ return b;
+}
+
+static inline void ceph_buffer_put(struct ceph_buffer *b)
+{
+ kref_put(&b->kref, ceph_buffer_release);
+}
+
+extern int ceph_decode_buffer(struct ceph_buffer **b, void **p, void *end);

Sage Weil

unread,

Mar 2, 2010, 8:00:03 PM3/2/10

This implements a trivial authentication scheme that provides
no actual authentication.

Signed-off-by: Sage Weil <sa...@newdream.net>
---

fs/ceph/auth_none.c | 121 +++++++++++++++++++++++++++++++++++++++++++++++++++
fs/ceph/auth_none.h | 28 ++++++++++++
2 files changed, 149 insertions(+), 0 deletions(-)
create mode 100644 fs/ceph/auth_none.c
create mode 100644 fs/ceph/auth_none.h

diff --git a/fs/ceph/auth_none.c b/fs/ceph/auth_none.c
new file mode 100644
index 0000000..b4ef6f0
--- /dev/null
+++ b/fs/ceph/auth_none.c
@@ -0,0 +1,121 @@
+

+#include "ceph_debug.h"
+
+#include <linux/err.h>

+#include <linux/module.h>
+#include <linux/random.h>
+

+#include "auth_none.h"

+#include "auth.h"
+#include "decode.h"
+

+static void reset(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+

+ xi->starting = true;

+ xi->built_authorizer = false;
+}
+
+static void destroy(struct ceph_auth_client *ac)

+{
+ kfree(ac->private);
+ ac->private = NULL;
+}
+

+static int is_authenticated(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+
+ return !xi->starting;
+}
+
+/*
+ * the generic auth code decode the global_id, and we carry no actual
+ * authenticate state, so nothing happens here.
+ */
+static int handle_reply(struct ceph_auth_client *ac, int result,
+ void *buf, void *end)
+{
+ struct ceph_auth_none_info *xi = ac->private;
+

+ xi->starting = false;

+ return result;
+}
+
+/*
+ * build an 'authorizer' with our entity_name and global_id. we can
+ * reuse a single static copy since it is identical for all services
+ * we connect to.
+ */
+static int ceph_auth_none_create_authorizer(

+ struct ceph_auth_client *ac, int peer_type,
+ struct ceph_authorizer **a,
+ void **buf, size_t *len,
+ void **reply_buf, size_t *reply_len)
+{

+ struct ceph_auth_none_info *ai = ac->private;
+ struct ceph_none_authorizer *au = &ai->au;
+ void *p, *end;
+ int ret;
+
+ if (!ai->built_authorizer) {
+ p = au->buf;
+ end = p + sizeof(au->buf);
+ ceph_encode_8(&p, 1);
+ ret = ceph_entity_name_encode(ac->name, &p, end - 8);
+ if (ret < 0)
+ goto bad;
+ ceph_decode_need(&p, end, sizeof(u64), bad2);
+ ceph_encode_64(&p, ac->global_id);
+ au->buf_len = p - (void *)au->buf;
+ ai->built_authorizer = true;
+ dout("built authorizer len %d\n", au->buf_len);

+ }
+
+ *a = (struct ceph_authorizer *)au;

+ *buf = au->buf;
+ *len = au->buf_len;

+ *reply_buf = au->reply_buf;
+ *reply_len = sizeof(au->reply_buf);
+ return 0;
+

+bad2:
+ ret = -ERANGE;
+bad:

+ return ret;
+}
+

+static void ceph_auth_none_destroy_authorizer(struct ceph_auth_client *ac,

+ struct ceph_authorizer *a)
+{

+ /* nothing to do */
+}
+
+static const struct ceph_auth_client_ops ceph_auth_none_ops = {
+ .reset = reset,
+ .destroy = destroy,
+ .is_authenticated = is_authenticated,
+ .handle_reply = handle_reply,
+ .create_authorizer = ceph_auth_none_create_authorizer,
+ .destroy_authorizer = ceph_auth_none_destroy_authorizer,
+};
+
+int ceph_auth_none_init(struct ceph_auth_client *ac)
+{
+ struct ceph_auth_none_info *xi;
+
+ dout("ceph_auth_none_init %p\n", ac);

+ xi = kzalloc(sizeof(*xi), GFP_NOFS);
+ if (!xi)

+ return -ENOMEM;
+

+ xi->starting = true;

+ xi->built_authorizer = false;
+
+ ac->protocol = CEPH_AUTH_NONE;

+ ac->private = xi;

+ ac->ops = &ceph_auth_none_ops;

+ return 0;
+}
+

diff --git a/fs/ceph/auth_none.h b/fs/ceph/auth_none.h
new file mode 100644
index 0000000..56c0553
--- /dev/null
+++ b/fs/ceph/auth_none.h
@@ -0,0 +1,28 @@
+#ifndef _FS_CEPH_AUTH_NONE_H
+#define _FS_CEPH_AUTH_NONE_H
+
+#include "auth.h"
+
+/*
+ * null security mode.
+ *
+ * we use a single static authorizer that simply encodes our entity name
+ * and global id.
+ */
+
+struct ceph_none_authorizer {
+ char buf[128];
+ int buf_len;
+ char reply_buf[0];
+};
+
+struct ceph_auth_none_info {
+ bool starting;
+ bool built_authorizer;
+ struct ceph_none_authorizer au; /* we only need one; it's static */
+};
+
+extern int ceph_auth_none_init(struct ceph_auth_client *ac);
+
+#endif
+

0 new messages