Commit 85c7000f authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - several changes to how snap context and snap realms are tracked
     (Xiubo Li). In particular, this should resolve a long-standing
     issue of high kworker CPU usage and various stalls caused by
     needless iteration over all inodes in the snap realm.

   - async create fixes to address hangs in some edge cases (Jeff
     Layton)

   - support for getvxattr MDS op for querying server-side xattrs, such
     as file/directory layouts and ephemeral pins (Milind Changire)

   - average latency is now maintained for all metrics (Venky Shankar)

   - some tweaks around handling inline data to make it fit better with
     netfs helper library (David Howells)

  Also a couple of memory leaks got plugged along with a few assorted
  fixups. Last but not least, Xiubo has stepped up to serve as a CephFS
  co-maintainer"

* tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits)
  ceph: fix memory leak in ceph_readdir when note_last_dentry returns error
  ceph: uninitialized variable in debug output
  ceph: use tracked average r/w/m latencies to display metrics in debugfs
  ceph: include average/stdev r/w/m latency in mds metrics
  ceph: track average r/w/m latency
  ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64()
  ceph: assign the ci only when the inode isn't NULL
  ceph: fix inode reference leakage in ceph_get_snapdir()
  ceph: misc fix for code style and logs
  ceph: allocate capsnap memory outside of ceph_queue_cap_snap()
  ceph: do not release the global snaprealm until unmounting
  ceph: remove incorrect and unused CEPH_INO_DOTDOT macro
  MAINTAINERS: add Xiubo Li as cephfs co-maintainer
  ceph: eliminate the recursion when rebuilding the snap context
  ceph: do not update snapshot context when there is no new snapshot
  ceph: zero the dir_entries memory when allocating it
  ceph: move to a dedicated slabcache for ceph_cap_snap
  ceph: add getvxattr op
  libceph: drop else branches in prepare_read_data{,_cont}
  ceph: fix comments mentioning i_mutex
  ...
parents b1b07ba3 f639d986
Loading
Loading
Loading
Loading
+2 −0
Original line number Diff line number Diff line
@@ -4456,6 +4456,7 @@ F: drivers/power/supply/cw2015_battery.c
CEPH COMMON CODE (LIBCEPH)
M:	Ilya Dryomov <idryomov@gmail.com>
M:	Jeff Layton <jlayton@kernel.org>
M:	Xiubo Li <xiubli@redhat.com>
L:	ceph-devel@vger.kernel.org
S:	Supported
W:	http://ceph.com/
@@ -4466,6 +4467,7 @@ F: net/ceph/
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
M:	Jeff Layton <jlayton@kernel.org>
M:	Xiubo Li <xiubli@redhat.com>
M:	Ilya Dryomov <idryomov@gmail.com>
L:	ceph-devel@vger.kernel.org
S:	Supported
+112 −128
Original line number Diff line number Diff line
@@ -184,7 +184,7 @@ static int ceph_releasepage(struct page *page, gfp_t gfp)

static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)
{
	struct inode *inode = rreq->mapping->host;
	struct inode *inode = rreq->inode;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_file_layout *lo = &ci->i_layout;
	u32 blockoff;
@@ -201,7 +201,7 @@ static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq)

static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq)
{
	struct inode *inode = subreq->rreq->mapping->host;
	struct inode *inode = subreq->rreq->inode;
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_inode_info *ci = ceph_inode(inode);
	u64 objno, objoff;
@@ -244,10 +244,63 @@ static void finish_netfs_read(struct ceph_osd_request *req)
	iput(req->r_inode);
}

static bool ceph_netfs_issue_op_inline(struct netfs_read_subrequest *subreq)
{
	struct netfs_read_request *rreq = subreq->rreq;
	struct inode *inode = rreq->inode;
	struct ceph_mds_reply_info_parsed *rinfo;
	struct ceph_mds_reply_info_in *iinfo;
	struct ceph_mds_request *req;
	struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct iov_iter iter;
	ssize_t err = 0;
	size_t len;

	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
	__clear_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags);

	if (subreq->start >= inode->i_size)
		goto out;

	/* We need to fetch the inline data. */
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
	}
	req->r_ino1 = ci->i_vino;
	req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INLINE_DATA);
	req->r_num_caps = 2;

	err = ceph_mdsc_do_request(mdsc, NULL, req);
	if (err < 0)
		goto out;

	rinfo = &req->r_reply_info;
	iinfo = &rinfo->targeti;
	if (iinfo->inline_version == CEPH_INLINE_NONE) {
		/* The data got uninlined */
		ceph_mdsc_put_request(req);
		return false;
	}

	len = min_t(size_t, iinfo->inline_len - subreq->start, subreq->len);
	iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len);
	err = copy_to_iter(iinfo->inline_data + subreq->start, len, &iter);
	if (err == 0)
		err = -EFAULT;

	ceph_mdsc_put_request(req);
out:
	netfs_subreq_terminated(subreq, err, false);
	return true;
}

static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
{
	struct netfs_read_request *rreq = subreq->rreq;
	struct inode *inode = rreq->mapping->host;
	struct inode *inode = rreq->inode;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_osd_request *req;
@@ -258,6 +311,10 @@ static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq)
	int err = 0;
	u64 len = subreq->len;

	if (ci->i_inline_version != CEPH_INLINE_NONE &&
	    ceph_netfs_issue_op_inline(subreq))
		return;

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
			0, 1, CEPH_OSD_OP_READ,
			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
@@ -326,23 +383,9 @@ static int ceph_readpage(struct file *file, struct page *subpage)
	size_t len = folio_size(folio);
	u64 off = folio_file_pos(folio);

	if (ci->i_inline_version != CEPH_INLINE_NONE) {
		/*
		 * Uptodate inline data should have been added
		 * into page cache while getting Fcr caps.
		 */
		if (off == 0) {
			folio_unlock(folio);
			return -EINVAL;
		}
		zero_user_segment(&folio->page, 0, folio_size(folio));
		folio_mark_uptodate(folio);
		folio_unlock(folio);
		return 0;
	}

	dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n",
	     vino.ino, vino.snap, file, off, len, folio, folio_index(folio));
	dout("readpage ino %llx.%llx file %p off %llu len %zu folio %p index %lu\n inline %d",
	     vino.ino, vino.snap, file, off, len, folio, folio_index(folio),
	     ci->i_inline_version != CEPH_INLINE_NONE);

	return netfs_readpage(file, folio, &ceph_netfs_read_ops, NULL);
}
@@ -1281,45 +1324,11 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping,
			    struct page **pagep, void **fsdata)
{
	struct inode *inode = file_inode(file);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct folio *folio = NULL;
	pgoff_t index = pos >> PAGE_SHIFT;
	int r;

	/*
	 * Uninlining should have already been done and everything updated, EXCEPT
	 * for inline_version sent to the MDS.
	 */
	if (ci->i_inline_version != CEPH_INLINE_NONE) {
		unsigned int fgp_flags = FGP_LOCK | FGP_WRITE | FGP_CREAT | FGP_STABLE;
		if (aop_flags & AOP_FLAG_NOFS)
			fgp_flags |= FGP_NOFS;
		folio = __filemap_get_folio(mapping, index, fgp_flags,
					    mapping_gfp_mask(mapping));
		if (!folio)
			return -ENOMEM;

		/*
		 * The inline_version on a new inode is set to 1. If that's the
		 * case, then the folio is brand new and isn't yet Uptodate.
		 */
		r = 0;
		if (index == 0 && ci->i_inline_version != 1) {
			if (!folio_test_uptodate(folio)) {
				WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n",
					  ci->i_inline_version);
				r = -EINVAL;
			}
			goto out;
		}
		zero_user_segment(&folio->page, 0, folio_size(folio));
		folio_mark_uptodate(folio);
		goto out;
	}

	r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &folio, NULL,
			      &ceph_netfs_read_ops, NULL);
out:
	if (r == 0)
		folio_wait_fscache(folio);
	if (r < 0) {
@@ -1515,19 +1524,6 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
	sb_start_pagefault(inode->i_sb);
	ceph_block_sigs(&oldset);

	if (ci->i_inline_version != CEPH_INLINE_NONE) {
		struct page *locked_page = NULL;
		if (off == 0) {
			lock_page(page);
			locked_page = page;
		}
		err = ceph_uninline_data(vma->vm_file, locked_page);
		if (locked_page)
			unlock_page(locked_page);
		if (err < 0)
			goto out_free;
	}

	if (off + thp_size(page) <= size)
		len = thp_size(page);
	else
@@ -1584,11 +1580,9 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf)
		ceph_put_snap_context(snapc);
	} while (err == 0);

	if (ret == VM_FAULT_LOCKED ||
	    ci->i_inline_version != CEPH_INLINE_NONE) {
	if (ret == VM_FAULT_LOCKED) {
		int dirty;
		spin_lock(&ci->i_ceph_lock);
		ci->i_inline_version = CEPH_INLINE_NONE;
		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
					       &prealloc_cf);
		spin_unlock(&ci->i_ceph_lock);
@@ -1652,16 +1646,30 @@ void ceph_fill_inline_data(struct inode *inode, struct page *locked_page,
	}
}

int ceph_uninline_data(struct file *filp, struct page *locked_page)
int ceph_uninline_data(struct file *file)
{
	struct inode *inode = file_inode(filp);
	struct inode *inode = file_inode(file);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_osd_request *req;
	struct page *page = NULL;
	u64 len, inline_version;
	struct ceph_cap_flush *prealloc_cf;
	struct folio *folio = NULL;
	u64 inline_version = CEPH_INLINE_NONE;
	struct page *pages[1];
	int err = 0;
	bool from_pagecache = false;
	u64 len;

	prealloc_cf = ceph_alloc_cap_flush();
	if (!prealloc_cf)
		return -ENOMEM;

	folio = read_mapping_folio(inode->i_mapping, 0, file);
	if (IS_ERR(folio)) {
		err = PTR_ERR(folio);
		goto out;
	}

	folio_lock(folio);

	spin_lock(&ci->i_ceph_lock);
	inline_version = ci->i_inline_version;
@@ -1672,45 +1680,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)

	if (inline_version == 1 || /* initial version, no data */
	    inline_version == CEPH_INLINE_NONE)
		goto out;

	if (locked_page) {
		page = locked_page;
		WARN_ON(!PageUptodate(page));
	} else if (ceph_caps_issued(ci) &
		   (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) {
		page = find_get_page(inode->i_mapping, 0);
		if (page) {
			if (PageUptodate(page)) {
				from_pagecache = true;
				lock_page(page);
			} else {
				put_page(page);
				page = NULL;
			}
		}
	}
		goto out_unlock;

	if (page) {
	len = i_size_read(inode);
		if (len > PAGE_SIZE)
			len = PAGE_SIZE;
	} else {
		page = __page_cache_alloc(GFP_NOFS);
		if (!page) {
			err = -ENOMEM;
			goto out;
		}
		err = __ceph_do_getattr(inode, page,
					CEPH_STAT_CAP_INLINE_DATA, true);
		if (err < 0) {
			/* no inline data */
			if (err == -ENODATA)
				err = 0;
			goto out;
		}
		len = err;
	}
	if (len > folio_size(folio))
		len = folio_size(folio);

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				    ceph_vino(inode), 0, &len, 0, 1,
@@ -1718,7 +1692,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
				    NULL, 0, 0, false);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
		goto out_unlock;
	}

	req->r_mtime = inode->i_mtime;
@@ -1727,7 +1701,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
		err = ceph_osdc_wait_request(&fsc->client->osdc, req);
	ceph_osdc_put_request(req);
	if (err < 0)
		goto out;
		goto out_unlock;

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout,
				    ceph_vino(inode), 0, &len, 1, 3,
@@ -1736,10 +1710,11 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
				    ci->i_truncate_size, false);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
		goto out_unlock;
	}

	osd_req_op_extent_osd_data_pages(req, 1, &page, len, 0, false, false);
	pages[0] = folio_page(folio, 0);
	osd_req_op_extent_osd_data_pages(req, 1, pages, len, 0, false, false);

	{
		__le64 xattr_buf = cpu_to_le64(inline_version);
@@ -1749,7 +1724,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
					    CEPH_OSD_CMPXATTR_OP_GT,
					    CEPH_OSD_CMPXATTR_MODE_U64);
		if (err)
			goto out_put;
			goto out_put_req;
	}

	{
@@ -1760,7 +1735,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
					    "inline_version",
					    xattr_buf, xattr_len, 0, 0);
		if (err)
			goto out_put;
			goto out_put_req;
	}

	req->r_mtime = inode->i_mtime;
@@ -1771,19 +1746,28 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, err);

out_put:
	if (!err) {
		int dirty;

		/* Set to CAP_INLINE_NONE and dirty the caps */
		down_read(&fsc->mdsc->snap_rwsem);
		spin_lock(&ci->i_ceph_lock);
		ci->i_inline_version = CEPH_INLINE_NONE;
		dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR, &prealloc_cf);
		spin_unlock(&ci->i_ceph_lock);
		up_read(&fsc->mdsc->snap_rwsem);
		if (dirty)
			__mark_inode_dirty(inode, dirty);
	}
out_put_req:
	ceph_osdc_put_request(req);
	if (err == -ECANCELED)
		err = 0;
out_unlock:
	folio_unlock(folio);
	folio_put(folio);
out:
	if (page && page != locked_page) {
		if (from_pagecache) {
			unlock_page(page);
			put_page(page);
		} else
			__free_pages(page, 0);
	}

	ceph_free_cap_flush(prealloc_cf);
	dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
	     inode, ceph_vinop(inode), inline_version, err);
	return err;
+15 −1
Original line number Diff line number Diff line
@@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
		ceph_get_mds_session(session);

	spin_lock(&ci->i_ceph_lock);
	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
		/* Don't send messages until we get async create reply */
		spin_unlock(&ci->i_ceph_lock);
		ceph_put_mds_session(session);
		return;
	}

	if (ci->i_ceph_flags & CEPH_I_FLUSH)
		flags |= CHECK_CAPS_FLUSH;
retry:
@@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
	dout("write_inode %p wait=%d\n", inode, wait);
	ceph_fscache_unpin_writeback(inode, wbc);
	if (wait) {
		err = ceph_wait_on_async_create(inode);
		if (err)
			return err;
		dirty = try_flush_caps(inode, &flush_tid);
		if (dirty)
			err = wait_event_interruptible(ci->i_cap_wq,
@@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
	u64 first_tid = 0;
	u64 last_snap_flush = 0;

	/* Don't do anything until create reply comes in */
	if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
		return;

	ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;

	list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
@@ -4152,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,

	/* lookup ino */
	inode = ceph_find_inode(mdsc->fsc->sb, vino);
	ci = ceph_inode(inode);
	dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
	     vino.snap, inode);

@@ -4178,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
		}
		goto flush_cap_releases;
	}
	ci = ceph_inode(inode);

	/* these will work even if we don't have a cap yet */
	switch (op) {
+2 −3
Original line number Diff line number Diff line
@@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
	struct ceph_fs_client *fsc = s->private;
	struct ceph_client_metric *cm = &fsc->mdsc->metric;
	struct ceph_metric *m;
	s64 total, sum, avg, min, max, sq;
	s64 total, avg, min, max, sq;
	int i;

	seq_printf(s, "item          total       avg_lat(us)     min_lat(us)     max_lat(us)     stdev(us)\n");
@@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
		m = &cm->metric[i];
		spin_lock(&m->lock);
		total = m->total;
		sum = m->latency_sum;
		avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
		avg = m->latency_avg;
		min = m->latency_min;
		max = m->latency_max;
		sq = m->latency_sq_sum;
+13 −4
Original line number Diff line number Diff line
@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
			return ERR_PTR(-EAGAIN);
		}
		/* reading/filling the cache are serialized by
		   i_mutex, no need to use page lock */
		   i_rwsem, no need to use page lock */
		unlock_page(cache_ctl->page);
		cache_ctl->dentries = kmap(cache_ctl->page);
	}
@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
	rcu_read_lock();
	spin_lock(&parent->d_lock);
	/* check i_size again here, because empty directory can be
	 * marked as complete while not holding the i_mutex. */
	 * marked as complete while not holding the i_rwsem. */
	if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
		dentry = cache_ctl->dentries[cache_ctl->index];
	else
@@ -478,8 +478,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
					2 : (fpos_off(rde->offset) + 1);
			err = note_last_dentry(dfi, rde->name, rde->name_len,
					       next_offset);
			if (err)
			if (err) {
				ceph_mdsc_put_request(dfi->last_readdir);
				dfi->last_readdir = NULL;
				return err;
			}
		} else if (req->r_reply_info.dir_end) {
			dfi->next_offset = 2;
			/* keep last name */
@@ -520,6 +523,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
		if (!dir_emit(ctx, rde->name, rde->name_len,
			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
			/*
			 * NOTE: Here no need to put the 'dfi->last_readdir',
			 * because when dir_emit stops us it's most likely
			 * doesn't have enough memory, etc. So for next readdir
			 * it will continue.
			 */
			dout("filldir stopping us...\n");
			return 0;
		}
@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
				   struct dentry *dentry)
{
	struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
	struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
	struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */

	/* .snap dir? */
	if (ceph_snap(parent) == CEPH_NOSNAP &&
Loading