Commit 17d8e3d9 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "A big pile of assorted fixes and improvements for the filesystem with
  nothing in particular standing out, except perhaps that the fact that
  the MDS never really maintained atime was made official and thus it's
  no longer updated on the client either.

  We also have a MAINTAINERS update: Jeff is transitioning his
  filesystem maintainership duties to Xiubo"

* tag 'ceph-for-5.19-rc1' of https://github.com/ceph/ceph-client: (23 commits)
  MAINTAINERS: move myself from ceph "Maintainer" to "Reviewer"
  ceph: fix decoding of client session messages flags
  ceph: switch TASK_INTERRUPTIBLE to TASK_KILLABLE
  ceph: remove redundant variable ino
  ceph: try to queue a writeback if revoking fails
  ceph: fix statfs for subdir mounts
  ceph: fix possible deadlock when holding Fwb to get inline_data
  ceph: redirty the page for writepage on failure
  ceph: try to choose the auth MDS if possible for getattr
  ceph: disable updating the atime since cephfs won't maintain it
  ceph: flush the mdlog for filesystem sync
  ceph: rename unsafe_request_wait()
  libceph: use swap() macro instead of taking tmp variable
  ceph: fix statx AT_STATX_DONT_SYNC vs AT_STATX_FORCE_SYNC check
  ceph: no need to invalidate the fscache twice
  ceph: replace usage of found with dedicated list iterator variable
  ceph: use dedicated list iterator variable
  ceph: update the dlease for the hashed dentry when removing
  ceph: stop retrying the request when exceeding 256 times
  ceph: stop forwarding the request when exceeding 256 times
  ...
parents 7c9e960c af7dc8e5
Loading
Loading
Loading
Loading
+2 −2
Original line number Diff line number Diff line
@@ -4566,8 +4566,8 @@ F: drivers/power/supply/cw2015_battery.c
CEPH COMMON CODE (LIBCEPH)
M:	Ilya Dryomov <idryomov@gmail.com>
M:	Jeff Layton <jlayton@kernel.org>
M:	Xiubo Li <xiubli@redhat.com>
R:	Jeff Layton <jlayton@kernel.org>
L:	ceph-devel@vger.kernel.org
S:	Supported
W:	http://ceph.com/
@@ -4577,9 +4577,9 @@ F: include/linux/crush/
F:	net/ceph/
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
M:	Jeff Layton <jlayton@kernel.org>
M:	Xiubo Li <xiubli@redhat.com>
M:	Ilya Dryomov <idryomov@gmail.com>
R:	Jeff Layton <jlayton@kernel.org>
L:	ceph-devel@vger.kernel.org
S:	Supported
W:	http://ceph.com/
+6 −7
Original line number Diff line number Diff line
@@ -756,24 +756,23 @@ static struct rbd_client *__rbd_get_client(struct rbd_client *rbdc)
 */
static struct rbd_client *rbd_client_find(struct ceph_options *ceph_opts)
{
	struct rbd_client *client_node;
	bool found = false;
	struct rbd_client *rbdc = NULL, *iter;

	if (ceph_opts->flags & CEPH_OPT_NOSHARE)
		return NULL;

	spin_lock(&rbd_client_list_lock);
	list_for_each_entry(client_node, &rbd_client_list, node) {
		if (!ceph_compare_options(ceph_opts, client_node->client)) {
			__rbd_get_client(client_node);
	list_for_each_entry(iter, &rbd_client_list, node) {
		if (!ceph_compare_options(ceph_opts, iter->client)) {
			__rbd_get_client(iter);

			found = true;
			rbdc = iter;
			break;
		}
	}
	spin_unlock(&rbd_client_list_lock);

	return found ? client_node : NULL;
	return rbdc;
}

/*
+25 −17
Original line number Diff line number Diff line
@@ -256,6 +256,7 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
	struct iov_iter iter;
	ssize_t err = 0;
	size_t len;
	int mode;

	__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
	__clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags);
@@ -264,7 +265,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
		goto out;

	/* We need to fetch the inline data. */
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, USE_ANY_MDS);
	mode = ceph_try_to_choose_auth_mds(inode, CEPH_STAT_CAP_INLINE_DATA);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
	if (IS_ERR(req)) {
		err = PTR_ERR(req);
		goto out;
@@ -604,8 +606,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
				    ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
				    true);
	if (IS_ERR(req))
	if (IS_ERR(req)) {
		redirty_page_for_writepage(wbc, page);
		return PTR_ERR(req);
	}

	set_page_writeback(page);
	if (caching)
@@ -1644,7 +1648,7 @@ int ceph_uninline_data(struct file *file)
	struct inode *inode = file_inode(file);
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_osd_request *req;
	struct ceph_osd_request *req = NULL;
	struct ceph_cap_flush *prealloc_cf;
	struct folio *folio = NULL;
	u64 inline_version = CEPH_INLINE_NONE;
@@ -1652,10 +1656,23 @@ int ceph_uninline_data(struct file *file)
	int err = 0;
	u64 len;

	spin_lock(&ci->i_ceph_lock);
	inline_version = ci->i_inline_version;
	spin_unlock(&ci->i_ceph_lock);

	dout("uninline_data %p %llx.%llx inline_version %llu\n",
	     inode, ceph_vinop(inode), inline_version);

	if (inline_version == CEPH_INLINE_NONE)
		return 0;

	prealloc_cf = ceph_alloc_cap_flush();
	if (!prealloc_cf)
		return -ENOMEM;

	if (inline_version == 1) /* initial version, no data */
		goto out_uninline;

	folio = read_mapping_folio(inode->i_mapping, 0, file);
	if (IS_ERR(folio)) {
		err = PTR_ERR(folio);
@@ -1664,17 +1681,6 @@ int ceph_uninline_data(struct file *file)

	folio_lock(folio);

	spin_lock(&ci->i_ceph_lock);
	inline_version = ci->i_inline_version;
	spin_unlock(&ci->i_ceph_lock);

	dout("uninline_data %p %llx.%llx inline_version %llu\n",
	     inode, ceph_vinop(inode), inline_version);

	if (inline_version == 1 || /* initial version, no data */
	    inline_version == CEPH_INLINE_NONE)
		goto out_unlock;

	len = i_size_read(inode);
	if (len > folio_size(folio))
		len = folio_size(folio);
@@ -1739,6 +1745,7 @@ int ceph_uninline_data(struct file *file)
	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, err);

out_uninline:
	if (!err) {
		int dirty;

@@ -1757,8 +1764,10 @@ int ceph_uninline_data(struct file *file)
	if (err == -ECANCELED)
		err = 0;
out_unlock:
	if (folio) {
		folio_unlock(folio);
		folio_put(folio);
	}
out:
	ceph_free_cap_flush(prealloc_cf);
	dout("uninline_data %p %llx.%llx inline_version %llu = %d\n",
@@ -1777,7 +1786,6 @@ int ceph_mmap(struct file *file, struct vm_area_struct *vma)

	if (!mapping->a_ops->read_folio)
		return -ENOEXEC;
	file_accessed(file);
	vma->vm_ops = &ceph_vmops;
	return 0;
}
+47 −28
Original line number Diff line number Diff line
@@ -1577,7 +1577,7 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,

	while (first_tid <= last_tid) {
		struct ceph_cap *cap = ci->i_auth_cap;
		struct ceph_cap_flush *cf;
		struct ceph_cap_flush *cf = NULL, *iter;
		int ret;

		if (!(cap && cap->session == session)) {
@@ -1587,8 +1587,9 @@ static void __ceph_flush_snaps(struct ceph_inode_info *ci,
		}

		ret = -ENOENT;
		list_for_each_entry(cf, &ci->i_cap_flush_list, i_list) {
			if (cf->tid >= first_tid) {
		list_for_each_entry(iter, &ci->i_cap_flush_list, i_list) {
			if (iter->tid >= first_tid) {
				cf = iter;
				ret = 0;
				break;
			}
@@ -1910,6 +1911,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
	struct rb_node *p;
	bool queue_invalidate = false;
	bool tried_invalidate = false;
	bool queue_writeback = false;

	if (session)
		ceph_get_mds_session(session);
@@ -2062,12 +2064,29 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
		}

		/* completed revocation? going down and there are no caps? */
		if (revoking && (revoking & cap_used) == 0) {
		if (revoking) {
			if ((revoking & cap_used) == 0) {
				dout("completed revocation of %s\n",
				      ceph_cap_string(cap->implemented & ~cap->issued));
				goto ack;
			}

			/*
			 * If the "i_wrbuffer_ref" was increased by mmap or generic
			 * cache write just before the ceph_check_caps() is called,
			 * the Fb capability revoking will fail this time. Then we
			 * must wait for the BDI's delayed work to flush the dirty
			 * pages and to release the "i_wrbuffer_ref", which will cost
			 * at most 5 seconds. That means the MDS needs to wait at
			 * most 5 seconds to finished the Fb capability's revocation.
			 *
			 * Let's queue a writeback for it.
			 */
			if (S_ISREG(inode->i_mode) && ci->i_wrbuffer_ref &&
			    (revoking & CEPH_CAP_FILE_BUFFER))
				queue_writeback = true;
		}

		/* want more caps from mds? */
		if (want & ~cap->mds_wanted) {
			if (want & ~(cap->mds_wanted | cap->issued))
@@ -2135,6 +2154,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
	spin_unlock(&ci->i_ceph_lock);

	ceph_put_mds_session(session);
	if (queue_writeback)
		ceph_queue_writeback(inode);
	if (queue_invalidate)
		ceph_queue_invalidate(inode);
}
@@ -2218,9 +2239,9 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
}

/*
 * wait for any unsafe requests to complete.
 * flush the mdlog and wait for any unsafe requests to complete.
 */
static int unsafe_request_wait(struct inode *inode)
static int flush_mdlog_and_wait_inode_unsafe_requests(struct inode *inode)
{
	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
	struct ceph_inode_info *ci = ceph_inode(inode);
@@ -2336,7 +2357,7 @@ static int unsafe_request_wait(struct inode *inode)
		kfree(sessions);
	}

	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
	dout("%s %p wait on tid %llu %llu\n", __func__,
	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
	if (req1) {
		ret = !wait_for_completion_timeout(&req1->r_safe_completion,
@@ -2380,7 +2401,7 @@ int ceph_fsync(struct file *file, loff_t start, loff_t end, int datasync)
	dirty = try_flush_caps(inode, &flush_tid);
	dout("fsync dirty caps are %s\n", ceph_cap_string(dirty));

	err = unsafe_request_wait(inode);
	err = flush_mdlog_and_wait_inode_unsafe_requests(inode);

	/*
	 * only wait on non-file metadata writeback (the mds
@@ -3182,10 +3203,9 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				struct ceph_snap_context *snapc)
{
	struct inode *inode = &ci->vfs_inode;
	struct ceph_cap_snap *capsnap = NULL;
	struct ceph_cap_snap *capsnap = NULL, *iter;
	int put = 0;
	bool last = false;
	bool found = false;
	bool flush_snaps = false;
	bool complete_capsnap = false;

@@ -3212,14 +3232,14 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
		     ci->i_wrbuffer_ref, ci->i_wrbuffer_ref_head,
		     last ? " LAST" : "");
	} else {
		list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
			if (capsnap->context == snapc) {
				found = true;
		list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
			if (iter->context == snapc) {
				capsnap = iter;
				break;
			}
		}

		if (!found) {
		if (!capsnap) {
			/*
			 * The capsnap should already be removed when removing
			 * auth cap in the case of a forced unmount.
@@ -3769,8 +3789,7 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
	u64 follows = le64_to_cpu(m->snap_follows);
	struct ceph_cap_snap *capsnap;
	bool flushed = false;
	struct ceph_cap_snap *capsnap = NULL, *iter;
	bool wake_ci = false;
	bool wake_mdsc = false;

@@ -3778,26 +3797,26 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
	     inode, ci, session->s_mds, follows);

	spin_lock(&ci->i_ceph_lock);
	list_for_each_entry(capsnap, &ci->i_cap_snaps, ci_item) {
		if (capsnap->follows == follows) {
			if (capsnap->cap_flush.tid != flush_tid) {
	list_for_each_entry(iter, &ci->i_cap_snaps, ci_item) {
		if (iter->follows == follows) {
			if (iter->cap_flush.tid != flush_tid) {
				dout(" cap_snap %p follows %lld tid %lld !="
				     " %lld\n", capsnap, follows,
				     flush_tid, capsnap->cap_flush.tid);
				     " %lld\n", iter, follows,
				     flush_tid, iter->cap_flush.tid);
				break;
			}
			flushed = true;
			capsnap = iter;
			break;
		} else {
			dout(" skipping cap_snap %p follows %lld\n",
			     capsnap, capsnap->follows);
			     iter, iter->follows);
		}
	}
	if (flushed)
	if (capsnap)
		ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
	spin_unlock(&ci->i_ceph_lock);

	if (flushed) {
	if (capsnap) {
		ceph_put_snap_context(capsnap->context);
		ceph_put_cap_snap(capsnap);
		if (wake_ci)
+30 −5
Original line number Diff line number Diff line
@@ -578,7 +578,7 @@ void ceph_evict_inode(struct inode *inode)

	__ceph_remove_caps(ci);

	if (__ceph_has_any_quota(ci))
	if (__ceph_has_quota(ci, QUOTA_GET_ANY))
		ceph_adjust_quota_realms_count(inode, false);

	/*
@@ -1466,10 +1466,12 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
			} else if (have_lease) {
				if (d_unhashed(dn))
					d_add(dn, NULL);
			}

			if (!d_unhashed(dn) && have_lease)
				update_dentry_lease(dir, dn,
						    rinfo->dlease, session,
						    req->r_request_started);
			}
			goto done;
		}

@@ -1884,7 +1886,6 @@ static void ceph_do_invalidate_pages(struct inode *inode)
	orig_gen = ci->i_rdcache_gen;
	spin_unlock(&ci->i_ceph_lock);

	ceph_fscache_invalidate(inode, false);
	if (invalidate_inode_pages2(inode->i_mapping) < 0) {
		pr_err("invalidate_inode_pages2 %llx.%llx failed\n",
		       ceph_vinop(inode));
@@ -2258,6 +2259,30 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry,
	return err;
}

int ceph_try_to_choose_auth_mds(struct inode *inode, int mask)
{
	int issued = ceph_caps_issued(ceph_inode(inode));

	/*
	 * If any 'x' caps is issued we can just choose the auth MDS
	 * instead of the random replica MDSes. Because only when the
	 * Locker is in LOCK_EXEC state will the loner client could
	 * get the 'x' caps. And if we send the getattr requests to
	 * any replica MDS it must auth pin and tries to rdlock from
	 * the auth MDS, and then the auth MDS need to do the Locker
	 * state transition to LOCK_SYNC. And after that the lock state
	 * will change back.
	 *
	 * This cost much when doing the Locker state transition and
	 * usually will need to revoke caps from clients.
	 */
	if (((mask & CEPH_CAP_ANY_SHARED) && (issued & CEPH_CAP_ANY_EXCL))
	    || (mask & CEPH_STAT_RSTAT))
		return USE_AUTH_MDS;
	else
		return USE_ANY_MDS;
}

/*
 * Verify that we have a lease on the given mask.  If not,
 * do a getattr against an mds.
@@ -2281,7 +2306,7 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
	if (!force && ceph_caps_issued_mask_metric(ceph_inode(inode), mask, 1))
			return 0;

	mode = (mask & CEPH_STAT_RSTAT) ? USE_AUTH_MDS : USE_ANY_MDS;
	mode = ceph_try_to_choose_auth_mds(inode, mask);
	req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETATTR, mode);
	if (IS_ERR(req))
		return PTR_ERR(req);
@@ -2423,7 +2448,7 @@ int ceph_getattr(struct user_namespace *mnt_userns, const struct path *path,
		return -ESTALE;

	/* Skip the getattr altogether if we're asked not to sync */
	if (!(flags & AT_STATX_DONT_SYNC)) {
	if ((flags & AT_STATX_SYNC_TYPE) != AT_STATX_DONT_SYNC) {
		err = ceph_do_getattr(inode,
				statx_to_caps(request_mask, inode->i_mode),
				flags & AT_STATX_FORCE_SYNC);
Loading