Commit 8a05abd0 authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-5.15-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:

 - a set of patches to address fsync stalls caused by depending on
   periodic rather than triggered MDS journal flushes in some cases
   (Xiubo Li)

 - a fix for mtime effectively not getting updated in case of competing
   writers (Jeff Layton)

 - a couple of fixes for inode reference leaks and various WARNs after
   "umount -f" (Xiubo Li)

 - a new ceph.auth_mds extended attribute (Jeff Layton)

 - a smattering of fixups and cleanups from Jeff, Xiubo and Colin.

* tag 'ceph-for-5.15-rc1' of git://github.com/ceph/ceph-client:
  ceph: fix dereference of null pointer cf
  ceph: drop the mdsc_get_session/put_session dout messages
  ceph: lockdep annotations for try_nonblocking_invalidate
  ceph: don't WARN if we're forcibly removing the session caps
  ceph: don't WARN if we're force umounting
  ceph: remove the capsnaps when removing caps
  ceph: request Fw caps before updating the mtime in ceph_write_iter
  ceph: reconnect to the export targets on new mdsmaps
  ceph: print more information when we can't find snaprealm
  ceph: add ceph_change_snap_realm() helper
  ceph: remove redundant initializations from mdsc and session
  ceph: cancel delayed work instead of flushing on mdsc teardown
  ceph: add a new vxattr to return auth mds for an inode
  ceph: remove some defunct forward declarations
  ceph: flush the mdlog before waiting on unsafe reqs
  ceph: flush mdlog before umounting
  ceph: make iterate_sessions a global symbol
  ceph: make ceph_create_session_msg a global symbol
  ceph: fix comment about short copies in ceph_write_end
  ceph: fix memory leak on decode error in ceph_handle_caps
parents 34c59da4 05a444d3
Loading
Loading
Loading
Loading
+1 −1
Original line number Diff line number Diff line
@@ -1281,8 +1281,8 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
	dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
	     inode, page, (int)pos, (int)copied, (int)len);

	/* zero the stale part of the page if we did a short copy */
	if (!PageUptodate(page)) {
		/* just return that nothing was copied on a short copy */
		if (copied < len) {
			copied = 0;
			goto out;
+0 −6
Original line number Diff line number Diff line
@@ -26,12 +26,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci);
void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp);
void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci);

int ceph_readpage_from_fscache(struct inode *inode, struct page *page);
int ceph_readpages_from_fscache(struct inode *inode,
				struct address_space *mapping,
				struct list_head *pages,
				unsigned *nr_pages);

static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci)
{
	ci->fscache = NULL;
+174 −92
Original line number Diff line number Diff line
@@ -703,29 +703,12 @@ void ceph_add_cap(struct inode *inode,
		 */
		struct ceph_snap_realm *realm = ceph_lookup_snap_realm(mdsc,
							       realmino);
		if (realm) {
			struct ceph_snap_realm *oldrealm = ci->i_snap_realm;
			if (oldrealm) {
				spin_lock(&oldrealm->inodes_with_caps_lock);
				list_del_init(&ci->i_snap_realm_item);
				spin_unlock(&oldrealm->inodes_with_caps_lock);
			}

			spin_lock(&realm->inodes_with_caps_lock);
			list_add(&ci->i_snap_realm_item,
				 &realm->inodes_with_caps);
			ci->i_snap_realm = realm;
			if (realm->ino == ci->i_vino.ino)
				realm->inode = inode;
			spin_unlock(&realm->inodes_with_caps_lock);

			if (oldrealm)
				ceph_put_snap_realm(mdsc, oldrealm);
		} else {
			pr_err("ceph_add_cap: couldn't find snap realm %llx\n",
			       realmino);
			WARN_ON(!realm);
		}
		if (realm)
			ceph_change_snap_realm(inode, realm);
		else
			WARN(1, "%s: couldn't find snap realm 0x%llx (ino 0x%llx oldrealm 0x%llx)\n",
			     __func__, realmino, ci->i_vino.ino,
			     ci->i_snap_realm ? ci->i_snap_realm->ino : 0);
	}

	__check_cap_issue(ci, cap, issued);
@@ -1112,20 +1095,6 @@ int ceph_is_any_caps(struct inode *inode)
	return ret;
}

static void drop_inode_snap_realm(struct ceph_inode_info *ci)
{
	struct ceph_snap_realm *realm = ci->i_snap_realm;
	spin_lock(&realm->inodes_with_caps_lock);
	list_del_init(&ci->i_snap_realm_item);
	ci->i_snap_realm_counter++;
	ci->i_snap_realm = NULL;
	if (realm->ino == ci->i_vino.ino)
		realm->inode = NULL;
	spin_unlock(&realm->inodes_with_caps_lock);
	ceph_put_snap_realm(ceph_sb_to_client(ci->vfs_inode.i_sb)->mdsc,
			    realm);
}

/*
 * Remove a cap.  Take steps to deal with a racing iterate_session_caps.
 *
@@ -1145,17 +1114,16 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
		return;
	}

	lockdep_assert_held(&ci->i_ceph_lock);

	dout("__ceph_remove_cap %p from %p\n", cap, &ci->vfs_inode);

	mdsc = ceph_inode_to_client(&ci->vfs_inode)->mdsc;

	/* remove from inode's cap rbtree, and clear auth cap */
	rb_erase(&cap->ci_node, &ci->i_caps);
	if (ci->i_auth_cap == cap) {
		WARN_ON_ONCE(!list_empty(&ci->i_dirty_item) &&
			     !mdsc->fsc->blocklisted);
	if (ci->i_auth_cap == cap)
		ci->i_auth_cap = NULL;
	}

	/* remove from session list */
	spin_lock(&session->s_cap_lock);
@@ -1201,12 +1169,34 @@ void __ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
		 * keep i_snap_realm.
		 */
		if (ci->i_wr_ref == 0 && ci->i_snap_realm)
			drop_inode_snap_realm(ci);
			ceph_change_snap_realm(&ci->vfs_inode, NULL);

		__cap_delay_cancel(mdsc, ci);
	}
}

void ceph_remove_cap(struct ceph_cap *cap, bool queue_release)
{
	struct ceph_inode_info *ci = cap->ci;
	struct ceph_fs_client *fsc;

	/* 'ci' being NULL means the remove have already occurred */
	if (!ci) {
		dout("%s: cap inode is NULL\n", __func__);
		return;
	}

	lockdep_assert_held(&ci->i_ceph_lock);

	fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
	WARN_ON_ONCE(ci->i_auth_cap == cap &&
		     !list_empty(&ci->i_dirty_item) &&
		     !fsc->blocklisted &&
		     READ_ONCE(fsc->mount_state) != CEPH_MOUNT_SHUTDOWN);

	__ceph_remove_cap(cap, queue_release);
}

struct cap_msg_args {
	struct ceph_mds_session	*session;
	u64			ino, cid, follows;
@@ -1335,7 +1325,7 @@ void __ceph_remove_caps(struct ceph_inode_info *ci)
	while (p) {
		struct ceph_cap *cap = rb_entry(p, struct ceph_cap, ci_node);
		p = rb_next(p);
		__ceph_remove_cap(cap, true);
		ceph_remove_cap(cap, true);
	}
	spin_unlock(&ci->i_ceph_lock);
}
@@ -1746,6 +1736,9 @@ struct ceph_cap_flush *ceph_alloc_cap_flush(void)
	struct ceph_cap_flush *cf;

	cf = kmem_cache_alloc(ceph_cap_flush_cachep, GFP_KERNEL);
	if (!cf)
		return NULL;

	cf->is_capsnap = false;
	return cf;
}
@@ -1856,6 +1849,8 @@ static u64 __mark_caps_flushing(struct inode *inode,
 * try to invalidate mapping pages without blocking.
 */
static int try_nonblocking_invalidate(struct inode *inode)
	__releases(ci->i_ceph_lock)
	__acquires(ci->i_ceph_lock)
{
	struct ceph_inode_info *ci = ceph_inode(inode);
	u32 invalidating_gen = ci->i_rdcache_gen;
@@ -2219,6 +2214,7 @@ static int caps_are_flushed(struct inode *inode, u64 flush_tid)
 */
static int unsafe_request_wait(struct inode *inode)
{
	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_mds_request *req1 = NULL, *req2 = NULL;
	int ret, err = 0;
@@ -2238,6 +2234,81 @@ static int unsafe_request_wait(struct inode *inode)
	}
	spin_unlock(&ci->i_unsafe_lock);

	/*
	 * Trigger to flush the journal logs in all the relevant MDSes
	 * manually, or in the worst case we must wait at most 5 seconds
	 * to wait the journal logs to be flushed by the MDSes periodically.
	 */
	if (req1 || req2) {
		struct ceph_mds_session **sessions = NULL;
		struct ceph_mds_session *s;
		struct ceph_mds_request *req;
		unsigned int max;
		int i;

		/*
		 * The mdsc->max_sessions is unlikely to be changed
		 * mostly, here we will retry it by reallocating the
		 * sessions arrary memory to get rid of the mdsc->mutex
		 * lock.
		 */
retry:
		max = mdsc->max_sessions;
		sessions = krealloc(sessions, max * sizeof(s), __GFP_ZERO);
		if (!sessions)
			return -ENOMEM;

		spin_lock(&ci->i_unsafe_lock);
		if (req1) {
			list_for_each_entry(req, &ci->i_unsafe_dirops,
					    r_unsafe_dir_item) {
				s = req->r_session;
				if (unlikely(s->s_mds > max)) {
					spin_unlock(&ci->i_unsafe_lock);
					goto retry;
				}
				if (!sessions[s->s_mds]) {
					s = ceph_get_mds_session(s);
					sessions[s->s_mds] = s;
				}
			}
		}
		if (req2) {
			list_for_each_entry(req, &ci->i_unsafe_iops,
					    r_unsafe_target_item) {
				s = req->r_session;
				if (unlikely(s->s_mds > max)) {
					spin_unlock(&ci->i_unsafe_lock);
					goto retry;
				}
				if (!sessions[s->s_mds]) {
					s = ceph_get_mds_session(s);
					sessions[s->s_mds] = s;
				}
			}
		}
		spin_unlock(&ci->i_unsafe_lock);

		/* the auth MDS */
		spin_lock(&ci->i_ceph_lock);
		if (ci->i_auth_cap) {
		      s = ci->i_auth_cap->session;
		      if (!sessions[s->s_mds])
			      sessions[s->s_mds] = ceph_get_mds_session(s);
		}
		spin_unlock(&ci->i_ceph_lock);

		/* send flush mdlog request to MDSes */
		for (i = 0; i < max; i++) {
			s = sessions[i];
			if (s) {
				send_flush_mdlog(s);
				ceph_put_mds_session(s);
			}
		}
		kfree(sessions);
	}

	dout("unsafe_request_wait %p wait on tid %llu %llu\n",
	     inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
	if (req1) {
@@ -3008,7 +3079,7 @@ static void __ceph_put_cap_refs(struct ceph_inode_info *ci, int had,
			}
			/* see comment in __ceph_remove_cap() */
			if (!__ceph_is_any_real_caps(ci) && ci->i_snap_realm)
				drop_inode_snap_realm(ci);
				ceph_change_snap_realm(inode, NULL);
		}
	}
	if (check_flushsnaps && __ceph_have_pending_cap_snap(ci)) {
@@ -3114,7 +3185,16 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
				break;
			}
		}
		BUG_ON(!found);

		if (!found) {
			/*
			 * The capsnap should already be removed when removing
			 * auth cap in the case of a forced unmount.
			 */
			WARN_ON_ONCE(ci->i_auth_cap);
			goto unlock;
		}

		capsnap->dirty_pages -= nr;
		if (capsnap->dirty_pages == 0) {
			complete_capsnap = true;
@@ -3136,6 +3216,7 @@ void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
		     complete_capsnap ? " (complete capsnap)" : "");
	}

unlock:
	spin_unlock(&ci->i_ceph_lock);

	if (last) {
@@ -3606,6 +3687,43 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
		iput(inode);
}

void __ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
			   bool *wake_ci, bool *wake_mdsc)
{
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
	bool ret;

	lockdep_assert_held(&ci->i_ceph_lock);

	dout("removing capsnap %p, inode %p ci %p\n", capsnap, inode, ci);

	list_del_init(&capsnap->ci_item);
	ret = __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);
	if (wake_ci)
		*wake_ci = ret;

	spin_lock(&mdsc->cap_dirty_lock);
	if (list_empty(&ci->i_cap_flush_list))
		list_del_init(&ci->i_flushing_item);

	ret = __detach_cap_flush_from_mdsc(mdsc, &capsnap->cap_flush);
	if (wake_mdsc)
		*wake_mdsc = ret;
	spin_unlock(&mdsc->cap_dirty_lock);
}

void ceph_remove_capsnap(struct inode *inode, struct ceph_cap_snap *capsnap,
			 bool *wake_ci, bool *wake_mdsc)
{
	struct ceph_inode_info *ci = ceph_inode(inode);

	lockdep_assert_held(&ci->i_ceph_lock);

	WARN_ON_ONCE(capsnap->dirty_pages || capsnap->writing);
	__ceph_remove_capsnap(inode, capsnap, wake_ci, wake_mdsc);
}

/*
 * Handle FLUSHSNAP_ACK.  MDS has flushed snap data to disk and we can
 * throw away our cap_snap.
@@ -3643,23 +3761,10 @@ static void handle_cap_flushsnap_ack(struct inode *inode, u64 flush_tid,
			     capsnap, capsnap->follows);
		}
	}
	if (flushed) {
		WARN_ON(capsnap->dirty_pages || capsnap->writing);
		dout(" removing %p cap_snap %p follows %lld\n",
		     inode, capsnap, follows);
		list_del(&capsnap->ci_item);
		wake_ci |= __detach_cap_flush_from_ci(ci, &capsnap->cap_flush);

		spin_lock(&mdsc->cap_dirty_lock);

		if (list_empty(&ci->i_cap_flush_list))
			list_del_init(&ci->i_flushing_item);

		wake_mdsc |= __detach_cap_flush_from_mdsc(mdsc,
							  &capsnap->cap_flush);
		spin_unlock(&mdsc->cap_dirty_lock);
	}
	if (flushed)
		ceph_remove_capsnap(inode, capsnap, &wake_ci, &wake_mdsc);
	spin_unlock(&ci->i_ceph_lock);

	if (flushed) {
		ceph_put_snap_context(capsnap->context);
		ceph_put_cap_snap(capsnap);
@@ -3743,7 +3848,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
		goto out_unlock;

	if (target < 0) {
		__ceph_remove_cap(cap, false);
		ceph_remove_cap(cap, false);
		goto out_unlock;
	}

@@ -3778,7 +3883,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
				change_auth_cap_ses(ci, tcap->session);
			}
		}
		__ceph_remove_cap(cap, false);
		ceph_remove_cap(cap, false);
		goto out_unlock;
	} else if (tsession) {
		/* add placeholder for the export tagert */
@@ -3795,7 +3900,7 @@ static void handle_cap_export(struct inode *inode, struct ceph_mds_caps *ex,
			spin_unlock(&mdsc->cap_dirty_lock);
		}

		__ceph_remove_cap(cap, false);
		ceph_remove_cap(cap, false);
		goto out_unlock;
	}

@@ -3906,7 +4011,7 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
					ocap->mseq, mds, le32_to_cpu(ph->seq),
					le32_to_cpu(ph->mseq));
		}
		__ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
		ceph_remove_cap(ocap, (ph->flags & CEPH_CAP_FLAG_RELEASE));
	}

	*old_issued = issued;
@@ -4134,8 +4239,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
done:
	mutex_unlock(&session->s_mutex);
done_unlocked:
	ceph_put_string(extra_info.pool_ns);
	iput(inode);
out:
	ceph_put_string(extra_info.pool_ns);
	return;

flush_cap_releases:
@@ -4150,7 +4256,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
bad:
	pr_err("ceph_handle_caps: corrupt message\n");
	ceph_msg_dump(msg);
	return;
	goto out;
}

/*
@@ -4225,33 +4331,9 @@ static void flush_dirty_session_caps(struct ceph_mds_session *s)
	dout("flush_dirty_caps done\n");
}

static void iterate_sessions(struct ceph_mds_client *mdsc,
			     void (*cb)(struct ceph_mds_session *))
{
	int mds;

	mutex_lock(&mdsc->mutex);
	for (mds = 0; mds < mdsc->max_sessions; ++mds) {
		struct ceph_mds_session *s;

		if (!mdsc->sessions[mds])
			continue;

		s = ceph_get_mds_session(mdsc->sessions[mds]);
		if (!s)
			continue;

		mutex_unlock(&mdsc->mutex);
		cb(s);
		ceph_put_mds_session(s);
		mutex_lock(&mdsc->mutex);
	}
	mutex_unlock(&mdsc->mutex);
}

void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
	iterate_sessions(mdsc, flush_dirty_session_caps);
	ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}

void __ceph_touch_fmode(struct ceph_inode_info *ci,
+17 −15
Original line number Diff line number Diff line
@@ -1722,32 +1722,26 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
		goto out;
	}

	err = file_remove_privs(file);
	if (err)
	down_read(&osdc->lock);
	map_flags = osdc->osdmap->flags;
	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
	up_read(&osdc->lock);
	if ((map_flags & CEPH_OSDMAP_FULL) ||
	    (pool_flags & CEPH_POOL_FLAG_FULL)) {
		err = -ENOSPC;
		goto out;
	}

	err = file_update_time(file);
	err = file_remove_privs(file);
	if (err)
		goto out;

	inode_inc_iversion_raw(inode);

	if (ci->i_inline_version != CEPH_INLINE_NONE) {
		err = ceph_uninline_data(file, NULL);
		if (err < 0)
			goto out;
	}

	down_read(&osdc->lock);
	map_flags = osdc->osdmap->flags;
	pool_flags = ceph_pg_pool_flags(osdc->osdmap, ci->i_layout.pool_id);
	up_read(&osdc->lock);
	if ((map_flags & CEPH_OSDMAP_FULL) ||
	    (pool_flags & CEPH_POOL_FLAG_FULL)) {
		err = -ENOSPC;
		goto out;
	}

	dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
	     inode, ceph_vinop(inode), pos, count, i_size_read(inode));
	if (fi->fmode & CEPH_FILE_MODE_LAZY)
@@ -1759,6 +1753,12 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
	if (err < 0)
		goto out;

	err = file_update_time(file);
	if (err)
		goto out_caps;

	inode_inc_iversion_raw(inode);

	dout("aio_write %p %llx.%llx %llu~%zd got cap refs on %s\n",
	     inode, ceph_vinop(inode), pos, count, ceph_cap_string(got));

@@ -1842,6 +1842,8 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
	}

	goto out_unlocked;
out_caps:
	ceph_put_cap_refs(ci, got);
out:
	if (direct_lock)
		ceph_end_io_direct(inode);
+2 −9
Original line number Diff line number Diff line
@@ -581,16 +581,9 @@ void ceph_evict_inode(struct inode *inode)
	 */
	if (ci->i_snap_realm) {
		if (ceph_snap(inode) == CEPH_NOSNAP) {
			struct ceph_snap_realm *realm = ci->i_snap_realm;
			dout(" dropping residual ref to snap realm %p\n",
			     realm);
			spin_lock(&realm->inodes_with_caps_lock);
			list_del_init(&ci->i_snap_realm_item);
			ci->i_snap_realm = NULL;
			if (realm->ino == ci->i_vino.ino)
				realm->inode = NULL;
			spin_unlock(&realm->inodes_with_caps_lock);
			ceph_put_snap_realm(mdsc, realm);
			     ci->i_snap_realm);
			ceph_change_snap_realm(inode, NULL);
		} else {
			ceph_put_snapid_map(mdsc, ci->i_snapid_map);
			ci->i_snap_realm = NULL;
Loading