Commit 555dbf1a authored by Trond Myklebust's avatar Trond Myklebust Committed by Chuck Lever
Browse files

nfsd: Replace use of rwsem with errseq_t



The nfsd_file nf_rwsem is currently being used to separate file write
and commit instances to ensure that we catch errors and apply them to
the correct write/commit.
We can improve scalability at the expense of a little accuracy (some
extra false positives) by replacing the nf_rwsem with more careful
use of the errseq_t mechanism to track errors across the different
operations.

Signed-off-by: default avatarTrond Myklebust <trond.myklebust@hammerspace.com>
Signed-off-by: default avatarChuck Lever <chuck.lever@oracle.com>
[ cel: rebased on zero-verifier fix ]
parent f11ad7aa
Loading
Loading
Loading
Loading
+0 −1
Original line number Original line Diff line number Diff line
@@ -189,7 +189,6 @@ nfsd_file_alloc(struct inode *inode, unsigned int may, unsigned int hashval,
				__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
				__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
		}
		}
		nf->nf_mark = NULL;
		nf->nf_mark = NULL;
		init_rwsem(&nf->nf_rwsem);
		trace_nfsd_file_alloc(nf);
		trace_nfsd_file_alloc(nf);
	}
	}
	return nf;
	return nf;
+0 −1
Original line number Original line Diff line number Diff line
@@ -46,7 +46,6 @@ struct nfsd_file {
	refcount_t		nf_ref;
	refcount_t		nf_ref;
	unsigned char		nf_may;
	unsigned char		nf_may;
	struct nfsd_file_mark	*nf_mark;
	struct nfsd_file_mark	*nf_mark;
	struct rw_semaphore	nf_rwsem;
};
};


int nfsd_file_cache_init(void);
int nfsd_file_cache_init(void);
+9 −7
Original line number Original line Diff line number Diff line
@@ -1510,6 +1510,9 @@ static void nfsd4_init_copy_res(struct nfsd4_copy *copy, bool sync)


static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
{
{
	struct file *dst = copy->nf_dst->nf_file;
	struct file *src = copy->nf_src->nf_file;
	errseq_t since;
	ssize_t bytes_copied = 0;
	ssize_t bytes_copied = 0;
	u64 bytes_total = copy->cp_count;
	u64 bytes_total = copy->cp_count;
	u64 src_pos = copy->cp_src_pos;
	u64 src_pos = copy->cp_src_pos;
@@ -1522,8 +1525,7 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
	do {
	do {
		if (kthread_should_stop())
		if (kthread_should_stop())
			break;
			break;
		bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
		bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
				src_pos, copy->nf_dst->nf_file, dst_pos,
						    bytes_total);
						    bytes_total);
		if (bytes_copied <= 0)
		if (bytes_copied <= 0)
			break;
			break;
@@ -1534,11 +1536,11 @@ static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
	} while (bytes_total > 0 && !copy->cp_synchronous);
	} while (bytes_total > 0 && !copy->cp_synchronous);
	/* for a non-zero asynchronous copy do a commit of data */
	/* for a non-zero asynchronous copy do a commit of data */
	if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
	if (!copy->cp_synchronous && copy->cp_res.wr_bytes_written > 0) {
		down_write(&copy->nf_dst->nf_rwsem);
		since = READ_ONCE(dst->f_wb_err);
		status = vfs_fsync_range(copy->nf_dst->nf_file,
		status = vfs_fsync_range(dst, copy->cp_dst_pos,
					 copy->cp_dst_pos,
					 copy->cp_res.wr_bytes_written, 0);
					 copy->cp_res.wr_bytes_written, 0);
		up_write(&copy->nf_dst->nf_rwsem);
		if (!status)
			status = filemap_check_wb_err(dst->f_mapping, since);
		if (!status)
		if (!status)
			copy->committed = true;
			copy->committed = true;
	}
	}
+15 −25
Original line number Original line Diff line number Diff line
@@ -522,10 +522,11 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
{
{
	struct file *src = nf_src->nf_file;
	struct file *src = nf_src->nf_file;
	struct file *dst = nf_dst->nf_file;
	struct file *dst = nf_dst->nf_file;
	errseq_t since;
	loff_t cloned;
	loff_t cloned;
	__be32 ret = 0;
	__be32 ret = 0;


	down_write(&nf_dst->nf_rwsem);
	since = READ_ONCE(dst->f_wb_err);
	cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
	cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
	if (cloned < 0) {
	if (cloned < 0) {
		ret = nfserrno(cloned);
		ret = nfserrno(cloned);
@@ -539,6 +540,8 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
		loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
		loff_t dst_end = count ? dst_pos + count - 1 : LLONG_MAX;
		int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
		int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);


		if (!status)
			status = filemap_check_wb_err(dst->f_mapping, since);
		if (!status)
		if (!status)
			status = commit_inode_metadata(file_inode(src));
			status = commit_inode_metadata(file_inode(src));
		if (status < 0) {
		if (status < 0) {
@@ -548,7 +551,6 @@ __be32 nfsd4_clone_file_range(struct nfsd_file *nf_src, u64 src_pos,
		}
		}
	}
	}
out_err:
out_err:
	up_write(&nf_dst->nf_rwsem);
	return ret;
	return ret;
}
}


@@ -956,6 +958,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
	struct super_block	*sb = file_inode(file)->i_sb;
	struct super_block	*sb = file_inode(file)->i_sb;
	struct svc_export	*exp;
	struct svc_export	*exp;
	struct iov_iter		iter;
	struct iov_iter		iter;
	errseq_t		since;
	__be32			nfserr;
	__be32			nfserr;
	int			host_err;
	int			host_err;
	int			use_wgather;
	int			use_wgather;
@@ -993,8 +996,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
		flags |= RWF_SYNC;
		flags |= RWF_SYNC;


	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
	iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
	since = READ_ONCE(file->f_wb_err);
	if (flags & RWF_SYNC) {
	if (flags & RWF_SYNC) {
		down_write(&nf->nf_rwsem);
		if (verf)
		if (verf)
			nfsd_copy_boot_verifier(verf,
			nfsd_copy_boot_verifier(verf,
					net_generic(SVC_NET(rqstp),
					net_generic(SVC_NET(rqstp),
@@ -1003,15 +1006,12 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
		if (host_err < 0)
		if (host_err < 0)
			nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
			nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
						 nfsd_net_id));
						 nfsd_net_id));
		up_write(&nf->nf_rwsem);
	} else {
	} else {
		down_read(&nf->nf_rwsem);
		if (verf)
		if (verf)
			nfsd_copy_boot_verifier(verf,
			nfsd_copy_boot_verifier(verf,
					net_generic(SVC_NET(rqstp),
					net_generic(SVC_NET(rqstp),
					nfsd_net_id));
					nfsd_net_id));
		host_err = vfs_iter_write(file, &iter, &pos, flags);
		host_err = vfs_iter_write(file, &iter, &pos, flags);
		up_read(&nf->nf_rwsem);
	}
	}
	if (host_err < 0) {
	if (host_err < 0) {
		nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
		nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
@@ -1021,6 +1021,9 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
	*cnt = host_err;
	*cnt = host_err;
	nfsd_stats_io_write_add(exp, *cnt);
	nfsd_stats_io_write_add(exp, *cnt);
	fsnotify_modify(file);
	fsnotify_modify(file);
	host_err = filemap_check_wb_err(file->f_mapping, since);
	if (host_err < 0)
		goto out_nfserr;


	if (stable && use_wgather) {
	if (stable && use_wgather) {
		host_err = wait_for_concurrent_writes(file);
		host_err = wait_for_concurrent_writes(file);
@@ -1101,19 +1104,6 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
}
}


#ifdef CONFIG_NFSD_V3
#ifdef CONFIG_NFSD_V3
static int
nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset,
				  loff_t end)
{
	struct address_space *mapping = nf->nf_file->f_mapping;
	int ret = filemap_fdatawrite_range(mapping, offset, end);

	if (ret)
		return ret;
	filemap_fdatawait_range_keep_errors(mapping, offset, end);
	return 0;
}

/*
/*
 * Commit all pending writes to stable storage.
 * Commit all pending writes to stable storage.
 *
 *
@@ -1144,25 +1134,25 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp,
	if (err)
	if (err)
		goto out;
		goto out;
	if (EX_ISSYNC(fhp->fh_export)) {
	if (EX_ISSYNC(fhp->fh_export)) {
		int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end);
		errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
		int err2;


		down_write(&nf->nf_rwsem);
		if (!err2)
		err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
		err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
		switch (err2) {
		switch (err2) {
		case 0:
		case 0:
			nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
			nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
						nfsd_net_id));
						nfsd_net_id));
			err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
						    since);
			break;
			break;
		case -EINVAL:
		case -EINVAL:
			err = nfserr_notsupp;
			err = nfserr_notsupp;
			break;
			break;
		default:
		default:
			err = nfserrno(err2);
			nfsd_reset_boot_verifier(net_generic(nf->nf_net,
			nfsd_reset_boot_verifier(net_generic(nf->nf_net,
						 nfsd_net_id));
						 nfsd_net_id));
		}
		}
		up_write(&nf->nf_rwsem);
		err = nfserrno(err2);
	} else
	} else
		nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
		nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
					nfsd_net_id));
					nfsd_net_id));