Commit 7ba2090c authored by Linus Torvalds's avatar Linus Torvalds
Browse files

Merge tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "Mixed with some fixes and cleanups, this brings in reasonably complete
  fscrypt support to CephFS! The list of things which don't work with
  encryption should be fairly short, mostly around the edges: fallocate
  (not supported well in CephFS to begin with), copy_file_range
  (requires re-encryption), non-default striping patterns.

  This was a multi-year effort principally by Jeff Layton with
  assistance from Xiubo Li, Luís Henriques and others, including several
  dependant changes in the MDS, netfs helper library and fscrypt
  framework itself"

* tag 'ceph-for-6.6-rc1' of https://github.com/ceph/ceph-client: (53 commits)
  ceph: make num_fwd and num_retry to __u32
  ceph: make members in struct ceph_mds_request_args_ext a union
  rbd: use list_for_each_entry() helper
  libceph: do not include crypto/algapi.h
  ceph: switch ceph_lookup/atomic_open() to use new fscrypt helper
  ceph: fix updating i_truncate_pagecache_size for fscrypt
  ceph: wait for OSD requests' callbacks to finish when unmounting
  ceph: drop messages from MDS when unmounting
  ceph: update documentation regarding snapshot naming limitations
  ceph: prevent snapshot creation in encrypted locked directories
  ceph: add support for encrypted snapshot names
  ceph: invalidate pages when doing direct/sync writes
  ceph: plumb in decryption during reads
  ceph: add encryption support to writepage and writepages
  ceph: add read/modify/write to ceph_sync_write
  ceph: align data in pages in ceph_sync_write
  ceph: don't use special DIO path for encrypted inodes
  ceph: add truncate size handling support for fscrypt
  ceph: add object version support for sync read
  libceph: allow ceph_osdc_new_request to accept a multi-op read
  ...
parents 744a7594 ce0d5bd3
Loading
Loading
Loading
Loading
+10 −0
Original line number Diff line number Diff line
@@ -57,6 +57,16 @@ a snapshot on any subdirectory (and its nested contents) in the
system.  Snapshot creation and deletion are as simple as 'mkdir
.snap/foo' and 'rmdir .snap/foo'.

Snapshot names have two limitations:

* They can not start with an underscore ('_'), as these names are reserved
  for internal usage by the MDS.
* They can not exceed 240 characters in size.  This is because the MDS makes
  use of long snapshot names internally, which follow the format:
  `_<SNAPSHOT-NAME>_<INODE-NUMBER>`.  Since filenames in general can't have
  more than 255 characters, and `<node-id>` takes 13 characters, the long
  snapshot names can take as much as 255 - 1 - 1 - 13 = 240.

Ceph also provides some recursive accounting on directories for nested
files and bytes.  That is, a 'getfattr -d foo' on any directory in the
system will reveal the total number of nested regular files and
+1 −3
Original line number Diff line number Diff line
@@ -7199,7 +7199,6 @@ static void rbd_dev_remove_parent(struct rbd_device *rbd_dev)
static ssize_t do_rbd_remove(const char *buf, size_t count)
{
	struct rbd_device *rbd_dev = NULL;
	struct list_head *tmp;
	int dev_id;
	char opt_buf[6];
	bool force = false;
@@ -7226,8 +7225,7 @@ static ssize_t do_rbd_remove(const char *buf, size_t count)

	ret = -ENOENT;
	spin_lock(&rbd_dev_list_lock);
	list_for_each(tmp, &rbd_dev_list) {
		rbd_dev = list_entry(tmp, struct rbd_device, node);
	list_for_each_entry(rbd_dev, &rbd_dev_list, node) {
		if (rbd_dev->dev_id == dev_id) {
			ret = 0;
			break;
+1 −0
Original line number Diff line number Diff line
@@ -12,3 +12,4 @@ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \

ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
ceph-$(CONFIG_FS_ENCRYPTION) += crypto.o
+2 −2
Original line number Diff line number Diff line
@@ -140,7 +140,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
		newattrs.ia_ctime = current_time(inode);
		newattrs.ia_mode = new_mode;
		newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
		ret = __ceph_setattr(inode, &newattrs);
		ret = __ceph_setattr(inode, &newattrs, NULL);
		if (ret)
			goto out_free;
	}
@@ -151,7 +151,7 @@ int ceph_set_acl(struct mnt_idmap *idmap, struct dentry *dentry,
			newattrs.ia_ctime = old_ctime;
			newattrs.ia_mode = old_mode;
			newattrs.ia_valid = ATTR_MODE | ATTR_CTIME;
			__ceph_setattr(inode, &newattrs);
			__ceph_setattr(inode, &newattrs, NULL);
		}
		goto out_free;
	}
+155 −41
Original line number Diff line number Diff line
@@ -18,6 +18,7 @@
#include "mds_client.h"
#include "cache.h"
#include "metric.h"
#include "crypto.h"
#include <linux/ceph/osd_client.h>
#include <linux/ceph/striper.h>

@@ -242,11 +243,13 @@ static bool ceph_netfs_clamp_length(struct netfs_io_subrequest *subreq)

static void finish_netfs_read(struct ceph_osd_request *req)
{
	struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode);
	struct inode *inode = req->r_inode;
	struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
	struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0);
	struct netfs_io_subrequest *subreq = req->r_priv;
	int num_pages;
	struct ceph_osd_req_op *op = &req->r_ops[0];
	int err = req->r_result;
	bool sparse = (op->op == CEPH_OSD_OP_SPARSE_READ);

	ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency,
				 req->r_end_latency, osd_data->length, err);
@@ -260,14 +263,29 @@ static void finish_netfs_read(struct ceph_osd_request *req)
	else if (err == -EBLOCKLISTED)
		fsc->blocklisted = true;

	if (err >= 0 && err < subreq->len)
	if (err >= 0) {
		if (sparse && err > 0)
			err = ceph_sparse_ext_map_end(op);
		if (err < subreq->len)
			__set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
		if (IS_ENCRYPTED(inode) && err > 0) {
			err = ceph_fscrypt_decrypt_extents(inode,
					osd_data->pages, subreq->start,
					op->extent.sparse_ext,
					op->extent.sparse_ext_cnt);
			if (err > subreq->len)
				err = subreq->len;
		}
	}

	if (osd_data->type == CEPH_OSD_DATA_TYPE_PAGES) {
		ceph_put_page_vector(osd_data->pages,
				     calc_pages_for(osd_data->alignment,
					osd_data->length), false);
	}
	netfs_subreq_terminated(subreq, err, false);

	num_pages = calc_pages_for(osd_data->alignment, osd_data->length);
	ceph_put_page_vector(osd_data->pages, num_pages, false);
	iput(req->r_inode);
	ceph_dec_osd_stopping_blocker(fsc->mdsc);
}

static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq)
@@ -334,10 +352,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	struct ceph_osd_request *req = NULL;
	struct ceph_vino vino = ceph_vino(inode);
	struct iov_iter iter;
	struct page **pages;
	size_t page_off;
	int err = 0;
	u64 len = subreq->len;
	bool sparse = IS_ENCRYPTED(inode) || ceph_test_mount_opt(fsc, SPARSEREAD);
	u64 off = subreq->start;

	if (ceph_inode_is_shutdown(inode)) {
		err = -EIO;
@@ -347,8 +365,10 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
	if (ceph_has_inline_data(ci) && ceph_netfs_issue_op_inline(subreq))
		return;

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len,
			0, 1, CEPH_OSD_OP_READ,
	ceph_fscrypt_adjust_off_and_len(inode, &off, &len);

	req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino,
			off, &len, 0, 1, sparse ? CEPH_OSD_OP_SPARSE_READ : CEPH_OSD_OP_READ,
			CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica,
			NULL, ci->i_truncate_seq, ci->i_truncate_size, false);
	if (IS_ERR(req)) {
@@ -357,11 +377,31 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
		goto out;
	}

	if (sparse) {
		err = ceph_alloc_sparse_ext_map(&req->r_ops[0]);
		if (err)
			goto out;
	}

	dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len);

	iov_iter_xarray(&iter, ITER_DEST, &rreq->mapping->i_pages, subreq->start, len);

	/*
	 * FIXME: For now, use CEPH_OSD_DATA_TYPE_PAGES instead of _ITER for
	 * encrypted inodes. We'd need infrastructure that handles an iov_iter
	 * instead of page arrays, and we don't have that as of yet. Once the
	 * dust settles on the write helpers and encrypt/decrypt routines for
	 * netfs, we should be able to rework this.
	 */
	if (IS_ENCRYPTED(inode)) {
		struct page **pages;
		size_t page_off;

		err = iov_iter_get_pages_alloc2(&iter, &pages, len, &page_off);
		if (err < 0) {
		dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err);
			dout("%s: iov_ter_get_pages_alloc returned %d\n",
			     __func__, err);
			goto out;
		}

@@ -370,7 +410,15 @@ static void ceph_netfs_issue_read(struct netfs_io_subrequest *subreq)
		len = err;
		err = 0;

	osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false);
		osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false,
						 false);
	} else {
		osd_req_op_extent_osd_iter(req, 0, &iter);
	}
	if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
		err = -EIO;
		goto out;
	}
	req->r_callback = finish_netfs_read;
	req->r_priv = subreq;
	req->r_inode = inode;
@@ -571,10 +619,12 @@ static u64 get_writepages_data_length(struct inode *inode,
				      struct page *page, u64 start)
{
	struct ceph_inode_info *ci = ceph_inode(inode);
	struct ceph_snap_context *snapc = page_snap_context(page);
	struct ceph_snap_context *snapc;
	struct ceph_cap_snap *capsnap = NULL;
	u64 end = i_size_read(inode);
	u64 ret;

	snapc = page_snap_context(ceph_fscrypt_pagecache_page(page));
	if (snapc != ci->i_head_snapc) {
		bool found = false;
		spin_lock(&ci->i_ceph_lock);
@@ -589,9 +639,12 @@ static u64 get_writepages_data_length(struct inode *inode,
		spin_unlock(&ci->i_ceph_lock);
		WARN_ON(!found);
	}
	if (end > page_offset(page) + thp_size(page))
		end = page_offset(page) + thp_size(page);
	return end > start ? end - start : 0;
	if (end > ceph_fscrypt_page_offset(page) + thp_size(page))
		end = ceph_fscrypt_page_offset(page) + thp_size(page);
	ret = end > start ? end - start : 0;
	if (ret && fscrypt_is_bounce_page(page))
		ret = round_up(ret, CEPH_FSCRYPT_BLOCK_SIZE);
	return ret;
}

/*
@@ -610,10 +663,12 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
	loff_t page_off = page_offset(page);
	int err;
	loff_t len = thp_size(page);
	loff_t wlen;
	struct ceph_writeback_ctl ceph_wbc;
	struct ceph_osd_client *osdc = &fsc->client->osdc;
	struct ceph_osd_request *req;
	bool caching = ceph_is_cache_enabled(inode);
	struct page *bounce_page = NULL;

	dout("writepage %p idx %lu\n", page, page->index);

@@ -649,31 +704,51 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
	if (ceph_wbc.i_size < page_off + len)
		len = ceph_wbc.i_size - page_off;

	wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len;
	dout("writepage %p page %p index %lu on %llu~%llu snapc %p seq %lld\n",
	     inode, page, page->index, page_off, len, snapc, snapc->seq);
	     inode, page, page->index, page_off, wlen, snapc, snapc->seq);

	if (atomic_long_inc_return(&fsc->writeback_count) >
	    CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb))
		fsc->write_congested = true;

	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode), page_off, &len, 0, 1,
				    CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, snapc,
				    ceph_wbc.truncate_seq, ceph_wbc.truncate_size,
				    true);
	req = ceph_osdc_new_request(osdc, &ci->i_layout, ceph_vino(inode),
				    page_off, &wlen, 0, 1, CEPH_OSD_OP_WRITE,
				    CEPH_OSD_FLAG_WRITE, snapc,
				    ceph_wbc.truncate_seq,
				    ceph_wbc.truncate_size, true);
	if (IS_ERR(req)) {
		redirty_page_for_writepage(wbc, page);
		return PTR_ERR(req);
	}

	if (wlen < len)
		len = wlen;

	set_page_writeback(page);
	if (caching)
		ceph_set_page_fscache(page);
	ceph_fscache_write_to_cache(inode, page_off, len, caching);

	if (IS_ENCRYPTED(inode)) {
		bounce_page = fscrypt_encrypt_pagecache_blocks(page,
						    CEPH_FSCRYPT_BLOCK_SIZE, 0,
						    GFP_NOFS);
		if (IS_ERR(bounce_page)) {
			redirty_page_for_writepage(wbc, page);
			end_page_writeback(page);
			ceph_osdc_put_request(req);
			return PTR_ERR(bounce_page);
		}
	}

	/* it may be a short write due to an object boundary */
	WARN_ON_ONCE(len > thp_size(page));
	osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false);
	dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len);
	osd_req_op_extent_osd_data_pages(req, 0,
			bounce_page ? &bounce_page : &page, wlen, 0,
			false, false);
	dout("writepage %llu~%llu (%llu bytes, %sencrypted)\n",
	     page_off, len, wlen, IS_ENCRYPTED(inode) ? "" : "not ");

	req->r_mtime = inode->i_mtime;
	ceph_osdc_start_request(osdc, req);
@@ -681,7 +756,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)

	ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency,
				  req->r_end_latency, len, err);

	fscrypt_free_bounce_page(bounce_page);
	ceph_osdc_put_request(req);
	if (err == 0)
		err = len;
@@ -800,6 +875,11 @@ static void writepages_finish(struct ceph_osd_request *req)
		total_pages += num_pages;
		for (j = 0; j < num_pages; j++) {
			page = osd_data->pages[j];
			if (fscrypt_is_bounce_page(page)) {
				page = fscrypt_pagecache_page(page);
				fscrypt_free_bounce_page(osd_data->pages[j]);
				osd_data->pages[j] = page;
			}
			BUG_ON(!page);
			WARN_ON(!PageUptodate(page));

@@ -835,6 +915,7 @@ static void writepages_finish(struct ceph_osd_request *req)
	else
		kfree(osd_data->pages);
	ceph_osdc_put_request(req);
	ceph_dec_osd_stopping_blocker(fsc->mdsc);
}

/*
@@ -1070,9 +1151,28 @@ static int ceph_writepages_start(struct address_space *mapping,
				    fsc->mount_options->congestion_kb))
				fsc->write_congested = true;

			if (IS_ENCRYPTED(inode)) {
				pages[locked_pages] =
					fscrypt_encrypt_pagecache_blocks(page,
						PAGE_SIZE, 0,
						locked_pages ? GFP_NOWAIT : GFP_NOFS);
				if (IS_ERR(pages[locked_pages])) {
					if (PTR_ERR(pages[locked_pages]) == -EINVAL)
						pr_err("%s: inode->i_blkbits=%hhu\n",
							__func__, inode->i_blkbits);
					/* better not fail on first page! */
					BUG_ON(locked_pages == 0);
					pages[locked_pages] = NULL;
					redirty_page_for_writepage(wbc, page);
					unlock_page(page);
					break;
				}
				++locked_pages;
			} else {
				pages[locked_pages++] = page;
			fbatch.folios[i] = NULL;
			}

			fbatch.folios[i] = NULL;
			len += thp_size(page);
		}

@@ -1100,7 +1200,7 @@ static int ceph_writepages_start(struct address_space *mapping,
		}

new_request:
		offset = page_offset(pages[0]);
		offset = ceph_fscrypt_page_offset(pages[0]);
		len = wsize;

		req = ceph_osdc_new_request(&fsc->client->osdc,
@@ -1121,9 +1221,13 @@ static int ceph_writepages_start(struct address_space *mapping,
						ceph_wbc.truncate_size, true);
			BUG_ON(IS_ERR(req));
		}
		BUG_ON(len < page_offset(pages[locked_pages - 1]) +
			     thp_size(page) - offset);
		BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) +
			     thp_size(pages[locked_pages - 1]) - offset);

		if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) {
			rc = -EIO;
			goto release_folios;
		}
		req->r_callback = writepages_finish;
		req->r_inode = inode;

@@ -1132,7 +1236,9 @@ static int ceph_writepages_start(struct address_space *mapping,
		data_pages = pages;
		op_idx = 0;
		for (i = 0; i < locked_pages; i++) {
			u64 cur_offset = page_offset(pages[i]);
			struct page *page = ceph_fscrypt_pagecache_page(pages[i]);

			u64 cur_offset = page_offset(page);
			/*
			 * Discontinuity in page range? Ceph can handle that by just passing
			 * multiple extents in the write op.
@@ -1161,9 +1267,9 @@ static int ceph_writepages_start(struct address_space *mapping,
				op_idx++;
			}

			set_page_writeback(pages[i]);
			set_page_writeback(page);
			if (caching)
				ceph_set_page_fscache(pages[i]);
				ceph_set_page_fscache(page);
			len += thp_size(page);
		}
		ceph_fscache_write_to_cache(inode, offset, len, caching);
@@ -1179,8 +1285,16 @@ static int ceph_writepages_start(struct address_space *mapping,
							 offset);
			len = max(len, min_len);
		}
		if (IS_ENCRYPTED(inode))
			len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE);

		dout("writepages got pages at %llu~%llu\n", offset, len);

		if (IS_ENCRYPTED(inode) &&
		    ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK))
			pr_warn("%s: bad encrypted write offset=%lld len=%llu\n",
				__func__, offset, len);

		osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
						 0, from_pool, false);
		osd_req_op_extent_update(req, op_idx, len);
Loading