Commit ebce3eb2 authored by Jeff Layton's avatar Jeff Layton Committed by Ilya Dryomov
Browse files

ceph: fix inode number handling on arches with 32-bit ino_t

Tuan and Ulrich mentioned that they were hitting a problem on s390x,
which has a 32-bit ino_t value, even though it's a 64-bit arch (for
historical reasons).

I think the current handling of inode numbers in the ceph driver is
wrong. It tries to use 32-bit inode numbers on 32-bit arches, but that's
actually not a problem. 32-bit arches can deal with 64-bit inode numbers
just fine when userland code is compiled with LFS support (the common
case these days).

What we really want to do is just use 64-bit numbers everywhere, unless
someone has mounted with the ino32 mount option. In that case, we want
to ensure that we hash the inode number down to something that will fit
in 32 bits before presenting the value to userland.

Add new helper functions that do this, and only do the conversion before
presenting these values to userland in getattr and readdir.

The inode table hashvalue is changed to just cast the inode number to
unsigned long, as low-order bits are the most likely to vary anyway.

While it's not strictly required, we do want to put something in
inode->i_ino. Instead of basing it on BITS_PER_LONG, however, base it on
the size of the ino_t type.

NOTE: This is a user-visible change on 32-bit arches:

1/ inode numbers will be seen to have changed between kernel versions.
   32-bit arches will see large inode numbers now instead of the hashed
   ones they saw before.

2/ any really old software not built with LFS support may start failing
   stat() calls with -EOVERFLOW on inode numbers >2^32. Nothing much we
   can do about these, but hopefully the intersection of people running
   such code on ceph will be very small.

The workaround for both problems is to mount with "-o ino32".

[ idryomov: changelog tweak ]

URL: https://tracker.ceph.com/issues/46828


Reported-by: default avatarUlrich Weigand <Ulrich.Weigand@de.ibm.com>
Reported-and-Tested-by: default avatarTuan Hoang1 <Tuan.Hoang1@ibm.com>
Signed-off-by: default avatarJeff Layton <jlayton@kernel.org>
Reviewed-by: default avatar"Yan, Zheng" <zyan@redhat.com>
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent f062f025
Loading
Loading
Loading
Loading
+7 −7
Original line number Diff line number Diff line
@@ -887,8 +887,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
	int have = ci->i_snap_caps;

	if ((have & mask) == mask) {
		dout("__ceph_caps_issued_mask ino 0x%lx snap issued %s"
		     " (mask %s)\n", ci->vfs_inode.i_ino,
		dout("__ceph_caps_issued_mask ino 0x%llx snap issued %s"
		     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
		     ceph_cap_string(have),
		     ceph_cap_string(mask));
		return 1;
@@ -899,8 +899,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
		if (!__cap_is_valid(cap))
			continue;
		if ((cap->issued & mask) == mask) {
			dout("__ceph_caps_issued_mask ino 0x%lx cap %p issued %s"
			     " (mask %s)\n", ci->vfs_inode.i_ino, cap,
			dout("__ceph_caps_issued_mask ino 0x%llx cap %p issued %s"
			     " (mask %s)\n", ceph_ino(&ci->vfs_inode), cap,
			     ceph_cap_string(cap->issued),
			     ceph_cap_string(mask));
			if (touch)
@@ -911,8 +911,8 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
		/* does a combination of caps satisfy mask? */
		have |= cap->issued;
		if ((have & mask) == mask) {
			dout("__ceph_caps_issued_mask ino 0x%lx combo issued %s"
			     " (mask %s)\n", ci->vfs_inode.i_ino,
			dout("__ceph_caps_issued_mask ino 0x%llx combo issued %s"
			     " (mask %s)\n", ceph_ino(&ci->vfs_inode),
			     ceph_cap_string(cap->issued),
			     ceph_cap_string(mask));
			if (touch) {
@@ -2872,7 +2872,7 @@ int ceph_get_caps(struct file *filp, int need, int want,
			struct cap_wait cw;
			DEFINE_WAIT_FUNC(wait, woken_wake_function);

			cw.ino = inode->i_ino;
			cw.ino = ceph_ino(inode);
			cw.tgid = current->tgid;
			cw.need = need;
			cw.want = want;
+2 −2
Original line number Diff line number Diff line
@@ -202,7 +202,7 @@ static int caps_show_cb(struct inode *inode, struct ceph_cap *cap, void *p)
{
	struct seq_file *s = p;

	seq_printf(s, "0x%-17lx%-17s%-17s\n", inode->i_ino,
	seq_printf(s, "0x%-17llx%-17s%-17s\n", ceph_ino(inode),
		   ceph_cap_string(cap->issued),
		   ceph_cap_string(cap->implemented));
	return 0;
@@ -247,7 +247,7 @@ static int caps_show(struct seq_file *s, void *p)

	spin_lock(&mdsc->caps_list_lock);
	list_for_each_entry(cw, &mdsc->cap_wait_list, list) {
		seq_printf(s, "%-13d0x%-17lx%-17s%-17s\n", cw->tgid, cw->ino,
		seq_printf(s, "%-13d0x%-17llx%-17s%-17s\n", cw->tgid, cw->ino,
				ceph_cap_string(cw->need),
				ceph_cap_string(cw->want));
	}
+13 −18
Original line number Diff line number Diff line
@@ -259,9 +259,7 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
			     dentry, dentry, d_inode(dentry));
			ctx->pos = di->offset;
			if (!dir_emit(ctx, dentry->d_name.name,
				      dentry->d_name.len,
				      ceph_translate_ino(dentry->d_sb,
							 d_inode(dentry)->i_ino),
				      dentry->d_name.len, ceph_present_inode(d_inode(dentry)),
				      d_inode(dentry)->i_mode >> 12)) {
				dput(dentry);
				err = 0;
@@ -324,18 +322,21 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
	/* always start with . and .. */
	if (ctx->pos == 0) {
		dout("readdir off 0 -> '.'\n");
		if (!dir_emit(ctx, ".", 1, 
			    ceph_translate_ino(inode->i_sb, inode->i_ino),
		if (!dir_emit(ctx, ".", 1, ceph_present_inode(inode),
			    inode->i_mode >> 12))
			return 0;
		ctx->pos = 1;
	}
	if (ctx->pos == 1) {
		ino_t ino = parent_ino(file->f_path.dentry);
		u64 ino;
		struct dentry *dentry = file->f_path.dentry;

		spin_lock(&dentry->d_lock);
		ino = ceph_present_inode(dentry->d_parent->d_inode);
		spin_unlock(&dentry->d_lock);

		dout("readdir off 1 -> '..'\n");
		if (!dir_emit(ctx, "..", 2,
			    ceph_translate_ino(inode->i_sb, ino),
			    inode->i_mode >> 12))
		if (!dir_emit(ctx, "..", 2, ino, inode->i_mode >> 12))
			return 0;
		ctx->pos = 2;
	}
@@ -507,9 +508,6 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
	}
	for (; i < rinfo->dir_nr; i++) {
		struct ceph_mds_reply_dir_entry *rde = rinfo->dir_entries + i;
		struct ceph_vino vino;
		ino_t ino;
		u32 ftype;

		BUG_ON(rde->offset < ctx->pos);

@@ -519,13 +517,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
		     rde->name_len, rde->name, &rde->inode.in);

		BUG_ON(!rde->inode.in);
		ftype = le32_to_cpu(rde->inode.in->mode) >> 12;
		vino.ino = le64_to_cpu(rde->inode.in->ino);
		vino.snap = le64_to_cpu(rde->inode.in->snapid);
		ino = ceph_vino_to_ino(vino);

		if (!dir_emit(ctx, rde->name, rde->name_len,
			      ceph_translate_ino(inode->i_sb, ino), ftype)) {
			      ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
			      le32_to_cpu(rde->inode.in->mode) >> 12)) {
			dout("filldir stopping us...\n");
			return 0;
		}
@@ -1161,7 +1156,7 @@ static int ceph_unlink(struct inode *dir, struct dentry *dentry)

	if (try_async && op == CEPH_MDS_OP_UNLINK &&
	    (req->r_dir_caps = get_caps_for_async_unlink(dir, dentry))) {
		dout("async unlink on %lu/%.*s caps=%s", dir->i_ino,
		dout("async unlink on %llu/%.*s caps=%s", ceph_ino(dir),
		     dentry->d_name.len, dentry->d_name.name,
		     ceph_cap_string(req->r_dir_caps));
		set_bit(CEPH_MDS_R_ASYNC, &req->r_req_flags);
+2 −2
Original line number Diff line number Diff line
@@ -630,8 +630,8 @@ static int ceph_finish_async_create(struct inode *dir, struct dentry *dentry,
	} else {
		struct dentry *dn;

		dout("%s d_adding new inode 0x%llx to 0x%lx/%s\n", __func__,
			vino.ino, dir->i_ino, dentry->d_name.name);
		dout("%s d_adding new inode 0x%llx to 0x%llx/%s\n", __func__,
			vino.ino, ceph_ino(dir), dentry->d_name.name);
		ceph_dir_clear_ordered(dir);
		ceph_init_inode_acls(inode, as_ctx);
		if (inode->i_state & I_NEW) {
+9 −10
Original line number Diff line number Diff line
@@ -41,8 +41,10 @@ static void ceph_inode_work(struct work_struct *work);
 */
static int ceph_set_ino_cb(struct inode *inode, void *data)
{
	ceph_inode(inode)->i_vino = *(struct ceph_vino *)data;
	inode->i_ino = ceph_vino_to_ino(*(struct ceph_vino *)data);
	struct ceph_inode_info *ci = ceph_inode(inode);

	ci->i_vino = *(struct ceph_vino *)data;
	inode->i_ino = ceph_vino_to_ino_t(ci->i_vino);
	inode_set_iversion_raw(inode, 0);
	return 0;
}
@@ -50,17 +52,14 @@ static int ceph_set_ino_cb(struct inode *inode, void *data)
struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
{
	struct inode *inode;
	ino_t t = ceph_vino_to_ino(vino);

	inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
	inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare,
			     ceph_set_ino_cb, &vino);
	if (!inode)
		return ERR_PTR(-ENOMEM);
	if (inode->i_state & I_NEW)
		dout("get_inode created new inode %p %llx.%llx ino %llx\n",
		     inode, ceph_vinop(inode), (u64)inode->i_ino);

	dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
	     vino.snap, inode);
	dout("get_inode on %llu=%llx.%llx got %p new %d\n", ceph_present_inode(inode),
	     ceph_vinop(inode), inode, !!(inode->i_state & I_NEW));
	return inode;
}

@@ -2378,7 +2377,7 @@ int ceph_getattr(const struct path *path, struct kstat *stat,
	}

	generic_fillattr(inode, stat);
	stat->ino = ceph_translate_ino(inode->i_sb, inode->i_ino);
	stat->ino = ceph_present_inode(inode);

	/*
	 * btime on newly-allocated inodes is 0, so if this is still set to
Loading