Commit 9631abdb authored by Mariusz Tkaczyk's avatar Mariusz Tkaczyk Committed by Song Liu
Browse files

md: Set MD_BROKEN for RAID1 and RAID10



There is no direct mechanism to determine raid failure outside
personality. It is done by checking rdev->flags after executing
md_error(). If "faulty" flag is not set then -EBUSY is returned to
userspace. -EBUSY means that array will be failed after drive removal.

Mdadm has special routine to handle the array failure and it is executed
if -EBUSY is returned by md.

There are at least two known reasons to not consider this mechanism
as correct:
1. drive can be removed even if array will be failed[1].
2. -EBUSY seems to be wrong status. Array is not busy, but removal
   process cannot proceed safe.

-EBUSY expectation cannot be removed without breaking compatibility
with userspace. In this patch first issue is resolved by adding support
for MD_BROKEN flag for RAID1 and RAID10. Support for RAID456 is added in
next commit.

The idea is to set the MD_BROKEN if we are sure that raid is in failed
state now. This is done in each error_handler(). In md_error() MD_BROKEN
flag is checked. If is set, then -EBUSY is returned to userspace.

As in previous commit, it causes that #mdadm --set-faulty is able to
fail array. Previously proposed workaround is valid if optional
functionality[1] is disabled.

[1] commit 9a567843("md: allow last device to be forcibly removed from
    RAID1/RAID10.")

Reviewd-by: default avatarXiao Ni <xni@redhat.com>
Signed-off-by: default avatarMariusz Tkaczyk <mariusz.tkaczyk@linux.intel.com>
Signed-off-by: default avatarSong Liu <song@kernel.org>
parent 5ea7c133
Loading
Loading
Loading
Loading
+15 −12
Original line number Diff line number Diff line
@@ -2984,10 +2984,11 @@ state_store(struct md_rdev *rdev, const char *buf, size_t len)

	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
		md_error(rdev->mddev, rdev);
		if (test_bit(Faulty, &rdev->flags))
			err = 0;
		else

		if (test_bit(MD_BROKEN, &rdev->mddev->flags))
			err = -EBUSY;
		else
			err = 0;
	} else if (cmd_match(buf, "remove")) {
		if (rdev->mddev->pers) {
			clear_bit(Blocked, &rdev->flags);
@@ -4353,10 +4354,9 @@ __ATTR_PREALLOC(resync_start, S_IRUGO|S_IWUSR,
 *     like active, but no writes have been seen for a while (100msec).
 *
 * broken
 *     RAID0/LINEAR-only: same as clean, but array is missing a member.
 *     It's useful because RAID0/LINEAR mounted-arrays aren't stopped
 *     when a member is gone, so this state will at least alert the
 *     user that something is wrong.
*     Array is failed. It's useful because mounted-arrays aren't stopped
*     when array is failed, so this state will at least alert the user that
*     something is wrong.
 */
enum array_state { clear, inactive, suspended, readonly, read_auto, clean, active,
		   write_pending, active_idle, broken, bad_word};
@@ -7443,7 +7443,7 @@ static int set_disk_faulty(struct mddev *mddev, dev_t dev)
		err =  -ENODEV;
	else {
		md_error(mddev, rdev);
		if (!test_bit(Faulty, &rdev->flags))
		if (test_bit(MD_BROKEN, &mddev->flags))
			err = -EBUSY;
	}
	rcu_read_unlock();
@@ -7985,12 +7985,15 @@ void md_error(struct mddev *mddev, struct md_rdev *rdev)
	if (!mddev->pers || !mddev->pers->error_handler)
		return;
	mddev->pers->error_handler(mddev, rdev);
	if (mddev->degraded)

	if (mddev->degraded && !test_bit(MD_BROKEN, &mddev->flags))
		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
	sysfs_notify_dirent_safe(rdev->sysfs_state);
	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	if (!test_bit(MD_BROKEN, &mddev->flags)) {
		set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
		md_wakeup_thread(mddev->thread);
	}
	if (mddev->event_work.func)
		queue_work(md_misc_wq, &mddev->event_work);
	md_new_event();
+35 −27
Original line number Diff line number Diff line
@@ -234,34 +234,42 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
				int is_new);
struct md_cluster_info;

/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
enum mddev_flags {
	MD_ARRAY_FIRST_USE,	/* First use of array, needs initialization */
	MD_CLOSING,		/* If set, we are closing the array, do not open
				 * it then */
	MD_JOURNAL_CLEAN,	/* A raid with journal is already clean */
	MD_HAS_JOURNAL,		/* The raid array has journal feature set */
	MD_CLUSTER_RESYNC_LOCKED, /* cluster raid only, which means node
				   * already took resync lock, need to
				   * release the lock */
	MD_FAILFAST_SUPPORTED,	/* Using MD_FAILFAST on metadata writes is
				 * supported as calls to md_error() will
				 * never cause the array to become failed.
				 */
	MD_HAS_PPL,		/* The raid array has PPL feature set */
	MD_HAS_MULTIPLE_PPLS,	/* The raid array has multiple PPLs feature set */
	MD_ALLOW_SB_UPDATE,	/* md_check_recovery is allowed to update
				 * the metadata without taking reconfig_mutex.
				 */
	MD_UPDATING_SB,		/* md_check_recovery is updating the metadata
				 * without explicitly holding reconfig_mutex.
				 */
	MD_NOT_READY,		/* do_md_run() is active, so 'array_state'
				 * must not report that array is ready yet
				 */
	MD_BROKEN,              /* This is used in RAID-0/LINEAR only, to stop
				 * I/O in case an array member is gone/failed.
/**
 * enum mddev_flags - md device flags.
 * @MD_ARRAY_FIRST_USE: First use of array, needs initialization.
 * @MD_CLOSING: If set, we are closing the array, do not open it then.
 * @MD_JOURNAL_CLEAN: A raid with journal is already clean.
 * @MD_HAS_JOURNAL: The raid array has journal feature set.
 * @MD_CLUSTER_RESYNC_LOCKED: cluster raid only, which means node, already took
 *			       resync lock, need to release the lock.
 * @MD_FAILFAST_SUPPORTED: Using MD_FAILFAST on metadata writes is supported as
 *			    calls to md_error() will never cause the array to
 *			    become failed.
 * @MD_HAS_PPL:  The raid array has PPL feature set.
 * @MD_HAS_MULTIPLE_PPLS: The raid array has multiple PPLs feature set.
 * @MD_ALLOW_SB_UPDATE: md_check_recovery is allowed to update the metadata
 *			 without taking reconfig_mutex.
 * @MD_UPDATING_SB: md_check_recovery is updating the metadata without
 *		     explicitly holding reconfig_mutex.
 * @MD_NOT_READY: do_md_run() is active, so 'array_state', ust not report that
 *		   array is ready yet.
 * @MD_BROKEN: This is used to stop writes and mark array as failed.
 *
 * change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added
 */
enum mddev_flags {
	MD_ARRAY_FIRST_USE,
	MD_CLOSING,
	MD_JOURNAL_CLEAN,
	MD_HAS_JOURNAL,
	MD_CLUSTER_RESYNC_LOCKED,
	MD_FAILFAST_SUPPORTED,
	MD_HAS_PPL,
	MD_HAS_MULTIPLE_PPLS,
	MD_ALLOW_SB_UPDATE,
	MD_UPDATING_SB,
	MD_NOT_READY,
	MD_BROKEN,
};

enum mddev_sb_flags {
+26 −17
Original line number Diff line number Diff line
@@ -1641,31 +1641,40 @@ static void raid1_status(struct seq_file *seq, struct mddev *mddev)
	seq_printf(seq, "]");
}

/**
 * raid1_error() - RAID1 error handler.
 * @mddev: affected md device.
 * @rdev: member device to fail.
 *
 * The routine acknowledges &rdev failure and determines new @mddev state.
 * If it failed, then:
 *	- &MD_BROKEN flag is set in &mddev->flags.
 *	- recovery is disabled.
 * Otherwise, it must be degraded:
 *	- recovery is interrupted.
 *	- &mddev->degraded is bumped.
 *
 * @rdev is marked as &Faulty excluding case when array is failed and
 * &mddev->fail_last_dev is off.
 */
static void raid1_error(struct mddev *mddev, struct md_rdev *rdev)
{
	char b[BDEVNAME_SIZE];
	struct r1conf *conf = mddev->private;
	unsigned long flags;

	/*
	 * If it is not operational, then we have already marked it as dead
	 * else if it is the last working disks with "fail_last_dev == false",
	 * ignore the error, let the next level up know.
	 * else mark the drive as failed
	 */
	spin_lock_irqsave(&conf->device_lock, flags);
	if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
	    && (conf->raid_disks - mddev->degraded) == 1) {
		/*
		 * Don't fail the drive, act as though we were just a
		 * normal single drive.
		 * However don't try a recovery from this drive as
		 * it is very likely to fail.
		 */

	if (test_bit(In_sync, &rdev->flags) &&
	    (conf->raid_disks - mddev->degraded) == 1) {
		set_bit(MD_BROKEN, &mddev->flags);

		if (!mddev->fail_last_dev) {
			conf->recovery_disabled = mddev->recovery_disabled;
			spin_unlock_irqrestore(&conf->device_lock, flags);
			return;
		}
	}
	set_bit(Blocked, &rdev->flags);
	if (test_and_clear_bit(In_sync, &rdev->flags))
		mddev->degraded++;
+24 −16
Original line number Diff line number Diff line
@@ -1970,32 +1970,40 @@ static int enough(struct r10conf *conf, int ignore)
		_enough(conf, 1, ignore);
}

/**
 * raid10_error() - RAID10 error handler.
 * @mddev: affected md device.
 * @rdev: member device to fail.
 *
 * The routine acknowledges &rdev failure and determines new @mddev state.
 * If it failed, then:
 *	- &MD_BROKEN flag is set in &mddev->flags.
 * Otherwise, it must be degraded:
 *	- recovery is interrupted.
 *	- &mddev->degraded is bumped.

 * @rdev is marked as &Faulty excluding case when array is failed and
 * &mddev->fail_last_dev is off.
 */
static void raid10_error(struct mddev *mddev, struct md_rdev *rdev)
{
	char b[BDEVNAME_SIZE];
	struct r10conf *conf = mddev->private;
	unsigned long flags;

	/*
	 * If it is not operational, then we have already marked it as dead
	 * else if it is the last working disks with "fail_last_dev == false",
	 * ignore the error, let the next level up know.
	 * else mark the drive as failed
	 */
	spin_lock_irqsave(&conf->device_lock, flags);
	if (test_bit(In_sync, &rdev->flags) && !mddev->fail_last_dev
	    && !enough(conf, rdev->raid_disk)) {
		/*
		 * Don't fail the drive, just return an IO error.
		 */

	if (test_bit(In_sync, &rdev->flags) && !enough(conf, rdev->raid_disk)) {
		set_bit(MD_BROKEN, &mddev->flags);

		if (!mddev->fail_last_dev) {
			spin_unlock_irqrestore(&conf->device_lock, flags);
			return;
		}
	}
	if (test_and_clear_bit(In_sync, &rdev->flags))
		mddev->degraded++;
	/*
	 * If recovery is running, make sure it aborts.
	 */

	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
	set_bit(Blocked, &rdev->flags);
	set_bit(Faulty, &rdev->flags);