Commit 6bfe0b49 authored by Dan Williams's avatar Dan Williams Committed by Linus Torvalds
Browse files

md: support blocking writes to an array on device failure



Allows a userspace metadata handler to take action upon detecting a device
failure.

Based on an original patch by Neil Brown.

Changes:
-added blocked_wait waitqueue to rdev
-don't qualify Blocked with Faulty always let userspace block writes
-added md_wait_for_blocked_rdev to wait for the block device to be clear, if
 userspace misses the notification another one is sent every 5 seconds
-set MD_RECOVERY_NEEDED after clearing "blocked"
-kill DoBlock flag, just test mddev->external

Signed-off-by: default avatarDan Williams <dan.j.williams@intel.com>
Signed-off-by: default avatarNeil Brown <neilb@suse.de>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 11e2ede0
Loading
Loading
Loading
Loading
+32 −1
Original line number Original line Diff line number Diff line
@@ -1828,6 +1828,10 @@ state_show(mdk_rdev_t *rdev, char *page)
		len += sprintf(page+len, "%swrite_mostly",sep);
		len += sprintf(page+len, "%swrite_mostly",sep);
		sep = ",";
		sep = ",";
	}
	}
	if (test_bit(Blocked, &rdev->flags)) {
		len += sprintf(page+len, "%sblocked", sep);
		sep = ",";
	}
	if (!test_bit(Faulty, &rdev->flags) &&
	if (!test_bit(Faulty, &rdev->flags) &&
	    !test_bit(In_sync, &rdev->flags)) {
	    !test_bit(In_sync, &rdev->flags)) {
		len += sprintf(page+len, "%sspare", sep);
		len += sprintf(page+len, "%sspare", sep);
@@ -1844,6 +1848,8 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
	 *  remove  - disconnects the device
	 *  remove  - disconnects the device
	 *  writemostly - sets write_mostly
	 *  writemostly - sets write_mostly
	 *  -writemostly - clears write_mostly
	 *  -writemostly - clears write_mostly
	 *  blocked - sets the Blocked flag
	 *  -blocked - clears the Blocked flag
	 */
	 */
	int err = -EINVAL;
	int err = -EINVAL;
	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
	if (cmd_match(buf, "faulty") && rdev->mddev->pers) {
@@ -1865,6 +1871,16 @@ state_store(mdk_rdev_t *rdev, const char *buf, size_t len)
		err = 0;
		err = 0;
	} else if (cmd_match(buf, "-writemostly")) {
	} else if (cmd_match(buf, "-writemostly")) {
		clear_bit(WriteMostly, &rdev->flags);
		clear_bit(WriteMostly, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "blocked")) {
		set_bit(Blocked, &rdev->flags);
		err = 0;
	} else if (cmd_match(buf, "-blocked")) {
		clear_bit(Blocked, &rdev->flags);
		wake_up(&rdev->blocked_wait);
		set_bit(MD_RECOVERY_NEEDED, &rdev->mddev->recovery);
		md_wakeup_thread(rdev->mddev->thread);

		err = 0;
		err = 0;
	}
	}
	return err ? err : len;
	return err ? err : len;
@@ -2194,7 +2210,9 @@ static mdk_rdev_t *md_import_device(dev_t newdev, int super_format, int super_mi
			goto abort_free;
			goto abort_free;
		}
		}
	}
	}

	INIT_LIST_HEAD(&rdev->same_set);
	INIT_LIST_HEAD(&rdev->same_set);
	init_waitqueue_head(&rdev->blocked_wait);


	return rdev;
	return rdev;


@@ -4958,6 +4976,9 @@ void md_error(mddev_t *mddev, mdk_rdev_t *rdev)


	if (!rdev || test_bit(Faulty, &rdev->flags))
	if (!rdev || test_bit(Faulty, &rdev->flags))
		return;
		return;

	if (mddev->external)
		set_bit(Blocked, &rdev->flags);
/*
/*
	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
	dprintk("md_error dev:%s, rdev:(%d:%d), (caller: %p,%p,%p,%p).\n",
		mdname(mddev),
		mdname(mddev),
@@ -5760,7 +5781,7 @@ static int remove_and_add_spares(mddev_t *mddev)


	rdev_for_each(rdev, rtmp, mddev)
	rdev_for_each(rdev, rtmp, mddev)
		if (rdev->raid_disk >= 0 &&
		if (rdev->raid_disk >= 0 &&
		    !mddev->external &&
		    !test_bit(Blocked, &rdev->flags) &&
		    (test_bit(Faulty, &rdev->flags) ||
		    (test_bit(Faulty, &rdev->flags) ||
		     ! test_bit(In_sync, &rdev->flags)) &&
		     ! test_bit(In_sync, &rdev->flags)) &&
		    atomic_read(&rdev->nr_pending)==0) {
		    atomic_read(&rdev->nr_pending)==0) {
@@ -5959,6 +5980,16 @@ void md_check_recovery(mddev_t *mddev)
	}
	}
}
}


void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev)
{
	sysfs_notify(&rdev->kobj, NULL, "state");
	wait_event_timeout(rdev->blocked_wait,
			   !test_bit(Blocked, &rdev->flags),
			   msecs_to_jiffies(5000));
	rdev_dec_pending(rdev, mddev);
}
EXPORT_SYMBOL(md_wait_for_blocked_rdev);

static int md_notify_reboot(struct notifier_block *this,
static int md_notify_reboot(struct notifier_block *this,
			    unsigned long code, void *x)
			    unsigned long code, void *x)
{
{
+24 −3
Original line number Original line Diff line number Diff line
@@ -773,7 +773,6 @@ static int make_request(struct request_queue *q, struct bio * bio)
	r1bio_t *r1_bio;
	r1bio_t *r1_bio;
	struct bio *read_bio;
	struct bio *read_bio;
	int i, targets = 0, disks;
	int i, targets = 0, disks;
	mdk_rdev_t *rdev;
	struct bitmap *bitmap = mddev->bitmap;
	struct bitmap *bitmap = mddev->bitmap;
	unsigned long flags;
	unsigned long flags;
	struct bio_list bl;
	struct bio_list bl;
@@ -781,6 +780,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
	const int rw = bio_data_dir(bio);
	const int rw = bio_data_dir(bio);
	const int do_sync = bio_sync(bio);
	const int do_sync = bio_sync(bio);
	int do_barriers;
	int do_barriers;
	mdk_rdev_t *blocked_rdev;


	/*
	/*
	 * Register the new request and wait if the reconstruction
	 * Register the new request and wait if the reconstruction
@@ -862,10 +862,17 @@ static int make_request(struct request_queue *q, struct bio * bio)
	first = 0;
	first = 0;
	}
	}
#endif
#endif
 retry_write:
	blocked_rdev = NULL;
	rcu_read_lock();
	rcu_read_lock();
	for (i = 0;  i < disks; i++) {
	for (i = 0;  i < disks; i++) {
		if ((rdev=rcu_dereference(conf->mirrors[i].rdev)) != NULL &&
		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[i].rdev);
		    !test_bit(Faulty, &rdev->flags)) {
		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
			atomic_inc(&rdev->nr_pending);
			blocked_rdev = rdev;
			break;
		}
		if (rdev && !test_bit(Faulty, &rdev->flags)) {
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			if (test_bit(Faulty, &rdev->flags)) {
			if (test_bit(Faulty, &rdev->flags)) {
				rdev_dec_pending(rdev, mddev);
				rdev_dec_pending(rdev, mddev);
@@ -878,6 +885,20 @@ static int make_request(struct request_queue *q, struct bio * bio)
	}
	}
	rcu_read_unlock();
	rcu_read_unlock();


	if (unlikely(blocked_rdev)) {
		/* Wait for this device to become unblocked */
		int j;

		for (j = 0; j < i; j++)
			if (r1_bio->bios[j])
				rdev_dec_pending(conf->mirrors[j].rdev, mddev);

		allow_barrier(conf);
		md_wait_for_blocked_rdev(blocked_rdev, mddev);
		wait_barrier(conf);
		goto retry_write;
	}

	BUG_ON(targets == 0); /* we never fail the last device */
	BUG_ON(targets == 0); /* we never fail the last device */


	if (targets < conf->raid_disks) {
	if (targets < conf->raid_disks) {
+26 −3
Original line number Original line Diff line number Diff line
@@ -790,6 +790,7 @@ static int make_request(struct request_queue *q, struct bio * bio)
	const int do_sync = bio_sync(bio);
	const int do_sync = bio_sync(bio);
	struct bio_list bl;
	struct bio_list bl;
	unsigned long flags;
	unsigned long flags;
	mdk_rdev_t *blocked_rdev;


	if (unlikely(bio_barrier(bio))) {
	if (unlikely(bio_barrier(bio))) {
		bio_endio(bio, -EOPNOTSUPP);
		bio_endio(bio, -EOPNOTSUPP);
@@ -879,17 +880,23 @@ static int make_request(struct request_queue *q, struct bio * bio)
	/*
	/*
	 * WRITE:
	 * WRITE:
	 */
	 */
	/* first select target devices under spinlock and
	/* first select target devices under rcu_lock and
	 * inc refcount on their rdev.  Record them by setting
	 * inc refcount on their rdev.  Record them by setting
	 * bios[x] to bio
	 * bios[x] to bio
	 */
	 */
	raid10_find_phys(conf, r10_bio);
	raid10_find_phys(conf, r10_bio);
 retry_write:
	blocked_rdev = 0;
	rcu_read_lock();
	rcu_read_lock();
	for (i = 0;  i < conf->copies; i++) {
	for (i = 0;  i < conf->copies; i++) {
		int d = r10_bio->devs[i].devnum;
		int d = r10_bio->devs[i].devnum;
		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
		mdk_rdev_t *rdev = rcu_dereference(conf->mirrors[d].rdev);
		if (rdev &&
		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
		    !test_bit(Faulty, &rdev->flags)) {
			atomic_inc(&rdev->nr_pending);
			blocked_rdev = rdev;
			break;
		}
		if (rdev && !test_bit(Faulty, &rdev->flags)) {
			atomic_inc(&rdev->nr_pending);
			atomic_inc(&rdev->nr_pending);
			r10_bio->devs[i].bio = bio;
			r10_bio->devs[i].bio = bio;
		} else {
		} else {
@@ -899,6 +906,22 @@ static int make_request(struct request_queue *q, struct bio * bio)
	}
	}
	rcu_read_unlock();
	rcu_read_unlock();


	if (unlikely(blocked_rdev)) {
		/* Have to wait for this device to get unblocked, then retry */
		int j;
		int d;

		for (j = 0; j < i; j++)
			if (r10_bio->devs[j].bio) {
				d = r10_bio->devs[j].devnum;
				rdev_dec_pending(conf->mirrors[d].rdev, mddev);
			}
		allow_barrier(conf);
		md_wait_for_blocked_rdev(blocked_rdev, mddev);
		wait_barrier(conf);
		goto retry_write;
	}

	atomic_set(&r10_bio->remaining, 0);
	atomic_set(&r10_bio->remaining, 0);


	bio_list_init(&bl);
	bio_list_init(&bl);
+33 −0
Original line number Original line Diff line number Diff line
@@ -2607,6 +2607,7 @@ static void handle_stripe_expansion(raid5_conf_t *conf, struct stripe_head *sh,
	}
	}
}
}



/*
/*
 * handle_stripe - do things to a stripe.
 * handle_stripe - do things to a stripe.
 *
 *
@@ -2632,6 +2633,7 @@ static void handle_stripe5(struct stripe_head *sh)
	struct stripe_head_state s;
	struct stripe_head_state s;
	struct r5dev *dev;
	struct r5dev *dev;
	unsigned long pending = 0;
	unsigned long pending = 0;
	mdk_rdev_t *blocked_rdev = NULL;


	memset(&s, 0, sizeof(s));
	memset(&s, 0, sizeof(s));
	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
	pr_debug("handling stripe %llu, state=%#lx cnt=%d, pd_idx=%d "
@@ -2691,6 +2693,11 @@ static void handle_stripe5(struct stripe_head *sh)
		if (dev->written)
		if (dev->written)
			s.written++;
			s.written++;
		rdev = rcu_dereference(conf->disks[i].rdev);
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
			blocked_rdev = rdev;
			atomic_inc(&rdev->nr_pending);
			break;
		}
		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
			/* The ReadError flag will just be confusing now */
			/* The ReadError flag will just be confusing now */
			clear_bit(R5_ReadError, &dev->flags);
			clear_bit(R5_ReadError, &dev->flags);
@@ -2705,6 +2712,11 @@ static void handle_stripe5(struct stripe_head *sh)
	}
	}
	rcu_read_unlock();
	rcu_read_unlock();


	if (unlikely(blocked_rdev)) {
		set_bit(STRIPE_HANDLE, &sh->state);
		goto unlock;
	}

	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
	if (s.to_fill && !test_and_set_bit(STRIPE_OP_BIOFILL, &sh->ops.pending))
		sh->ops.count++;
		sh->ops.count++;


@@ -2894,8 +2906,13 @@ static void handle_stripe5(struct stripe_head *sh)
	if (sh->ops.count)
	if (sh->ops.count)
		pending = get_stripe_work(sh);
		pending = get_stripe_work(sh);


 unlock:
	spin_unlock(&sh->lock);
	spin_unlock(&sh->lock);


	/* wait for this device to become unblocked */
	if (unlikely(blocked_rdev))
		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);

	if (pending)
	if (pending)
		raid5_run_ops(sh, pending);
		raid5_run_ops(sh, pending);


@@ -2912,6 +2929,7 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
	struct stripe_head_state s;
	struct stripe_head_state s;
	struct r6_state r6s;
	struct r6_state r6s;
	struct r5dev *dev, *pdev, *qdev;
	struct r5dev *dev, *pdev, *qdev;
	mdk_rdev_t *blocked_rdev = NULL;


	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
	r6s.qd_idx = raid6_next_disk(pd_idx, disks);
	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
	pr_debug("handling stripe %llu, state=%#lx cnt=%d, "
@@ -2975,6 +2993,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
		if (dev->written)
		if (dev->written)
			s.written++;
			s.written++;
		rdev = rcu_dereference(conf->disks[i].rdev);
		rdev = rcu_dereference(conf->disks[i].rdev);
		if (rdev && unlikely(test_bit(Blocked, &rdev->flags))) {
			blocked_rdev = rdev;
			atomic_inc(&rdev->nr_pending);
			break;
		}
		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
		if (!rdev || !test_bit(In_sync, &rdev->flags)) {
			/* The ReadError flag will just be confusing now */
			/* The ReadError flag will just be confusing now */
			clear_bit(R5_ReadError, &dev->flags);
			clear_bit(R5_ReadError, &dev->flags);
@@ -2989,6 +3012,11 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
			set_bit(R5_Insync, &dev->flags);
			set_bit(R5_Insync, &dev->flags);
	}
	}
	rcu_read_unlock();
	rcu_read_unlock();

	if (unlikely(blocked_rdev)) {
		set_bit(STRIPE_HANDLE, &sh->state);
		goto unlock;
	}
	pr_debug("locked=%d uptodate=%d to_read=%d"
	pr_debug("locked=%d uptodate=%d to_read=%d"
	       " to_write=%d failed=%d failed_num=%d,%d\n",
	       " to_write=%d failed=%d failed_num=%d,%d\n",
	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
	       s.locked, s.uptodate, s.to_read, s.to_write, s.failed,
@@ -3094,8 +3122,13 @@ static void handle_stripe6(struct stripe_head *sh, struct page *tmp_page)
	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
	    !test_bit(STRIPE_OP_COMPUTE_BLK, &sh->ops.pending))
		handle_stripe_expansion(conf, sh, &r6s);
		handle_stripe_expansion(conf, sh, &r6s);


 unlock:
	spin_unlock(&sh->lock);
	spin_unlock(&sh->lock);


	/* wait for this device to become unblocked */
	if (unlikely(blocked_rdev))
		md_wait_for_blocked_rdev(blocked_rdev, conf->mddev);

	return_io(return_bi);
	return_io(return_bi);


	for (i=disks; i-- ;) {
	for (i=disks; i-- ;) {
+1 −0
Original line number Original line Diff line number Diff line
@@ -94,6 +94,7 @@ extern int sync_page_io(struct block_device *bdev, sector_t sector, int size,
extern void md_do_sync(mddev_t *mddev);
extern void md_do_sync(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern void md_new_event(mddev_t *mddev);
extern void md_allow_write(mddev_t *mddev);
extern void md_allow_write(mddev_t *mddev);
extern void md_wait_for_blocked_rdev(mdk_rdev_t *rdev, mddev_t *mddev);


#endif /* CONFIG_MD */
#endif /* CONFIG_MD */
#endif 
#endif 
Loading