Commit 93723095 authored by Qu Wenruo's avatar Qu Wenruo Committed by David Sterba
Browse files

btrfs: raid56: switch write path to rmw_rbio()



This includes the following changes:

- Implement new raid_unplug() functions
  Now we don't need a workqueue to run the plug, as all our
  work is just queue rmw_rbio_work() call, which can be executed
  without sleep.

- Implement a rmw_rbio_work_locked() helper
  This is for unlock_stripe(), which is already holding the full stripe
  lock.

- Remove all the old functions
  This should already shows how complex the old functions are, as we
  ended up removing the following functions:

  * rmw_work()
  * validate_rbio_for_rmw()
  * raid56_rmw_end_io_work()
  * raid56_rmw_stripe()
  * full_stripe_write()
  * partial_stripe_write()
  * __raid56_parity_write()
  * run_plug()
  * unplug_work()
  * btrfs_raid_unplug()
  * rmw_work()
  * __raid56_parity_recover()
  * raid_recover_end_io_work()

- Unexport rmw_rbio()

Signed-off-by: default avatarQu Wenruo <wqu@suse.com>
Signed-off-by: default avatarDavid Sterba <dsterba@suse.com>
parent 5eb30ee2
Loading
Loading
Loading
Loading
+42 −308
Original line number Diff line number Diff line
@@ -64,9 +64,9 @@ struct sector_ptr {
	unsigned int uptodate:8;
};

static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
static void rmw_work(struct work_struct *work);
static void rmw_rbio_work(struct work_struct *work);
static void rmw_rbio_work_locked(struct work_struct *work);
static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
static void index_rbio_pages(struct btrfs_raid_bio *rbio);
@@ -816,7 +816,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
				start_async_work(next, recover_rbio_work_locked);
			} else if (next->operation == BTRFS_RBIO_WRITE) {
				steal_rbio(rbio, next);
				start_async_work(next, rmw_work);
				start_async_work(next, rmw_rbio_work_locked);
			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
				steal_rbio(rbio, next);
				start_async_work(next, scrub_parity_work);
@@ -1108,23 +1108,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
	return 0;
}

/*
 * while we're doing the read/modify/write cycle, we could
 * have errors in reading pages off the disk.  This checks
 * for errors and if we're not able to read the page it'll
 * trigger parity reconstruction.  The rmw will be finished
 * after we've reconstructed the failed stripes
 */
static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
{
	if (rbio->faila >= 0 || rbio->failb >= 0) {
		BUG_ON(rbio->faila == rbio->real_stripes - 1);
		__raid56_parity_recover(rbio);
	} else {
		finish_rmw(rbio);
	}
}

static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
{
	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@@ -1601,31 +1584,6 @@ static void raid56_bio_end_io(struct bio *bio)
			   &rbio->end_io_work);
}

/*
 * End io handler for the read phase of the RMW cycle.  All the bios here are
 * physical stripe bios we've read from the disk so we can recalculate the
 * parity of the stripe.
 *
 * This will usually kick off finish_rmw once all the bios are read in, but it
 * may trigger parity reconstruction if we had any errors along the way
 */
static void raid56_rmw_end_io_work(struct work_struct *work)
{
	struct btrfs_raid_bio *rbio =
		container_of(work, struct btrfs_raid_bio, end_io_work);

	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
		rbio_orig_end_io(rbio, BLK_STS_IOERR);
		return;
	}

	/*
	 * This will normally call finish_rmw to start our write but if there
	 * are any failed stripes we'll reconstruct from parity first.
	 */
	validate_rbio_for_rmw(rbio);
}

static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
				  struct bio_list *bio_list)
{
@@ -1686,122 +1644,6 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
	return 0;
}

/*
 * the stripe must be locked by the caller.  It will
 * unlock after all the writes are done
 */
static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
{
	int bios_to_read = 0;
	struct bio_list bio_list;
	int ret;
	struct bio *bio;

	bio_list_init(&bio_list);

	ret = alloc_rbio_pages(rbio);
	if (ret)
		goto cleanup;

	index_rbio_pages(rbio);

	atomic_set(&rbio->error, 0);

	ret = rmw_assemble_read_bios(rbio, &bio_list);
	if (ret < 0)
		goto cleanup;

	bios_to_read = bio_list_size(&bio_list);
	if (!bios_to_read) {
		/*
		 * this can happen if others have merged with
		 * us, it means there is nothing left to read.
		 * But if there are missing devices it may not be
		 * safe to do the full stripe write yet.
		 */
		goto finish;
	}

	/*
	 * The bioc may be freed once we submit the last bio. Make sure not to
	 * touch it after that.
	 */
	atomic_set(&rbio->stripes_pending, bios_to_read);
	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
	while ((bio = bio_list_pop(&bio_list))) {
		bio->bi_end_io = raid56_bio_end_io;

		if (trace_raid56_read_partial_enabled()) {
			struct raid56_bio_trace_info trace_info = { 0 };

			bio_get_trace_info(rbio, bio, &trace_info);
			trace_raid56_read_partial(rbio, bio, &trace_info);
		}
		submit_bio(bio);
	}
	/* the actual write will happen once the reads are done */
	return 0;

cleanup:
	rbio_orig_end_io(rbio, BLK_STS_IOERR);

	while ((bio = bio_list_pop(&bio_list)))
		bio_put(bio);

	return -EIO;

finish:
	validate_rbio_for_rmw(rbio);
	return 0;
}

/*
 * if the upper layers pass in a full stripe, we thank them by only allocating
 * enough pages to hold the parity, and sending it all down quickly.
 */
static int full_stripe_write(struct btrfs_raid_bio *rbio)
{
	int ret;

	ret = alloc_rbio_parity_pages(rbio);
	if (ret)
		return ret;

	ret = lock_stripe_add(rbio);
	if (ret == 0)
		finish_rmw(rbio);
	return 0;
}

/*
 * partial stripe writes get handed over to async helpers.
 * We're really hoping to merge a few more writes into this
 * rbio before calculating new parity
 */
static int partial_stripe_write(struct btrfs_raid_bio *rbio)
{
	int ret;

	ret = lock_stripe_add(rbio);
	if (ret == 0)
		start_async_work(rbio, rmw_work);
	return 0;
}

/*
 * sometimes while we were reading from the drive to
 * recalculate parity, enough new bios come into create
 * a full stripe.  So we do a check here to see if we can
 * go directly to finish_rmw
 */
static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
{
	/* head off into rmw land if we don't have a full stripe */
	if (!rbio_is_full(rbio))
		return partial_stripe_write(rbio);
	return full_stripe_write(rbio);
}

/*
 * We use plugging call backs to collect full stripes.
 * Any time we get a partial stripe write while plugged
@@ -1836,28 +1678,22 @@ static int plug_cmp(void *priv, const struct list_head *a,
	return 0;
}

static void run_plug(struct btrfs_plug_cb *plug)
static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
	struct btrfs_raid_bio *cur;
	struct btrfs_raid_bio *last = NULL;

	/*
	 * sort our plug list then try to merge
	 * everything we can in hopes of creating full
	 * stripes.
	 */
	list_sort(NULL, &plug->rbio_list, plug_cmp);

	while (!list_empty(&plug->rbio_list)) {
		cur = list_entry(plug->rbio_list.next,
				 struct btrfs_raid_bio, plug_list);
		list_del_init(&cur->plug_list);

		if (rbio_is_full(cur)) {
			int ret;

			/* we have a full stripe, send it down */
			ret = full_stripe_write(cur);
			BUG_ON(ret);
			/* We have a full stripe, queue it down. */
			start_async_work(cur, rmw_rbio_work);
			continue;
		}
		if (last) {
@@ -1865,42 +1701,16 @@ static void run_plug(struct btrfs_plug_cb *plug)
				merge_rbio(last, cur);
				free_raid_bio(cur);
				continue;

			}
			__raid56_parity_write(last);
			start_async_work(last, rmw_rbio_work);
		}
		last = cur;
	}
	if (last) {
		__raid56_parity_write(last);
	}
	if (last)
		start_async_work(last, rmw_rbio_work);
	kfree(plug);
}

/*
 * if the unplug comes from schedule, we have to push the
 * work off to a helper thread
 */
static void unplug_work(struct work_struct *work)
{
	struct btrfs_plug_cb *plug;
	plug = container_of(work, struct btrfs_plug_cb, work);
	run_plug(plug);
}

static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
{
	struct btrfs_plug_cb *plug;
	plug = container_of(cb, struct btrfs_plug_cb, cb);

	if (from_schedule) {
		INIT_WORK(&plug->work, unplug_work);
		queue_work(plug->info->rmw_workers, &plug->work);
		return;
	}
	run_plug(plug);
}

/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
{
@@ -1948,19 +1758,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
	rbio_add_bio(rbio, bio);

	/*
	 * don't plug on full rbios, just get them out the door
	 * Don't plug on full rbios, just get them out the door
	 * as quickly as we can
	 */
	if (rbio_is_full(rbio)) {
		ret = full_stripe_write(rbio);
		if (ret) {
			free_raid_bio(rbio);
			goto fail;
		}
		return;
	}
	if (rbio_is_full(rbio))
		goto queue_rbio;

	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
	if (cb) {
		plug = container_of(cb, struct btrfs_plug_cb, cb);
		if (!plug->info) {
@@ -1968,13 +1772,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
			INIT_LIST_HEAD(&plug->rbio_list);
		}
		list_add_tail(&rbio->plug_list, &plug->rbio_list);
	} else {
		ret = __raid56_parity_write(rbio);
		if (ret) {
			free_raid_bio(rbio);
			goto fail;
		}
		return;
	}
queue_rbio:
	/*
	 * Either we don't have any existing plug, or we're doing a full stripe,
	 * can queue the rmw work now.
	 */
	start_async_work(rbio, rmw_rbio_work);

	return;

@@ -2217,21 +2022,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
	}
}

/*
 * This is called only for stripes we've read from disk to reconstruct the
 * parity.
 */
static void raid_recover_end_io_work(struct work_struct *work)
{
	struct btrfs_raid_bio *rbio =
		container_of(work, struct btrfs_raid_bio, end_io_work);

	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
		rbio_orig_end_io(rbio, BLK_STS_IOERR);
	else
		__raid_recover_end_io(rbio);
}

static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
				      struct bio_list *bio_list)
{
@@ -2348,79 +2138,6 @@ static void recover_rbio_work_locked(struct work_struct *work)
	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}

/*
 * reads everything we need off the disk to reconstruct
 * the parity. endio handlers trigger final reconstruction
 * when the IO is done.
 *
 * This is used both for reads from the higher layers and for
 * parity construction required to finish a rmw cycle.
 */
static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
{
	int bios_to_read = 0;
	struct bio_list bio_list;
	int ret;
	struct bio *bio;

	bio_list_init(&bio_list);

	ret = alloc_rbio_pages(rbio);
	if (ret)
		goto cleanup;

	atomic_set(&rbio->error, 0);

	ret = recover_assemble_read_bios(rbio, &bio_list);
	if (ret < 0)
		goto cleanup;

	bios_to_read = bio_list_size(&bio_list);
	if (!bios_to_read) {
		/*
		 * we might have no bios to read just because the pages
		 * were up to date, or we might have no bios to read because
		 * the devices were gone.
		 */
		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
			__raid_recover_end_io(rbio);
			return 0;
		} else {
			goto cleanup;
		}
	}

	/*
	 * The bioc may be freed once we submit the last bio. Make sure not to
	 * touch it after that.
	 */
	atomic_set(&rbio->stripes_pending, bios_to_read);
	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
	while ((bio = bio_list_pop(&bio_list))) {
		bio->bi_end_io = raid56_bio_end_io;

		if (trace_raid56_scrub_read_recover_enabled()) {
			struct raid56_bio_trace_info trace_info = { 0 };

			bio_get_trace_info(rbio, bio, &trace_info);
			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
		}
		submit_bio(bio);
	}

	return 0;

cleanup:
	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
		rbio_orig_end_io(rbio, BLK_STS_IOERR);

	while ((bio = bio_list_pop(&bio_list)))
		bio_put(bio);

	return -EIO;
}

/*
 * the main entry point for reads from the higher layers.  This
 * is really only called when the normal read path had a failure,
@@ -2529,7 +2246,7 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio,
	}
}

int rmw_rbio(struct btrfs_raid_bio *rbio)
static int rmw_rbio(struct btrfs_raid_bio *rbio)
{
	struct bio_list bio_list;
	int sectornr;
@@ -2615,12 +2332,29 @@ int rmw_rbio(struct btrfs_raid_bio *rbio)
	return ret;
}

static void rmw_work(struct work_struct *work)
static void rmw_rbio_work(struct work_struct *work)
{
	struct btrfs_raid_bio *rbio;
	int ret;

	rbio = container_of(work, struct btrfs_raid_bio, work);
	raid56_rmw_stripe(rbio);

	ret = lock_stripe_add(rbio);
	if (ret == 0) {
		ret = rmw_rbio(rbio);
		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
	}
}

static void rmw_rbio_work_locked(struct work_struct *work)
{
	struct btrfs_raid_bio *rbio;
	int ret;

	rbio = container_of(work, struct btrfs_raid_bio, work);

	ret = rmw_rbio(rbio);
	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
}

/*
+0 −5
Original line number Diff line number Diff line
@@ -185,9 +185,4 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);

/*
 * Placeholder definition to avoid warning, will be removed when
 * the full write path is migrated.
 */
int rmw_rbio(struct btrfs_raid_bio *rbio);
#endif