btrfs: raid56: migrate recovery and scrub recovery path to use error_bitmap (75b47033) · Commits · EulixOS / Software / Kernel

fs/btrfs/raid56.c

+193 −86

Original line number	Diff line number	Diff line
		@@ -1013,6 +1013,36 @@ static int alloc_rbio_parity_pages(struct btrfs_raid_bio *rbio)
		return 0;
		}

		/*
		* Return the total numer of errors found in the vertical stripe of @sector_nr.
		*
		* @faila and @failb will also be updated to the first and second stripe
		* number of the errors.
		*/
		static int get_rbio_veritical_errors(struct btrfs_raid_bio *rbio, int sector_nr,
		int faila, int failb)
		{
		int stripe_nr;
		int found_errors = 0;

		ASSERT(faila && failb);
		*faila = -1;
		*failb = -1;

		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
		int total_sector_nr = stripe_nr * rbio->stripe_nsectors + sector_nr;

		if (test_bit(total_sector_nr, rbio->error_bitmap)) {
		found_errors++;
		if (*faila < 0)
		*faila = stripe_nr;
		else if (*failb < 0)
		*failb = stripe_nr;
		}
		}
		return found_errors;
		}

		/*
		* Add a single sector @sector into our list of bios for IO.
		*
		@@ -1740,14 +1770,15 @@ void raid56_parity_write(struct bio bio, struct btrfs_io_context bioc)
		* @*pointers are the pre-allocated pointers by the caller, so we don't
		* need to allocate/free the pointers again and again.
		*/
		static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
		static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
		void pointers, void unmap_array)
		{
		struct btrfs_fs_info *fs_info = rbio->bioc->fs_info;
		struct sector_ptr *sector;
		const u32 sectorsize = fs_info->sectorsize;
		const int faila = rbio->faila;
		const int failb = rbio->failb;
		int found_errors;
		int faila;
		int failb;
		int stripe_nr;

		/*
		@@ -1756,7 +1787,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
		*/
		if (rbio->operation == BTRFS_RBIO_PARITY_SCRUB &&
		!test_bit(sector_nr, &rbio->dbitmap))
		return;
		return 0;

		found_errors = get_rbio_veritical_errors(rbio, sector_nr, &faila,
		&failb);
		/*
		* No errors in the veritical stripe, skip it. Can happen for recovery
		* which only part of a stripe failed csum check.
		*/
		if (!found_errors)
		return 0;

		if (found_errors > rbio->bioc->max_errors)
		return -EIO;

		/*
		* Setup our array of pointers with sectors from each stripe
		@@ -1766,12 +1809,11 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
		*/
		for (stripe_nr = 0; stripe_nr < rbio->real_stripes; stripe_nr++) {
		/*
		* If we're rebuilding a read, we have to use
		* pages from the bio list
		* If we're rebuilding a read, we have to use pages from the
		* bio list if possible.
		*/
		if ((rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
		rbio->operation == BTRFS_RBIO_REBUILD_MISSING) &&
		(stripe_nr == faila \|\| stripe_nr == failb)) {
		rbio->operation == BTRFS_RBIO_REBUILD_MISSING)) {
		sector = sector_in_rbio(rbio, stripe_nr, sector_nr, 0);
		} else {
		sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr);
		@@ -1859,18 +1901,19 @@ static void recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr,
		* Especially if we determine to cache the rbio, we need to
		* have at least all data sectors uptodate.
		*/
		if (rbio->faila >= 0) {
		sector = rbio_stripe_sector(rbio, rbio->faila, sector_nr);
		if (faila >= 0) {
		sector = rbio_stripe_sector(rbio, faila, sector_nr);
		sector->uptodate = 1;
		}
		if (rbio->failb >= 0) {
		sector = rbio_stripe_sector(rbio, rbio->failb, sector_nr);
		if (failb >= 0) {
		sector = rbio_stripe_sector(rbio, failb, sector_nr);
		sector->uptodate = 1;
		}

		cleanup:
		for (stripe_nr = rbio->real_stripes - 1; stripe_nr >= 0; stripe_nr--)
		kunmap_local(unmap_array[stripe_nr]);
		return 0;
		}

		static int recover_sectors(struct btrfs_raid_bio *rbio)
		@@ -1893,10 +1936,6 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)
		goto out;
		}

		/* Make sure faila and fail b are in order. */
		if (rbio->faila >= 0 && rbio->failb >= 0 && rbio->faila > rbio->failb)
		swap(rbio->faila, rbio->failb);

		if (rbio->operation == BTRFS_RBIO_READ_REBUILD \|\|
		rbio->operation == BTRFS_RBIO_REBUILD_MISSING) {
		spin_lock_irq(&rbio->bio_list_lock);
		@@ -1906,8 +1945,11 @@ static int recover_sectors(struct btrfs_raid_bio *rbio)

		index_rbio_pages(rbio);

		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++)
		recover_vertical(rbio, sectornr, pointers, unmap_array);
		for (sectornr = 0; sectornr < rbio->stripe_nsectors; sectornr++) {
		ret = recover_vertical(rbio, sectornr, pointers, unmap_array);
		if (ret < 0)
		break;
		}

		out:
		kfree(pointers);
		@@ -1937,13 +1979,21 @@ static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
		int sectornr = total_sector_nr % rbio->stripe_nsectors;
		struct sector_ptr *sector;

		if (rbio->faila == stripe \|\| rbio->failb == stripe) {
		/* Skip the current stripe. */
		ASSERT(sectornr == 0);
		total_sector_nr += rbio->stripe_nsectors - 1;
		atomic_inc(&rbio->error);
		/*
		* Skip the range which has error. It can be a range which is
		* marked error (for csum mismatch), or it can be a missing
		* device.
		*/
		if (!rbio->bioc->stripes[stripe].dev->bdev \|\|
		test_bit(total_sector_nr, rbio->error_bitmap)) {
		/*
		* Also set the error bit for missing device, which
		* may not yet have its error bit set.
		*/
		set_bit(total_sector_nr, rbio->error_bitmap);
		continue;
		}

		sector = rbio_stripe_sector(rbio, stripe, sectornr);
		ret = rbio_add_io_sector(rbio, bio_list, sector, stripe,
		sectornr, REQ_OP_READ);
		@@ -1966,9 +2016,8 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)

		/*
		* Either we're doing recover for a read failure or degraded write,
		* caller should have set faila/b and error bitmap correctly.
		* caller should have set error bitmap correctly.
		*/
		ASSERT(rbio->faila >= 0 \|\| rbio->failb >= 0);
		ASSERT(bitmap_weight(rbio->error_bitmap, rbio->nr_sectors));
		bio_list_init(&bio_list);

		@@ -1992,12 +2041,6 @@ static int recover_rbio(struct btrfs_raid_bio *rbio)
		submit_read_bios(rbio, &bio_list);
		wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);

		/* We have more errors than our tolerance during the read. */
		if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
		ret = -EIO;
		goto out;
		}

		ret = recover_sectors(rbio);

		out:
		@@ -2032,6 +2075,51 @@ static void recover_rbio_work_locked(struct work_struct *work)
		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
		}

		static void set_rbio_raid6_extra_error(struct btrfs_raid_bio *rbio, int mirror_num)
		{
		bool found = false;
		int sector_nr;

		/*
		* This is for RAID6 extra recovery tries, thus mirror number should
		* be large than 2.
		* Mirror 1 means read from data stripes. Mirror 2 means rebuild using
		* RAID5 methods.
		*/
		ASSERT(mirror_num > 2);
		for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
		int found_errors;
		int faila;
		int failb;

		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
		&faila, &failb);
		/* This vertical stripe doesn't have errors. */
		if (!found_errors)
		continue;

		/*
		* If we found errors, there should be only one error marked
		* by previous set_rbio_range_error().
		*/
		ASSERT(found_errors == 1);
		found = true;

		/* Now select another stripe to mark as error. */
		failb = rbio->real_stripes - (mirror_num - 1);
		if (failb <= faila)
		failb--;

		/* Set the extra bit in error bitmap. */
		if (failb >= 0)
		set_bit(failb * rbio->stripe_nsectors + sector_nr,
		rbio->error_bitmap);
		}

		/* We should found at least one vertical stripe with error.*/
		ASSERT(found);
		}

		/*
		* the main entry point for reads from the higher layers. This
		* is really only called when the normal read path had a failure,
		@@ -2074,11 +2162,7 @@ void raid56_parity_recover(struct bio bio, struct btrfs_io_context bioc,
		* for 'mirror_num > 2', select a stripe to fail on every retry.
		*/
		if (mirror_num > 2) {
		/*
		* 'mirror == 3' is to fail the p stripe and
		* reconstruct from the q stripe. 'mirror > 3' is to
		* fail a data stripe and reconstruct from p+q stripe.
		*/
		set_rbio_raid6_extra_error(rbio, mirror_num);
		rbio->failb = rbio->real_stripes - (mirror_num - 1);
		ASSERT(rbio->failb > 0);
		if (rbio->failb <= rbio->faila)
		@@ -2507,48 +2591,85 @@ static inline int is_data_stripe(struct btrfs_raid_bio *rbio, int stripe)

		static int recover_scrub_rbio(struct btrfs_raid_bio *rbio)
		{
		int dfail = 0, failp = -1;
		void **pointers = NULL;
		void **unmap_array = NULL;
		int sector_nr;
		int ret;

		/* No error case should be already handled by the caller. */
		ASSERT(rbio->faila >= 0 \|\| rbio->failb >= 0);
		/*
		* @pointers array stores the pointer for each sector.
		*
		* @unmap_array stores copy of pointers that does not get reordered
		* during reconstruction so that kunmap_local works.
		*/
		pointers = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
		unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS);
		if (!pointers \|\| !unmap_array) {
		ret = -ENOMEM;
		goto out;
		}

		if (is_data_stripe(rbio, rbio->faila))
		dfail++;
		else if (is_parity_stripe(rbio->faila))
		failp = rbio->faila;
		for (sector_nr = 0; sector_nr < rbio->stripe_nsectors; sector_nr++) {
		int dfail = 0, failp = -1;
		int faila;
		int failb;
		int found_errors;

		found_errors = get_rbio_veritical_errors(rbio, sector_nr,
		&faila, &failb);
		if (found_errors > rbio->bioc->max_errors) {
		ret = -EIO;
		goto out;
		}
		if (found_errors == 0)
		continue;

		/* We should have at least one error here. */
		ASSERT(faila >= 0 \|\| failb >= 0);

		if (is_data_stripe(rbio, rbio->failb))
		if (is_data_stripe(rbio, faila))
		dfail++;
		else if (is_parity_stripe(rbio->failb))
		failp = rbio->failb;
		else if (is_parity_stripe(faila))
		failp = faila;

		if (is_data_stripe(rbio, failb))
		dfail++;
		else if (is_parity_stripe(failb))
		failp = failb;
		/*
		* Because we can not use a scrubbing parity to repair
		* the data, so the capability of the repair is declined.
		* (In the case of RAID5, we can not repair anything)
		* Because we can not use a scrubbing parity to repair the
		* data, so the capability of the repair is declined. (In the
		* case of RAID5, we can not repair anything.)
		*/
		if (dfail > rbio->bioc->max_errors - 1)
		return -EIO;

		if (dfail > rbio->bioc->max_errors - 1) {
		ret = -EIO;
		goto out;
		}
		/*
		* If all data is good, only parity is correctly, just
		* repair the parity.
		* If all data is good, only parity is correctly, just repair
		* the parity, no need to recover data stripes.
		*/
		if (dfail == 0)
		return 0;
		continue;

		/*
		* Here means we got one corrupted data stripe and one
		* corrupted parity on RAID6, if the corrupted parity
		* is scrubbing parity, luckily, use the other one to repair
		* the data, or we can not repair the data stripe.
		* corrupted parity on RAID6, if the corrupted parity is
		* scrubbing parity, luckily, use the other one to repair the
		* data, or we can not repair the data stripe.
		*/
		if (failp != rbio->scrubp)
		return -EIO;
		if (failp != rbio->scrubp) {
		ret = -EIO;
		goto out;
		}

		/* We have some corrupted sectors, need to repair them. */
		ret = recover_sectors(rbio);
		ret = recover_vertical(rbio, sector_nr, pointers, unmap_array);
		if (ret < 0)
		goto out;
		}
		out:
		kfree(pointers);
		kfree(unmap_array);
		return ret;
		}

		@@ -2624,25 +2745,11 @@ static int scrub_rbio(struct btrfs_raid_bio *rbio)
		submit_read_bios(rbio, &bio_list);
		wait_event(rbio->io_wait, atomic_read(&rbio->stripes_pending) == 0);

		if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
		ret = -EIO;
		goto cleanup;
		}
		/*
		* No error during read, can finish the scrub and need to verify the
		* P/Q sectors;
		*/
		if (atomic_read(&rbio->error) == 0) {
		need_check = true;
		goto finish;
		}

		/* We have some failures, need to recover the failed sectors first. */
		/* We may have some failures, recover the failed sectors first. */
		ret = recover_scrub_rbio(rbio);
		if (ret < 0)
		goto cleanup;

		finish:
		/*
		* We have every sector properly prepared. Can finish the scrub
		* and writeback the good content.