Skip to content

Commit

Permalink
Merge pull request openzfs#18 from bgly/newfixes
Browse files Browse the repository at this point in the history
Newfixes
  • Loading branch information
Bryant G. Ly authored Dec 11, 2019
2 parents bb451b1 + 69c252d commit b42d33b
Show file tree
Hide file tree
Showing 8 changed files with 215 additions and 217 deletions.
19 changes: 13 additions & 6 deletions contrib/initramfs/scripts/zfs.in
Original file line number Diff line number Diff line change
Expand Up @@ -884,20 +884,27 @@ mountroot()
ZFS_RPOOL="${pool}"
fi

# Set elevator=noop on the root pool's vdevs' disks. ZFS already
# does this for wholedisk vdevs (for all pools), so this is only
# important for partitions.
# Set the no-op scheduler on the disks containing the vdevs of
# the root pool. For single-queue devices, this scheduler is
# "noop", for multi-queue devices, it is "none".
# ZFS already does this for wholedisk vdevs (for all pools), so this
# is only important for partitions.
"${ZPOOL}" status -L "${ZFS_RPOOL}" 2> /dev/null |
awk '/^\t / && !/(mirror|raidz)/ {
dev=$1;
sub(/[0-9]+$/, "", dev);
print dev
}' |
while read i
while read -r i
do
if grep -sq noop /sys/block/$i/queue/scheduler
SCHEDULER=/sys/block/$i/queue/scheduler
if [ -e "${SCHEDULER}" ]
then
echo noop > "/sys/block/$i/queue/scheduler"
# Query to see what schedulers are available
case "$(cat "${SCHEDULER}")" in
*noop*) echo noop > "${SCHEDULER}" ;;
*none*) echo none > "${SCHEDULER}" ;;
esac
fi
done

Expand Down
13 changes: 11 additions & 2 deletions include/sys/abd.h
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,7 @@
*/
/*
* Copyright (c) 2014 by Chunwei Chen. All rights reserved.
* Copyright (c) 2016 by Delphix. All rights reserved.
* Copyright (c) 2016, 2019 by Delphix. All rights reserved.
*/

#ifndef _ABD_H
Expand All @@ -44,7 +44,8 @@ typedef enum abd_flags {
ABD_FLAG_OWNER = 1 << 1, /* does it own its data buffers? */
ABD_FLAG_META = 1 << 2, /* does this represent FS metadata? */
ABD_FLAG_MULTI_ZONE = 1 << 3, /* pages split over memory zones */
ABD_FLAG_MULTI_CHUNK = 1 << 4 /* pages split over multiple chunks */
ABD_FLAG_MULTI_CHUNK = 1 << 4, /* pages split over multiple chunks */
ABD_FLAG_LINEAR_PAGE = 1 << 5, /* linear but allocd from page */
} abd_flags_t;

typedef struct abd {
Expand All @@ -60,6 +61,7 @@ typedef struct abd {
} abd_scatter;
struct abd_linear {
void *abd_buf;
struct scatterlist *abd_sgl; /* for LINEAR_PAGE */
} abd_linear;
} abd_u;
} abd_t;
Expand All @@ -75,6 +77,13 @@ abd_is_linear(abd_t *abd)
return ((abd->abd_flags & ABD_FLAG_LINEAR) != 0 ? B_TRUE : B_FALSE);
}

static inline boolean_t
abd_is_linear_page(abd_t *abd)
{
return ((abd->abd_flags & ABD_FLAG_LINEAR_PAGE) != 0 ?
B_TRUE : B_FALSE);
}

/*
* Allocations and deallocations
*/
Expand Down
120 changes: 79 additions & 41 deletions module/zfs/abd.c
Original file line number Diff line number Diff line change
Expand Up @@ -72,17 +72,19 @@
* (2) Fragmentation is less of an issue since when we are at the limit of
* allocatable space, we won't have to search around for a long free
* hole in the VA space for large ARC allocations. Each chunk is mapped in
* individually, so even if we weren't using segkpm (see next point) we
* individually, so even if we are using HIGHMEM (see next point) we
* wouldn't need to worry about finding a contiguous address range.
*
* (3) Use of segkpm will avoid the need for map / unmap / TLB shootdown costs
* on each ABD access. (If segkpm isn't available then we use all linear
* ABDs to avoid this penalty.) See seg_kpm.c for more details.
* (3) If we are not using HIGHMEM, then all physical memory is always
* mapped into the kernel's address space, so we also avoid the map /
* unmap costs on each ABD access.
*
* If we are not using HIGHMEM, scattered buffers which have only one chunk
* can be treated as linear buffers, because they are contiguous in the
* kernel's virtual address space. See abd_alloc_pages() for details.
*
* It is possible to make all ABDs linear by setting zfs_abd_scatter_enabled to
* B_FALSE. However, it is not possible to use scattered ABDs if segkpm is not
* available, which is the case on all 32-bit systems and any 64-bit systems
* where kpm_enable is turned off.
* B_FALSE.
*
* In addition to directly allocating a linear or scattered ABD, it is also
* possible to create an ABD by requesting the "sub-ABD" starting at an offset
Expand Down Expand Up @@ -249,18 +251,6 @@ abd_chunkcnt_for_bytes(size_t size)
#define __GFP_RECLAIM __GFP_WAIT
#endif

static unsigned long
abd_alloc_chunk(int nid, gfp_t gfp, unsigned int order)
{
struct page *page;

page = alloc_pages_node(nid, gfp, order);
if (!page)
return (0);

return ((unsigned long) page_address(page));
}

/*
* The goal is to minimize fragmentation by preferentially populating ABDs
* with higher order compound pages from a single zone. Allocation size is
Expand All @@ -283,19 +273,18 @@ abd_alloc_pages(abd_t *abd, size_t size)
size_t remaining_size;
int nid = NUMA_NO_NODE;
int alloc_pages = 0;
int order;

INIT_LIST_HEAD(&pages);

while (alloc_pages < nr_pages) {
unsigned long paddr;
unsigned chunk_pages;
int order;

order = MIN(highbit64(nr_pages - alloc_pages) - 1, max_order);
chunk_pages = (1U << order);

paddr = abd_alloc_chunk(nid, order ? gfp_comp : gfp, order);
if (paddr == 0) {
page = alloc_pages_node(nid, order ? gfp_comp : gfp, order);
if (page == NULL) {
if (order == 0) {
ABDSTAT_BUMP(abdstat_scatter_page_alloc_retry);
schedule_timeout_interruptible(1);
Expand All @@ -305,7 +294,6 @@ abd_alloc_pages(abd_t *abd, size_t size)
continue;
}

page = virt_to_page(paddr);
list_add_tail(&page->lru, &pages);

if ((nid != NUMA_NO_NODE) && (page_to_nid(page) != nid))
Expand Down Expand Up @@ -336,18 +324,52 @@ abd_alloc_pages(abd_t *abd, size_t size)
list_del(&page->lru);
}

if (chunks > 1) {
/*
* These conditions ensure that a possible transformation to a linear
* ABD would be valid.
*/
ASSERT(!PageHighMem(sg_page(table.sgl)));
ASSERT0(ABD_SCATTER(abd).abd_offset);

if (table.nents == 1) {
/*
* Since there is only one entry, this ABD can be represented
* as a linear buffer. All single-page (4K) ABD's can be
* represented this way. Some multi-page ABD's can also be
* represented this way, if we were able to allocate a single
* "chunk" (higher-order "page" which represents a power-of-2
* series of physically-contiguous pages). This is often the
* case for 2-page (8K) ABD's.
*
* Representing a single-entry scatter ABD as a linear ABD
* has the performance advantage of avoiding the copy (and
* allocation) in abd_borrow_buf_copy / abd_return_buf_copy.
* A performance increase of around 5% has been observed for
* ARC-cached reads (of small blocks which can take advantage
* of this).
*
* Note that this optimization is only possible because the
* pages are always mapped into the kernel's address space.
* This is not the case for highmem pages, so the
* optimization can not be made there.
*/
abd->abd_flags |= ABD_FLAG_LINEAR;
abd->abd_flags |= ABD_FLAG_LINEAR_PAGE;
abd->abd_u.abd_linear.abd_sgl = table.sgl;
abd->abd_u.abd_linear.abd_buf =
page_address(sg_page(table.sgl));
} else if (table.nents > 1) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_chunk);
abd->abd_flags |= ABD_FLAG_MULTI_CHUNK;

if (zones) {
ABDSTAT_BUMP(abdstat_scatter_page_multi_zone);
abd->abd_flags |= ABD_FLAG_MULTI_ZONE;
}
}

ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = table.nents;
ABD_SCATTER(abd).abd_sgl = table.sgl;
ABD_SCATTER(abd).abd_nents = table.nents;
}
}
#else
/*
Expand Down Expand Up @@ -427,10 +449,6 @@ abd_free_pages(abd_t *abd)

struct page;

#define kpm_enable 1
#define abd_alloc_chunk(o) \
((struct page *)umem_alloc_aligned(PAGESIZE << (o), 64, KM_SLEEP))
#define abd_free_chunk(chunk, o) umem_free(chunk, PAGESIZE << (o))
#define zfs_kmap_atomic(chunk, km) ((void *)chunk)
#define zfs_kunmap_atomic(addr, km) do { (void)(addr); } while (0)
#define local_irq_save(flags) do { (void)(flags); } while (0)
Expand Down Expand Up @@ -491,7 +509,7 @@ abd_alloc_pages(abd_t *abd, size_t size)
sg_init_table(ABD_SCATTER(abd).abd_sgl, nr_pages);

abd_for_each_sg(abd, sg, nr_pages, i) {
struct page *p = abd_alloc_chunk(0);
struct page *p = umem_alloc_aligned(PAGESIZE, 64, KM_SLEEP);
sg_set_page(sg, p, PAGESIZE, 0);
}
ABD_SCATTER(abd).abd_nents = nr_pages;
Expand All @@ -502,12 +520,11 @@ abd_free_pages(abd_t *abd)
{
int i, n = ABD_SCATTER(abd).abd_nents;
struct scatterlist *sg;
int j;

abd_for_each_sg(abd, sg, n, i) {
for (j = 0; j < sg->length; j += PAGESIZE) {
struct page *p = nth_page(sg_page(sg), j>>PAGE_SHIFT);
abd_free_chunk(p, 0);
for (int j = 0; j < sg->length; j += PAGESIZE) {
struct page *p = nth_page(sg_page(sg), j >> PAGE_SHIFT);
umem_free(p, PAGESIZE);
}
}

Expand Down Expand Up @@ -560,7 +577,7 @@ abd_verify(abd_t *abd)
ASSERT3U(abd->abd_size, <=, SPA_MAXBLOCKSIZE);
ASSERT3U(abd->abd_flags, ==, abd->abd_flags & (ABD_FLAG_LINEAR |
ABD_FLAG_OWNER | ABD_FLAG_META | ABD_FLAG_MULTI_ZONE |
ABD_FLAG_MULTI_CHUNK));
ABD_FLAG_MULTI_CHUNK | ABD_FLAG_LINEAR_PAGE));
IMPLY(abd->abd_parent != NULL, !(abd->abd_flags & ABD_FLAG_OWNER));
IMPLY(abd->abd_flags & ABD_FLAG_META, abd->abd_flags & ABD_FLAG_OWNER);
if (abd_is_linear(abd)) {
Expand Down Expand Up @@ -613,6 +630,7 @@ abd_alloc(size_t size, boolean_t is_metadata)

abd_t *abd = abd_alloc_struct();
abd->abd_flags = ABD_FLAG_OWNER;
abd->abd_u.abd_scatter.abd_offset = 0;
abd_alloc_pages(abd, size);

if (is_metadata) {
Expand All @@ -622,8 +640,6 @@ abd_alloc(size_t size, boolean_t is_metadata)
abd->abd_parent = NULL;
zfs_refcount_create(&abd->abd_children);

abd->abd_u.abd_scatter.abd_offset = 0;

ABDSTAT_BUMP(abdstat_scatter_cnt);
ABDSTAT_INCR(abdstat_scatter_data_size, size);
ABDSTAT_INCR(abdstat_scatter_chunk_waste,
Expand Down Expand Up @@ -681,6 +697,17 @@ abd_alloc_linear(size_t size, boolean_t is_metadata)
static void
abd_free_linear(abd_t *abd)
{
if (abd_is_linear_page(abd)) {
/* Transform it back into a scatter ABD for freeing */
struct scatterlist *sg = abd->abd_u.abd_linear.abd_sgl;
abd->abd_flags &= ~ABD_FLAG_LINEAR;
abd->abd_flags &= ~ABD_FLAG_LINEAR_PAGE;
ABD_SCATTER(abd).abd_nents = 1;
ABD_SCATTER(abd).abd_offset = 0;
ABD_SCATTER(abd).abd_sgl = sg;
abd_free_scatter(abd);
return;
}
if (abd->abd_flags & ABD_FLAG_META) {
zio_buf_free(abd->abd_u.abd_linear.abd_buf, abd->abd_size);
} else {
Expand Down Expand Up @@ -718,7 +745,8 @@ abd_t *
abd_alloc_sametype(abd_t *sabd, size_t size)
{
boolean_t is_metadata = (sabd->abd_flags & ABD_FLAG_META) != 0;
if (abd_is_linear(sabd)) {
if (abd_is_linear(sabd) &&
!abd_is_linear_page(sabd)) {
return (abd_alloc_linear(size, is_metadata));
} else {
return (abd_alloc(size, is_metadata));
Expand Down Expand Up @@ -966,6 +994,16 @@ abd_release_ownership_of_buf(abd_t *abd)
{
ASSERT(abd_is_linear(abd));
ASSERT(abd->abd_flags & ABD_FLAG_OWNER);

/*
* abd_free() needs to handle LINEAR_PAGE ABD's specially.
* Since that flag does not survive the
* abd_release_ownership_of_buf() -> abd_get_from_buf() ->
* abd_take_ownership_of_buf() sequence, we don't allow releasing
* these "linear but not zio_[data_]buf_alloc()'ed" ABD's.
*/
ASSERT(!abd_is_linear_page(abd));

abd_verify(abd);

abd->abd_flags &= ~ABD_FLAG_OWNER;
Expand Down
19 changes: 13 additions & 6 deletions module/zfs/arc.c
Original file line number Diff line number Diff line change
Expand Up @@ -2917,18 +2917,26 @@ arc_buf_alloc_impl(arc_buf_hdr_t *hdr, spa_t *spa, const zbookmark_phys_t *zb,
/*
* If the hdr's data can be shared then we share the data buffer and
* set the appropriate bit in the hdr's b_flags to indicate the hdr is
* allocate a new buffer to store the buf's data.
* sharing it's b_pabd with the arc_buf_t. Otherwise, we allocate a new
* buffer to store the buf's data.
*
* There are two additional restrictions here because we're sharing
* hdr -> buf instead of the usual buf -> hdr. First, the hdr can't be
* actively involved in an L2ARC write, because if this buf is used by
* an arc_write() then the hdr's data buffer will be released when the
* write completes, even though the L2ARC write might still be using it.
* Second, the hdr's ABD must be linear so that the buf's user doesn't
* need to be ABD-aware.
*/
boolean_t can_share = arc_can_share(hdr, buf) && !HDR_L2_WRITING(hdr) &&
hdr->b_l1hdr.b_pabd != NULL && abd_is_linear(hdr->b_l1hdr.b_pabd);
* need to be ABD-aware. It must be allocated via
* zio_[data_]buf_alloc(), not as a page, because we need to be able
* to abd_release_ownership_of_buf(), which isn't allowed on "linear
* page" buffers because the ABD code needs to handle freeing them
* specially.
*/
boolean_t can_share = arc_can_share(hdr, buf) &&
!HDR_L2_WRITING(hdr) &&
hdr->b_l1hdr.b_pabd != NULL &&
abd_is_linear(hdr->b_l1hdr.b_pabd) &&
!abd_is_linear_page(hdr->b_l1hdr.b_pabd);

/* Set up b_data and sharing */
if (can_share) {
Expand Down Expand Up @@ -3731,7 +3739,6 @@ arc_alloc_compressed_buf(spa_t *spa, void *tag, uint64_t psize, uint64_t lsize,
* disk, it's easiest if we just set up sharing between the
* buf and the hdr.
*/
ASSERT(!abd_is_linear(hdr->b_l1hdr.b_pabd));
arc_hdr_free_abd(hdr, B_FALSE);
arc_share_buf(hdr, buf);
}
Expand Down
Loading

0 comments on commit b42d33b

Please sign in to comment.