xfs: convert xfarray_pagesort to deal with large folios

Convert xfarray_pagesort to handle large folios by introducing a new
xfile_get_folio routine that can return a folio of arbitrary size, and
using heapsort on the full folio.  This also corrects an off-by-one bug
in the calculation of len in xfarray_pagesort that was papered over by
xfarray_want_pagesort.

Signed-off-by: "Darrick J. Wong" <djwong@kernel.org>
Signed-off-by: Christoph Hellwig <hch@lst.de>
Reviewed-by: Kent Overstreet <kent.overstreet@linux.dev>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>
This commit is contained in:
Darrick J. Wong 2024-02-19 07:27:29 +01:00 committed by Chandan Babu R
parent b2fdfe19df
commit ee13fc6720
3 changed files with 143 additions and 111 deletions

View File

@ -956,7 +956,7 @@ TRACE_EVENT(xfarray_isort,
__entry->hi - __entry->lo)
);
TRACE_EVENT(xfarray_pagesort,
TRACE_EVENT(xfarray_foliosort,
TP_PROTO(struct xfarray_sortinfo *si, uint64_t lo, uint64_t hi),
TP_ARGS(si, lo, hi),
TP_STRUCT__entry(
@ -1027,6 +1027,47 @@ TRACE_EVENT(xfarray_sort,
__entry->bytes)
);
TRACE_EVENT(xfarray_sort_scan,
TP_PROTO(struct xfarray_sortinfo *si, unsigned long long idx),
TP_ARGS(si, idx),
TP_STRUCT__entry(
__field(unsigned long, ino)
__field(unsigned long long, nr)
__field(size_t, obj_size)
__field(unsigned long long, idx)
__field(unsigned long long, folio_pos)
__field(unsigned long, folio_bytes)
__field(unsigned long long, first_idx)
__field(unsigned long long, last_idx)
),
TP_fast_assign(
__entry->nr = si->array->nr;
__entry->obj_size = si->array->obj_size;
__entry->ino = file_inode(si->array->xfile->file)->i_ino;
__entry->idx = idx;
if (si->folio) {
__entry->folio_pos = folio_pos(si->folio);
__entry->folio_bytes = folio_size(si->folio);
__entry->first_idx = si->first_folio_idx;
__entry->last_idx = si->last_folio_idx;
} else {
__entry->folio_pos = 0;
__entry->folio_bytes = 0;
__entry->first_idx = 0;
__entry->last_idx = 0;
}
),
TP_printk("xfino 0x%lx nr %llu objsz %zu idx %llu folio_pos 0x%llx folio_bytes 0x%lx first_idx %llu last_idx %llu",
__entry->ino,
__entry->nr,
__entry->obj_size,
__entry->idx,
__entry->folio_pos,
__entry->folio_bytes,
__entry->first_idx,
__entry->last_idx)
);
TRACE_EVENT(xfarray_sort_stats,
TP_PROTO(struct xfarray_sortinfo *si, int error),
TP_ARGS(si, error),

View File

@ -563,70 +563,42 @@ xfarray_isort(
return xfile_store(si->array->xfile, scratch, len, lo_pos);
}
/* Grab a page for sorting records. */
static inline int
xfarray_sort_get_page(
struct xfarray_sortinfo *si,
loff_t pos,
uint64_t len)
{
return xfile_get_page(si->array->xfile, pos, len, &si->xfpage);
}
/* Release a page we grabbed for sorting records. */
static inline int
xfarray_sort_put_page(
struct xfarray_sortinfo *si)
{
if (!xfile_page_cached(&si->xfpage))
return 0;
return xfile_put_page(si->array->xfile, &si->xfpage);
}
/* Decide if these records are eligible for in-page sorting. */
static inline bool
xfarray_want_pagesort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
pgoff_t lo_page;
pgoff_t hi_page;
loff_t end_pos;
/* We can only map one page at a time. */
lo_page = xfarray_pos(si->array, lo) >> PAGE_SHIFT;
end_pos = xfarray_pos(si->array, hi) + si->array->obj_size - 1;
hi_page = end_pos >> PAGE_SHIFT;
return lo_page == hi_page;
}
/* Sort a bunch of records that all live in the same memory page. */
/*
* Sort the records from lo to hi (inclusive) if they are all backed by the
* same memory folio. Returns 1 if it sorted, 0 if it did not, or a negative
* errno.
*/
STATIC int
xfarray_pagesort(
xfarray_foliosort(
struct xfarray_sortinfo *si,
xfarray_idx_t lo,
xfarray_idx_t hi)
{
struct folio *folio;
void *startp;
loff_t lo_pos = xfarray_pos(si->array, lo);
uint64_t len = xfarray_pos(si->array, hi - lo);
int error = 0;
uint64_t len = xfarray_pos(si->array, hi - lo + 1);
trace_xfarray_pagesort(si, lo, hi);
/* No single folio could back this many records. */
if (len > XFILE_MAX_FOLIO_SIZE)
return 0;
xfarray_sort_bump_loads(si);
error = xfarray_sort_get_page(si, lo_pos, len);
if (error)
return error;
folio = xfile_get_folio(si->array->xfile, lo_pos, len, XFILE_ALLOC);
if (IS_ERR(folio))
return PTR_ERR(folio);
if (!folio)
return 0;
trace_xfarray_foliosort(si, lo, hi);
xfarray_sort_bump_heapsorts(si);
startp = page_address(si->xfpage.page) + offset_in_page(lo_pos);
startp = folio_address(folio) + offset_in_folio(folio, lo_pos);
sort(startp, hi - lo + 1, si->array->obj_size, si->cmp_fn, NULL);
xfarray_sort_bump_stores(si);
return xfarray_sort_put_page(si);
xfile_put_folio(si->array->xfile, folio);
return 1;
}
/* Return a pointer to the xfarray pivot record within the sortinfo struct. */
@ -814,63 +786,78 @@ xfarray_qsort_push(
return 0;
}
static inline void
xfarray_sort_scan_done(
struct xfarray_sortinfo *si)
{
if (si->folio)
xfile_put_folio(si->array->xfile, si->folio);
si->folio = NULL;
}
/*
* Load an element from the array into the first scratchpad and cache the page,
* if possible.
* Cache the folio backing the start of the given array element. If the array
* element is contained entirely within the folio, return a pointer to the
* cached folio. Otherwise, load the element into the scratchpad and return a
* pointer to the scratchpad.
*/
static inline int
xfarray_sort_load_cached(
xfarray_sort_scan(
struct xfarray_sortinfo *si,
xfarray_idx_t idx,
void *ptr)
void **ptrp)
{
loff_t idx_pos = xfarray_pos(si->array, idx);
pgoff_t startpage;
pgoff_t endpage;
int error = 0;
/*
* If this load would split a page, release the cached page, if any,
* and perform a traditional read.
*/
startpage = idx_pos >> PAGE_SHIFT;
endpage = (idx_pos + si->array->obj_size - 1) >> PAGE_SHIFT;
if (startpage != endpage) {
error = xfarray_sort_put_page(si);
if (error)
return error;
if (xfarray_sort_terminated(si, &error))
return error;
if (xfarray_sort_terminated(si, &error))
return error;
trace_xfarray_sort_scan(si, idx);
return xfile_load(si->array->xfile, ptr,
si->array->obj_size, idx_pos);
}
/* If the cached folio doesn't cover this index, release it. */
if (si->folio &&
(idx < si->first_folio_idx || idx > si->last_folio_idx))
xfarray_sort_scan_done(si);
/* If the cached page is not the one we want, release it. */
if (xfile_page_cached(&si->xfpage) &&
xfile_page_index(&si->xfpage) != startpage) {
error = xfarray_sort_put_page(si);
if (error)
return error;
/* Grab the first folio that backs this array element. */
if (!si->folio) {
loff_t next_pos;
si->folio = xfile_get_folio(si->array->xfile, idx_pos,
si->array->obj_size, XFILE_ALLOC);
if (IS_ERR(si->folio))
return PTR_ERR(si->folio);
si->first_folio_idx = xfarray_idx(si->array,
folio_pos(si->folio) + si->array->obj_size - 1);
next_pos = folio_pos(si->folio) + folio_size(si->folio);
si->last_folio_idx = xfarray_idx(si->array, next_pos - 1);
if (xfarray_pos(si->array, si->last_folio_idx + 1) > next_pos)
si->last_folio_idx--;
trace_xfarray_sort_scan(si, idx);
}
/*
* If we don't have a cached page (and we know the load is contained
* in a single page) then grab it.
* If this folio still doesn't cover the desired element, it must cross
* a folio boundary. Read into the scratchpad and we're done.
*/
if (!xfile_page_cached(&si->xfpage)) {
if (xfarray_sort_terminated(si, &error))
return error;
if (idx < si->first_folio_idx || idx > si->last_folio_idx) {
void *temp = xfarray_scratch(si->array);
error = xfarray_sort_get_page(si, startpage << PAGE_SHIFT,
PAGE_SIZE);
error = xfile_load(si->array->xfile, temp, si->array->obj_size,
idx_pos);
if (error)
return error;
*ptrp = temp;
return 0;
}
memcpy(ptr, page_address(si->xfpage.page) + offset_in_page(idx_pos),
si->array->obj_size);
/* Otherwise return a pointer to the array element in the folio. */
*ptrp = folio_address(si->folio) + offset_in_folio(si->folio, idx_pos);
return 0;
}
@ -937,6 +924,8 @@ xfarray_sort(
pivot = xfarray_sortinfo_pivot(si);
while (si->stack_depth >= 0) {
int ret;
lo = si_lo[si->stack_depth];
hi = si_hi[si->stack_depth];
@ -949,13 +938,13 @@ xfarray_sort(
}
/*
* If directly mapping the page and sorting can solve our
* If directly mapping the folio and sorting can solve our
* problems, we're done.
*/
if (xfarray_want_pagesort(si, lo, hi)) {
error = xfarray_pagesort(si, lo, hi);
if (error)
goto out_free;
ret = xfarray_foliosort(si, lo, hi);
if (ret < 0)
goto out_free;
if (ret == 1) {
si->stack_depth--;
continue;
}
@ -980,25 +969,24 @@ xfarray_sort(
* than the pivot is on the right side of the range.
*/
while (lo < hi) {
void *p;
/*
* Decrement hi until it finds an a[hi] less than the
* pivot value.
*/
error = xfarray_sort_load_cached(si, hi, scratch);
error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
while (xfarray_sort_cmp(si, scratch, pivot) >= 0 &&
lo < hi) {
while (xfarray_sort_cmp(si, p, pivot) >= 0 && lo < hi) {
hi--;
error = xfarray_sort_load_cached(si, hi,
scratch);
error = xfarray_sort_scan(si, hi, &p);
if (error)
goto out_free;
}
error = xfarray_sort_put_page(si);
if (error)
goto out_free;
if (p != scratch)
memcpy(scratch, p, si->array->obj_size);
xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;
@ -1013,21 +1001,18 @@ xfarray_sort(
* Increment lo until it finds an a[lo] greater than
* the pivot value.
*/
error = xfarray_sort_load_cached(si, lo, scratch);
error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
while (xfarray_sort_cmp(si, scratch, pivot) <= 0 &&
lo < hi) {
while (xfarray_sort_cmp(si, p, pivot) <= 0 && lo < hi) {
lo++;
error = xfarray_sort_load_cached(si, lo,
scratch);
error = xfarray_sort_scan(si, lo, &p);
if (error)
goto out_free;
}
error = xfarray_sort_put_page(si);
if (error)
goto out_free;
if (p != scratch)
memcpy(scratch, p, si->array->obj_size);
xfarray_sort_scan_done(si);
if (xfarray_sort_terminated(si, &error))
goto out_free;

View File

@ -105,8 +105,14 @@ struct xfarray_sortinfo {
/* XFARRAY_SORT_* flags; see below. */
unsigned int flags;
/* Cache a page here for faster access. */
struct xfile_page xfpage;
/* Cache a folio here for faster scanning for pivots */
struct folio *folio;
/* First array index in folio that is completely readable */
xfarray_idx_t first_folio_idx;
/* Last array index in folio that is completely readable */
xfarray_idx_t last_folio_idx;
#ifdef DEBUG
/* Performance statistics. */