staging: erofs: introduce VLE decompression support

This patch introduces the basic in-place VLE decompression
implementation for the erofs file system.

Compared with fixed-sized input compression, it implements
what we call 'the variable-length extent compression' which
specifies the same output size for each compression block
to make the full use of IO bandwidth (which means almost
all data from block device can be directly used for decomp-
ression), improve the real (rather than just via data caching,
which costs more memory) random read and keep the relatively
lower compression ratios (it saves more storage space than
fixed-sized input compression which is also configured with
the same input block size), as illustrated below:

        |---  variable-length extent ---|------ VLE ------|---  VLE ---|
         /> clusterofs                  /> clusterofs     /> clusterofs /> clusterofs
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
...||   |       ||           ||         | ||           || |         || | ... original data
   ++---|-------++-----------++---------|-++-----------++-|---------++-|
   ++->cluster<-++->cluster<-++->cluster<-++->cluster<-++->cluster<-++
        size         size         size         size         size
         \                             /                 /            /
          \                      /              /            /
           \               /            /            /
            ++-----------++-----------++-----------++
        ... ||           ||           ||           || ... compressed clusters
            ++-----------++-----------++-----------++
            ++->cluster<-++->cluster<-++->cluster<-++
                 size         size         size

The main point of 'in-place' refers to the decompression mode:
Instead of allocating independent compressed pages and data
structures, it reuses the allocated file cache pages at most
to store its compressed data and the corresponding pagevec in
a time-sharing approach by default, which will be useful for
low memory scenario.

In the end, unlike the other filesystems with (de)compression
support using a relatively large compression block size, which
reads and decompresses >= 128KB at once, and gains a more
good-looking random read (In fact it collects small random reads
into large sequential reads and caches all decompressed data
in memory, but it is unacceptable especially for embedded devices
with limited memory, and it is not the real random read), we
select a universal small-sized 4KB compressed cluster, which is
the smallest page size for most architectures, and all compressed
clusters can be read and decompressed independently, which ensures
random read number for all use cases.

Signed-off-by: Gao Xiang <gaoxiang25@huawei.com>
Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
This commit is contained in:
Gao Xiang 2018-07-26 20:22:06 +08:00 committed by Greg Kroah-Hartman
parent e7e9a307be
commit 3883a79abd
6 changed files with 1418 additions and 2 deletions

View File

@ -210,7 +210,12 @@ static int fill_inode(struct inode *inode, int isdir)
}
if (is_inode_layout_compression(inode)) {
#ifdef CONFIG_EROFS_FS_ZIP
inode->i_mapping->a_ops =
&z_erofs_vle_normalaccess_aops;
#else
err = -ENOTSUPP;
#endif
goto out_unlock;
}

View File

@ -262,6 +262,9 @@ static inline void erofs_workstation_cleanup_all(struct super_block *sb)
#ifdef CONFIG_EROFS_FS_ZIP
/* hard limit of pages per compressed cluster */
#define Z_EROFS_CLUSTER_MAX_PAGES (CONFIG_EROFS_FS_CLUSTER_PAGE_LIMIT)
/* page count of a compressed cluster */
#define erofs_clusterpages(sbi) ((1 << (sbi)->clusterbits) / PAGE_SIZE)
#endif
typedef u64 erofs_off_t;
@ -340,6 +343,9 @@ extern const struct inode_operations erofs_dir_iops;
extern const struct file_operations erofs_dir_fops;
extern const struct address_space_operations erofs_raw_access_aops;
#ifdef CONFIG_EROFS_FS_ZIP
extern const struct address_space_operations z_erofs_vle_normalaccess_aops;
#endif
/*
* Logical to physical block mapping, used by erofs_map_blocks()

View File

@ -115,6 +115,13 @@ static int superblock_read(struct super_block *sb)
sbi->xattr_blkaddr = le32_to_cpu(layout->xattr_blkaddr);
#endif
sbi->islotbits = ffs(sizeof(struct erofs_inode_v1)) - 1;
#ifdef CONFIG_EROFS_FS_ZIP
sbi->clusterbits = 12;
if (1 << (sbi->clusterbits - 12) > Z_EROFS_CLUSTER_MAX_PAGES)
errln("clusterbits %u is not supported on this kernel",
sbi->clusterbits);
#endif
sbi->root_nid = le16_to_cpu(layout->root_nid);
sbi->inos = le64_to_cpu(layout->inos);
@ -441,6 +448,11 @@ static struct file_system_type erofs_fs_type = {
};
MODULE_ALIAS_FS("erofs");
#ifdef CONFIG_EROFS_FS_ZIP
extern int z_erofs_init_zip_subsystem(void);
extern void z_erofs_exit_zip_subsystem(void);
#endif
static int __init erofs_module_init(void)
{
int err;
@ -456,6 +468,12 @@ static int __init erofs_module_init(void)
if (err)
goto shrinker_err;
#ifdef CONFIG_EROFS_FS_ZIP
err = z_erofs_init_zip_subsystem();
if (err)
goto zip_err;
#endif
err = register_filesystem(&erofs_fs_type);
if (err)
goto fs_err;
@ -464,6 +482,10 @@ static int __init erofs_module_init(void)
return 0;
fs_err:
#ifdef CONFIG_EROFS_FS_ZIP
z_erofs_exit_zip_subsystem();
zip_err:
#endif
unregister_shrinker(&erofs_shrinker_info);
shrinker_err:
erofs_exit_inode_cache();
@ -474,6 +496,9 @@ icache_err:
static void __exit erofs_module_exit(void)
{
unregister_filesystem(&erofs_fs_type);
#ifdef CONFIG_EROFS_FS_ZIP
z_erofs_exit_zip_subsystem();
#endif
unregister_shrinker(&erofs_shrinker_info);
erofs_exit_inode_cache();
infoln("successfully finalize erofs");

File diff suppressed because it is too large Load Diff

View File

@ -14,9 +14,213 @@
#define __EROFS_FS_UNZIP_VLE_H
#include "internal.h"
#include "unzip_pagevec.h"
/*
* - 0x5A110C8D ('sallocated', Z_EROFS_MAPPING_STAGING) -
* used for temporary allocated pages (via erofs_allocpage),
* in order to seperate those from NULL mapping (eg. truncated pages)
*/
#define Z_EROFS_MAPPING_STAGING ((void *)0x5A110C8D)
#define z_erofs_is_stagingpage(page) \
((page)->mapping == Z_EROFS_MAPPING_STAGING)
static inline bool z_erofs_gather_if_stagingpage(struct list_head *page_pool,
struct page *page)
{
if (z_erofs_is_stagingpage(page)) {
list_add(&page->lru, page_pool);
return true;
}
return false;
}
/*
* Structure fields follow one of the following exclusion rules.
*
* I: Modifiable by initialization/destruction paths and read-only
* for everyone else.
*
*/
#define Z_EROFS_VLE_INLINE_PAGEVECS 3
struct z_erofs_vle_work {
/* struct z_erofs_vle_work *left, *right; */
#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
struct list_head list;
atomic_t refcount;
#endif
struct mutex lock;
/* I: decompression offset in page */
unsigned short pageofs;
unsigned short nr_pages;
/* L: queued pages in pagevec[] */
unsigned vcnt;
union {
/* L: pagevec */
erofs_vtptr_t pagevec[Z_EROFS_VLE_INLINE_PAGEVECS];
struct rcu_head rcu;
};
};
#define Z_EROFS_VLE_WORKGRP_FMT_PLAIN 0
#define Z_EROFS_VLE_WORKGRP_FMT_LZ4 1
#define Z_EROFS_VLE_WORKGRP_FMT_MASK 1
typedef struct z_erofs_vle_workgroup *z_erofs_vle_owned_workgrp_t;
struct z_erofs_vle_workgroup {
struct erofs_workgroup obj;
struct z_erofs_vle_work work;
/* next owned workgroup */
z_erofs_vle_owned_workgrp_t next;
/* compressed pages (including multi-usage pages) */
struct page *compressed_pages[Z_EROFS_CLUSTER_MAX_PAGES];
unsigned int llen, flags;
};
/* let's avoid the valid 32-bit kernel addresses */
/* the chained workgroup has't submitted io (still open) */
#define Z_EROFS_VLE_WORKGRP_TAIL ((void *)0x5F0ECAFE)
/* the chained workgroup has already submitted io */
#define Z_EROFS_VLE_WORKGRP_TAIL_CLOSED ((void *)0x5F0EDEAD)
#define Z_EROFS_VLE_WORKGRP_NIL (NULL)
#define z_erofs_vle_workgrp_fmt(grp) \
((grp)->flags & Z_EROFS_VLE_WORKGRP_FMT_MASK)
static inline void z_erofs_vle_set_workgrp_fmt(
struct z_erofs_vle_workgroup *grp,
unsigned int fmt)
{
grp->flags = fmt | (grp->flags & ~Z_EROFS_VLE_WORKGRP_FMT_MASK);
}
#ifdef CONFIG_EROFS_FS_ZIP_MULTIREF
#error multiref decompression is unimplemented yet
#else
#define z_erofs_vle_grab_primary_work(grp) (&(grp)->work)
#define z_erofs_vle_grab_work(grp, pageofs) (&(grp)->work)
#define z_erofs_vle_work_workgroup(wrk, primary) \
((primary) ? container_of(wrk, \
struct z_erofs_vle_workgroup, work) : \
({ BUG(); (void *)NULL; }))
#endif
#define Z_EROFS_WORKGROUP_SIZE sizeof(struct z_erofs_vle_workgroup)
struct z_erofs_vle_unzip_io {
atomic_t pending_bios;
z_erofs_vle_owned_workgrp_t head;
union {
wait_queue_head_t wait;
struct work_struct work;
} u;
};
struct z_erofs_vle_unzip_io_sb {
struct z_erofs_vle_unzip_io io;
struct super_block *sb;
};
#define Z_EROFS_ONLINEPAGE_COUNT_BITS 2
#define Z_EROFS_ONLINEPAGE_COUNT_MASK ((1 << Z_EROFS_ONLINEPAGE_COUNT_BITS) - 1)
#define Z_EROFS_ONLINEPAGE_INDEX_SHIFT (Z_EROFS_ONLINEPAGE_COUNT_BITS)
/*
* waiters (aka. ongoing_packs): # to unlock the page
* sub-index: 0 - for partial page, >= 1 full page sub-index
*/
typedef atomic_t z_erofs_onlinepage_t;
/* type punning */
union z_erofs_onlinepage_converter {
z_erofs_onlinepage_t *o;
unsigned long *v;
};
static inline unsigned z_erofs_onlinepage_index(struct page *page)
{
union z_erofs_onlinepage_converter u;
BUG_ON(!PagePrivate(page));
u.v = &page_private(page);
return atomic_read(u.o) >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
}
static inline void z_erofs_onlinepage_init(struct page *page)
{
union {
z_erofs_onlinepage_t o;
unsigned long v;
/* keep from being unlocked in advance */
} u = { .o = ATOMIC_INIT(1) };
set_page_private(page, u.v);
smp_wmb();
SetPagePrivate(page);
}
static inline void z_erofs_onlinepage_fixup(struct page *page,
uintptr_t index, bool down)
{
unsigned long *p, o, v, id;
repeat:
p = &page_private(page);
o = READ_ONCE(*p);
id = o >> Z_EROFS_ONLINEPAGE_INDEX_SHIFT;
if (id) {
if (!index)
return;
BUG_ON(id != index);
}
v = (index << Z_EROFS_ONLINEPAGE_INDEX_SHIFT) |
((o & Z_EROFS_ONLINEPAGE_COUNT_MASK) + (unsigned)down);
if (cmpxchg(p, o, v) != o)
goto repeat;
}
static inline void z_erofs_onlinepage_endio(struct page *page)
{
union z_erofs_onlinepage_converter u;
unsigned v;
BUG_ON(!PagePrivate(page));
u.v = &page_private(page);
v = atomic_dec_return(u.o);
if (!(v & Z_EROFS_ONLINEPAGE_COUNT_MASK)) {
ClearPagePrivate(page);
if (!PageError(page))
SetPageUptodate(page);
unlock_page(page);
}
debugln("%s, page %p value %x", __func__, page, atomic_read(u.o));
}
#define Z_EROFS_VLE_VMAP_ONSTACK_PAGES \
min(THREAD_SIZE / 8 / sizeof(struct page *), 96UL)
#define Z_EROFS_VLE_VMAP_GLOBAL_PAGES 2048
/* unzip_vle_lz4.c */
extern int z_erofs_vle_plain_copy(struct page **compressed_pages,
unsigned clusterpages, struct page **pages,

View File

@ -12,6 +12,7 @@
*/
#include "internal.h"
#include <linux/pagevec.h>
struct page *erofs_allocpage(struct list_head *pool, gfp_t gfp)
{
@ -98,11 +99,69 @@ int erofs_register_workgroup(struct super_block *sb,
return err;
}
extern void erofs_workgroup_free_rcu(struct erofs_workgroup *grp);
int erofs_workgroup_put(struct erofs_workgroup *grp)
{
int count = atomic_dec_return(&grp->refcount);
if (count == 1)
atomic_long_inc(&erofs_global_shrink_cnt);
else if (!count) {
atomic_long_dec(&erofs_global_shrink_cnt);
erofs_workgroup_free_rcu(grp);
}
return count;
}
unsigned long erofs_shrink_workstation(struct erofs_sb_info *sbi,
unsigned long nr_shrink,
bool cleanup)
{
return 0;
pgoff_t first_index = 0;
void *batch[PAGEVEC_SIZE];
unsigned freed = 0;
int i, found;
repeat:
erofs_workstn_lock(sbi);
found = radix_tree_gang_lookup(&sbi->workstn_tree,
batch, first_index, PAGEVEC_SIZE);
for (i = 0; i < found; ++i) {
int cnt;
struct erofs_workgroup *grp = (void *)
((unsigned long)batch[i] &
~RADIX_TREE_EXCEPTIONAL_ENTRY);
first_index = grp->index + 1;
cnt = atomic_read(&grp->refcount);
BUG_ON(cnt <= 0);
if (cleanup)
BUG_ON(cnt != 1);
else if (cnt > 1)
continue;
if (radix_tree_delete(&sbi->workstn_tree,
grp->index) != grp)
continue;
/* (rarely) grabbed again when freeing */
erofs_workgroup_put(grp);
++freed;
if (unlikely(!--nr_shrink))
break;
}
erofs_workstn_unlock(sbi);
if (i && nr_shrink)
goto repeat;
return freed;
}
#endif