diff --git a/Documentation/ABI/testing/sysfs-fs-erofs b/Documentation/ABI/testing/sysfs-fs-erofs index bb4681a01811..284224d1b56f 100644 --- a/Documentation/ABI/testing/sysfs-fs-erofs +++ b/Documentation/ABI/testing/sysfs-fs-erofs @@ -4,7 +4,8 @@ Contact: "Huang Jianan" Description: Shows all enabled kernel features. Supported features: zero_padding, compr_cfgs, big_pcluster, chunked_file, - device_table, compr_head2, sb_chksum. + device_table, compr_head2, sb_chksum, ztailpacking, + dedupe, fragments. What: /sys/fs/erofs//sync_decompress Date: November 2021 diff --git a/MAINTAINERS b/MAINTAINERS index b602d07c6752..82938ca70466 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7747,6 +7747,7 @@ R: Jeffle Xu L: linux-erofs@lists.ozlabs.org S: Maintained T: git git://git.kernel.org/pub/scm/linux/kernel/git/xiang/erofs.git +F: Documentation/ABI/testing/sysfs-fs-erofs F: Documentation/filesystems/erofs.rst F: fs/erofs/ F: include/trace/events/erofs.h diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 85490370e0ca..704fb59577e0 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -108,3 +108,21 @@ config EROFS_FS_ONDEMAND read support. If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD + bool "EROFS per-cpu decompression kthread workers" + depends on EROFS_FS_ZIP + help + Saying Y here enables per-CPU kthread workers pool to carry out + async decompression for low latencies on some architectures. + + If unsure, say N. + +config EROFS_FS_PCPU_KTHREAD_HIPRI + bool "EROFS high priority per-CPU kthread workers" + depends on EROFS_FS_ZIP && EROFS_FS_PCPU_KTHREAD + help + This permits EROFS to configure per-CPU kthread workers to run + at higher priority. + + If unsure, say N. diff --git a/fs/erofs/data.c b/fs/erofs/data.c index f57f921683d7..032e12dccb84 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -74,8 +74,7 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, } static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map, - int flags) + struct erofs_map_blocks *map) { erofs_blk_t nblocks, lastblk; u64 offset = map->m_la; @@ -91,11 +90,8 @@ static int erofs_map_blocks_flatmode(struct inode *inode, map->m_pa = blknr_to_addr(vi->raw_blkaddr) + map->m_la; map->m_plen = blknr_to_addr(lastblk) - offset; } else if (tailendpacking) { - /* 2 - inode inline B: inode, [xattrs], inline last blk... */ - struct erofs_sb_info *sbi = EROFS_SB(inode->i_sb); - - map->m_pa = iloc(sbi, vi->nid) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(map->m_la); + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(offset); map->m_plen = inode->i_size - offset; /* inline data should be located in the same meta block */ @@ -117,8 +113,7 @@ static int erofs_map_blocks_flatmode(struct inode *inode, return 0; } -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags) +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { struct super_block *sb = inode->i_sb; struct erofs_inode *vi = EROFS_I(inode); @@ -130,7 +125,7 @@ int erofs_map_blocks(struct inode *inode, void *kaddr; int err = 0; - trace_erofs_map_blocks_enter(inode, map, flags); + trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; if (map->m_la >= inode->i_size) { /* leave out-of-bound access unmapped */ @@ -140,7 +135,7 @@ int erofs_map_blocks(struct inode *inode, } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map, flags); + err = erofs_map_blocks_flatmode(inode, map); goto out; } @@ -150,7 +145,7 @@ int erofs_map_blocks(struct inode *inode, unit = EROFS_BLOCK_MAP_ENTRY_SIZE; /* block map */ chunknr = map->m_la >> vi->chunkbits; - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); @@ -192,7 +187,7 @@ out_unlock: out: if (!err) map->m_llen = map->m_plen; - trace_erofs_map_blocks_exit(inode, map, flags, 0); + trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -255,7 +250,7 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_la = offset; map.m_llen = length; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret < 0) return ret; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index ecf28f66b97d..6970b09b8307 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -6,21 +6,6 @@ */ #include "internal.h" -static void debug_one_dentry(unsigned char d_type, const char *de_name, - unsigned int de_namelen) -{ -#ifdef CONFIG_EROFS_FS_DEBUG - /* since the on-disk name could not have the trailing '\0' */ - unsigned char dbg_namebuf[EROFS_NAME_LEN + 1]; - - memcpy(dbg_namebuf, de_name, de_namelen); - dbg_namebuf[de_namelen] = '\0'; - - erofs_dbg("found dirent %s de_len %u d_type %d", dbg_namebuf, - de_namelen, d_type); -#endif -} - static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, void *dentry_blk, struct erofs_dirent *de, unsigned int nameoff, unsigned int maxsize) @@ -52,10 +37,8 @@ static int erofs_fill_dentries(struct inode *dir, struct dir_context *ctx, return -EFSCORRUPTED; } - debug_one_dentry(d_type, de_name, de_namelen); if (!dir_emit(ctx, de_name, de_namelen, le64_to_cpu(de->nid), d_type)) - /* stopped by some reason */ return 1; ++de; ctx->pos += sizeof(struct erofs_dirent); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 014e20962376..96a87c023128 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -9,6 +9,7 @@ static DEFINE_MUTEX(erofs_domain_list_lock); static DEFINE_MUTEX(erofs_domain_cookies_lock); static LIST_HEAD(erofs_domain_list); +static LIST_HEAD(erofs_domain_cookies_list); static struct vfsmount *erofs_pseudo_mnt; struct erofs_fscache_request { @@ -164,18 +165,8 @@ static int erofs_fscache_read_folios_async(struct fscache_cookie *cookie, static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) { int ret; - struct super_block *sb = folio_mapping(folio)->host->i_sb; + struct erofs_fscache *ctx = folio_mapping(folio)->host->i_private; struct erofs_fscache_request *req; - struct erofs_map_dev mdev = { - .m_deviceid = 0, - .m_pa = folio_pos(folio), - }; - - ret = erofs_map_dev(sb, &mdev); - if (ret) { - folio_unlock(folio); - return ret; - } req = erofs_fscache_req_alloc(folio_mapping(folio), folio_pos(folio), folio_size(folio)); @@ -184,8 +175,8 @@ static int erofs_fscache_meta_read_folio(struct file *data, struct folio *folio) return PTR_ERR(req); } - ret = erofs_fscache_read_folios_async(mdev.m_fscache->cookie, - req, mdev.m_pa, folio_size(folio)); + ret = erofs_fscache_read_folios_async(ctx->cookie, req, + folio_pos(folio), folio_size(folio)); if (ret) req->error = ret; @@ -207,7 +198,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_request *primary) int ret; map.m_la = pos; - ret = erofs_map_blocks(inode, &map, EROFS_GET_BLOCKS_RAW); + ret = erofs_map_blocks(inode, &map); if (ret) return ret; @@ -328,8 +319,6 @@ const struct address_space_operations erofs_fscache_access_aops = { static void erofs_fscache_domain_put(struct erofs_domain *domain) { - if (!domain) - return; mutex_lock(&erofs_domain_list_lock); if (refcount_dec_and_test(&domain->ref)) { list_del(&domain->list); @@ -337,8 +326,8 @@ static void erofs_fscache_domain_put(struct erofs_domain *domain) kern_unmount(erofs_pseudo_mnt); erofs_pseudo_mnt = NULL; } - mutex_unlock(&erofs_domain_list_lock); fscache_relinquish_volume(domain->volume, NULL, false); + mutex_unlock(&erofs_domain_list_lock); kfree(domain->domain_id); kfree(domain); return; @@ -431,19 +420,21 @@ static int erofs_fscache_register_domain(struct super_block *sb) return err; } -static -struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, + char *name, unsigned int flags) { struct fscache_volume *volume = EROFS_SB(sb)->volume; struct erofs_fscache *ctx; struct fscache_cookie *cookie; + struct super_block *isb; + struct inode *inode; int ret; ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); if (!ctx) return ERR_PTR(-ENOMEM); + INIT_LIST_HEAD(&ctx->node); + refcount_set(&ctx->ref, 1); cookie = fscache_acquire_cookie(volume, FSCACHE_ADV_WANT_CACHE_SIZE, name, strlen(name), NULL, 0, 0); @@ -452,32 +443,32 @@ struct erofs_fscache *erofs_fscache_acquire_cookie(struct super_block *sb, ret = -EINVAL; goto err; } - fscache_use_cookie(cookie, false); - ctx->cookie = cookie; - if (flags & EROFS_REG_COOKIE_NEED_INODE) { - struct inode *const inode = new_inode(sb); - - if (!inode) { - erofs_err(sb, "failed to get anon inode for %s", name); - ret = -ENOMEM; - goto err_cookie; - } - - set_nlink(inode, 1); - inode->i_size = OFFSET_MAX; - inode->i_mapping->a_ops = &erofs_fscache_meta_aops; - mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - - ctx->inode = inode; + /* + * Allocate anonymous inode in global pseudo mount for shareable blobs, + * so that they are accessible among erofs fs instances. + */ + isb = flags & EROFS_REG_COOKIE_SHARE ? erofs_pseudo_mnt->mnt_sb : sb; + inode = new_inode(isb); + if (!inode) { + erofs_err(sb, "failed to get anon inode for %s", name); + ret = -ENOMEM; + goto err_cookie; } + inode->i_size = OFFSET_MAX; + inode->i_mapping->a_ops = &erofs_fscache_meta_aops; + mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); + inode->i_private = ctx; + + ctx->cookie = cookie; + ctx->inode = inode; return ctx; err_cookie: - fscache_unuse_cookie(ctx->cookie, NULL, NULL); - fscache_relinquish_cookie(ctx->cookie, false); + fscache_unuse_cookie(cookie, NULL, NULL); + fscache_relinquish_cookie(cookie, false); err: kfree(ctx); return ERR_PTR(ret); @@ -492,13 +483,9 @@ static void erofs_fscache_relinquish_cookie(struct erofs_fscache *ctx) kfree(ctx); } -static -struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_init_cookie(struct super_block *sb, + char *name, unsigned int flags) { - int err; - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; @@ -508,55 +495,38 @@ struct erofs_fscache *erofs_fscache_domain_init_cookie(struct super_block *sb, ctx->name = kstrdup(name, GFP_KERNEL); if (!ctx->name) { - err = -ENOMEM; - goto out; + erofs_fscache_relinquish_cookie(ctx); + return ERR_PTR(-ENOMEM); } - inode = new_inode(erofs_pseudo_mnt->mnt_sb); - if (!inode) { - err = -ENOMEM; - goto out; - } - - ctx->domain = domain; - ctx->anon_inode = inode; - inode->i_private = ctx; refcount_inc(&domain->ref); + ctx->domain = domain; + list_add(&ctx->node, &erofs_domain_cookies_list); return ctx; -out: - erofs_fscache_relinquish_cookie(ctx); - return ERR_PTR(err); } -static -struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) +static struct erofs_fscache *erofs_domain_register_cookie(struct super_block *sb, + char *name, unsigned int flags) { - struct inode *inode; struct erofs_fscache *ctx; struct erofs_domain *domain = EROFS_SB(sb)->domain; - struct super_block *psb = erofs_pseudo_mnt->mnt_sb; + flags |= EROFS_REG_COOKIE_SHARE; mutex_lock(&erofs_domain_cookies_lock); - spin_lock(&psb->s_inode_list_lock); - list_for_each_entry(inode, &psb->s_inodes, i_sb_list) { - ctx = inode->i_private; - if (!ctx || ctx->domain != domain || strcmp(ctx->name, name)) + list_for_each_entry(ctx, &erofs_domain_cookies_list, node) { + if (ctx->domain != domain || strcmp(ctx->name, name)) continue; if (!(flags & EROFS_REG_COOKIE_NEED_NOEXIST)) { - igrab(inode); + refcount_inc(&ctx->ref); } else { erofs_err(sb, "%s already exists in domain %s", name, domain->domain_id); ctx = ERR_PTR(-EEXIST); } - spin_unlock(&psb->s_inode_list_lock); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } - spin_unlock(&psb->s_inode_list_lock); - ctx = erofs_fscache_domain_init_cookie(sb, name, flags); + ctx = erofs_domain_init_cookie(sb, name, flags); mutex_unlock(&erofs_domain_cookies_lock); return ctx; } @@ -572,23 +542,22 @@ struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, void erofs_fscache_unregister_cookie(struct erofs_fscache *ctx) { - bool drop; - struct erofs_domain *domain; + struct erofs_domain *domain = NULL; if (!ctx) return; - domain = ctx->domain; - if (domain) { - mutex_lock(&erofs_domain_cookies_lock); - drop = atomic_read(&ctx->anon_inode->i_count) == 1; - iput(ctx->anon_inode); - mutex_unlock(&erofs_domain_cookies_lock); - if (!drop) - return; - } + if (!ctx->domain) + return erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_relinquish_cookie(ctx); - erofs_fscache_domain_put(domain); + mutex_lock(&erofs_domain_cookies_lock); + if (refcount_dec_and_test(&ctx->ref)) { + domain = ctx->domain; + list_del(&ctx->node); + erofs_fscache_relinquish_cookie(ctx); + } + mutex_unlock(&erofs_domain_cookies_lock); + if (domain) + erofs_fscache_domain_put(domain); } int erofs_fscache_register_fs(struct super_block *sb) @@ -596,7 +565,7 @@ int erofs_fscache_register_fs(struct super_block *sb) int ret; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_fscache *fscache; - unsigned int flags; + unsigned int flags = 0; if (sbi->domain_id) ret = erofs_fscache_register_domain(sb); @@ -615,7 +584,6 @@ int erofs_fscache_register_fs(struct super_block *sb) * * Acquired domain/volume will be relinquished in kill_sb() on error. */ - flags = EROFS_REG_COOKIE_NEED_INODE; if (sbi->domain_id) flags |= EROFS_REG_COOKIE_NEED_NOEXIST; fscache = erofs_fscache_register_cookie(sb, sbi->fsid, flags); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index a194e8ee5861..4be7dda3cd24 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -14,7 +14,7 @@ static void *erofs_read_inode(struct erofs_buf *buf, struct super_block *sb = inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = iloc(sbi, vi->nid); + const erofs_off_t inode_loc = erofs_iloc(inode); erofs_blk_t blkaddr, nblks = 0; void *kaddr; @@ -308,47 +308,49 @@ out_unlock: } /* - * erofs nid is 64bits, but i_ino is 'unsigned long', therefore - * we should do more for 32-bit platform to find the right inode. + * ino_t is 32-bits on 32-bit arch. We have to squash the 64-bit value down + * so that it will fit. */ -static int erofs_ilookup_test_actor(struct inode *inode, void *opaque) +static ino_t erofs_squash_ino(erofs_nid_t nid) { - const erofs_nid_t nid = *(erofs_nid_t *)opaque; + ino_t ino = (ino_t)nid; - return EROFS_I(inode)->nid == nid; + if (sizeof(ino_t) < sizeof(erofs_nid_t)) + ino ^= nid >> (sizeof(erofs_nid_t) - sizeof(ino_t)) * 8; + return ino; } -static int erofs_iget_set_actor(struct inode *inode, void *opaque) +static int erofs_iget5_eq(struct inode *inode, void *opaque) +{ + return EROFS_I(inode)->nid == *(erofs_nid_t *)opaque; +} + +static int erofs_iget5_set(struct inode *inode, void *opaque) { const erofs_nid_t nid = *(erofs_nid_t *)opaque; - inode->i_ino = erofs_inode_hash(nid); + inode->i_ino = erofs_squash_ino(nid); + EROFS_I(inode)->nid = nid; return 0; } struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid) { - const unsigned long hashval = erofs_inode_hash(nid); struct inode *inode; - inode = iget5_locked(sb, hashval, erofs_ilookup_test_actor, - erofs_iget_set_actor, &nid); + inode = iget5_locked(sb, erofs_squash_ino(nid), erofs_iget5_eq, + erofs_iget5_set, &nid); if (!inode) return ERR_PTR(-ENOMEM); if (inode->i_state & I_NEW) { - int err; - struct erofs_inode *vi = EROFS_I(inode); + int err = erofs_fill_inode(inode); - vi->nid = nid; - - err = erofs_fill_inode(inode); - if (!err) { - unlock_new_inode(inode); - } else { + if (err) { iget_failed(inode); - inode = ERR_PTR(err); + return ERR_PTR(err); } + unlock_new_inode(inode); } return inode; } diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index e05ae61069e8..3f3561d37d1b 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -12,7 +12,6 @@ #include #include #include -#include #include #include #include @@ -108,9 +107,12 @@ struct erofs_domain { struct erofs_fscache { struct fscache_cookie *cookie; - struct inode *inode; - struct inode *anon_inode; + struct inode *inode; /* anonymous inode for the blob */ + + /* used for share domain mode */ struct erofs_domain *domain; + struct list_head node; + refcount_t ref; char *name; }; @@ -271,11 +273,6 @@ struct erofs_buf { #define erofs_blkoff(addr) ((addr) % EROFS_BLKSIZ) #define blknr_to_addr(nr) ((erofs_off_t)(nr) * EROFS_BLKSIZ) -static inline erofs_off_t iloc(struct erofs_sb_info *sbi, erofs_nid_t nid) -{ - return blknr_to_addr(sbi->meta_blkaddr) + (nid << sbi->islotbits); -} - #define EROFS_FEATURE_FUNCS(name, compat, feature) \ static inline bool erofs_sb_has_##name(struct erofs_sb_info *sbi) \ { \ @@ -340,13 +337,14 @@ struct erofs_inode { struct inode vfs_inode; }; -#define EROFS_I(ptr) \ - container_of(ptr, struct erofs_inode, vfs_inode) +#define EROFS_I(ptr) container_of(ptr, struct erofs_inode, vfs_inode) -static inline unsigned long erofs_inode_datablocks(struct inode *inode) +static inline erofs_off_t erofs_iloc(struct inode *inode) { - /* since i_size cannot be changed */ - return DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); + struct erofs_sb_info *sbi = EROFS_I_SB(inode); + + return blknr_to_addr(sbi->meta_blkaddr) + + (EROFS_I(inode)->nid << sbi->islotbits); } static inline unsigned int erofs_bitrange(unsigned int value, unsigned int bit, @@ -382,31 +380,18 @@ struct page *erofs_grab_cache_page_nowait(struct address_space *mapping, readahead_gfp_mask(mapping) & ~__GFP_RECLAIM); } -extern const struct super_operations erofs_sops; -extern struct file_system_type erofs_fs_type; - -extern const struct address_space_operations erofs_raw_access_aops; -extern const struct address_space_operations z_erofs_aops; - -enum { - BH_Encoded = BH_PrivateStart, - BH_FullMapped, - BH_Fragment, - BH_Partialref, -}; - /* Has a disk mapping */ -#define EROFS_MAP_MAPPED (1 << BH_Mapped) +#define EROFS_MAP_MAPPED 0x0001 /* Located in metadata (could be copied from bd_inode) */ -#define EROFS_MAP_META (1 << BH_Meta) +#define EROFS_MAP_META 0x0002 /* The extent is encoded */ -#define EROFS_MAP_ENCODED (1 << BH_Encoded) +#define EROFS_MAP_ENCODED 0x0004 /* The length of extent is full */ -#define EROFS_MAP_FULL_MAPPED (1 << BH_FullMapped) +#define EROFS_MAP_FULL_MAPPED 0x0008 /* Located in the special packed inode */ -#define EROFS_MAP_FRAGMENT (1 << BH_Fragment) +#define EROFS_MAP_FRAGMENT 0x0010 /* The extent refers to partial decompressed data */ -#define EROFS_MAP_PARTIAL_REF (1 << BH_Partialref) +#define EROFS_MAP_PARTIAL_REF 0x0020 struct erofs_map_blocks { struct erofs_buf buf; @@ -419,17 +404,15 @@ struct erofs_map_blocks { unsigned int m_flags; }; -/* Flags used by erofs_map_blocks_flatmode() */ -#define EROFS_GET_BLOCKS_RAW 0x0001 /* * Used to get the exact decompressed length, e.g. fiemap (consider lookback * approach instead if possible since it's more metadata lightweight.) */ -#define EROFS_GET_BLOCKS_FIEMAP 0x0002 +#define EROFS_GET_BLOCKS_FIEMAP 0x0001 /* Used to map the whole extent if non-negligible data is requested for LZMA */ -#define EROFS_GET_BLOCKS_READMORE 0x0004 +#define EROFS_GET_BLOCKS_READMORE 0x0002 /* Used to map tail extent for tailpacking inline or fragment pcluster */ -#define EROFS_GET_BLOCKS_FINDTAIL 0x0008 +#define EROFS_GET_BLOCKS_FINDTAIL 0x0004 enum { Z_EROFS_COMPRESSION_SHIFTED = Z_EROFS_COMPRESSION_MAX, @@ -437,24 +420,6 @@ enum { Z_EROFS_COMPRESSION_RUNTIME_MAX }; -/* zmap.c */ -extern const struct iomap_ops z_erofs_iomap_report_ops; - -#ifdef CONFIG_EROFS_FS_ZIP -int z_erofs_fill_inode(struct inode *inode); -int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags); -#else -static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } -static inline int z_erofs_map_blocks_iter(struct inode *inode, - struct erofs_map_blocks *map, - int flags) -{ - return -EOPNOTSUPP; -} -#endif /* !CONFIG_EROFS_FS_ZIP */ - struct erofs_map_dev { struct erofs_fscache *m_fscache; struct block_device *m_bdev; @@ -465,8 +430,27 @@ struct erofs_map_dev { unsigned int m_deviceid; }; -/* data.c */ +extern struct file_system_type erofs_fs_type; +extern const struct super_operations erofs_sops; + +extern const struct address_space_operations erofs_raw_access_aops; +extern const struct address_space_operations z_erofs_aops; +extern const struct address_space_operations erofs_fscache_access_aops; + +extern const struct inode_operations erofs_generic_iops; +extern const struct inode_operations erofs_symlink_iops; +extern const struct inode_operations erofs_fast_symlink_iops; +extern const struct inode_operations erofs_dir_iops; + extern const struct file_operations erofs_file_fops; +extern const struct file_operations erofs_dir_fops; + +extern const struct iomap_ops z_erofs_iomap_report_ops; + +/* flags for erofs_fscache_register_cookie() */ +#define EROFS_REG_COOKIE_SHARE 0x0001 +#define EROFS_REG_COOKIE_NEED_NOEXIST 0x0002 + void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); void *erofs_bread(struct erofs_buf *buf, struct inode *inode, @@ -476,37 +460,14 @@ void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); -int erofs_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, int flags); - -/* inode.c */ -static inline unsigned long erofs_inode_hash(erofs_nid_t nid) -{ -#if BITS_PER_LONG == 32 - return (nid >> 32) ^ (nid & 0xffffffff); -#else - return nid; -#endif -} - -extern const struct inode_operations erofs_generic_iops; -extern const struct inode_operations erofs_symlink_iops; -extern const struct inode_operations erofs_fast_symlink_iops; - +int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map); struct inode *erofs_iget(struct super_block *sb, erofs_nid_t nid); int erofs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags); - -/* namei.c */ -extern const struct inode_operations erofs_dir_iops; - int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, unsigned int *d_type); -/* dir.c */ -extern const struct file_operations erofs_dir_fops; - static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) { int retried = 0; @@ -522,23 +483,19 @@ static inline void *erofs_vm_map_ram(struct page **pages, unsigned int count) return NULL; } -/* pcpubuf.c */ void *erofs_get_pcpubuf(unsigned int requiredpages); void erofs_put_pcpubuf(void *ptr); int erofs_pcpubuf_growsize(unsigned int nrpages); void erofs_pcpubuf_init(void); void erofs_pcpubuf_exit(void); -/* sysfs.c */ int erofs_register_sysfs(struct super_block *sb); void erofs_unregister_sysfs(struct super_block *sb); int __init erofs_init_sysfs(void); void erofs_exit_sysfs(void); -/* utils.c / zdata.c */ struct page *erofs_allocpage(struct page **pagepool, gfp_t gfp); -static inline void erofs_pagepool_add(struct page **pagepool, - struct page *page) +static inline void erofs_pagepool_add(struct page **pagepool, struct page *page) { set_page_private(page, (unsigned long)*pagepool); *pagepool = page; @@ -564,6 +521,9 @@ int erofs_try_to_free_cached_page(struct page *page); int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, struct z_erofs_lz4_cfgs *lz4, int len); +int z_erofs_fill_inode(struct inode *inode); +int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, + int flags); #else static inline void erofs_shrinker_register(struct super_block *sb) {} static inline void erofs_shrinker_unregister(struct super_block *sb) {} @@ -581,6 +541,7 @@ static inline int z_erofs_load_lz4_config(struct super_block *sb, } return 0; } +static inline int z_erofs_fill_inode(struct inode *inode) { return -EOPNOTSUPP; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_ZIP_LZMA @@ -601,23 +562,15 @@ static inline int z_erofs_load_lzma_config(struct super_block *sb, } return 0; } -#endif /* !CONFIG_EROFS_FS_ZIP */ +#endif /* !CONFIG_EROFS_FS_ZIP_LZMA */ -/* flags for erofs_fscache_register_cookie() */ -#define EROFS_REG_COOKIE_NEED_INODE 1 -#define EROFS_REG_COOKIE_NEED_NOEXIST 2 - -/* fscache.c */ #ifdef CONFIG_EROFS_FS_ONDEMAND int erofs_fscache_register_fs(struct super_block *sb); void erofs_fscache_unregister_fs(struct super_block *sb); struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags); + char *name, unsigned int flags); void erofs_fscache_unregister_cookie(struct erofs_fscache *fscache); - -extern const struct address_space_operations erofs_fscache_access_aops; #else static inline int erofs_fscache_register_fs(struct super_block *sb) { @@ -627,8 +580,7 @@ static inline void erofs_fscache_unregister_fs(struct super_block *sb) {} static inline struct erofs_fscache *erofs_fscache_register_cookie(struct super_block *sb, - char *name, - unsigned int flags) + char *name, unsigned int flags) { return ERR_PTR(-EOPNOTSUPP); } diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index b64a108fac92..966eabc61c13 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -5,7 +5,6 @@ * Copyright (C) 2022, Alibaba Cloud */ #include "xattr.h" - #include struct erofs_qstr { @@ -87,19 +86,13 @@ static struct erofs_dirent *find_target_dirent(struct erofs_qstr *name, return ERR_PTR(-ENOENT); } -static void *find_target_block_classic(struct erofs_buf *target, - struct inode *dir, - struct erofs_qstr *name, - int *_ndirents) +static void *erofs_find_target_block(struct erofs_buf *target, + struct inode *dir, struct erofs_qstr *name, int *_ndirents) { - unsigned int startprfx, endprfx; - int head, back; + int head = 0, back = DIV_ROUND_UP(dir->i_size, EROFS_BLKSIZ) - 1; + unsigned int startprfx = 0, endprfx = 0; void *candidate = ERR_PTR(-ENOENT); - startprfx = endprfx = 0; - head = 0; - back = erofs_inode_datablocks(dir) - 1; - while (head <= back) { const int mid = head + (back - head) / 2; struct erofs_buf buf = __EROFS_BUF_INITIALIZER; @@ -180,8 +173,7 @@ int erofs_namei(struct inode *dir, const struct qstr *name, erofs_nid_t *nid, qn.end = name->name + name->len; ndirents = 0; - - de = find_target_block_classic(&buf, dir, &qn, &ndirents); + de = erofs_find_target_block(&buf, dir, &qn, &ndirents); if (IS_ERR(de)) return PTR_ERR(de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 626a615dafc2..19b1ae79cec4 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -5,7 +5,6 @@ * Copyright (C) 2021, Alibaba Cloud */ #include -#include #include #include #include @@ -969,6 +968,8 @@ static void erofs_put_super(struct super_block *sb) iput(sbi->packed_inode); sbi->packed_inode = NULL; #endif + erofs_free_dev_context(sbi->devs); + sbi->devs = NULL; erofs_fscache_unregister_fs(sb); } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index fd476961f742..435e515c0792 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -179,13 +179,13 @@ static const struct sysfs_ops erofs_attr_ops = { .store = erofs_attr_store, }; -static struct kobj_type erofs_sb_ktype = { +static const struct kobj_type erofs_sb_ktype = { .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; -static struct kobj_type erofs_ktype = { +static const struct kobj_type erofs_ktype = { .sysfs_ops = &erofs_attr_ops, }; @@ -193,7 +193,7 @@ static struct kset erofs_root = { .kobj = {.ktype = &erofs_ktype}, }; -static struct kobj_type erofs_feat_ktype = { +static const struct kobj_type erofs_feat_ktype = { .default_groups = erofs_feat_groups, .sysfs_ops = &erofs_attr_ops, }; diff --git a/fs/erofs/tagptr.h b/fs/erofs/tagptr.h deleted file mode 100644 index 64ceb7270b5c..000000000000 --- a/fs/erofs/tagptr.h +++ /dev/null @@ -1,107 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * A tagged pointer implementation - */ -#ifndef __EROFS_FS_TAGPTR_H -#define __EROFS_FS_TAGPTR_H - -#include -#include - -/* - * the name of tagged pointer types are tagptr{1, 2, 3...}_t - * avoid directly using the internal structs __tagptr{1, 2, 3...} - */ -#define __MAKE_TAGPTR(n) \ -typedef struct __tagptr##n { \ - uintptr_t v; \ -} tagptr##n##_t; - -__MAKE_TAGPTR(1) -__MAKE_TAGPTR(2) -__MAKE_TAGPTR(3) -__MAKE_TAGPTR(4) - -#undef __MAKE_TAGPTR - -extern void __compiletime_error("bad tagptr tags") - __bad_tagptr_tags(void); - -extern void __compiletime_error("bad tagptr type") - __bad_tagptr_type(void); - -/* fix the broken usage of "#define tagptr2_t tagptr3_t" by users */ -#define __tagptr_mask_1(ptr, n) \ - __builtin_types_compatible_p(typeof(ptr), struct __tagptr##n) ? \ - (1UL << (n)) - 1 : - -#define __tagptr_mask(ptr) (\ - __tagptr_mask_1(ptr, 1) ( \ - __tagptr_mask_1(ptr, 2) ( \ - __tagptr_mask_1(ptr, 3) ( \ - __tagptr_mask_1(ptr, 4) ( \ - __bad_tagptr_type(), 0))))) - -/* generate a tagged pointer from a raw value */ -#define tagptr_init(type, val) \ - ((typeof(type)){ .v = (uintptr_t)(val) }) - -/* - * directly cast a tagged pointer to the native pointer type, which - * could be used for backward compatibility of existing code. - */ -#define tagptr_cast_ptr(tptr) ((void *)(tptr).v) - -/* encode tagged pointers */ -#define tagptr_fold(type, ptr, _tags) ({ \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(type))) \ - __bad_tagptr_tags(); \ -tagptr_init(type, (uintptr_t)(ptr) | tags); }) - -/* decode tagged pointers */ -#define tagptr_unfold_ptr(tptr) \ - ((void *)((tptr).v & ~__tagptr_mask(tptr))) - -#define tagptr_unfold_tags(tptr) \ - ((tptr).v & __tagptr_mask(tptr)) - -/* operations for the tagger pointer */ -#define tagptr_eq(_tptr1, _tptr2) ({ \ - typeof(_tptr1) tptr1 = (_tptr1); \ - typeof(_tptr2) tptr2 = (_tptr2); \ - (void)(&tptr1 == &tptr2); \ -(tptr1).v == (tptr2).v; }) - -/* lock-free CAS operation */ -#define tagptr_cmpxchg(_ptptr, _o, _n) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - typeof(_o) o = (_o); \ - typeof(_n) n = (_n); \ - (void)(&o == &n); \ - (void)(&o == ptptr); \ -tagptr_init(o, cmpxchg(&ptptr->v, o.v, n.v)); }) - -/* wrap WRITE_ONCE if atomic update is needed */ -#define tagptr_replace_tags(_ptptr, tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - *ptptr = tagptr_fold(*ptptr, tagptr_unfold_ptr(*ptptr), tags); \ -*ptptr; }) - -#define tagptr_set_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v |= tags; \ -*ptptr; }) - -#define tagptr_clear_tags(_ptptr, _tags) ({ \ - typeof(_ptptr) ptptr = (_ptptr); \ - const typeof(_tags) tags = (_tags); \ - if (__builtin_constant_p(tags) && (tags & ~__tagptr_mask(*ptptr))) \ - __bad_tagptr_tags(); \ - ptptr->v &= ~tags; \ -*ptptr; }) - -#endif /* __EROFS_FS_TAGPTR_H */ diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index a62fb8a3318a..60729b1220b6 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -22,8 +22,7 @@ static int init_inode_xattrs(struct inode *inode) struct xattr_iter it; unsigned int i; struct erofs_xattr_ibody_header *ih; - struct super_block *sb; - struct erofs_sb_info *sbi; + struct super_block *sb = inode->i_sb; int ret = 0; /* the most case is that xattrs of this inode are initialized. */ @@ -52,15 +51,14 @@ static int init_inode_xattrs(struct inode *inode) * undefined right now (maybe use later with some new sb feature). */ if (vi->xattr_isize == sizeof(struct erofs_xattr_ibody_header)) { - erofs_err(inode->i_sb, + erofs_err(sb, "xattr_isize %d of nid %llu is not supported yet", vi->xattr_isize, vi->nid); ret = -EOPNOTSUPP; goto out_unlock; } else if (vi->xattr_isize < sizeof(struct erofs_xattr_ibody_header)) { if (vi->xattr_isize) { - erofs_err(inode->i_sb, - "bogus xattr ibody @ nid %llu", vi->nid); + erofs_err(sb, "bogus xattr ibody @ nid %llu", vi->nid); DBG_BUGON(1); ret = -EFSCORRUPTED; goto out_unlock; /* xattr ondisk layout error */ @@ -69,11 +67,9 @@ static int init_inode_xattrs(struct inode *inode) goto out_unlock; } - sb = inode->i_sb; - sbi = EROFS_SB(sb); it.buf = __EROFS_BUF_INITIALIZER; - it.blkaddr = erofs_blknr(iloc(sbi, vi->nid) + vi->inode_isize); - it.ofs = erofs_blkoff(iloc(sbi, vi->nid) + vi->inode_isize); + it.blkaddr = erofs_blknr(erofs_iloc(inode) + vi->inode_isize); + it.ofs = erofs_blkoff(erofs_iloc(inode) + vi->inode_isize); /* read in shared xattr array (non-atomic, see kmalloc below) */ it.kaddr = erofs_read_metabuf(&it.buf, sb, it.blkaddr, EROFS_KMAP); @@ -159,7 +155,6 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); - struct erofs_sb_info *const sbi = EROFS_SB(inode->i_sb); unsigned int xattr_header_sz, inline_xattr_ofs; xattr_header_sz = inlinexattr_header_size(inode); @@ -170,9 +165,8 @@ static int inline_xattr_iter_begin(struct xattr_iter *it, inline_xattr_ofs = vi->inode_isize + xattr_header_sz; - it->blkaddr = erofs_blknr(iloc(sbi, vi->nid) + inline_xattr_ofs); - it->ofs = erofs_blkoff(iloc(sbi, vi->nid) + inline_xattr_ofs); - + it->blkaddr = erofs_blknr(erofs_iloc(inode) + inline_xattr_ofs); + it->ofs = erofs_blkoff(erofs_iloc(inode) + inline_xattr_ofs); it->kaddr = erofs_read_metabuf(&it->buf, inode->i_sb, it->blkaddr, EROFS_KMAP); if (IS_ERR(it->kaddr)) diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index 5200bb86e264..3247d2422bea 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -4,13 +4,178 @@ * https://www.huawei.com/ * Copyright (C) 2022 Alibaba Cloud */ -#include "zdata.h" #include "compress.h" #include #include - +#include #include +#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) +#define Z_EROFS_INLINE_BVECS 2 + +/* + * let's leave a type here in case of introducing + * another tagged pointer later. + */ +typedef void *z_erofs_next_pcluster_t; + +struct z_erofs_bvec { + struct page *page; + int offset; + unsigned int end; +}; + +#define __Z_EROFS_BVSET(name, total) \ +struct name { \ + /* point to the next page which contains the following bvecs */ \ + struct page *nextpage; \ + struct z_erofs_bvec bvec[total]; \ +} +__Z_EROFS_BVSET(z_erofs_bvset,); +__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); + +/* + * Structure fields follow one of the following exclusion rules. + * + * I: Modifiable by initialization/destruction paths and read-only + * for everyone else; + * + * L: Field should be protected by the pcluster lock; + * + * A: Field should be accessed / updated in atomic for parallelized code. + */ +struct z_erofs_pcluster { + struct erofs_workgroup obj; + struct mutex lock; + + /* A: point to next chained pcluster or TAILs */ + z_erofs_next_pcluster_t next; + + /* L: the maximum decompression size of this round */ + unsigned int length; + + /* L: total number of bvecs */ + unsigned int vcnt; + + /* I: page offset of start position of decompression */ + unsigned short pageofs_out; + + /* I: page offset of inline compressed data */ + unsigned short pageofs_in; + + union { + /* L: inline a certain number of bvec for bootstrap */ + struct z_erofs_bvset_inline bvset; + + /* I: can be used to free the pcluster by RCU. */ + struct rcu_head rcu; + }; + + union { + /* I: physical cluster size in pages */ + unsigned short pclusterpages; + + /* I: tailpacking inline compressed size */ + unsigned short tailpacking_size; + }; + + /* I: compression algorithm format */ + unsigned char algorithmformat; + + /* L: whether partial decompression or not */ + bool partial; + + /* L: indicate several pageofs_outs or not */ + bool multibases; + + /* A: compressed bvecs (can be cached or inplaced pages) */ + struct z_erofs_bvec compressed_bvecs[]; +}; + +/* let's avoid the valid 32-bit kernel addresses */ + +/* the chained workgroup has't submitted io (still open) */ +#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) +/* the chained workgroup has already submitted io */ +#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) + +#define Z_EROFS_PCLUSTER_NIL (NULL) + +struct z_erofs_decompressqueue { + struct super_block *sb; + atomic_t pending_bios; + z_erofs_next_pcluster_t head; + + union { + struct completion done; + struct work_struct work; + struct kthread_work kthread_work; + } u; + bool eio, sync; +}; + +static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) +{ + return !pcl->obj.index; +} + +static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) +{ + if (z_erofs_is_inline_pcluster(pcl)) + return 1; + return pcl->pclusterpages; +} + +/* + * bit 30: I/O error occurred on this page + * bit 0 - 29: remaining parts to complete this page + */ +#define Z_EROFS_PAGE_EIO (1 << 30) + +static inline void z_erofs_onlinepage_init(struct page *page) +{ + union { + atomic_t o; + unsigned long v; + } u = { .o = ATOMIC_INIT(1) }; + + set_page_private(page, u.v); + smp_wmb(); + SetPagePrivate(page); +} + +static inline void z_erofs_onlinepage_split(struct page *page) +{ + atomic_inc((atomic_t *)&page->private); +} + +static inline void z_erofs_page_mark_eio(struct page *page) +{ + int orig; + + do { + orig = atomic_read((atomic_t *)&page->private); + } while (atomic_cmpxchg((atomic_t *)&page->private, orig, + orig | Z_EROFS_PAGE_EIO) != orig); +} + +static inline void z_erofs_onlinepage_endio(struct page *page) +{ + unsigned int v; + + DBG_BUGON(!PagePrivate(page)); + v = atomic_dec_return((atomic_t *)&page->private); + if (!(v & ~Z_EROFS_PAGE_EIO)) { + set_page_private(page, 0); + ClearPagePrivate(page); + if (!(v & Z_EROFS_PAGE_EIO)) + SetPageUptodate(page); + unlock_page(page); + } +} + +#define Z_EROFS_ONSTACK_PAGES 32 + /* * since pclustersize is variable for big pcluster feature, introduce slab * pools implementation for different pcluster sizes. @@ -175,46 +340,162 @@ static void z_erofs_free_pcluster(struct z_erofs_pcluster *pcl) DBG_BUGON(1); } -/* - * tagged pointer with 1-bit tag for all compressed pages - * tag 0 - the page is just found with an extra page reference - */ -typedef tagptr1_t compressed_page_t; - -#define tag_compressed_page_justfound(page) \ - tagptr_fold(compressed_page_t, page, 1) - static struct workqueue_struct *z_erofs_workqueue __read_mostly; +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static struct kthread_worker __rcu **z_erofs_pcpu_workers; + +static void erofs_destroy_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + for_each_possible_cpu(cpu) { + worker = rcu_dereference_protected( + z_erofs_pcpu_workers[cpu], 1); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + if (worker) + kthread_destroy_worker(worker); + } + kfree(z_erofs_pcpu_workers); +} + +static struct kthread_worker *erofs_init_percpu_worker(int cpu) +{ + struct kthread_worker *worker = + kthread_create_worker_on_cpu(cpu, 0, "erofs_worker/%u", cpu); + + if (IS_ERR(worker)) + return worker; + if (IS_ENABLED(CONFIG_EROFS_FS_PCPU_KTHREAD_HIPRI)) + sched_set_fifo_low(worker->task); + else + sched_set_normal(worker->task, 0); + return worker; +} + +static int erofs_init_percpu_workers(void) +{ + struct kthread_worker *worker; + unsigned int cpu; + + z_erofs_pcpu_workers = kcalloc(num_possible_cpus(), + sizeof(struct kthread_worker *), GFP_ATOMIC); + if (!z_erofs_pcpu_workers) + return -ENOMEM; + + for_each_online_cpu(cpu) { /* could miss cpu{off,on}line? */ + worker = erofs_init_percpu_worker(cpu); + if (!IS_ERR(worker)) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + } + return 0; +} +#else +static inline void erofs_destroy_percpu_workers(void) {} +static inline int erofs_init_percpu_workers(void) { return 0; } +#endif + +#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); +static enum cpuhp_state erofs_cpuhp_state; + +static int erofs_cpu_online(unsigned int cpu) +{ + struct kthread_worker *worker, *old; + + worker = erofs_init_percpu_worker(cpu); + if (IS_ERR(worker)) + return PTR_ERR(worker); + + spin_lock(&z_erofs_pcpu_worker_lock); + old = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + if (!old) + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], worker); + spin_unlock(&z_erofs_pcpu_worker_lock); + if (old) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_offline(unsigned int cpu) +{ + struct kthread_worker *worker; + + spin_lock(&z_erofs_pcpu_worker_lock); + worker = rcu_dereference_protected(z_erofs_pcpu_workers[cpu], + lockdep_is_held(&z_erofs_pcpu_worker_lock)); + rcu_assign_pointer(z_erofs_pcpu_workers[cpu], NULL); + spin_unlock(&z_erofs_pcpu_worker_lock); + + synchronize_rcu(); + if (worker) + kthread_destroy_worker(worker); + return 0; +} + +static int erofs_cpu_hotplug_init(void) +{ + int state; + + state = cpuhp_setup_state_nocalls(CPUHP_AP_ONLINE_DYN, + "fs/erofs:online", erofs_cpu_online, erofs_cpu_offline); + if (state < 0) + return state; + + erofs_cpuhp_state = state; + return 0; +} + +static void erofs_cpu_hotplug_destroy(void) +{ + if (erofs_cpuhp_state) + cpuhp_remove_state_nocalls(erofs_cpuhp_state); +} +#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int erofs_cpu_hotplug_init(void) { return 0; } +static inline void erofs_cpu_hotplug_destroy(void) {} +#endif + void z_erofs_exit_zip_subsystem(void) { + erofs_cpu_hotplug_destroy(); + erofs_destroy_percpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); } -static inline int z_erofs_init_workqueue(void) -{ - const unsigned int onlinecpus = num_possible_cpus(); - - /* - * no need to spawn too many threads, limiting threads could minimum - * scheduling overhead, perhaps per-CPU threads should be better? - */ - z_erofs_workqueue = alloc_workqueue("erofs_unzipd", - WQ_UNBOUND | WQ_HIGHPRI, - onlinecpus + onlinecpus / 4); - return z_erofs_workqueue ? 0 : -ENOMEM; -} - int __init z_erofs_init_zip_subsystem(void) { int err = z_erofs_create_pcluster_pool(); if (err) - return err; - err = z_erofs_init_workqueue(); + goto out_error_pcluster_pool; + + z_erofs_workqueue = alloc_workqueue("erofs_worker", + WQ_UNBOUND | WQ_HIGHPRI, num_possible_cpus()); + if (!z_erofs_workqueue) { + err = -ENOMEM; + goto out_error_workqueue_init; + } + + err = erofs_init_percpu_workers(); if (err) - z_erofs_destroy_pcluster_pool(); + goto out_error_pcpu_worker; + + err = erofs_cpu_hotplug_init(); + if (err < 0) + goto out_error_cpuhp_init; + return err; + +out_error_cpuhp_init: + erofs_destroy_percpu_workers(); +out_error_pcpu_worker: + destroy_workqueue(z_erofs_workqueue); +out_error_workqueue_init: + z_erofs_destroy_pcluster_pool(); +out_error_pcluster_pool: return err; } @@ -319,7 +600,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, for (i = 0; i < pcl->pclusterpages; ++i) { struct page *page; - compressed_page_t t; + void *t; /* mark pages just found for debugging */ struct page *newpage = NULL; /* the compressed page was loaded before */ @@ -329,7 +610,7 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, page = find_get_page(mc, pcl->obj.index + i); if (page) { - t = tag_compressed_page_justfound(page); + t = (void *)((unsigned long)page | 1); } else { /* I/O is needed, no possible to decompress directly */ standalone = false; @@ -345,11 +626,10 @@ static void z_erofs_bind_cache(struct z_erofs_decompress_frontend *fe, if (!newpage) continue; set_page_private(newpage, Z_EROFS_PREALLOCATED_PAGE); - t = tag_compressed_page_justfound(newpage); + t = (void *)((unsigned long)newpage | 1); } - if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, - tagptr_cast_ptr(t))) + if (!cmpxchg_relaxed(&pcl->compressed_bvecs[i].page, NULL, t)) continue; if (page) @@ -1151,18 +1431,24 @@ static void z_erofs_decompressqueue_work(struct work_struct *work) DBG_BUGON(bgq->head == Z_EROFS_PCLUSTER_TAIL_CLOSED); z_erofs_decompress_queue(bgq, &pagepool); - erofs_release_pages(&pagepool); kvfree(bgq); } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD +static void z_erofs_decompressqueue_kthread_work(struct kthread_work *work) +{ + z_erofs_decompressqueue_work((struct work_struct *)work); +} +#endif + static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, - bool sync, int bios) + int bios) { struct erofs_sb_info *const sbi = EROFS_SB(io->sb); /* wake up the caller thread for sync decompression */ - if (sync) { + if (io->sync) { if (!atomic_add_return(bios, &io->pending_bios)) complete(&io->u.done); return; @@ -1170,9 +1456,24 @@ static void z_erofs_decompress_kickoff(struct z_erofs_decompressqueue *io, if (atomic_add_return(bios, &io->pending_bios)) return; - /* Use workqueue and sync decompression for atomic contexts only */ + /* Use (kthread_)work and sync decompression for atomic contexts only */ if (in_atomic() || irqs_disabled()) { +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + struct kthread_worker *worker; + + rcu_read_lock(); + worker = rcu_dereference( + z_erofs_pcpu_workers[raw_smp_processor_id()]); + if (!worker) { + INIT_WORK(&io->u.work, z_erofs_decompressqueue_work); + queue_work(z_erofs_workqueue, &io->u.work); + } else { + kthread_queue_work(worker, &io->u.kthread_work); + } + rcu_read_unlock(); +#else queue_work(z_erofs_workqueue, &io->u.work); +#endif /* enable sync decompression for readahead */ if (sbi->opt.sync_decompress == EROFS_SYNC_DECOMPRESS_AUTO) sbi->opt.sync_decompress = EROFS_SYNC_DECOMPRESS_FORCE_ON; @@ -1192,8 +1493,6 @@ static struct page *pickup_page_for_submission(struct z_erofs_pcluster *pcl, struct address_space *mapping; struct page *oldpage, *page; - - compressed_page_t t; int justfound; repeat: @@ -1203,10 +1502,8 @@ repeat: if (!page) goto out_allocpage; - /* process the target tagged pointer */ - t = tagptr_init(compressed_page_t, page); - justfound = tagptr_unfold_tags(t); - page = tagptr_unfold_ptr(t); + justfound = (unsigned long)page & 1UL; + page = (struct page *)((unsigned long)page & ~1UL); /* * preallocated cached pages, which is used to avoid direct reclaim @@ -1294,9 +1591,8 @@ out: /* the only exit (for tracing and debugging) */ return page; } -static struct z_erofs_decompressqueue * -jobqueue_init(struct super_block *sb, - struct z_erofs_decompressqueue *fgq, bool *fg) +static struct z_erofs_decompressqueue *jobqueue_init(struct super_block *sb, + struct z_erofs_decompressqueue *fgq, bool *fg) { struct z_erofs_decompressqueue *q; @@ -1306,13 +1602,19 @@ jobqueue_init(struct super_block *sb, *fg = true; goto fg_out; } +#ifdef CONFIG_EROFS_FS_PCPU_KTHREAD + kthread_init_work(&q->u.kthread_work, + z_erofs_decompressqueue_kthread_work); +#else INIT_WORK(&q->u.work, z_erofs_decompressqueue_work); +#endif } else { fg_out: q = fgq; init_completion(&fgq->u.done); atomic_set(&fgq->pending_bios, 0); q->eio = false; + q->sync = true; } q->sb = sb; q->head = Z_EROFS_PCLUSTER_TAIL_CLOSED; @@ -1326,20 +1628,6 @@ enum { NR_JOBQUEUES, }; -static void *jobqueueset_init(struct super_block *sb, - struct z_erofs_decompressqueue *q[], - struct z_erofs_decompressqueue *fgq, bool *fg) -{ - /* - * if managed cache is enabled, bypass jobqueue is needed, - * no need to read from device for all pclusters in this queue. - */ - q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); - q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, fg); - - return tagptr_cast_ptr(tagptr_fold(tagptr1_t, q[JQ_SUBMIT], *fg)); -} - static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, z_erofs_next_pcluster_t qtail[], z_erofs_next_pcluster_t owned_head) @@ -1361,8 +1649,7 @@ static void move_to_bypass_jobqueue(struct z_erofs_pcluster *pcl, static void z_erofs_decompressqueue_endio(struct bio *bio) { - tagptr1_t t = tagptr_init(tagptr1_t, bio->bi_private); - struct z_erofs_decompressqueue *q = tagptr_unfold_ptr(t); + struct z_erofs_decompressqueue *q = bio->bi_private; blk_status_t err = bio->bi_status; struct bio_vec *bvec; struct bvec_iter_all iter_all; @@ -1381,7 +1668,7 @@ static void z_erofs_decompressqueue_endio(struct bio *bio) } if (err) q->eio = true; - z_erofs_decompress_kickoff(q, tagptr_unfold_tags(t), -1); + z_erofs_decompress_kickoff(q, -1); bio_put(bio); } @@ -1394,7 +1681,6 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, struct address_space *mc = MNGD_MAPPING(EROFS_SB(sb)); z_erofs_next_pcluster_t qtail[NR_JOBQUEUES]; struct z_erofs_decompressqueue *q[NR_JOBQUEUES]; - void *bi_private; z_erofs_next_pcluster_t owned_head = f->owned_head; /* bio is NULL initially, so no need to initialize last_{index,bdev} */ pgoff_t last_index; @@ -1404,7 +1690,13 @@ static void z_erofs_submit_queue(struct z_erofs_decompress_frontend *f, unsigned long pflags; int memstall = 0; - bi_private = jobqueueset_init(sb, q, fgq, force_fg); + /* + * if managed cache is enabled, bypass jobqueue is needed, + * no need to read from device for all pclusters in this queue. + */ + q[JQ_BYPASS] = jobqueue_init(sb, fgq + JQ_BYPASS, NULL); + q[JQ_SUBMIT] = jobqueue_init(sb, fgq + JQ_SUBMIT, force_fg); + qtail[JQ_BYPASS] = &q[JQ_BYPASS]->head; qtail[JQ_SUBMIT] = &q[JQ_SUBMIT]->head; @@ -1473,7 +1765,7 @@ submit_bio_retry: last_bdev = mdev.m_bdev; bio->bi_iter.bi_sector = (sector_t)cur << LOG_SECTORS_PER_BLOCK; - bio->bi_private = bi_private; + bio->bi_private = q[JQ_SUBMIT]; if (f->readahead) bio->bi_opf |= REQ_RAHEAD; ++nr_bios; @@ -1500,13 +1792,13 @@ submit_bio_retry: /* * although background is preferred, no one is pending for submission. - * don't issue workqueue for decompression but drop it directly instead. + * don't issue decompression but drop it directly instead. */ if (!*force_fg && !nr_bios) { kvfree(q[JQ_SUBMIT]); return; } - z_erofs_decompress_kickoff(q[JQ_SUBMIT], *force_fg, nr_bios); + z_erofs_decompress_kickoff(q[JQ_SUBMIT], nr_bios); } static void z_erofs_runqueue(struct z_erofs_decompress_frontend *f, diff --git a/fs/erofs/zdata.h b/fs/erofs/zdata.h deleted file mode 100644 index d98c95212985..000000000000 --- a/fs/erofs/zdata.h +++ /dev/null @@ -1,178 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0-only */ -/* - * Copyright (C) 2018 HUAWEI, Inc. - * https://www.huawei.com/ - */ -#ifndef __EROFS_FS_ZDATA_H -#define __EROFS_FS_ZDATA_H - -#include "internal.h" -#include "tagptr.h" - -#define Z_EROFS_PCLUSTER_MAX_PAGES (Z_EROFS_PCLUSTER_MAX_SIZE / PAGE_SIZE) -#define Z_EROFS_INLINE_BVECS 2 - -/* - * let's leave a type here in case of introducing - * another tagged pointer later. - */ -typedef void *z_erofs_next_pcluster_t; - -struct z_erofs_bvec { - struct page *page; - int offset; - unsigned int end; -}; - -#define __Z_EROFS_BVSET(name, total) \ -struct name { \ - /* point to the next page which contains the following bvecs */ \ - struct page *nextpage; \ - struct z_erofs_bvec bvec[total]; \ -} -__Z_EROFS_BVSET(z_erofs_bvset,); -__Z_EROFS_BVSET(z_erofs_bvset_inline, Z_EROFS_INLINE_BVECS); - -/* - * Structure fields follow one of the following exclusion rules. - * - * I: Modifiable by initialization/destruction paths and read-only - * for everyone else; - * - * L: Field should be protected by the pcluster lock; - * - * A: Field should be accessed / updated in atomic for parallelized code. - */ -struct z_erofs_pcluster { - struct erofs_workgroup obj; - struct mutex lock; - - /* A: point to next chained pcluster or TAILs */ - z_erofs_next_pcluster_t next; - - /* L: the maximum decompression size of this round */ - unsigned int length; - - /* L: total number of bvecs */ - unsigned int vcnt; - - /* I: page offset of start position of decompression */ - unsigned short pageofs_out; - - /* I: page offset of inline compressed data */ - unsigned short pageofs_in; - - union { - /* L: inline a certain number of bvec for bootstrap */ - struct z_erofs_bvset_inline bvset; - - /* I: can be used to free the pcluster by RCU. */ - struct rcu_head rcu; - }; - - union { - /* I: physical cluster size in pages */ - unsigned short pclusterpages; - - /* I: tailpacking inline compressed size */ - unsigned short tailpacking_size; - }; - - /* I: compression algorithm format */ - unsigned char algorithmformat; - - /* L: whether partial decompression or not */ - bool partial; - - /* L: indicate several pageofs_outs or not */ - bool multibases; - - /* A: compressed bvecs (can be cached or inplaced pages) */ - struct z_erofs_bvec compressed_bvecs[]; -}; - -/* let's avoid the valid 32-bit kernel addresses */ - -/* the chained workgroup has't submitted io (still open) */ -#define Z_EROFS_PCLUSTER_TAIL ((void *)0x5F0ECAFE) -/* the chained workgroup has already submitted io */ -#define Z_EROFS_PCLUSTER_TAIL_CLOSED ((void *)0x5F0EDEAD) - -#define Z_EROFS_PCLUSTER_NIL (NULL) - -struct z_erofs_decompressqueue { - struct super_block *sb; - atomic_t pending_bios; - z_erofs_next_pcluster_t head; - - union { - struct completion done; - struct work_struct work; - } u; - - bool eio; -}; - -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->obj.index; -} - -static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) -{ - if (z_erofs_is_inline_pcluster(pcl)) - return 1; - return pcl->pclusterpages; -} - -/* - * bit 30: I/O error occurred on this page - * bit 0 - 29: remaining parts to complete this page - */ -#define Z_EROFS_PAGE_EIO (1 << 30) - -static inline void z_erofs_onlinepage_init(struct page *page) -{ - union { - atomic_t o; - unsigned long v; - } u = { .o = ATOMIC_INIT(1) }; - - set_page_private(page, u.v); - smp_wmb(); - SetPagePrivate(page); -} - -static inline void z_erofs_onlinepage_split(struct page *page) -{ - atomic_inc((atomic_t *)&page->private); -} - -static inline void z_erofs_page_mark_eio(struct page *page) -{ - int orig; - - do { - orig = atomic_read((atomic_t *)&page->private); - } while (atomic_cmpxchg((atomic_t *)&page->private, orig, - orig | Z_EROFS_PAGE_EIO) != orig); -} - -static inline void z_erofs_onlinepage_endio(struct page *page) -{ - unsigned int v; - - DBG_BUGON(!PagePrivate(page)); - v = atomic_dec_return((atomic_t *)&page->private); - if (!(v & ~Z_EROFS_PAGE_EIO)) { - set_page_private(page, 0); - ClearPagePrivate(page); - if (!(v & Z_EROFS_PAGE_EIO)) - SetPageUptodate(page); - unlock_page(page); - } -} - -#define Z_EROFS_ONSTACK_PAGES 32 - -#endif diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 98fb90b9af71..8bf6d30518b6 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -7,10 +7,6 @@ #include #include -static int z_erofs_do_map_blocks(struct inode *inode, - struct erofs_map_blocks *map, - int flags); - int z_erofs_fill_inode(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -29,126 +25,6 @@ int z_erofs_fill_inode(struct inode *inode) return 0; } -static int z_erofs_fill_inode_lazy(struct inode *inode) -{ - struct erofs_inode *const vi = EROFS_I(inode); - struct super_block *const sb = inode->i_sb; - int err, headnr; - erofs_off_t pos; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - void *kaddr; - struct z_erofs_map_header *h; - - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { - /* - * paired with smp_mb() at the end of the function to ensure - * fields will only be observed after the bit is set. - */ - smp_mb(); - return 0; - } - - if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) - return -ERESTARTSYS; - - err = 0; - if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) - goto out_unlock; - - pos = ALIGN(iloc(EROFS_SB(sb), vi->nid) + vi->inode_isize + - vi->xattr_isize, 8); - kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); - goto out_unlock; - } - - h = kaddr + erofs_blkoff(pos); - /* - * if the highest bit of the 8-byte map header is set, the whole file - * is stored in the packed inode. The rest bits keeps z_fragmentoff. - */ - if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { - vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; - vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); - vi->z_tailextent_headlcn = 0; - goto done; - } - vi->z_advise = le16_to_cpu(h->h_advise); - vi->z_algorithmtype[0] = h->h_algorithmtype & 15; - vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; - - headnr = 0; - if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || - vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { - erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", - headnr + 1, vi->z_algorithmtype[headnr], vi->nid); - err = -EOPNOTSUPP; - goto out_put_metabuf; - } - - vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); - if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && - vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | - Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ - !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { - erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", - vi->nid); - err = -EFSCORRUPTED; - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - if (err < 0) - goto out_put_metabuf; - } -done: - /* paired with smp_mb() at the beginning of the function */ - smp_mb(); - set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); -out_put_metabuf: - erofs_put_metabuf(&buf); -out_unlock: - clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); - return err; -} - struct z_erofs_maprecorder { struct inode *inode; struct erofs_map_blocks *map; @@ -169,10 +45,9 @@ static int legacy_load_cluster_from_disk(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ibase = iloc(EROFS_I_SB(inode), vi->nid); const erofs_off_t pos = - Z_EROFS_VLE_LEGACY_INDEX_ALIGN(ibase + vi->inode_isize + - vi->xattr_isize) + + Z_EROFS_VLE_LEGACY_INDEX_ALIGN(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_vle_decompressed_index); struct z_erofs_vle_decompressed_index *di; unsigned int advise, type; @@ -372,9 +247,8 @@ static int compacted_load_cluster_from_disk(struct z_erofs_maprecorder *m, struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); const unsigned int lclusterbits = vi->z_logical_clusterbits; - const erofs_off_t ebase = ALIGN(iloc(EROFS_I_SB(inode), vi->nid) + - vi->inode_isize + vi->xattr_isize, 8) + - sizeof(struct z_erofs_map_header); + const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + + ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); const unsigned int totalidx = DIV_ROUND_UP(inode->i_size, EROFS_BLKSIZ); unsigned int compacted_4b_initial, compacted_2b; unsigned int amortizedshift; @@ -732,6 +606,125 @@ unmap_out: return err; } +static int z_erofs_fill_inode_lazy(struct inode *inode) +{ + struct erofs_inode *const vi = EROFS_I(inode); + struct super_block *const sb = inode->i_sb; + int err, headnr; + erofs_off_t pos; + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; + void *kaddr; + struct z_erofs_map_header *h; + + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) { + /* + * paired with smp_mb() at the end of the function to ensure + * fields will only be observed after the bit is set. + */ + smp_mb(); + return 0; + } + + if (wait_on_bit_lock(&vi->flags, EROFS_I_BL_Z_BIT, TASK_KILLABLE)) + return -ERESTARTSYS; + + err = 0; + if (test_bit(EROFS_I_Z_INITED_BIT, &vi->flags)) + goto out_unlock; + + pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); + kaddr = erofs_read_metabuf(&buf, sb, erofs_blknr(pos), EROFS_KMAP); + if (IS_ERR(kaddr)) { + err = PTR_ERR(kaddr); + goto out_unlock; + } + + h = kaddr + erofs_blkoff(pos); + /* + * if the highest bit of the 8-byte map header is set, the whole file + * is stored in the packed inode. The rest bits keeps z_fragmentoff. + */ + if (h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT) { + vi->z_advise = Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + vi->z_fragmentoff = le64_to_cpu(*(__le64 *)h) ^ (1ULL << 63); + vi->z_tailextent_headlcn = 0; + goto done; + } + vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; + vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + + headnr = 0; + if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || + vi->z_algorithmtype[++headnr] >= Z_EROFS_COMPRESSION_MAX) { + erofs_err(sb, "unknown HEAD%u format %u for nid %llu, please upgrade kernel", + headnr + 1, vi->z_algorithmtype[headnr], vi->nid); + err = -EOPNOTSUPP; + goto out_put_metabuf; + } + + vi->z_logical_clusterbits = LOG_BLOCK_SIZE + (h->h_clusterbits & 7); + if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && + vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | + Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "per-inode big pcluster without sb feature for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + if (vi->datalayout == EROFS_INODE_FLAT_COMPRESSION && + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_1) ^ + !(vi->z_advise & Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { + erofs_err(sb, "big pcluster head1/2 of compact indexes should be consistent for nid %llu", + vi->nid); + err = -EFSCORRUPTED; + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_idata_size = le16_to_cpu(h->h_idata_size); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + + if (!map.m_plen || + erofs_blkoff(map.m_pa) + map.m_plen > EROFS_BLKSIZ) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map.m_plen); + err = -EFSCORRUPTED; + } + if (err < 0) + goto out_put_metabuf; + } + + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && + !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { + struct erofs_map_blocks map = { + .buf = __EROFS_BUF_INITIALIZER + }; + + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + err = z_erofs_do_map_blocks(inode, &map, + EROFS_GET_BLOCKS_FINDTAIL); + erofs_put_metabuf(&map.buf); + if (err < 0) + goto out_put_metabuf; + } +done: + /* paired with smp_mb() at the beginning of the function */ + smp_mb(); + set_bit(EROFS_I_Z_INITED_BIT, &vi->flags); +out_put_metabuf: + erofs_put_metabuf(&buf); +out_unlock: + clear_and_wake_up_bit(EROFS_I_BL_Z_BIT, &vi->flags); + return err; +} + int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, int flags) { diff --git a/include/trace/events/erofs.h b/include/trace/events/erofs.h index 4f4c44ea3a65..cf4a0d28b178 100644 --- a/include/trace/events/erofs.h +++ b/include/trace/events/erofs.h @@ -19,12 +19,17 @@ struct erofs_map_blocks; { 1, "DIR" }) #define show_map_flags(flags) __print_flags(flags, "|", \ - { EROFS_GET_BLOCKS_RAW, "RAW" }) + { EROFS_GET_BLOCKS_FIEMAP, "FIEMAP" }, \ + { EROFS_GET_BLOCKS_READMORE, "READMORE" }, \ + { EROFS_GET_BLOCKS_FINDTAIL, "FINDTAIL" }) #define show_mflags(flags) __print_flags(flags, "", \ - { EROFS_MAP_MAPPED, "M" }, \ - { EROFS_MAP_META, "I" }, \ - { EROFS_MAP_ENCODED, "E" }) + { EROFS_MAP_MAPPED, "M" }, \ + { EROFS_MAP_META, "I" }, \ + { EROFS_MAP_ENCODED, "E" }, \ + { EROFS_MAP_FULL_MAPPED, "F" }, \ + { EROFS_MAP_FRAGMENT, "R" }, \ + { EROFS_MAP_PARTIAL_REF, "P" }) TRACE_EVENT(erofs_lookup, @@ -66,8 +71,8 @@ TRACE_EVENT(erofs_fill_inode, TP_fast_assign( __entry->dev = inode->i_sb->s_dev; __entry->nid = EROFS_I(inode)->nid; - __entry->blkaddr = erofs_blknr(iloc(EROFS_I_SB(inode), __entry->nid)); - __entry->ofs = erofs_blkoff(iloc(EROFS_I_SB(inode), __entry->nid)); + __entry->blkaddr = erofs_blknr(erofs_iloc(inode)); + __entry->ofs = erofs_blkoff(erofs_iloc(inode)); ), TP_printk("dev = (%d,%d), nid = %llu, blkaddr %u ofs %u",