mirror of
https://github.com/torvalds/linux.git
synced 2024-12-31 23:31:29 +00:00
Convert content from the ext4 wiki to Documentation rst files so it is
more likely to be updated as we add new features to ext4. Add 64-bit timestamp support to ext4's superblock fields. And the usual bug fixes and cleanups, including a Spectre gadget fixup and some hardening against maliciously corrupted file systems. -----BEGIN PGP SIGNATURE----- iQEzBAABCAAdFiEEK2m5VNv+CHkogTfJ8vlZVpUNgaMFAltx3CAACgkQ8vlZVpUN gaOfkAgApkNU46yPWWFqJpeTR76Go4zW00x9Piaita+KpQcTCUokK0BDXXeydOEo zhtNjCRl00vbhnYJ90T75Y3ce9oGQDkWpXHFZBdl+kmMBaIHQJYEECCLR2zrvxVk LhsixvLozQd/VFenwzmXLqYshiZjhjBZOKo7/xsoy4waer1vNidA9/SR3hGWTMOk 9IsEVqw1h6pZt7ZB/leZmZ348H9QZZbYOUWl2ArvdmNeg2t8pWH0MD0bos+mj5lW xihyJEIS/OjWZRJhM2WEPMO7U4jD/aX3zmDMGePp/vC5KSqA4FGwk671xIyD+LXR Hd8Rn8chRe4/iyhBqHpJNkqHQRUV6g== =SA3p -----END PGP SIGNATURE----- Merge tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4 Pull ext4 updates from Ted Ts'o: - Convert content from the ext4 wiki to Documentation rst files so it is more likely to be updated as we add new features to ext4. - Add 64-bit timestamp support to ext4's superblock fields. - ... and the usual bug fixes and cleanups, including a Spectre gadget fixup and some hardening against maliciously corrupted file systems. * tag 'ext4_for_linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tytso/ext4: (34 commits) ext4: remove unneeded variable "err" in ext4_mb_release_inode_pa() ext4: improve code readability in ext4_iget() ext4: fix spectre gadget in ext4_mb_regular_allocator() ext4: check for NUL characters in extended attribute's name ext4: use ext4_warning() for sb_getblk failure ext4: fix race when setting the bitmap corrupted flag ext4: reset error code in ext4_find_entry in fallback ext4: handle layout changes to pinned DAX mappings dax: dax_layout_busy_page() warn on !exceptional docs: fix up the obviously obsolete bits in the new ext4 documentation docs: add new ext4 superblock time extension fields docs: create filesystem internal section ext4: use swap macro in mext_page_double_lock ext4: check allocation failure when duplicating "data" in ext4_remount() ext4: fix warning message in ext4_enable_quotas() ext4: super: extend timestamps to 40 bits jbd2: replace current_kernel_time64 with ktime equivalent ext4: use timespec64 for all inode times ext4: use ktime_get_real_seconds for i_dtime ext4: use 64-bit timestamps for mmp_time ...
This commit is contained in:
commit
10f3e23f07
@ -34,7 +34,7 @@ needs_sphinx = '1.3'
|
||||
# Add any Sphinx extension module names here, as strings. They can be
|
||||
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
||||
# ones.
|
||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure']
|
||||
extensions = ['kerneldoc', 'rstFlatTable', 'kernel_include', 'cdomain', 'kfigure', 'sphinx.ext.ifconfig']
|
||||
|
||||
# The name of the math extension changed on Sphinx 1.4
|
||||
if major == 1 and minor > 3:
|
||||
|
@ -1,6 +1,8 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Ext4 Filesystem
|
||||
===============
|
||||
========================
|
||||
General Information
|
||||
========================
|
||||
|
||||
Ext4 is an advanced level of the ext3 filesystem which incorporates
|
||||
scalability and reliability enhancements for supporting large filesystems
|
||||
@ -11,37 +13,30 @@ Mailing list: linux-ext4@vger.kernel.org
|
||||
Web site: http://ext4.wiki.kernel.org
|
||||
|
||||
|
||||
1. Quick usage instructions:
|
||||
===========================
|
||||
Quick usage instructions
|
||||
========================
|
||||
|
||||
Note: More extensive information for getting started with ext4 can be
|
||||
found at the ext4 wiki site at the URL:
|
||||
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
|
||||
found at the ext4 wiki site at the URL:
|
||||
http://ext4.wiki.kernel.org/index.php/Ext4_Howto
|
||||
|
||||
- Compile and install the latest version of e2fsprogs (as of this
|
||||
writing version 1.41.3) from:
|
||||
|
||||
http://sourceforge.net/project/showfiles.php?group_id=2406
|
||||
|
||||
or
|
||||
- The latest version of e2fsprogs can be found at:
|
||||
|
||||
https://www.kernel.org/pub/linux/kernel/people/tytso/e2fsprogs/
|
||||
|
||||
or
|
||||
|
||||
http://sourceforge.net/project/showfiles.php?group_id=2406
|
||||
|
||||
or grab the latest git repository from:
|
||||
|
||||
git://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
||||
|
||||
- Note that it is highly important to install the mke2fs.conf file
|
||||
that comes with the e2fsprogs 1.41.x sources in /etc/mke2fs.conf. If
|
||||
you have edited the /etc/mke2fs.conf file installed on your system,
|
||||
you will need to merge your changes with the version from e2fsprogs
|
||||
1.41.x.
|
||||
https://git.kernel.org/pub/scm/fs/ext2/e2fsprogs.git
|
||||
|
||||
- Create a new filesystem using the ext4 filesystem type:
|
||||
|
||||
# mke2fs -t ext4 /dev/hda1
|
||||
# mke2fs -t ext4 /dev/hda1
|
||||
|
||||
Or to configure an existing ext3 filesystem to support extents:
|
||||
Or to configure an existing ext3 filesystem to support extents:
|
||||
|
||||
# tune2fs -O extents /dev/hda1
|
||||
|
||||
@ -50,10 +45,6 @@ Note: More extensive information for getting started with ext4 can be
|
||||
|
||||
# tune2fs -I 256 /dev/hda1
|
||||
|
||||
(Note: we currently do not have tools to convert an ext4
|
||||
filesystem back to ext3; so please do not do try this on production
|
||||
filesystems.)
|
||||
|
||||
- Mounting:
|
||||
|
||||
# mount -t ext4 /dev/hda1 /wherever
|
||||
@ -75,10 +66,11 @@ Note: More extensive information for getting started with ext4 can be
|
||||
the filesystem with a large journal can also be helpful for
|
||||
metadata-intensive workloads.
|
||||
|
||||
2. Features
|
||||
===========
|
||||
Features
|
||||
========
|
||||
|
||||
2.1 Currently available
|
||||
Currently Available
|
||||
-------------------
|
||||
|
||||
* ability to use filesystems > 16TB (e2fsprogs support not available yet)
|
||||
* extent format reduces metadata overhead (RAM, IO for access, transactions)
|
||||
@ -103,31 +95,15 @@ Note: More extensive information for getting started with ext4 can be
|
||||
[1] Filesystems with a block size of 1k may see a limit imposed by the
|
||||
directory hash tree having a maximum depth of two.
|
||||
|
||||
2.2 Candidate features for future inclusion
|
||||
|
||||
* online defrag (patches available but not well tested)
|
||||
* reduced mke2fs time via lazy itable initialization in conjunction with
|
||||
the uninit_bg feature (capability to do this is available in e2fsprogs
|
||||
but a kernel thread to do lazy zeroing of unused inode table blocks
|
||||
after filesystem is first mounted is required for safety)
|
||||
|
||||
There are several others under discussion, whether they all make it in is
|
||||
partly a function of how much time everyone has to work on them. Features like
|
||||
metadata checksumming have been discussed and planned for a bit but no patches
|
||||
exist yet so I'm not sure they're in the near-term roadmap.
|
||||
|
||||
The big performance win will come with mballoc, delalloc and flex_bg
|
||||
grouping of bitmaps and inode tables. Some test results available here:
|
||||
|
||||
- http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-write-2.6.27-rc1.html
|
||||
- http://www.bullopensource.org/ext4/20080818-ffsb/ffsb-readwrite-2.6.27-rc1.html
|
||||
|
||||
3. Options
|
||||
==========
|
||||
Options
|
||||
=======
|
||||
|
||||
When mounting an ext4 filesystem, the following option are accepted:
|
||||
(*) == default
|
||||
|
||||
======================= =======================================================
|
||||
Mount Option Description
|
||||
======================= =======================================================
|
||||
ro Mount filesystem read only. Note that ext4 will
|
||||
replay the journal (and thus write to the
|
||||
partition) even when mounted "read only". The
|
||||
@ -387,33 +363,38 @@ i_version Enable 64-bit inode version support. This option is
|
||||
dax Use direct access (no page cache). See
|
||||
Documentation/filesystems/dax.txt. Note that
|
||||
this option is incompatible with data=journal.
|
||||
======================= =======================================================
|
||||
|
||||
Data Mode
|
||||
=========
|
||||
There are 3 different data modes:
|
||||
|
||||
* writeback mode
|
||||
In data=writeback mode, ext4 does not journal data at all. This mode provides
|
||||
a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
|
||||
mode - metadata journaling. A crash+recovery can cause incorrect data to
|
||||
appear in files which were written shortly before the crash. This mode will
|
||||
typically provide the best ext4 performance.
|
||||
|
||||
In data=writeback mode, ext4 does not journal data at all. This mode provides
|
||||
a similar level of journaling as that of XFS, JFS, and ReiserFS in its default
|
||||
mode - metadata journaling. A crash+recovery can cause incorrect data to
|
||||
appear in files which were written shortly before the crash. This mode will
|
||||
typically provide the best ext4 performance.
|
||||
|
||||
* ordered mode
|
||||
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
||||
groups metadata information related to data changes with the data blocks into a
|
||||
single unit called a transaction. When it's time to write the new metadata
|
||||
out to disk, the associated data blocks are written first. In general,
|
||||
this mode performs slightly slower than writeback but significantly faster than journal mode.
|
||||
|
||||
In data=ordered mode, ext4 only officially journals metadata, but it logically
|
||||
groups metadata information related to data changes with the data blocks into
|
||||
a single unit called a transaction. When it's time to write the new metadata
|
||||
out to disk, the associated data blocks are written first. In general, this
|
||||
mode performs slightly slower than writeback but significantly faster than
|
||||
journal mode.
|
||||
|
||||
* journal mode
|
||||
data=journal mode provides full data and metadata journaling. All new data is
|
||||
written to the journal first, and then to its final location.
|
||||
In the event of a crash, the journal can be replayed, bringing both data and
|
||||
metadata into a consistent state. This mode is the slowest except when data
|
||||
needs to be read from and written to disk at the same time where it
|
||||
outperforms all others modes. Enabling this mode will disable delayed
|
||||
allocation and O_DIRECT support.
|
||||
|
||||
data=journal mode provides full data and metadata journaling. All new data is
|
||||
written to the journal first, and then to its final location. In the event of
|
||||
a crash, the journal can be replayed, bringing both data and metadata into a
|
||||
consistent state. This mode is the slowest except when data needs to be read
|
||||
from and written to disk at the same time where it outperforms all others
|
||||
modes. Enabling this mode will disable delayed allocation and O_DIRECT
|
||||
support.
|
||||
|
||||
/proc entries
|
||||
=============
|
||||
@ -425,10 +406,12 @@ Information about mounted ext4 file systems can be found in
|
||||
in table below.
|
||||
|
||||
Files in /proc/fs/ext4/<devname>
|
||||
..............................................................................
|
||||
|
||||
================ =======
|
||||
File Content
|
||||
================ =======
|
||||
mb_groups details of multiblock allocator buddy cache of free blocks
|
||||
..............................................................................
|
||||
================ =======
|
||||
|
||||
/sys entries
|
||||
============
|
||||
@ -439,28 +422,30 @@ Information about mounted ext4 file systems can be found in
|
||||
/sys/fs/ext4/dm-0). The files in each per-device directory are shown
|
||||
in table below.
|
||||
|
||||
Files in /sys/fs/ext4/<devname>
|
||||
(see also Documentation/ABI/testing/sysfs-fs-ext4)
|
||||
..............................................................................
|
||||
File Content
|
||||
Files in /sys/fs/ext4/<devname>:
|
||||
|
||||
(see also Documentation/ABI/testing/sysfs-fs-ext4)
|
||||
|
||||
============================= =================================================
|
||||
File Content
|
||||
============================= =================================================
|
||||
delayed_allocation_blocks This file is read-only and shows the number of
|
||||
blocks that are dirty in the page cache, but
|
||||
which do not have their location in the
|
||||
filesystem allocated yet.
|
||||
|
||||
inode_goal Tuning parameter which (if non-zero) controls
|
||||
inode_goal Tuning parameter which (if non-zero) controls
|
||||
the goal inode used by the inode allocator in
|
||||
preference to all other allocation heuristics.
|
||||
This is intended for debugging use only, and
|
||||
should be 0 on production systems.
|
||||
|
||||
inode_readahead_blks Tuning parameter which controls the maximum
|
||||
inode_readahead_blks Tuning parameter which controls the maximum
|
||||
number of inode table blocks that ext4's inode
|
||||
table readahead algorithm will pre-read into
|
||||
the buffer cache
|
||||
|
||||
lifetime_write_kbytes This file is read-only and shows the number of
|
||||
lifetime_write_kbytes This file is read-only and shows the number of
|
||||
kilobytes of data that have been written to this
|
||||
filesystem since it was created.
|
||||
|
||||
@ -508,7 +493,7 @@ Files in /sys/fs/ext4/<devname>
|
||||
in the file system. If there is not enough space
|
||||
for the reserved space when mounting the file
|
||||
mount will _not_ fail.
|
||||
..............................................................................
|
||||
============================= =================================================
|
||||
|
||||
Ioctls
|
||||
======
|
||||
@ -518,8 +503,10 @@ through the system call interfaces. The list of all Ext4 specific ioctls are
|
||||
shown in the table below.
|
||||
|
||||
Table of Ext4 specific ioctls
|
||||
..............................................................................
|
||||
Ioctl Description
|
||||
|
||||
============================= =================================================
|
||||
Ioctl Description
|
||||
============================= =================================================
|
||||
EXT4_IOC_GETFLAGS Get additional attributes associated with inode.
|
||||
The ioctl argument is an integer bitfield, with
|
||||
bit values described in ext4.h. This ioctl is an
|
||||
@ -610,8 +597,7 @@ Table of Ext4 specific ioctls
|
||||
normal user by accident.
|
||||
The data blocks of the previous boot loader
|
||||
will be associated with the given inode.
|
||||
|
||||
..............................................................................
|
||||
============================= =================================================
|
||||
|
||||
References
|
||||
==========
|
17
Documentation/filesystems/ext4/index.rst
Normal file
17
Documentation/filesystems/ext4/index.rst
Normal file
@ -0,0 +1,17 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
===============
|
||||
ext4 Filesystem
|
||||
===============
|
||||
|
||||
General usage and on-disk artifacts writen by ext4. More documentation may
|
||||
be ported from the wiki as time permits. This should be considered the
|
||||
canonical source of information as the details here have been reviewed by
|
||||
the ext4 community.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 5
|
||||
:numbered:
|
||||
|
||||
ext4
|
||||
ondisk/index
|
44
Documentation/filesystems/ext4/ondisk/about.rst
Normal file
44
Documentation/filesystems/ext4/ondisk/about.rst
Normal file
@ -0,0 +1,44 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
About this Book
|
||||
===============
|
||||
|
||||
This document attempts to describe the on-disk format for ext4
|
||||
filesystems. The same general ideas should apply to ext2/3 filesystems
|
||||
as well, though they do not support all the features that ext4 supports,
|
||||
and the fields will be shorter.
|
||||
|
||||
**NOTE**: This is a work in progress, based on notes that the author
|
||||
(djwong) made while picking apart a filesystem by hand. The data
|
||||
structure definitions should be current as of Linux 4.18 and
|
||||
e2fsprogs-1.44. All comments and corrections are welcome, since there is
|
||||
undoubtedly plenty of lore that might not be reflected in freshly
|
||||
created demonstration filesystems.
|
||||
|
||||
License
|
||||
-------
|
||||
This book is licensed under the terms of the GNU Public License, v2.
|
||||
|
||||
Terminology
|
||||
-----------
|
||||
|
||||
ext4 divides a storage device into an array of logical blocks both to
|
||||
reduce bookkeeping overhead and to increase throughput by forcing larger
|
||||
transfer sizes. Generally, the block size will be 4KiB (the same size as
|
||||
pages on x86 and the block layer's default block size), though the
|
||||
actual size is calculated as 2 ^ (10 + ``sb.s_log_block_size``) bytes.
|
||||
Throughout this document, disk locations are given in terms of these
|
||||
logical blocks, not raw LBAs, and not 1024-byte blocks. For the sake of
|
||||
convenience, the logical block size will be referred to as
|
||||
``$block_size`` throughout the rest of the document.
|
||||
|
||||
When referenced in ``preformatted text`` blocks, ``sb`` refers to fields
|
||||
in the super block, and ``inode`` refers to fields in an inode table
|
||||
entry.
|
||||
|
||||
Other References
|
||||
----------------
|
||||
|
||||
Also see http://www.nongnu.org/ext2-doc/ for quite a collection of
|
||||
information about ext2/3. Here's another old reference:
|
||||
http://wiki.osdev.org/Ext2
|
56
Documentation/filesystems/ext4/ondisk/allocators.rst
Normal file
56
Documentation/filesystems/ext4/ondisk/allocators.rst
Normal file
@ -0,0 +1,56 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Block and Inode Allocation Policy
|
||||
---------------------------------
|
||||
|
||||
ext4 recognizes (better than ext3, anyway) that data locality is
|
||||
generally a desirably quality of a filesystem. On a spinning disk,
|
||||
keeping related blocks near each other reduces the amount of movement
|
||||
that the head actuator and disk must perform to access a data block,
|
||||
thus speeding up disk IO. On an SSD there of course are no moving parts,
|
||||
but locality can increase the size of each transfer request while
|
||||
reducing the total number of requests. This locality may also have the
|
||||
effect of concentrating writes on a single erase block, which can speed
|
||||
up file rewrites significantly. Therefore, it is useful to reduce
|
||||
fragmentation whenever possible.
|
||||
|
||||
The first tool that ext4 uses to combat fragmentation is the multi-block
|
||||
allocator. When a file is first created, the block allocator
|
||||
speculatively allocates 8KiB of disk space to the file on the assumption
|
||||
that the space will get written soon. When the file is closed, the
|
||||
unused speculative allocations are of course freed, but if the
|
||||
speculation is correct (typically the case for full writes of small
|
||||
files) then the file data gets written out in a single multi-block
|
||||
extent. A second related trick that ext4 uses is delayed allocation.
|
||||
Under this scheme, when a file needs more blocks to absorb file writes,
|
||||
the filesystem defers deciding the exact placement on the disk until all
|
||||
the dirty buffers are being written out to disk. By not committing to a
|
||||
particular placement until it's absolutely necessary (the commit timeout
|
||||
is hit, or sync() is called, or the kernel runs out of memory), the hope
|
||||
is that the filesystem can make better location decisions.
|
||||
|
||||
The third trick that ext4 (and ext3) uses is that it tries to keep a
|
||||
file's data blocks in the same block group as its inode. This cuts down
|
||||
on the seek penalty when the filesystem first has to read a file's inode
|
||||
to learn where the file's data blocks live and then seek over to the
|
||||
file's data blocks to begin I/O operations.
|
||||
|
||||
The fourth trick is that all the inodes in a directory are placed in the
|
||||
same block group as the directory, when feasible. The working assumption
|
||||
here is that all the files in a directory might be related, therefore it
|
||||
is useful to try to keep them all together.
|
||||
|
||||
The fifth trick is that the disk volume is cut up into 128MB block
|
||||
groups; these mini-containers are used as outlined above to try to
|
||||
maintain data locality. However, there is a deliberate quirk -- when a
|
||||
directory is created in the root directory, the inode allocator scans
|
||||
the block groups and puts that directory into the least heavily loaded
|
||||
block group that it can find. This encourages directories to spread out
|
||||
over a disk; as the top-level directory/file blobs fill up one block
|
||||
group, the allocators simply move on to the next block group. Allegedly
|
||||
this scheme evens out the loading on the block groups, though the author
|
||||
suspects that the directories which are so unlucky as to land towards
|
||||
the end of a spinning drive get a raw deal performance-wise.
|
||||
|
||||
Of course if all of these mechanisms fail, one can always use e4defrag
|
||||
to defragment files.
|
191
Documentation/filesystems/ext4/ondisk/attributes.rst
Normal file
191
Documentation/filesystems/ext4/ondisk/attributes.rst
Normal file
@ -0,0 +1,191 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Extended Attributes
|
||||
-------------------
|
||||
|
||||
Extended attributes (xattrs) are typically stored in a separate data
|
||||
block on the disk and referenced from inodes via ``inode.i_file_acl*``.
|
||||
The first use of extended attributes seems to have been for storing file
|
||||
ACLs and other security data (selinux). With the ``user_xattr`` mount
|
||||
option it is possible for users to store extended attributes so long as
|
||||
all attribute names begin with “user”; this restriction seems to have
|
||||
disappeared as of Linux 3.0.
|
||||
|
||||
There are two places where extended attributes can be found. The first
|
||||
place is between the end of each inode entry and the beginning of the
|
||||
next inode entry. For example, if inode.i\_extra\_isize = 28 and
|
||||
sb.inode\_size = 256, then there are 256 - (128 + 28) = 100 bytes
|
||||
available for in-inode extended attribute storage. The second place
|
||||
where extended attributes can be found is in the block pointed to by
|
||||
``inode.i_file_acl``. As of Linux 3.11, it is not possible for this
|
||||
block to contain a pointer to a second extended attribute block (or even
|
||||
the remaining blocks of a cluster). In theory it is possible for each
|
||||
attribute's value to be stored in a separate data block, though as of
|
||||
Linux 3.11 the code does not permit this.
|
||||
|
||||
Keys are generally assumed to be ASCIIZ strings, whereas values can be
|
||||
strings or binary data.
|
||||
|
||||
Extended attributes, when stored after the inode, have a header
|
||||
``ext4_xattr_ibody_header`` that is 4 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_magic
|
||||
- Magic number for identification, 0xEA020000. This value is set by the
|
||||
Linux driver, though e2fsprogs doesn't seem to check it(?)
|
||||
|
||||
The beginning of an extended attribute block is in
|
||||
``struct ext4_xattr_header``, which is 32 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_magic
|
||||
- Magic number for identification, 0xEA020000.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- h\_refcount
|
||||
- Reference count.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- h\_blocks
|
||||
- Number of disk blocks used.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- h\_hash
|
||||
- Hash value of all attributes.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- h\_checksum
|
||||
- Checksum of the extended attribute block.
|
||||
* - 0x14
|
||||
- \_\_u32
|
||||
- h\_reserved[2]
|
||||
- Zero.
|
||||
|
||||
The checksum is calculated against the FS UUID, the 64-bit block number
|
||||
of the extended attribute block, and the entire block (header +
|
||||
entries).
|
||||
|
||||
Following the ``struct ext4_xattr_header`` or
|
||||
``struct ext4_xattr_ibody_header`` is an array of
|
||||
``struct ext4_xattr_entry``; each of these entries is at least 16 bytes
|
||||
long. When stored in an external block, the ``struct ext4_xattr_entry``
|
||||
entries must be stored in sorted order. The sort order is
|
||||
``e_name_index``, then ``e_name_len``, and finally ``e_name``.
|
||||
Attributes stored inside an inode do not need be stored in sorted order.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_u8
|
||||
- e\_name\_len
|
||||
- Length of name.
|
||||
* - 0x1
|
||||
- \_\_u8
|
||||
- e\_name\_index
|
||||
- Attribute name index. There is a discussion of this below.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- e\_value\_offs
|
||||
- Location of this attribute's value on the disk block where it is stored.
|
||||
Multiple attributes can share the same value. For an inode attribute
|
||||
this value is relative to the start of the first entry; for a block this
|
||||
value is relative to the start of the block (i.e. the header).
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- e\_value\_inum
|
||||
- The inode where the value is stored. Zero indicates the value is in the
|
||||
same block as this entry. This field is only used if the
|
||||
INCOMPAT\_EA\_INODE feature is enabled.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- e\_value\_size
|
||||
- Length of attribute value.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- e\_hash
|
||||
- Hash value of attribute name and attribute value. The kernel doesn't
|
||||
update the hash for in-inode attributes, so for that case this value
|
||||
must be zero, because e2fsck validates any non-zero hash regardless of
|
||||
where the xattr lives.
|
||||
* - 0x10
|
||||
- char
|
||||
- e\_name[e\_name\_len]
|
||||
- Attribute name. Does not include trailing NULL.
|
||||
|
||||
Attribute values can follow the end of the entry table. There appears to
|
||||
be a requirement that they be aligned to 4-byte boundaries. The values
|
||||
are stored starting at the end of the block and grow towards the
|
||||
xattr\_header/xattr\_entry table. When the two collide, the overflow is
|
||||
put into a separate disk block. If the disk block fills up, the
|
||||
filesystem returns -ENOSPC.
|
||||
|
||||
The first four fields of the ``ext4_xattr_entry`` are set to zero to
|
||||
mark the end of the key list.
|
||||
|
||||
Attribute Name Indices
|
||||
~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
Logically speaking, extended attributes are a series of key=value pairs.
|
||||
The keys are assumed to be NULL-terminated strings. To reduce the amount
|
||||
of on-disk space that the keys consume, the beginning of the key string
|
||||
is matched against the attribute name index. If a match is found, the
|
||||
attribute name index field is set, and matching string is removed from
|
||||
the key name. Here is a map of name index values to key prefixes:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Name Index
|
||||
- Key Prefix
|
||||
* - 0
|
||||
- (no prefix)
|
||||
* - 1
|
||||
- “user.”
|
||||
* - 2
|
||||
- “system.posix\_acl\_access”
|
||||
* - 3
|
||||
- “system.posix\_acl\_default”
|
||||
* - 4
|
||||
- “trusted.”
|
||||
* - 6
|
||||
- “security.”
|
||||
* - 7
|
||||
- “system.” (inline\_data only?)
|
||||
* - 8
|
||||
- “system.richacl” (SuSE kernels only?)
|
||||
|
||||
For example, if the attribute key is “user.fubar”, the attribute name
|
||||
index is set to 1 and the “fubar” name is recorded on disk.
|
||||
|
||||
POSIX ACLs
|
||||
~~~~~~~~~~
|
||||
|
||||
POSIX ACLs are stored in a reduced version of the Linux kernel (and
|
||||
libacl's) internal ACL format. The key difference is that the version
|
||||
number is different (1) and the ``e_id`` field is only stored for named
|
||||
user and group ACLs.
|
22
Documentation/filesystems/ext4/ondisk/bigalloc.rst
Normal file
22
Documentation/filesystems/ext4/ondisk/bigalloc.rst
Normal file
@ -0,0 +1,22 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Bigalloc
|
||||
--------
|
||||
|
||||
At the moment, the default size of a block is 4KiB, which is a commonly
|
||||
supported page size on most MMU-capable hardware. This is fortunate, as
|
||||
ext4 code is not prepared to handle the case where the block size
|
||||
exceeds the page size. However, for a filesystem of mostly huge files,
|
||||
it is desirable to be able to allocate disk blocks in units of multiple
|
||||
blocks to reduce both fragmentation and metadata overhead. The
|
||||
`bigalloc <Bigalloc>`__ feature provides exactly this ability. The
|
||||
administrator can set a block cluster size at mkfs time (which is stored
|
||||
in the s\_log\_cluster\_size field in the superblock); from then on, the
|
||||
block bitmaps track clusters, not individual blocks. This means that
|
||||
block groups can be several gigabytes in size (instead of just 128MiB);
|
||||
however, the minimum allocation unit becomes a cluster, not a block,
|
||||
even for directories. TaoBao had a patchset to extend the “use units of
|
||||
clusters instead of blocks” to the extent tree, though it is not clear
|
||||
where those patches went-- they eventually morphed into “extent tree v2”
|
||||
but that code has not landed as of May 2015.
|
||||
|
28
Documentation/filesystems/ext4/ondisk/bitmaps.rst
Normal file
28
Documentation/filesystems/ext4/ondisk/bitmaps.rst
Normal file
@ -0,0 +1,28 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Block and inode Bitmaps
|
||||
-----------------------
|
||||
|
||||
The data block bitmap tracks the usage of data blocks within the block
|
||||
group.
|
||||
|
||||
The inode bitmap records which entries in the inode table are in use.
|
||||
|
||||
As with most bitmaps, one bit represents the usage status of one data
|
||||
block or inode table entry. This implies a block group size of 8 \*
|
||||
number\_of\_bytes\_in\_a\_logical\_block.
|
||||
|
||||
NOTE: If ``BLOCK_UNINIT`` is set for a given block group, various parts
|
||||
of the kernel and e2fsprogs code pretends that the block bitmap contains
|
||||
zeros (i.e. all blocks in the group are free). However, it is not
|
||||
necessarily the case that no blocks are in use -- if ``meta_bg`` is set,
|
||||
the bitmaps and group descriptor live inside the group. Unfortunately,
|
||||
ext2fs\_test\_block\_bitmap2() will return '0' for those locations,
|
||||
which produces confusing debugfs output.
|
||||
|
||||
Inode Table
|
||||
-----------
|
||||
Inode tables are statically allocated at mkfs time. Each block group
|
||||
descriptor points to the start of the table, and the superblock records
|
||||
the number of inodes per group. See the section on inodes for more
|
||||
information.
|
135
Documentation/filesystems/ext4/ondisk/blockgroup.rst
Normal file
135
Documentation/filesystems/ext4/ondisk/blockgroup.rst
Normal file
@ -0,0 +1,135 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Layout
|
||||
------
|
||||
|
||||
The layout of a standard block group is approximately as follows (each
|
||||
of these fields is discussed in a separate section below):
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 1 1 1 1 1
|
||||
:header-rows: 1
|
||||
|
||||
* - Group 0 Padding
|
||||
- ext4 Super Block
|
||||
- Group Descriptors
|
||||
- Reserved GDT Blocks
|
||||
- Data Block Bitmap
|
||||
- inode Bitmap
|
||||
- inode Table
|
||||
- Data Blocks
|
||||
* - 1024 bytes
|
||||
- 1 block
|
||||
- many blocks
|
||||
- many blocks
|
||||
- 1 block
|
||||
- 1 block
|
||||
- many blocks
|
||||
- many more blocks
|
||||
|
||||
For the special case of block group 0, the first 1024 bytes are unused,
|
||||
to allow for the installation of x86 boot sectors and other oddities.
|
||||
The superblock will start at offset 1024 bytes, whichever block that
|
||||
happens to be (usually 0). However, if for some reason the block size =
|
||||
1024, then block 0 is marked in use and the superblock goes in block 1.
|
||||
For all other block groups, there is no padding.
|
||||
|
||||
The ext4 driver primarily works with the superblock and the group
|
||||
descriptors that are found in block group 0. Redundant copies of the
|
||||
superblock and group descriptors are written to some of the block groups
|
||||
across the disk in case the beginning of the disk gets trashed, though
|
||||
not all block groups necessarily host a redundant copy (see following
|
||||
paragraph for more details). If the group does not have a redundant
|
||||
copy, the block group begins with the data block bitmap. Note also that
|
||||
when the filesystem is freshly formatted, mkfs will allocate “reserve
|
||||
GDT block” space after the block group descriptors and before the start
|
||||
of the block bitmaps to allow for future expansion of the filesystem. By
|
||||
default, a filesystem is allowed to increase in size by a factor of
|
||||
1024x over the original filesystem size.
|
||||
|
||||
The location of the inode table is given by ``grp.bg_inode_table_*``. It
|
||||
is continuous range of blocks large enough to contain
|
||||
``sb.s_inodes_per_group * sb.s_inode_size`` bytes.
|
||||
|
||||
As for the ordering of items in a block group, it is generally
|
||||
established that the super block and the group descriptor table, if
|
||||
present, will be at the beginning of the block group. The bitmaps and
|
||||
the inode table can be anywhere, and it is quite possible for the
|
||||
bitmaps to come after the inode table, or for both to be in different
|
||||
groups (flex\_bg). Leftover space is used for file data blocks, indirect
|
||||
block maps, extent tree blocks, and extended attributes.
|
||||
|
||||
Flexible Block Groups
|
||||
---------------------
|
||||
|
||||
Starting in ext4, there is a new feature called flexible block groups
|
||||
(flex\_bg). In a flex\_bg, several block groups are tied together as one
|
||||
logical block group; the bitmap spaces and the inode table space in the
|
||||
first block group of the flex\_bg are expanded to include the bitmaps
|
||||
and inode tables of all other block groups in the flex\_bg. For example,
|
||||
if the flex\_bg size is 4, then group 0 will contain (in order) the
|
||||
superblock, group descriptors, data block bitmaps for groups 0-3, inode
|
||||
bitmaps for groups 0-3, inode tables for groups 0-3, and the remaining
|
||||
space in group 0 is for file data. The effect of this is to group the
|
||||
block metadata close together for faster loading, and to enable large
|
||||
files to be continuous on disk. Backup copies of the superblock and
|
||||
group descriptors are always at the beginning of block groups, even if
|
||||
flex\_bg is enabled. The number of block groups that make up a flex\_bg
|
||||
is given by 2 ^ ``sb.s_log_groups_per_flex``.
|
||||
|
||||
Meta Block Groups
|
||||
-----------------
|
||||
|
||||
Without the option META\_BG, for safety concerns, all block group
|
||||
descriptors copies are kept in the first block group. Given the default
|
||||
128MiB(2^27 bytes) block group size and 64-byte group descriptors, ext4
|
||||
can have at most 2^27/64 = 2^21 block groups. This limits the entire
|
||||
filesystem size to 2^21 ∗ 2^27 = 2^48bytes or 256TiB.
|
||||
|
||||
The solution to this problem is to use the metablock group feature
|
||||
(META\_BG), which is already in ext3 for all 2.6 releases. With the
|
||||
META\_BG feature, ext4 filesystems are partitioned into many metablock
|
||||
groups. Each metablock group is a cluster of block groups whose group
|
||||
descriptor structures can be stored in a single disk block. For ext4
|
||||
filesystems with 4 KB block size, a single metablock group partition
|
||||
includes 64 block groups, or 8 GiB of disk space. The metablock group
|
||||
feature moves the location of the group descriptors from the congested
|
||||
first block group of the whole filesystem into the first group of each
|
||||
metablock group itself. The backups are in the second and last group of
|
||||
each metablock group. This increases the 2^21 maximum block groups limit
|
||||
to the hard limit 2^32, allowing support for a 512PiB filesystem.
|
||||
|
||||
The change in the filesystem format replaces the current scheme where
|
||||
the superblock is followed by a variable-length set of block group
|
||||
descriptors. Instead, the superblock and a single block group descriptor
|
||||
block is placed at the beginning of the first, second, and last block
|
||||
groups in a meta-block group. A meta-block group is a collection of
|
||||
block groups which can be described by a single block group descriptor
|
||||
block. Since the size of the block group descriptor structure is 32
|
||||
bytes, a meta-block group contains 32 block groups for filesystems with
|
||||
a 1KB block size, and 128 block groups for filesystems with a 4KB
|
||||
blocksize. Filesystems can either be created using this new block group
|
||||
descriptor layout, or existing filesystems can be resized on-line, and
|
||||
the field s\_first\_meta\_bg in the superblock will indicate the first
|
||||
block group using this new layout.
|
||||
|
||||
Please see an important note about ``BLOCK_UNINIT`` in the section about
|
||||
block and inode bitmaps.
|
||||
|
||||
Lazy Block Group Initialization
|
||||
-------------------------------
|
||||
|
||||
A new feature for ext4 are three block group descriptor flags that
|
||||
enable mkfs to skip initializing other parts of the block group
|
||||
metadata. Specifically, the INODE\_UNINIT and BLOCK\_UNINIT flags mean
|
||||
that the inode and block bitmaps for that group can be calculated and
|
||||
therefore the on-disk bitmap blocks are not initialized. This is
|
||||
generally the case for an empty block group or a block group containing
|
||||
only fixed-location block group metadata. The INODE\_ZEROED flag means
|
||||
that the inode table has been initialized; mkfs will unset this flag and
|
||||
rely on the kernel to initialize the inode tables in the background.
|
||||
|
||||
By not writing zeroes to the bitmaps and inode table, mkfs time is
|
||||
reduced considerably. Note the feature flag is RO\_COMPAT\_GDT\_CSUM,
|
||||
but the dumpe2fs output prints this as “uninit\_bg”. They are the same
|
||||
thing.
|
49
Documentation/filesystems/ext4/ondisk/blockmap.rst
Normal file
49
Documentation/filesystems/ext4/ondisk/blockmap.rst
Normal file
@ -0,0 +1,49 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| i.i\_block Offset | Where It Points |
|
||||
+=====================+==============================================================================================================================================================================================================================+
|
||||
| 0 to 11 | Direct map to file blocks 0 to 11. |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 12 | Indirect block: (file blocks 12 to (``$block_size`` / 4) + 11, or 12 to 1035 if 4KiB blocks) |
|
||||
| | |
|
||||
| | +------------------------------+--------------------------------------------------------------------+ |
|
||||
| | | Indirect Block Offset | Where It Points | |
|
||||
| | +==============================+====================================================================+ |
|
||||
| | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | |
|
||||
| | +------------------------------+--------------------------------------------------------------------+ |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 13 | Double-indirect block: (file blocks ``$block_size``/4 + 12 to (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 11, or 1036 to 1049611 if 4KiB blocks) |
|
||||
| | |
|
||||
| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
|
||||
| | | Double Indirect Block Offset | Where It Points | |
|
||||
| | +================================+=========================================================================================================+ |
|
||||
| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | |
|
||||
| | | | | |
|
||||
| | | | +------------------------------+--------------------------------------------------------------------+ | |
|
||||
| | | | | Indirect Block Offset | Where It Points | | |
|
||||
| | | | +==============================+====================================================================+ | |
|
||||
| | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | |
|
||||
| | | | +------------------------------+--------------------------------------------------------------------+ | |
|
||||
| | +--------------------------------+---------------------------------------------------------------------------------------------------------+ |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
||||
| 14 | Triple-indirect block: (file blocks (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12 to (``$block_size`` / 4) ^ 3 + (``$block_size`` / 4) ^ 2 + (``$block_size`` / 4) + 12, or 1049612 to 1074791436 if 4KiB blocks) |
|
||||
| | |
|
||||
| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
|
||||
| | | Triple Indirect Block Offset | Where It Points | |
|
||||
| | +================================+================================================================================================================================================+ |
|
||||
| | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) double indirect blocks (1024 if 4KiB blocks) | |
|
||||
| | | | | |
|
||||
| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
|
||||
| | | | | Double Indirect Block Offset | Where It Points | | |
|
||||
| | | | +================================+=========================================================================================================+ | |
|
||||
| | | | | 0 to (``$block_size`` / 4) | Map to (``$block_size`` / 4) indirect blocks (1024 if 4KiB blocks) | | |
|
||||
| | | | | | | | |
|
||||
| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
|
||||
| | | | | | | Indirect Block Offset | Where It Points | | | |
|
||||
| | | | | | +==============================+====================================================================+ | | |
|
||||
| | | | | | | 0 to (``$block_size`` / 4) | Direct map to (``$block_size`` / 4) blocks (1024 if 4KiB blocks) | | | |
|
||||
| | | | | | +------------------------------+--------------------------------------------------------------------+ | | |
|
||||
| | | | +--------------------------------+---------------------------------------------------------------------------------------------------------+ | |
|
||||
| | +--------------------------------+------------------------------------------------------------------------------------------------------------------------------------------------+ |
|
||||
+---------------------+------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------+
|
142
Documentation/filesystems/ext4/ondisk/blocks.rst
Normal file
142
Documentation/filesystems/ext4/ondisk/blocks.rst
Normal file
@ -0,0 +1,142 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Blocks
|
||||
------
|
||||
|
||||
ext4 allocates storage space in units of “blocks”. A block is a group of
|
||||
sectors between 1KiB and 64KiB, and the number of sectors must be an
|
||||
integral power of 2. Blocks are in turn grouped into larger units called
|
||||
block groups. Block size is specified at mkfs time and typically is
|
||||
4KiB. You may experience mounting problems if block size is greater than
|
||||
page size (i.e. 64KiB blocks on a i386 which only has 4KiB memory
|
||||
pages). By default a filesystem can contain 2^32 blocks; if the '64bit'
|
||||
feature is enabled, then a filesystem can have 2^64 blocks.
|
||||
|
||||
For 32-bit filesystems, limits are as follows:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 1 1
|
||||
:header-rows: 1
|
||||
|
||||
* - Item
|
||||
- 1KiB
|
||||
- 2KiB
|
||||
- 4KiB
|
||||
- 64KiB
|
||||
* - Blocks
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
* - Inodes
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
* - File System Size
|
||||
- 4TiB
|
||||
- 8TiB
|
||||
- 16TiB
|
||||
- 256PiB
|
||||
* - Blocks Per Block Group
|
||||
- 8,192
|
||||
- 16,384
|
||||
- 32,768
|
||||
- 524,288
|
||||
* - Inodes Per Block Group
|
||||
- 8,192
|
||||
- 16,384
|
||||
- 32,768
|
||||
- 524,288
|
||||
* - Block Group Size
|
||||
- 8MiB
|
||||
- 32MiB
|
||||
- 128MiB
|
||||
- 32GiB
|
||||
* - Blocks Per File, Extents
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
* - Blocks Per File, Block Maps
|
||||
- 16,843,020
|
||||
- 134,480,396
|
||||
- 1,074,791,436
|
||||
- 4,398,314,962,956 (really 2^32 due to field size limitations)
|
||||
* - File Size, Extents
|
||||
- 4TiB
|
||||
- 8TiB
|
||||
- 16TiB
|
||||
- 256TiB
|
||||
* - File Size, Block Maps
|
||||
- 16GiB
|
||||
- 256GiB
|
||||
- 4TiB
|
||||
- 256TiB
|
||||
|
||||
For 64-bit filesystems, limits are as follows:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 1 1
|
||||
:header-rows: 1
|
||||
|
||||
* - Item
|
||||
- 1KiB
|
||||
- 2KiB
|
||||
- 4KiB
|
||||
- 64KiB
|
||||
* - Blocks
|
||||
- 2^64
|
||||
- 2^64
|
||||
- 2^64
|
||||
- 2^64
|
||||
* - Inodes
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
* - File System Size
|
||||
- 16ZiB
|
||||
- 32ZiB
|
||||
- 64ZiB
|
||||
- 1YiB
|
||||
* - Blocks Per Block Group
|
||||
- 8,192
|
||||
- 16,384
|
||||
- 32,768
|
||||
- 524,288
|
||||
* - Inodes Per Block Group
|
||||
- 8,192
|
||||
- 16,384
|
||||
- 32,768
|
||||
- 524,288
|
||||
* - Block Group Size
|
||||
- 8MiB
|
||||
- 32MiB
|
||||
- 128MiB
|
||||
- 32GiB
|
||||
* - Blocks Per File, Extents
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
- 2^32
|
||||
* - Blocks Per File, Block Maps
|
||||
- 16,843,020
|
||||
- 134,480,396
|
||||
- 1,074,791,436
|
||||
- 4,398,314,962,956 (really 2^32 due to field size limitations)
|
||||
* - File Size, Extents
|
||||
- 4TiB
|
||||
- 8TiB
|
||||
- 16TiB
|
||||
- 256TiB
|
||||
* - File Size, Block Maps
|
||||
- 16GiB
|
||||
- 256GiB
|
||||
- 4TiB
|
||||
- 256TiB
|
||||
|
||||
Note: Files not using extents (i.e. files using block maps) must be
|
||||
placed within the first 2^32 blocks of a filesystem. Files with extents
|
||||
must be placed within the first 2^48 blocks of a filesystem. It's not
|
||||
clear what happens with larger filesystems.
|
73
Documentation/filesystems/ext4/ondisk/checksums.rst
Normal file
73
Documentation/filesystems/ext4/ondisk/checksums.rst
Normal file
@ -0,0 +1,73 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Checksums
|
||||
---------
|
||||
|
||||
Starting in early 2012, metadata checksums were added to all major ext4
|
||||
and jbd2 data structures. The associated feature flag is metadata\_csum.
|
||||
The desired checksum algorithm is indicated in the superblock, though as
|
||||
of October 2012 the only supported algorithm is crc32c. Some data
|
||||
structures did not have space to fit a full 32-bit checksum, so only the
|
||||
lower 16 bits are stored. Enabling the 64bit feature increases the data
|
||||
structure size so that full 32-bit checksums can be stored for many data
|
||||
structures. However, existing 32-bit filesystems cannot be extended to
|
||||
enable 64bit mode, at least not without the experimental resize2fs
|
||||
patches to do so.
|
||||
|
||||
Existing filesystems can have checksumming added by running
|
||||
``tune2fs -O metadata_csum`` against the underlying device. If tune2fs
|
||||
encounters directory blocks that lack sufficient empty space to add a
|
||||
checksum, it will request that you run ``e2fsck -D`` to have the
|
||||
directories rebuilt with checksums. This has the added benefit of
|
||||
removing slack space from the directory files and rebalancing the htree
|
||||
indexes. If you \_ignore\_ this step, your directories will not be
|
||||
protected by a checksum!
|
||||
|
||||
The following table describes the data elements that go into each type
|
||||
of checksum. The checksum function is whatever the superblock describes
|
||||
(crc32c as of October 2013) unless noted otherwise.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 4
|
||||
:header-rows: 1
|
||||
|
||||
* - Metadata
|
||||
- Length
|
||||
- Ingredients
|
||||
* - Superblock
|
||||
- \_\_le32
|
||||
- The entire superblock up to the checksum field. The UUID lives inside
|
||||
the superblock.
|
||||
* - MMP
|
||||
- \_\_le32
|
||||
- UUID + the entire MMP block up to the checksum field.
|
||||
* - Extended Attributes
|
||||
- \_\_le32
|
||||
- UUID + the entire extended attribute block. The checksum field is set to
|
||||
zero.
|
||||
* - Directory Entries
|
||||
- \_\_le32
|
||||
- UUID + inode number + inode generation + the directory block up to the
|
||||
fake entry enclosing the checksum field.
|
||||
* - HTREE Nodes
|
||||
- \_\_le32
|
||||
- UUID + inode number + inode generation + all valid extents + HTREE tail.
|
||||
The checksum field is set to zero.
|
||||
* - Extents
|
||||
- \_\_le32
|
||||
- UUID + inode number + inode generation + the entire extent block up to
|
||||
the checksum field.
|
||||
* - Bitmaps
|
||||
- \_\_le32 or \_\_le16
|
||||
- UUID + the entire bitmap. Checksums are stored in the group descriptor,
|
||||
and truncated if the group descriptor size is 32 bytes (i.e. ^64bit)
|
||||
* - Inodes
|
||||
- \_\_le32
|
||||
- UUID + inode number + inode generation + the entire inode. The checksum
|
||||
field is set to zero. Each inode has its own checksum.
|
||||
* - Group Descriptors
|
||||
- \_\_le16
|
||||
- If metadata\_csum, then UUID + group number + the entire descriptor;
|
||||
else if gdt\_csum, then crc16(UUID + group number + the entire
|
||||
descriptor). In all cases, only the lower 16 bits are stored.
|
||||
|
426
Documentation/filesystems/ext4/ondisk/directory.rst
Normal file
426
Documentation/filesystems/ext4/ondisk/directory.rst
Normal file
@ -0,0 +1,426 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Directory Entries
|
||||
-----------------
|
||||
|
||||
In an ext4 filesystem, a directory is more or less a flat file that maps
|
||||
an arbitrary byte string (usually ASCII) to an inode number on the
|
||||
filesystem. There can be many directory entries across the filesystem
|
||||
that reference the same inode number--these are known as hard links, and
|
||||
that is why hard links cannot reference files on other filesystems. As
|
||||
such, directory entries are found by reading the data block(s)
|
||||
associated with a directory file for the particular directory entry that
|
||||
is desired.
|
||||
|
||||
Linear (Classic) Directories
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
By default, each directory lists its entries in an “almost-linear”
|
||||
array. I write “almost” because it's not a linear array in the memory
|
||||
sense because directory entries are not split across filesystem blocks.
|
||||
Therefore, it is more accurate to say that a directory is a series of
|
||||
data blocks and that each block contains a linear array of directory
|
||||
entries. The end of each per-block array is signified by reaching the
|
||||
end of the block; the last entry in the block has a record length that
|
||||
takes it all the way to the end of the block. The end of the entire
|
||||
directory is of course signified by reaching the end of the file. Unused
|
||||
directory entries are signified by inode = 0. By default the filesystem
|
||||
uses ``struct ext4_dir_entry_2`` for directory entries unless the
|
||||
“filetype” feature flag is not set, in which case it uses
|
||||
``struct ext4_dir_entry``.
|
||||
|
||||
The original directory entry format is ``struct ext4_dir_entry``, which
|
||||
is at most 263 bytes long, though on disk you'll need to reference
|
||||
``dirent.rec_len`` to know for sure.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- inode
|
||||
- Number of the inode that this directory entry points to.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- rec\_len
|
||||
- Length of this directory entry. Must be a multiple of 4.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- name\_len
|
||||
- Length of the file name.
|
||||
* - 0x8
|
||||
- char
|
||||
- name[EXT4\_NAME\_LEN]
|
||||
- File name.
|
||||
|
||||
Since file names cannot be longer than 255 bytes, the new directory
|
||||
entry format shortens the rec\_len field and uses the space for a file
|
||||
type flag, probably to avoid having to load every inode during directory
|
||||
tree traversal. This format is ``ext4_dir_entry_2``, which is at most
|
||||
263 bytes long, though on disk you'll need to reference
|
||||
``dirent.rec_len`` to know for sure.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- inode
|
||||
- Number of the inode that this directory entry points to.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- rec\_len
|
||||
- Length of this directory entry.
|
||||
* - 0x6
|
||||
- \_\_u8
|
||||
- name\_len
|
||||
- Length of the file name.
|
||||
* - 0x7
|
||||
- \_\_u8
|
||||
- file\_type
|
||||
- File type code, see ftype_ table below.
|
||||
* - 0x8
|
||||
- char
|
||||
- name[EXT4\_NAME\_LEN]
|
||||
- File name.
|
||||
|
||||
.. _ftype:
|
||||
|
||||
The directory file type is one of the following values:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0
|
||||
- Unknown.
|
||||
* - 0x1
|
||||
- Regular file.
|
||||
* - 0x2
|
||||
- Directory.
|
||||
* - 0x3
|
||||
- Character device file.
|
||||
* - 0x4
|
||||
- Block device file.
|
||||
* - 0x5
|
||||
- FIFO.
|
||||
* - 0x6
|
||||
- Socket.
|
||||
* - 0x7
|
||||
- Symbolic link.
|
||||
|
||||
In order to add checksums to these classic directory blocks, a phony
|
||||
``struct ext4_dir_entry`` is placed at the end of each leaf block to
|
||||
hold the checksum. The directory entry is 12 bytes long. The inode
|
||||
number and name\_len fields are set to zero to fool old software into
|
||||
ignoring an apparently empty directory entry, and the checksum is stored
|
||||
in the place where the name normally goes. The structure is
|
||||
``struct ext4_dir_entry_tail``:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- det\_reserved\_zero1
|
||||
- Inode number, which must be zero.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- det\_rec\_len
|
||||
- Length of this directory entry, which must be 12.
|
||||
* - 0x6
|
||||
- \_\_u8
|
||||
- det\_reserved\_zero2
|
||||
- Length of the file name, which must be zero.
|
||||
* - 0x7
|
||||
- \_\_u8
|
||||
- det\_reserved\_ft
|
||||
- File type, which must be 0xDE.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- det\_checksum
|
||||
- Directory leaf block checksum.
|
||||
|
||||
The leaf directory block checksum is calculated against the FS UUID, the
|
||||
directory's inode number, the directory's inode generation number, and
|
||||
the entire directory entry block up to (but not including) the fake
|
||||
directory entry.
|
||||
|
||||
Hash Tree Directories
|
||||
~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
A linear array of directory entries isn't great for performance, so a
|
||||
new feature was added to ext3 to provide a faster (but peculiar)
|
||||
balanced tree keyed off a hash of the directory entry name. If the
|
||||
EXT4\_INDEX\_FL (0x1000) flag is set in the inode, this directory uses a
|
||||
hashed btree (htree) to organize and find directory entries. For
|
||||
backwards read-only compatibility with ext2, this tree is actually
|
||||
hidden inside the directory file, masquerading as “empty” directory data
|
||||
blocks! It was stated previously that the end of the linear directory
|
||||
entry table was signified with an entry pointing to inode 0; this is
|
||||
(ab)used to fool the old linear-scan algorithm into thinking that the
|
||||
rest of the directory block is empty so that it moves on.
|
||||
|
||||
The root of the tree always lives in the first data block of the
|
||||
directory. By ext2 custom, the '.' and '..' entries must appear at the
|
||||
beginning of this first block, so they are put here as two
|
||||
``struct ext4_dir_entry_2``\ s and not stored in the tree. The rest of
|
||||
the root node contains metadata about the tree and finally a hash->block
|
||||
map to find nodes that are lower in the htree. If
|
||||
``dx_root.info.indirect_levels`` is non-zero then the htree has two
|
||||
levels; the data block pointed to by the root node's map is an interior
|
||||
node, which is indexed by a minor hash. Interior nodes in this tree
|
||||
contains a zeroed out ``struct ext4_dir_entry_2`` followed by a
|
||||
minor\_hash->block map to find leafe nodes. Leaf nodes contain a linear
|
||||
array of all ``struct ext4_dir_entry_2``; all of these entries
|
||||
(presumably) hash to the same value. If there is an overflow, the
|
||||
entries simply overflow into the next leaf node, and the
|
||||
least-significant bit of the hash (in the interior node map) that gets
|
||||
us to this next leaf node is set.
|
||||
|
||||
To traverse the directory as a htree, the code calculates the hash of
|
||||
the desired file name and uses it to find the corresponding block
|
||||
number. If the tree is flat, the block is a linear array of directory
|
||||
entries that can be searched; otherwise, the minor hash of the file name
|
||||
is computed and used against this second block to find the corresponding
|
||||
third block number. That third block number will be a linear array of
|
||||
directory entries.
|
||||
|
||||
To traverse the directory as a linear array (such as the old code does),
|
||||
the code simply reads every data block in the directory. The blocks used
|
||||
for the htree will appear to have no entries (aside from '.' and '..')
|
||||
and so only the leaf nodes will appear to have any interesting content.
|
||||
|
||||
The root of the htree is in ``struct dx_root``, which is the full length
|
||||
of a data block:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- dot.inode
|
||||
- inode number of this directory.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- dot.rec\_len
|
||||
- Length of this record, 12.
|
||||
* - 0x6
|
||||
- u8
|
||||
- dot.name\_len
|
||||
- Length of the name, 1.
|
||||
* - 0x7
|
||||
- u8
|
||||
- dot.file\_type
|
||||
- File type of this entry, 0x2 (directory) (if the feature flag is set).
|
||||
* - 0x8
|
||||
- char
|
||||
- dot.name[4]
|
||||
- “.\\0\\0\\0”
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- dotdot.inode
|
||||
- inode number of parent directory.
|
||||
* - 0x10
|
||||
- \_\_le16
|
||||
- dotdot.rec\_len
|
||||
- block\_size - 12. The record length is long enough to cover all htree
|
||||
data.
|
||||
* - 0x12
|
||||
- u8
|
||||
- dotdot.name\_len
|
||||
- Length of the name, 2.
|
||||
* - 0x13
|
||||
- u8
|
||||
- dotdot.file\_type
|
||||
- File type of this entry, 0x2 (directory) (if the feature flag is set).
|
||||
* - 0x14
|
||||
- char
|
||||
- dotdot\_name[4]
|
||||
- “..\\0\\0”
|
||||
* - 0x18
|
||||
- \_\_le32
|
||||
- struct dx\_root\_info.reserved\_zero
|
||||
- Zero.
|
||||
* - 0x1C
|
||||
- u8
|
||||
- struct dx\_root\_info.hash\_version
|
||||
- Hash type, see dirhash_ table below.
|
||||
* - 0x1D
|
||||
- u8
|
||||
- struct dx\_root\_info.info\_length
|
||||
- Length of the tree information, 0x8.
|
||||
* - 0x1E
|
||||
- u8
|
||||
- struct dx\_root\_info.indirect\_levels
|
||||
- Depth of the htree. Cannot be larger than 3 if the INCOMPAT\_LARGEDIR
|
||||
feature is set; cannot be larger than 2 otherwise.
|
||||
* - 0x1F
|
||||
- u8
|
||||
- struct dx\_root\_info.unused\_flags
|
||||
-
|
||||
* - 0x20
|
||||
- \_\_le16
|
||||
- limit
|
||||
- Maximum number of dx\_entries that can follow this header, plus 1 for
|
||||
the header itself.
|
||||
* - 0x22
|
||||
- \_\_le16
|
||||
- count
|
||||
- Actual number of dx\_entries that follow this header, plus 1 for the
|
||||
header itself.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- block
|
||||
- The block number (within the directory file) that goes with hash=0.
|
||||
* - 0x28
|
||||
- struct dx\_entry
|
||||
- entries[0]
|
||||
- As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
|
||||
|
||||
.. _dirhash:
|
||||
|
||||
The directory hash is one of the following values:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0
|
||||
- Legacy.
|
||||
* - 0x1
|
||||
- Half MD4.
|
||||
* - 0x2
|
||||
- Tea.
|
||||
* - 0x3
|
||||
- Legacy, unsigned.
|
||||
* - 0x4
|
||||
- Half MD4, unsigned.
|
||||
* - 0x5
|
||||
- Tea, unsigned.
|
||||
|
||||
Interior nodes of an htree are recorded as ``struct dx_node``, which is
|
||||
also the full length of a data block:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- fake.inode
|
||||
- Zero, to make it look like this entry is not in use.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- fake.rec\_len
|
||||
- The size of the block, in order to hide all of the dx\_node data.
|
||||
* - 0x6
|
||||
- u8
|
||||
- name\_len
|
||||
- Zero. There is no name for this “unused” directory entry.
|
||||
* - 0x7
|
||||
- u8
|
||||
- file\_type
|
||||
- Zero. There is no file type for this “unused” directory entry.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- limit
|
||||
- Maximum number of dx\_entries that can follow this header, plus 1 for
|
||||
the header itself.
|
||||
* - 0xA
|
||||
- \_\_le16
|
||||
- count
|
||||
- Actual number of dx\_entries that follow this header, plus 1 for the
|
||||
header itself.
|
||||
* - 0xE
|
||||
- \_\_le32
|
||||
- block
|
||||
- The block number (within the directory file) that goes with the lowest
|
||||
hash value of this block. This value is stored in the parent block.
|
||||
* - 0x12
|
||||
- struct dx\_entry
|
||||
- entries[0]
|
||||
- As many 8-byte ``struct dx_entry`` as fits in the rest of the data block.
|
||||
|
||||
The hash maps that exist in both ``struct dx_root`` and
|
||||
``struct dx_node`` are recorded as ``struct dx_entry``, which is 8 bytes
|
||||
long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- hash
|
||||
- Hash code.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- block
|
||||
- Block number (within the directory file, not filesystem blocks) of the
|
||||
next node in the htree.
|
||||
|
||||
(If you think this is all quite clever and peculiar, so does the
|
||||
author.)
|
||||
|
||||
If metadata checksums are enabled, the last 8 bytes of the directory
|
||||
block (precisely the length of one dx\_entry) are used to store a
|
||||
``struct dx_tail``, which contains the checksum. The ``limit`` and
|
||||
``count`` entries in the dx\_root/dx\_node structures are adjusted as
|
||||
necessary to fit the dx\_tail into the block. If there is no space for
|
||||
the dx\_tail, the user is notified to run e2fsck -D to rebuild the
|
||||
directory index (which will ensure that there's space for the checksum.
|
||||
The dx\_tail structure is 8 bytes long and looks like this:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- u32
|
||||
- dt\_reserved
|
||||
- Zero.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- dt\_checksum
|
||||
- Checksum of the htree directory block.
|
||||
|
||||
The checksum is calculated against the FS UUID, the htree index header
|
||||
(dx\_root or dx\_node), all of the htree indices (dx\_entry) that are in
|
||||
use, and the tail block (dx\_tail).
|
12
Documentation/filesystems/ext4/ondisk/dynamic.rst
Normal file
12
Documentation/filesystems/ext4/ondisk/dynamic.rst
Normal file
@ -0,0 +1,12 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Dynamic Structures
|
||||
==================
|
||||
|
||||
Dynamic metadata are created on the fly when files and blocks are
|
||||
allocated to files.
|
||||
|
||||
.. include:: inodes.rst
|
||||
.. include:: ifork.rst
|
||||
.. include:: directory.rst
|
||||
.. include:: attributes.rst
|
18
Documentation/filesystems/ext4/ondisk/eainode.rst
Normal file
18
Documentation/filesystems/ext4/ondisk/eainode.rst
Normal file
@ -0,0 +1,18 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Large Extended Attribute Values
|
||||
-------------------------------
|
||||
|
||||
To enable ext4 to store extended attribute values that do not fit in the
|
||||
inode or in the single extended attribute block attached to an inode,
|
||||
the EA\_INODE feature allows us to store the value in the data blocks of
|
||||
a regular file inode. This “EA inode” is linked only from the extended
|
||||
attribute name index and must not appear in a directory entry. The
|
||||
inode's i\_atime field is used to store a checksum of the xattr value;
|
||||
and i\_ctime/i\_version store a 64-bit reference count, which enables
|
||||
sharing of large xattr values between multiple owning inodes. For
|
||||
backward compatibility with older versions of this feature, the
|
||||
i\_mtime/i\_generation *may* store a back-reference to the inode number
|
||||
and i\_generation of the **one** owning inode (in cases where the EA
|
||||
inode is not referenced by multiple inodes) to verify that the EA inode
|
||||
is the correct one being accessed.
|
13
Documentation/filesystems/ext4/ondisk/globals.rst
Normal file
13
Documentation/filesystems/ext4/ondisk/globals.rst
Normal file
@ -0,0 +1,13 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Global Structures
|
||||
=================
|
||||
|
||||
The filesystem is sharded into a number of block groups, each of which
|
||||
have static metadata at fixed locations.
|
||||
|
||||
.. include:: super.rst
|
||||
.. include:: group_descr.rst
|
||||
.. include:: bitmaps.rst
|
||||
.. include:: mmp.rst
|
||||
.. include:: journal.rst
|
170
Documentation/filesystems/ext4/ondisk/group_descr.rst
Normal file
170
Documentation/filesystems/ext4/ondisk/group_descr.rst
Normal file
@ -0,0 +1,170 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Block Group Descriptors
|
||||
-----------------------
|
||||
|
||||
Each block group on the filesystem has one of these descriptors
|
||||
associated with it. As noted in the Layout section above, the group
|
||||
descriptors (if present) are the second item in the block group. The
|
||||
standard configuration is for each block group to contain a full copy of
|
||||
the block group descriptor table unless the sparse\_super feature flag
|
||||
is set.
|
||||
|
||||
Notice how the group descriptor records the location of both bitmaps and
|
||||
the inode table (i.e. they can float). This means that within a block
|
||||
group, the only data structures with fixed locations are the superblock
|
||||
and the group descriptor table. The flex\_bg mechanism uses this
|
||||
property to group several block groups into a flex group and lay out all
|
||||
of the groups' bitmaps and inode tables into one long run in the first
|
||||
group of the flex group.
|
||||
|
||||
If the meta\_bg feature flag is set, then several block groups are
|
||||
grouped together into a meta group. Note that in the meta\_bg case,
|
||||
however, the first and last two block groups within the larger meta
|
||||
group contain only group descriptors for the groups inside the meta
|
||||
group.
|
||||
|
||||
flex\_bg and meta\_bg do not appear to be mutually exclusive features.
|
||||
|
||||
In ext2, ext3, and ext4 (when the 64bit feature is not enabled), the
|
||||
block group descriptor was only 32 bytes long and therefore ends at
|
||||
bg\_checksum. On an ext4 filesystem with the 64bit feature enabled, the
|
||||
block group descriptor expands to at least the 64 bytes described below;
|
||||
the size is stored in the superblock.
|
||||
|
||||
If gdt\_csum is set and metadata\_csum is not set, the block group
|
||||
checksum is the crc16 of the FS UUID, the group number, and the group
|
||||
descriptor structure. If metadata\_csum is set, then the block group
|
||||
checksum is the lower 16 bits of the checksum of the FS UUID, the group
|
||||
number, and the group descriptor structure. Both block and inode bitmap
|
||||
checksums are calculated against the FS UUID, the group number, and the
|
||||
entire bitmap.
|
||||
|
||||
The block group descriptor is laid out in ``struct ext4_group_desc``.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- bg\_block\_bitmap\_lo
|
||||
- Lower 32-bits of location of block bitmap.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- bg\_inode\_bitmap\_lo
|
||||
- Lower 32-bits of location of inode bitmap.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- bg\_inode\_table\_lo
|
||||
- Lower 32-bits of location of inode table.
|
||||
* - 0xC
|
||||
- \_\_le16
|
||||
- bg\_free\_blocks\_count\_lo
|
||||
- Lower 16-bits of free block count.
|
||||
* - 0xE
|
||||
- \_\_le16
|
||||
- bg\_free\_inodes\_count\_lo
|
||||
- Lower 16-bits of free inode count.
|
||||
* - 0x10
|
||||
- \_\_le16
|
||||
- bg\_used\_dirs\_count\_lo
|
||||
- Lower 16-bits of directory count.
|
||||
* - 0x12
|
||||
- \_\_le16
|
||||
- bg\_flags
|
||||
- Block group flags. See the bgflags_ table below.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- bg\_exclude\_bitmap\_lo
|
||||
- Lower 32-bits of location of snapshot exclusion bitmap.
|
||||
* - 0x18
|
||||
- \_\_le16
|
||||
- bg\_block\_bitmap\_csum\_lo
|
||||
- Lower 16-bits of the block bitmap checksum.
|
||||
* - 0x1A
|
||||
- \_\_le16
|
||||
- bg\_inode\_bitmap\_csum\_lo
|
||||
- Lower 16-bits of the inode bitmap checksum.
|
||||
* - 0x1C
|
||||
- \_\_le16
|
||||
- bg\_itable\_unused\_lo
|
||||
- Lower 16-bits of unused inode count. If set, we needn't scan past the
|
||||
``(sb.s_inodes_per_group - gdt.bg_itable_unused)``\ th entry in the
|
||||
inode table for this group.
|
||||
* - 0x1E
|
||||
- \_\_le16
|
||||
- bg\_checksum
|
||||
- Group descriptor checksum; crc16(sb\_uuid+group+desc) if the
|
||||
RO\_COMPAT\_GDT\_CSUM feature is set, or crc32c(sb\_uuid+group\_desc) &
|
||||
0xFFFF if the RO\_COMPAT\_METADATA\_CSUM feature is set.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- These fields only exist if the 64bit feature is enabled and s_desc_size
|
||||
> 32.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- bg\_block\_bitmap\_hi
|
||||
- Upper 32-bits of location of block bitmap.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- bg\_inode\_bitmap\_hi
|
||||
- Upper 32-bits of location of inodes bitmap.
|
||||
* - 0x28
|
||||
- \_\_le32
|
||||
- bg\_inode\_table\_hi
|
||||
- Upper 32-bits of location of inodes table.
|
||||
* - 0x2C
|
||||
- \_\_le16
|
||||
- bg\_free\_blocks\_count\_hi
|
||||
- Upper 16-bits of free block count.
|
||||
* - 0x2E
|
||||
- \_\_le16
|
||||
- bg\_free\_inodes\_count\_hi
|
||||
- Upper 16-bits of free inode count.
|
||||
* - 0x30
|
||||
- \_\_le16
|
||||
- bg\_used\_dirs\_count\_hi
|
||||
- Upper 16-bits of directory count.
|
||||
* - 0x32
|
||||
- \_\_le16
|
||||
- bg\_itable\_unused\_hi
|
||||
- Upper 16-bits of unused inode count.
|
||||
* - 0x34
|
||||
- \_\_le32
|
||||
- bg\_exclude\_bitmap\_hi
|
||||
- Upper 32-bits of location of snapshot exclusion bitmap.
|
||||
* - 0x38
|
||||
- \_\_le16
|
||||
- bg\_block\_bitmap\_csum\_hi
|
||||
- Upper 16-bits of the block bitmap checksum.
|
||||
* - 0x3A
|
||||
- \_\_le16
|
||||
- bg\_inode\_bitmap\_csum\_hi
|
||||
- Upper 16-bits of the inode bitmap checksum.
|
||||
* - 0x3C
|
||||
- \_\_u32
|
||||
- bg\_reserved
|
||||
- Padding to 64 bytes.
|
||||
|
||||
.. _bgflags:
|
||||
|
||||
Block group flags can be any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- inode table and bitmap are not initialized (EXT4\_BG\_INODE\_UNINIT).
|
||||
* - 0x2
|
||||
- block bitmap is not initialized (EXT4\_BG\_BLOCK\_UNINIT).
|
||||
* - 0x4
|
||||
- inode table is zeroed (EXT4\_BG\_INODE\_ZEROED).
|
194
Documentation/filesystems/ext4/ondisk/ifork.rst
Normal file
194
Documentation/filesystems/ext4/ondisk/ifork.rst
Normal file
@ -0,0 +1,194 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
The Contents of inode.i\_block
|
||||
------------------------------
|
||||
|
||||
Depending on the type of file an inode describes, the 60 bytes of
|
||||
storage in ``inode.i_block`` can be used in different ways. In general,
|
||||
regular files and directories will use it for file block indexing
|
||||
information, and special files will use it for special purposes.
|
||||
|
||||
Symbolic Links
|
||||
~~~~~~~~~~~~~~
|
||||
|
||||
The target of a symbolic link will be stored in this field if the target
|
||||
string is less than 60 bytes long. Otherwise, either extents or block
|
||||
maps will be used to allocate data blocks to store the link target.
|
||||
|
||||
Direct/Indirect Block Addressing
|
||||
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
|
||||
|
||||
In ext2/3, file block numbers were mapped to logical block numbers by
|
||||
means of an (up to) three level 1-1 block map. To find the logical block
|
||||
that stores a particular file block, the code would navigate through
|
||||
this increasingly complicated structure. Notice that there is neither a
|
||||
magic number nor a checksum to provide any level of confidence that the
|
||||
block isn't full of garbage.
|
||||
|
||||
.. ifconfig:: builder != 'latex'
|
||||
|
||||
.. include:: blockmap.rst
|
||||
|
||||
.. ifconfig:: builder == 'latex'
|
||||
|
||||
[Table omitted because LaTeX doesn't support nested tables.]
|
||||
|
||||
Note that with this block mapping scheme, it is necessary to fill out a
|
||||
lot of mapping data even for a large contiguous file! This inefficiency
|
||||
led to the creation of the extent mapping scheme, discussed below.
|
||||
|
||||
Notice also that a file using this mapping scheme cannot be placed
|
||||
higher than 2^32 blocks.
|
||||
|
||||
Extent Tree
|
||||
~~~~~~~~~~~
|
||||
|
||||
In ext4, the file to logical block map has been replaced with an extent
|
||||
tree. Under the old scheme, allocating a contiguous run of 1,000 blocks
|
||||
requires an indirect block to map all 1,000 entries; with extents, the
|
||||
mapping is reduced to a single ``struct ext4_extent`` with
|
||||
``ee_len = 1000``. If flex\_bg is enabled, it is possible to allocate
|
||||
very large files with a single extent, at a considerable reduction in
|
||||
metadata block use, and some improvement in disk efficiency. The inode
|
||||
must have the extents flag (0x80000) flag set for this feature to be in
|
||||
use.
|
||||
|
||||
Extents are arranged as a tree. Each node of the tree begins with a
|
||||
``struct ext4_extent_header``. If the node is an interior node
|
||||
(``eh.eh_depth`` > 0), the header is followed by ``eh.eh_entries``
|
||||
instances of ``struct ext4_extent_idx``; each of these index entries
|
||||
points to a block containing more nodes in the extent tree. If the node
|
||||
is a leaf node (``eh.eh_depth == 0``), then the header is followed by
|
||||
``eh.eh_entries`` instances of ``struct ext4_extent``; these instances
|
||||
point to the file's data blocks. The root node of the extent tree is
|
||||
stored in ``inode.i_block``, which allows for the first four extents to
|
||||
be recorded without the use of extra metadata blocks.
|
||||
|
||||
The extent tree header is recorded in ``struct ext4_extent_header``,
|
||||
which is 12 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- eh\_magic
|
||||
- Magic number, 0xF30A.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- eh\_entries
|
||||
- Number of valid entries following the header.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- eh\_max
|
||||
- Maximum number of entries that could follow the header.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- eh\_depth
|
||||
- Depth of this extent node in the extent tree. 0 = this extent node
|
||||
points to data blocks; otherwise, this extent node points to other
|
||||
extent nodes. The extent tree can be at most 5 levels deep: a logical
|
||||
block number can be at most ``2^32``, and the smallest ``n`` that
|
||||
satisfies ``4*(((blocksize - 12)/12)^n) >= 2^32`` is 5.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- eh\_generation
|
||||
- Generation of the tree. (Used by Lustre, but not standard ext4).
|
||||
|
||||
Internal nodes of the extent tree, also known as index nodes, are
|
||||
recorded as ``struct ext4_extent_idx``, and are 12 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- ei\_block
|
||||
- This index node covers file blocks from 'block' onward.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- ei\_leaf\_lo
|
||||
- Lower 32-bits of the block number of the extent node that is the next
|
||||
level lower in the tree. The tree node pointed to can be either another
|
||||
internal node or a leaf node, described below.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- ei\_leaf\_hi
|
||||
- Upper 16-bits of the previous field.
|
||||
* - 0xA
|
||||
- \_\_u16
|
||||
- ei\_unused
|
||||
-
|
||||
|
||||
Leaf nodes of the extent tree are recorded as ``struct ext4_extent``,
|
||||
and are also 12 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- ee\_block
|
||||
- First file block number that this extent covers.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- ee\_len
|
||||
- Number of blocks covered by extent. If the value of this field is <=
|
||||
32768, the extent is initialized. If the value of the field is > 32768,
|
||||
the extent is uninitialized and the actual extent length is ``ee_len`` -
|
||||
32768. Therefore, the maximum length of a initialized extent is 32768
|
||||
blocks, and the maximum length of an uninitialized extent is 32767.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- ee\_start\_hi
|
||||
- Upper 16-bits of the block number to which this extent points.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- ee\_start\_lo
|
||||
- Lower 32-bits of the block number to which this extent points.
|
||||
|
||||
Prior to the introduction of metadata checksums, the extent header +
|
||||
extent entries always left at least 4 bytes of unallocated space at the
|
||||
end of each extent tree data block (because (2^x % 12) >= 4). Therefore,
|
||||
the 32-bit checksum is inserted into this space. The 4 extents in the
|
||||
inode do not need checksumming, since the inode is already checksummed.
|
||||
The checksum is calculated against the FS UUID, the inode number, the
|
||||
inode generation, and the entire extent block leading up to (but not
|
||||
including) the checksum itself.
|
||||
|
||||
``struct ext4_extent_tail`` is 4 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- eb\_checksum
|
||||
- Checksum of the extent block, crc32c(uuid+inum+igeneration+extentblock)
|
||||
|
||||
Inline Data
|
||||
~~~~~~~~~~~
|
||||
|
||||
If the inline data feature is enabled for the filesystem and the flag is
|
||||
set for the inode, it is possible that the first 60 bytes of the file
|
||||
data are stored here.
|
9
Documentation/filesystems/ext4/ondisk/index.rst
Normal file
9
Documentation/filesystems/ext4/ondisk/index.rst
Normal file
@ -0,0 +1,9 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
==============================
|
||||
Data Structures and Algorithms
|
||||
==============================
|
||||
.. include:: about.rst
|
||||
.. include:: overview.rst
|
||||
.. include:: globals.rst
|
||||
.. include:: dynamic.rst
|
37
Documentation/filesystems/ext4/ondisk/inlinedata.rst
Normal file
37
Documentation/filesystems/ext4/ondisk/inlinedata.rst
Normal file
@ -0,0 +1,37 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Inline Data
|
||||
-----------
|
||||
|
||||
The inline data feature was designed to handle the case that a file's
|
||||
data is so tiny that it readily fits inside the inode, which
|
||||
(theoretically) reduces disk block consumption and reduces seeks. If the
|
||||
file is smaller than 60 bytes, then the data are stored inline in
|
||||
``inode.i_block``. If the rest of the file would fit inside the extended
|
||||
attribute space, then it might be found as an extended attribute
|
||||
“system.data” within the inode body (“ibody EA”). This of course
|
||||
constrains the amount of extended attributes one can attach to an inode.
|
||||
If the data size increases beyond i\_block + ibody EA, a regular block
|
||||
is allocated and the contents moved to that block.
|
||||
|
||||
Pending a change to compact the extended attribute key used to store
|
||||
inline data, one ought to be able to store 160 bytes of data in a
|
||||
256-byte inode (as of June 2015, when i\_extra\_isize is 28). Prior to
|
||||
that, the limit was 156 bytes due to inefficient use of inode space.
|
||||
|
||||
The inline data feature requires the presence of an extended attribute
|
||||
for “system.data”, even if the attribute value is zero length.
|
||||
|
||||
Inline Directories
|
||||
~~~~~~~~~~~~~~~~~~
|
||||
|
||||
The first four bytes of i\_block are the inode number of the parent
|
||||
directory. Following that is a 56-byte space for an array of directory
|
||||
entries; see ``struct ext4_dir_entry``. If there is a “system.data”
|
||||
attribute in the inode body, the EA value is an array of
|
||||
``struct ext4_dir_entry`` as well. Note that for inline directories, the
|
||||
i\_block and EA space are treated as separate dirent blocks; directory
|
||||
entries cannot span the two.
|
||||
|
||||
Inline directory entries are not checksummed, as the inode checksum
|
||||
should protect all inline data contents.
|
575
Documentation/filesystems/ext4/ondisk/inodes.rst
Normal file
575
Documentation/filesystems/ext4/ondisk/inodes.rst
Normal file
@ -0,0 +1,575 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Index Nodes
|
||||
-----------
|
||||
|
||||
In a regular UNIX filesystem, the inode stores all the metadata
|
||||
pertaining to the file (time stamps, block maps, extended attributes,
|
||||
etc), not the directory entry. To find the information associated with a
|
||||
file, one must traverse the directory files to find the directory entry
|
||||
associated with a file, then load the inode to find the metadata for
|
||||
that file. ext4 appears to cheat (for performance reasons) a little bit
|
||||
by storing a copy of the file type (normally stored in the inode) in the
|
||||
directory entry. (Compare all this to FAT, which stores all the file
|
||||
information directly in the directory entry, but does not support hard
|
||||
links and is in general more seek-happy than ext4 due to its simpler
|
||||
block allocator and extensive use of linked lists.)
|
||||
|
||||
The inode table is a linear array of ``struct ext4_inode``. The table is
|
||||
sized to have enough blocks to store at least
|
||||
``sb.s_inode_size * sb.s_inodes_per_group`` bytes. The number of the
|
||||
block group containing an inode can be calculated as
|
||||
``(inode_number - 1) / sb.s_inodes_per_group``, and the offset into the
|
||||
group's table is ``(inode_number - 1) % sb.s_inodes_per_group``. There
|
||||
is no inode 0.
|
||||
|
||||
The inode checksum is calculated against the FS UUID, the inode number,
|
||||
and the inode structure itself.
|
||||
|
||||
The inode table entry is laid out in ``struct ext4_inode``.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- i\_mode
|
||||
- File mode. See the table i_mode_ below.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- i\_uid
|
||||
- Lower 16-bits of Owner UID.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- i\_size\_lo
|
||||
- Lower 32-bits of size in bytes.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- i\_atime
|
||||
- Last access time, in seconds since the epoch. However, if the EA\_INODE
|
||||
inode flag is set, this inode stores an extended attribute value and
|
||||
this field contains the checksum of the value.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- i\_ctime
|
||||
- Last inode change time, in seconds since the epoch. However, if the
|
||||
EA\_INODE inode flag is set, this inode stores an extended attribute
|
||||
value and this field contains the lower 32 bits of the attribute value's
|
||||
reference count.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- i\_mtime
|
||||
- Last data modification time, in seconds since the epoch. However, if the
|
||||
EA\_INODE inode flag is set, this inode stores an extended attribute
|
||||
value and this field contains the number of the inode that owns the
|
||||
extended attribute.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- i\_dtime
|
||||
- Deletion Time, in seconds since the epoch.
|
||||
* - 0x18
|
||||
- \_\_le16
|
||||
- i\_gid
|
||||
- Lower 16-bits of GID.
|
||||
* - 0x1A
|
||||
- \_\_le16
|
||||
- i\_links\_count
|
||||
- Hard link count. Normally, ext4 does not permit an inode to have more
|
||||
than 65,000 hard links. This applies to files as well as directories,
|
||||
which means that there cannot be more than 64,998 subdirectories in a
|
||||
directory (each subdirectory's '..' entry counts as a hard link, as does
|
||||
the '.' entry in the directory itself). With the DIR\_NLINK feature
|
||||
enabled, ext4 supports more than 64,998 subdirectories by setting this
|
||||
field to 1 to indicate that the number of hard links is not known.
|
||||
* - 0x1C
|
||||
- \_\_le32
|
||||
- i\_blocks\_lo
|
||||
- Lower 32-bits of “block” count. If the huge\_file feature flag is not
|
||||
set on the filesystem, the file consumes ``i_blocks_lo`` 512-byte blocks
|
||||
on disk. If huge\_file is set and EXT4\_HUGE\_FILE\_FL is NOT set in
|
||||
``inode.i_flags``, then the file consumes ``i_blocks_lo + (i_blocks_hi
|
||||
<< 32)`` 512-byte blocks on disk. If huge\_file is set and
|
||||
EXT4\_HUGE\_FILE\_FL IS set in ``inode.i_flags``, then this file
|
||||
consumes (``i_blocks_lo + i_blocks_hi`` << 32) filesystem blocks on
|
||||
disk.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- i\_flags
|
||||
- Inode flags. See the table i_flags_ below.
|
||||
* - 0x24
|
||||
- 4 bytes
|
||||
- i\_osd1
|
||||
- See the table i_osd1_ for more details.
|
||||
* - 0x28
|
||||
- 60 bytes
|
||||
- i\_block[EXT4\_N\_BLOCKS=15]
|
||||
- Block map or extent tree. See the section “The Contents of inode.i\_block”.
|
||||
* - 0x64
|
||||
- \_\_le32
|
||||
- i\_generation
|
||||
- File version (for NFS).
|
||||
* - 0x68
|
||||
- \_\_le32
|
||||
- i\_file\_acl\_lo
|
||||
- Lower 32-bits of extended attribute block. ACLs are of course one of
|
||||
many possible extended attributes; I think the name of this field is a
|
||||
result of the first use of extended attributes being for ACLs.
|
||||
* - 0x6C
|
||||
- \_\_le32
|
||||
- i\_size\_high / i\_dir\_acl
|
||||
- Upper 32-bits of file/directory size. In ext2/3 this field was named
|
||||
i\_dir\_acl, though it was usually set to zero and never used.
|
||||
* - 0x70
|
||||
- \_\_le32
|
||||
- i\_obso\_faddr
|
||||
- (Obsolete) fragment address.
|
||||
* - 0x74
|
||||
- 12 bytes
|
||||
- i\_osd2
|
||||
- See the table i_osd2_ for more details.
|
||||
* - 0x80
|
||||
- \_\_le16
|
||||
- i\_extra\_isize
|
||||
- Size of this inode - 128. Alternately, the size of the extended inode
|
||||
fields beyond the original ext2 inode, including this field.
|
||||
* - 0x82
|
||||
- \_\_le16
|
||||
- i\_checksum\_hi
|
||||
- Upper 16-bits of the inode checksum.
|
||||
* - 0x84
|
||||
- \_\_le32
|
||||
- i\_ctime\_extra
|
||||
- Extra change time bits. This provides sub-second precision. See Inode
|
||||
Timestamps section.
|
||||
* - 0x88
|
||||
- \_\_le32
|
||||
- i\_mtime\_extra
|
||||
- Extra modification time bits. This provides sub-second precision.
|
||||
* - 0x8C
|
||||
- \_\_le32
|
||||
- i\_atime\_extra
|
||||
- Extra access time bits. This provides sub-second precision.
|
||||
* - 0x90
|
||||
- \_\_le32
|
||||
- i\_crtime
|
||||
- File creation time, in seconds since the epoch.
|
||||
* - 0x94
|
||||
- \_\_le32
|
||||
- i\_crtime\_extra
|
||||
- Extra file creation time bits. This provides sub-second precision.
|
||||
* - 0x98
|
||||
- \_\_le32
|
||||
- i\_version\_hi
|
||||
- Upper 32-bits for version number.
|
||||
* - 0x9C
|
||||
- \_\_le32
|
||||
- i\_projid
|
||||
- Project ID.
|
||||
|
||||
.. _i_mode:
|
||||
|
||||
The ``i_mode`` value is a combination of the following flags:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- S\_IXOTH (Others may execute)
|
||||
* - 0x2
|
||||
- S\_IWOTH (Others may write)
|
||||
* - 0x4
|
||||
- S\_IROTH (Others may read)
|
||||
* - 0x8
|
||||
- S\_IXGRP (Group members may execute)
|
||||
* - 0x10
|
||||
- S\_IWGRP (Group members may write)
|
||||
* - 0x20
|
||||
- S\_IRGRP (Group members may read)
|
||||
* - 0x40
|
||||
- S\_IXUSR (Owner may execute)
|
||||
* - 0x80
|
||||
- S\_IWUSR (Owner may write)
|
||||
* - 0x100
|
||||
- S\_IRUSR (Owner may read)
|
||||
* - 0x200
|
||||
- S\_ISVTX (Sticky bit)
|
||||
* - 0x400
|
||||
- S\_ISGID (Set GID)
|
||||
* - 0x800
|
||||
- S\_ISUID (Set UID)
|
||||
* -
|
||||
- These are mutually-exclusive file types:
|
||||
* - 0x1000
|
||||
- S\_IFIFO (FIFO)
|
||||
* - 0x2000
|
||||
- S\_IFCHR (Character device)
|
||||
* - 0x4000
|
||||
- S\_IFDIR (Directory)
|
||||
* - 0x6000
|
||||
- S\_IFBLK (Block device)
|
||||
* - 0x8000
|
||||
- S\_IFREG (Regular file)
|
||||
* - 0xA000
|
||||
- S\_IFLNK (Symbolic link)
|
||||
* - 0xC000
|
||||
- S\_IFSOCK (Socket)
|
||||
|
||||
.. _i_flags:
|
||||
|
||||
The ``i_flags`` field is a combination of these values:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- This file requires secure deletion (EXT4\_SECRM\_FL). (not implemented)
|
||||
* - 0x2
|
||||
- This file should be preserved, should undeletion be desired
|
||||
(EXT4\_UNRM\_FL). (not implemented)
|
||||
* - 0x4
|
||||
- File is compressed (EXT4\_COMPR\_FL). (not really implemented)
|
||||
* - 0x8
|
||||
- All writes to the file must be synchronous (EXT4\_SYNC\_FL).
|
||||
* - 0x10
|
||||
- File is immutable (EXT4\_IMMUTABLE\_FL).
|
||||
* - 0x20
|
||||
- File can only be appended (EXT4\_APPEND\_FL).
|
||||
* - 0x40
|
||||
- The dump(1) utility should not dump this file (EXT4\_NODUMP\_FL).
|
||||
* - 0x80
|
||||
- Do not update access time (EXT4\_NOATIME\_FL).
|
||||
* - 0x100
|
||||
- Dirty compressed file (EXT4\_DIRTY\_FL). (not used)
|
||||
* - 0x200
|
||||
- File has one or more compressed clusters (EXT4\_COMPRBLK\_FL). (not used)
|
||||
* - 0x400
|
||||
- Do not compress file (EXT4\_NOCOMPR\_FL). (not used)
|
||||
* - 0x800
|
||||
- Encrypted inode (EXT4\_ENCRYPT\_FL). This bit value previously was
|
||||
EXT4\_ECOMPR\_FL (compression error), which was never used.
|
||||
* - 0x1000
|
||||
- Directory has hashed indexes (EXT4\_INDEX\_FL).
|
||||
* - 0x2000
|
||||
- AFS magic directory (EXT4\_IMAGIC\_FL).
|
||||
* - 0x4000
|
||||
- File data must always be written through the journal
|
||||
(EXT4\_JOURNAL\_DATA\_FL).
|
||||
* - 0x8000
|
||||
- File tail should not be merged (EXT4\_NOTAIL\_FL). (not used by ext4)
|
||||
* - 0x10000
|
||||
- All directory entry data should be written synchronously (see
|
||||
``dirsync``) (EXT4\_DIRSYNC\_FL).
|
||||
* - 0x20000
|
||||
- Top of directory hierarchy (EXT4\_TOPDIR\_FL).
|
||||
* - 0x40000
|
||||
- This is a huge file (EXT4\_HUGE\_FILE\_FL).
|
||||
* - 0x80000
|
||||
- Inode uses extents (EXT4\_EXTENTS\_FL).
|
||||
* - 0x200000
|
||||
- Inode stores a large extended attribute value in its data blocks
|
||||
(EXT4\_EA\_INODE\_FL).
|
||||
* - 0x400000
|
||||
- This file has blocks allocated past EOF (EXT4\_EOFBLOCKS\_FL).
|
||||
(deprecated)
|
||||
* - 0x01000000
|
||||
- Inode is a snapshot (``EXT4_SNAPFILE_FL``). (not in mainline)
|
||||
* - 0x04000000
|
||||
- Snapshot is being deleted (``EXT4_SNAPFILE_DELETED_FL``). (not in
|
||||
mainline)
|
||||
* - 0x08000000
|
||||
- Snapshot shrink has completed (``EXT4_SNAPFILE_SHRUNK_FL``). (not in
|
||||
mainline)
|
||||
* - 0x10000000
|
||||
- Inode has inline data (EXT4\_INLINE\_DATA\_FL).
|
||||
* - 0x20000000
|
||||
- Create children with the same project ID (EXT4\_PROJINHERIT\_FL).
|
||||
* - 0x80000000
|
||||
- Reserved for ext4 library (EXT4\_RESERVED\_FL).
|
||||
* -
|
||||
- Aggregate flags:
|
||||
* - 0x4BDFFF
|
||||
- User-visible flags.
|
||||
* - 0x4B80FF
|
||||
- User-modifiable flags. Note that while EXT4\_JOURNAL\_DATA\_FL and
|
||||
EXT4\_EXTENTS\_FL can be set with setattr, they are not in the kernel's
|
||||
EXT4\_FL\_USER\_MODIFIABLE mask, since it needs to handle the setting of
|
||||
these flags in a special manner and they are masked out of the set of
|
||||
flags that are saved directly to i\_flags.
|
||||
|
||||
.. _i_osd1:
|
||||
|
||||
The ``osd1`` field has multiple meanings depending on the creator:
|
||||
|
||||
Linux:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- l\_i\_version
|
||||
- Inode version. However, if the EA\_INODE inode flag is set, this inode
|
||||
stores an extended attribute value and this field contains the upper 32
|
||||
bits of the attribute value's reference count.
|
||||
|
||||
Hurd:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- h\_i\_translator
|
||||
- ??
|
||||
|
||||
Masix:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- m\_i\_reserved
|
||||
- ??
|
||||
|
||||
.. _i_osd2:
|
||||
|
||||
The ``osd2`` field has multiple meanings depending on the filesystem creator:
|
||||
|
||||
Linux:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- l\_i\_blocks\_high
|
||||
- Upper 16-bits of the block count. Please see the note attached to
|
||||
i\_blocks\_lo.
|
||||
* - 0x2
|
||||
- \_\_le16
|
||||
- l\_i\_file\_acl\_high
|
||||
- Upper 16-bits of the extended attribute block (historically, the file
|
||||
ACL location). See the Extended Attributes section below.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- l\_i\_uid\_high
|
||||
- Upper 16-bits of the Owner UID.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- l\_i\_gid\_high
|
||||
- Upper 16-bits of the GID.
|
||||
* - 0x8
|
||||
- \_\_le16
|
||||
- l\_i\_checksum\_lo
|
||||
- Lower 16-bits of the inode checksum.
|
||||
* - 0xA
|
||||
- \_\_le16
|
||||
- l\_i\_reserved
|
||||
- Unused.
|
||||
|
||||
Hurd:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- h\_i\_reserved1
|
||||
- ??
|
||||
* - 0x2
|
||||
- \_\_u16
|
||||
- h\_i\_mode\_high
|
||||
- Upper 16-bits of the file mode.
|
||||
* - 0x4
|
||||
- \_\_le16
|
||||
- h\_i\_uid\_high
|
||||
- Upper 16-bits of the Owner UID.
|
||||
* - 0x6
|
||||
- \_\_le16
|
||||
- h\_i\_gid\_high
|
||||
- Upper 16-bits of the GID.
|
||||
* - 0x8
|
||||
- \_\_u32
|
||||
- h\_i\_author
|
||||
- Author code?
|
||||
|
||||
Masix:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le16
|
||||
- h\_i\_reserved1
|
||||
- ??
|
||||
* - 0x2
|
||||
- \_\_u16
|
||||
- m\_i\_file\_acl\_high
|
||||
- Upper 16-bits of the extended attribute block (historically, the file
|
||||
ACL location).
|
||||
* - 0x4
|
||||
- \_\_u32
|
||||
- m\_i\_reserved2[2]
|
||||
- ??
|
||||
|
||||
Inode Size
|
||||
~~~~~~~~~~
|
||||
|
||||
In ext2 and ext3, the inode structure size was fixed at 128 bytes
|
||||
(``EXT2_GOOD_OLD_INODE_SIZE``) and each inode had a disk record size of
|
||||
128 bytes. Starting with ext4, it is possible to allocate a larger
|
||||
on-disk inode at format time for all inodes in the filesystem to provide
|
||||
space beyond the end of the original ext2 inode. The on-disk inode
|
||||
record size is recorded in the superblock as ``s_inode_size``. The
|
||||
number of bytes actually used by struct ext4\_inode beyond the original
|
||||
128-byte ext2 inode is recorded in the ``i_extra_isize`` field for each
|
||||
inode, which allows struct ext4\_inode to grow for a new kernel without
|
||||
having to upgrade all of the on-disk inodes. Access to fields beyond
|
||||
EXT2\_GOOD\_OLD\_INODE\_SIZE should be verified to be within
|
||||
``i_extra_isize``. By default, ext4 inode records are 256 bytes, and (as
|
||||
of October 2013) the inode structure is 156 bytes
|
||||
(``i_extra_isize = 28``). The extra space between the end of the inode
|
||||
structure and the end of the inode record can be used to store extended
|
||||
attributes. Each inode record can be as large as the filesystem block
|
||||
size, though this is not terribly efficient.
|
||||
|
||||
Finding an Inode
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Each block group contains ``sb->s_inodes_per_group`` inodes. Because
|
||||
inode 0 is defined not to exist, this formula can be used to find the
|
||||
block group that an inode lives in:
|
||||
``bg = (inode_num - 1) / sb->s_inodes_per_group``. The particular inode
|
||||
can be found within the block group's inode table at
|
||||
``index = (inode_num - 1) % sb->s_inodes_per_group``. To get the byte
|
||||
address within the inode table, use
|
||||
``offset = index * sb->s_inode_size``.
|
||||
|
||||
Inode Timestamps
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Four timestamps are recorded in the lower 128 bytes of the inode
|
||||
structure -- inode change time (ctime), access time (atime), data
|
||||
modification time (mtime), and deletion time (dtime). The four fields
|
||||
are 32-bit signed integers that represent seconds since the Unix epoch
|
||||
(1970-01-01 00:00:00 GMT), which means that the fields will overflow in
|
||||
January 2038. For inodes that are not linked from any directory but are
|
||||
still open (orphan inodes), the dtime field is overloaded for use with
|
||||
the orphan list. The superblock field ``s_last_orphan`` points to the
|
||||
first inode in the orphan list; dtime is then the number of the next
|
||||
orphaned inode, or zero if there are no more orphans.
|
||||
|
||||
If the inode structure size ``sb->s_inode_size`` is larger than 128
|
||||
bytes and the ``i_inode_extra`` field is large enough to encompass the
|
||||
respective ``i_[cma]time_extra`` field, the ctime, atime, and mtime
|
||||
inode fields are widened to 64 bits. Within this “extra” 32-bit field,
|
||||
the lower two bits are used to extend the 32-bit seconds field to be 34
|
||||
bit wide; the upper 30 bits are used to provide nanosecond timestamp
|
||||
accuracy. Therefore, timestamps should not overflow until May 2446.
|
||||
dtime was not widened. There is also a fifth timestamp to record inode
|
||||
creation time (crtime); this field is 64-bits wide and decoded in the
|
||||
same manner as 64-bit [cma]time. Neither crtime nor dtime are accessible
|
||||
through the regular stat() interface, though debugfs will report them.
|
||||
|
||||
We use the 32-bit signed time value plus (2^32 \* (extra epoch bits)).
|
||||
In other words:
|
||||
|
||||
.. list-table::
|
||||
:widths: 20 20 20 20 20
|
||||
:header-rows: 1
|
||||
|
||||
* - Extra epoch bits
|
||||
- MSB of 32-bit time
|
||||
- Adjustment for signed 32-bit to 64-bit tv\_sec
|
||||
- Decoded 64-bit tv\_sec
|
||||
- valid time range
|
||||
* - 0 0
|
||||
- 1
|
||||
- 0
|
||||
- ``-0x80000000 - -0x00000001``
|
||||
- 1901-12-13 to 1969-12-31
|
||||
* - 0 0
|
||||
- 0
|
||||
- 0
|
||||
- ``0x000000000 - 0x07fffffff``
|
||||
- 1970-01-01 to 2038-01-19
|
||||
* - 0 1
|
||||
- 1
|
||||
- 0x100000000
|
||||
- ``0x080000000 - 0x0ffffffff``
|
||||
- 2038-01-19 to 2106-02-07
|
||||
* - 0 1
|
||||
- 0
|
||||
- 0x100000000
|
||||
- ``0x100000000 - 0x17fffffff``
|
||||
- 2106-02-07 to 2174-02-25
|
||||
* - 1 0
|
||||
- 1
|
||||
- 0x200000000
|
||||
- ``0x180000000 - 0x1ffffffff``
|
||||
- 2174-02-25 to 2242-03-16
|
||||
* - 1 0
|
||||
- 0
|
||||
- 0x200000000
|
||||
- ``0x200000000 - 0x27fffffff``
|
||||
- 2242-03-16 to 2310-04-04
|
||||
* - 1 1
|
||||
- 1
|
||||
- 0x300000000
|
||||
- ``0x280000000 - 0x2ffffffff``
|
||||
- 2310-04-04 to 2378-04-22
|
||||
* - 1 1
|
||||
- 0
|
||||
- 0x300000000
|
||||
- ``0x300000000 - 0x37fffffff``
|
||||
- 2378-04-22 to 2446-05-10
|
||||
|
||||
This is a somewhat odd encoding since there are effectively seven times
|
||||
as many positive values as negative values. There have also been
|
||||
long-standing bugs decoding and encoding dates beyond 2038, which don't
|
||||
seem to be fixed as of kernel 3.12 and e2fsprogs 1.42.8. 64-bit kernels
|
||||
incorrectly use the extra epoch bits 1,1 for dates between 1901 and
|
||||
1970. At some point the kernel will be fixed and e2fsck will fix this
|
||||
situation, assuming that it is run before 2310.
|
611
Documentation/filesystems/ext4/ondisk/journal.rst
Normal file
611
Documentation/filesystems/ext4/ondisk/journal.rst
Normal file
@ -0,0 +1,611 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Journal (jbd2)
|
||||
--------------
|
||||
|
||||
Introduced in ext3, the ext4 filesystem employs a journal to protect the
|
||||
filesystem against corruption in the case of a system crash. A small
|
||||
continuous region of disk (default 128MiB) is reserved inside the
|
||||
filesystem as a place to land “important” data writes on-disk as quickly
|
||||
as possible. Once the important data transaction is fully written to the
|
||||
disk and flushed from the disk write cache, a record of the data being
|
||||
committed is also written to the journal. At some later point in time,
|
||||
the journal code writes the transactions to their final locations on
|
||||
disk (this could involve a lot of seeking or a lot of small
|
||||
read-write-erases) before erasing the commit record. Should the system
|
||||
crash during the second slow write, the journal can be replayed all the
|
||||
way to the latest commit record, guaranteeing the atomicity of whatever
|
||||
gets written through the journal to the disk. The effect of this is to
|
||||
guarantee that the filesystem does not become stuck midway through a
|
||||
metadata update.
|
||||
|
||||
For performance reasons, ext4 by default only writes filesystem metadata
|
||||
through the journal. This means that file data blocks are /not/
|
||||
guaranteed to be in any consistent state after a crash. If this default
|
||||
guarantee level (``data=ordered``) is not satisfactory, there is a mount
|
||||
option to control journal behavior. If ``data=journal``, all data and
|
||||
metadata are written to disk through the journal. This is slower but
|
||||
safest. If ``data=writeback``, dirty data blocks are not flushed to the
|
||||
disk before the metadata are written to disk through the journal.
|
||||
|
||||
The journal inode is typically inode 8. The first 68 bytes of the
|
||||
journal inode are replicated in the ext4 superblock. The journal itself
|
||||
is normal (but hidden) file within the filesystem. The file usually
|
||||
consumes an entire block group, though mke2fs tries to put it in the
|
||||
middle of the disk.
|
||||
|
||||
All fields in jbd2 are written to disk in big-endian order. This is the
|
||||
opposite of ext4.
|
||||
|
||||
NOTE: Both ext4 and ocfs2 use jbd2.
|
||||
|
||||
The maximum size of a journal embedded in an ext4 filesystem is 2^32
|
||||
blocks. jbd2 itself does not seem to care.
|
||||
|
||||
Layout
|
||||
~~~~~~
|
||||
|
||||
Generally speaking, the journal has this format:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 78
|
||||
:header-rows: 1
|
||||
|
||||
* - Superblock
|
||||
- descriptor\_block (data\_blocks or revocation\_block) [more data or
|
||||
revocations] commmit\_block
|
||||
- [more transactions...]
|
||||
* -
|
||||
- One transaction
|
||||
-
|
||||
|
||||
Notice that a transaction begins with either a descriptor and some data,
|
||||
or a block revocation list. A finished transaction always ends with a
|
||||
commit. If there is no commit record (or the checksums don't match), the
|
||||
transaction will be discarded during replay.
|
||||
|
||||
External Journal
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
Optionally, an ext4 filesystem can be created with an external journal
|
||||
device (as opposed to an internal journal, which uses a reserved inode).
|
||||
In this case, on the filesystem device, ``s_journal_inum`` should be
|
||||
zero and ``s_journal_uuid`` should be set. On the journal device there
|
||||
will be an ext4 super block in the usual place, with a matching UUID.
|
||||
The journal superblock will be in the next full block after the
|
||||
superblock.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 1 76
|
||||
:header-rows: 1
|
||||
|
||||
* - 1024 bytes of padding
|
||||
- ext4 Superblock
|
||||
- Journal Superblock
|
||||
- descriptor\_block (data\_blocks or revocation\_block) [more data or
|
||||
revocations] commmit\_block
|
||||
- [more transactions...]
|
||||
* -
|
||||
-
|
||||
-
|
||||
- One transaction
|
||||
-
|
||||
|
||||
Block Header
|
||||
~~~~~~~~~~~~
|
||||
|
||||
Every block in the journal starts with a common 12-byte header
|
||||
``struct journal_header_s``:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- h\_magic
|
||||
- jbd2 magic number, 0xC03B3998.
|
||||
* - 0x4
|
||||
- \_\_be32
|
||||
- h\_blocktype
|
||||
- Description of what this block contains. See the jbd2_blocktype_ table
|
||||
below.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- h\_sequence
|
||||
- The transaction ID that goes with this block.
|
||||
|
||||
.. _jbd2_blocktype:
|
||||
|
||||
The journal block type can be any one of:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 1
|
||||
- Descriptor. This block precedes a series of data blocks that were
|
||||
written through the journal during a transaction.
|
||||
* - 2
|
||||
- Block commit record. This block signifies the completion of a
|
||||
transaction.
|
||||
* - 3
|
||||
- Journal superblock, v1.
|
||||
* - 4
|
||||
- Journal superblock, v2.
|
||||
* - 5
|
||||
- Block revocation records. This speeds up recovery by enabling the
|
||||
journal to skip writing blocks that were subsequently rewritten.
|
||||
|
||||
Super Block
|
||||
~~~~~~~~~~~
|
||||
|
||||
The super block for the journal is much simpler as compared to ext4's.
|
||||
The key data kept within are size of the journal, and where to find the
|
||||
start of the log of transactions.
|
||||
|
||||
The journal superblock is recorded as ``struct journal_superblock_s``,
|
||||
which is 1024 bytes long:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* -
|
||||
-
|
||||
-
|
||||
- Static information describing the journal.
|
||||
* - 0x0
|
||||
- journal\_header\_t (12 bytes)
|
||||
- s\_header
|
||||
- Common header identifying this as a superblock.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- s\_blocksize
|
||||
- Journal device block size.
|
||||
* - 0x10
|
||||
- \_\_be32
|
||||
- s\_maxlen
|
||||
- Total number of blocks in this journal.
|
||||
* - 0x14
|
||||
- \_\_be32
|
||||
- s\_first
|
||||
- First block of log information.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- Dynamic information describing the current state of the log.
|
||||
* - 0x18
|
||||
- \_\_be32
|
||||
- s\_sequence
|
||||
- First commit ID expected in log.
|
||||
* - 0x1C
|
||||
- \_\_be32
|
||||
- s\_start
|
||||
- Block number of the start of log. Contrary to the comments, this field
|
||||
being zero does not imply that the journal is clean!
|
||||
* - 0x20
|
||||
- \_\_be32
|
||||
- s\_errno
|
||||
- Error value, as set by jbd2\_journal\_abort().
|
||||
* -
|
||||
-
|
||||
-
|
||||
- The remaining fields are only valid in a v2 superblock.
|
||||
* - 0x24
|
||||
- \_\_be32
|
||||
- s\_feature\_compat;
|
||||
- Compatible feature set. See the table jbd2_compat_ below.
|
||||
* - 0x28
|
||||
- \_\_be32
|
||||
- s\_feature\_incompat
|
||||
- Incompatible feature set. See the table jbd2_incompat_ below.
|
||||
* - 0x2C
|
||||
- \_\_be32
|
||||
- s\_feature\_ro\_compat
|
||||
- Read-only compatible feature set. There aren't any of these currently.
|
||||
* - 0x30
|
||||
- \_\_u8
|
||||
- s\_uuid[16]
|
||||
- 128-bit uuid for journal. This is compared against the copy in the ext4
|
||||
super block at mount time.
|
||||
* - 0x40
|
||||
- \_\_be32
|
||||
- s\_nr\_users
|
||||
- Number of file systems sharing this journal.
|
||||
* - 0x44
|
||||
- \_\_be32
|
||||
- s\_dynsuper
|
||||
- Location of dynamic super block copy. (Not used?)
|
||||
* - 0x48
|
||||
- \_\_be32
|
||||
- s\_max\_transaction
|
||||
- Limit of journal blocks per transaction. (Not used?)
|
||||
* - 0x4C
|
||||
- \_\_be32
|
||||
- s\_max\_trans\_data
|
||||
- Limit of data blocks per transaction. (Not used?)
|
||||
* - 0x50
|
||||
- \_\_u8
|
||||
- s\_checksum\_type
|
||||
- Checksum algorithm used for the journal. See jbd2_checksum_type_ for
|
||||
more info.
|
||||
* - 0x51
|
||||
- \_\_u8[3]
|
||||
- s\_padding2
|
||||
-
|
||||
* - 0x54
|
||||
- \_\_u32
|
||||
- s\_padding[42]
|
||||
-
|
||||
* - 0xFC
|
||||
- \_\_be32
|
||||
- s\_checksum
|
||||
- Checksum of the entire superblock, with this field set to zero.
|
||||
* - 0x100
|
||||
- \_\_u8
|
||||
- s\_users[16\*48]
|
||||
- ids of all file systems sharing the log. e2fsprogs/Linux don't allow
|
||||
shared external journals, but I imagine Lustre (or ocfs2?), which use
|
||||
the jbd2 code, might.
|
||||
|
||||
.. _jbd2_compat:
|
||||
|
||||
The journal compat features are any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Journal maintains checksums on the data blocks.
|
||||
(JBD2\_FEATURE\_COMPAT\_CHECKSUM)
|
||||
|
||||
.. _jbd2_incompat:
|
||||
|
||||
The journal incompat features are any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Journal has block revocation records. (JBD2\_FEATURE\_INCOMPAT\_REVOKE)
|
||||
* - 0x2
|
||||
- Journal can deal with 64-bit block numbers.
|
||||
(JBD2\_FEATURE\_INCOMPAT\_64BIT)
|
||||
* - 0x4
|
||||
- Journal commits asynchronously. (JBD2\_FEATURE\_INCOMPAT\_ASYNC\_COMMIT)
|
||||
* - 0x8
|
||||
- This journal uses v2 of the checksum on-disk format. Each journal
|
||||
metadata block gets its own checksum, and the block tags in the
|
||||
descriptor table contain checksums for each of the data blocks in the
|
||||
journal. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2)
|
||||
* - 0x10
|
||||
- This journal uses v3 of the checksum on-disk format. This is the same as
|
||||
v2, but the journal block tag size is fixed regardless of the size of
|
||||
block numbers. (JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3)
|
||||
|
||||
.. _jbd2_checksum_type:
|
||||
|
||||
Journal checksum type codes are one of the following. crc32 or crc32c are the
|
||||
most likely choices.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 1
|
||||
- CRC32
|
||||
* - 2
|
||||
- MD5
|
||||
* - 3
|
||||
- SHA1
|
||||
* - 4
|
||||
- CRC32C
|
||||
|
||||
Descriptor Block
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
The descriptor block contains an array of journal block tags that
|
||||
describe the final locations of the data blocks that follow in the
|
||||
journal. Descriptor blocks are open-coded instead of being completely
|
||||
described by a data structure, but here is the block structure anyway.
|
||||
Descriptor blocks consume at least 36 bytes, but use a full block:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- journal\_header\_t
|
||||
- (open coded)
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- struct journal\_block\_tag\_s
|
||||
- open coded array[]
|
||||
- Enough tags either to fill up the block or to describe all the data
|
||||
blocks that follow this descriptor block.
|
||||
|
||||
Journal block tags have any of the following formats, depending on which
|
||||
journal feature and block tag flags are set.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is set, the journal block tag is
|
||||
defined as ``struct journal_block_tag3_s``, which looks like the
|
||||
following. The size is 16 or 32 bytes.
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_blocknr
|
||||
- Lower 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* - 0x4
|
||||
- \_\_be32
|
||||
- t\_flags
|
||||
- Flags that go with the descriptor. See the table jbd2_tag_flags_ for
|
||||
more info.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- t\_blocknr\_high
|
||||
- Upper 32-bits of the location of where the corresponding data block
|
||||
should end up on disk. This is zero if JBD2\_FEATURE\_INCOMPAT\_64BIT is
|
||||
not enabled.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- t\_checksum
|
||||
- Checksum of the journal UUID, the sequence number, and the data block.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- This field appears to be open coded. It always comes at the end of the
|
||||
tag, after t_checksum. This field is not present if the "same UUID" flag
|
||||
is set.
|
||||
* - 0x8 or 0xC
|
||||
- char
|
||||
- uuid[16]
|
||||
- A UUID to go with this tag. This field appears to be copied from the
|
||||
``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
|
||||
field.
|
||||
|
||||
.. _jbd2_tag_flags:
|
||||
|
||||
The journal tag flags are any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- On-disk block is escaped. The first four bytes of the data block just
|
||||
happened to match the jbd2 magic number.
|
||||
* - 0x2
|
||||
- This block has the same UUID as previous, therefore the UUID field is
|
||||
omitted.
|
||||
* - 0x4
|
||||
- The data block was deleted by the transaction. (Not used?)
|
||||
* - 0x8
|
||||
- This is the last tag in this descriptor block.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 is NOT set, the journal block tag
|
||||
is defined as ``struct journal_block_tag_s``, which looks like the
|
||||
following. The size is 8, 12, 24, or 28 bytes:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_blocknr
|
||||
- Lower 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* - 0x4
|
||||
- \_\_be16
|
||||
- t\_checksum
|
||||
- Checksum of the journal UUID, the sequence number, and the data block.
|
||||
Note that only the lower 16 bits are stored.
|
||||
* - 0x6
|
||||
- \_\_be16
|
||||
- t\_flags
|
||||
- Flags that go with the descriptor. See the table jbd2_tag_flags_ for
|
||||
more info.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- This next field is only present if the super block indicates support for
|
||||
64-bit block numbers.
|
||||
* - 0x8
|
||||
- \_\_be32
|
||||
- t\_blocknr\_high
|
||||
- Upper 32-bits of the location of where the corresponding data block
|
||||
should end up on disk.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- This field appears to be open coded. It always comes at the end of the
|
||||
tag, after t_flags or t_blocknr_high. This field is not present if the
|
||||
"same UUID" flag is set.
|
||||
* - 0x8 or 0xC
|
||||
- char
|
||||
- uuid[16]
|
||||
- A UUID to go with this tag. This field appears to be copied from the
|
||||
``j_uuid`` field in ``struct journal_s``, but only tune2fs touches that
|
||||
field.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the block is a
|
||||
``struct jbd2_journal_block_tail``, which looks like this:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- t\_checksum
|
||||
- Checksum of the journal UUID + the descriptor block, with this field set
|
||||
to zero.
|
||||
|
||||
Data Block
|
||||
~~~~~~~~~~
|
||||
|
||||
In general, the data blocks being written to disk through the journal
|
||||
are written verbatim into the journal file after the descriptor block.
|
||||
However, if the first four bytes of the block match the jbd2 magic
|
||||
number then those four bytes are replaced with zeroes and the “escaped”
|
||||
flag is set in the descriptor block tag.
|
||||
|
||||
Revocation Block
|
||||
~~~~~~~~~~~~~~~~
|
||||
|
||||
A revocation block is used to prevent replay of a block in an earlier
|
||||
transaction. This is used to mark blocks that were journalled at one
|
||||
time but are no longer journalled. Typically this happens if a metadata
|
||||
block is freed and re-allocated as a file data block; in this case, a
|
||||
journal replay after the file block was written to disk will cause
|
||||
corruption.
|
||||
|
||||
**NOTE**: This mechanism is NOT used to express “this journal block is
|
||||
superseded by this other journal block”, as the author (djwong)
|
||||
mistakenly thought. Any block being added to a transaction will cause
|
||||
the removal of all existing revocation records for that block.
|
||||
|
||||
Revocation blocks are described in
|
||||
``struct jbd2_journal_revoke_header_s``, are at least 16 bytes in
|
||||
length, but use a full block:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- journal\_header\_t
|
||||
- r\_header
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- \_\_be32
|
||||
- r\_count
|
||||
- Number of bytes used in this block.
|
||||
* - 0x10
|
||||
- \_\_be32 or \_\_be64
|
||||
- blocks[0]
|
||||
- Blocks to revoke.
|
||||
|
||||
After r\_count is a linear array of block numbers that are effectively
|
||||
revoked by this transaction. The size of each block number is 8 bytes if
|
||||
the superblock advertises 64-bit block number support, or 4 bytes
|
||||
otherwise.
|
||||
|
||||
If JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3 are set, the end of the revocation
|
||||
block is a ``struct jbd2_journal_revoke_tail``, which has this format:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_be32
|
||||
- r\_checksum
|
||||
- Checksum of the journal UUID + revocation block
|
||||
|
||||
Commit Block
|
||||
~~~~~~~~~~~~
|
||||
|
||||
The commit block is a sentry that indicates that a transaction has been
|
||||
completely written to the journal. Once this commit block reaches the
|
||||
journal, the data stored with this transaction can be written to their
|
||||
final locations on disk.
|
||||
|
||||
The commit block is described by ``struct commit_header``, which is 32
|
||||
bytes long (but uses a full block):
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Descriptor
|
||||
* - 0x0
|
||||
- journal\_header\_s
|
||||
- (open coded)
|
||||
- Common block header.
|
||||
* - 0xC
|
||||
- unsigned char
|
||||
- h\_chksum\_type
|
||||
- The type of checksum to use to verify the integrity of the data blocks
|
||||
in the transaction. See jbd2_checksum_type_ for more info.
|
||||
* - 0xD
|
||||
- unsigned char
|
||||
- h\_chksum\_size
|
||||
- The number of bytes used by the checksum. Most likely 4.
|
||||
* - 0xE
|
||||
- unsigned char
|
||||
- h\_padding[2]
|
||||
-
|
||||
* - 0x10
|
||||
- \_\_be32
|
||||
- h\_chksum[JBD2\_CHECKSUM\_BYTES]
|
||||
- 32 bytes of space to store checksums. If
|
||||
JBD2\_FEATURE\_INCOMPAT\_CSUM\_V2 or JBD2\_FEATURE\_INCOMPAT\_CSUM\_V3
|
||||
are set, the first ``__be32`` is the checksum of the journal UUID and
|
||||
the entire commit block, with this field zeroed. If
|
||||
JBD2\_FEATURE\_COMPAT\_CHECKSUM is set, the first ``__be32`` is the
|
||||
crc32 of all the blocks already written to the transaction.
|
||||
* - 0x30
|
||||
- \_\_be64
|
||||
- h\_commit\_sec
|
||||
- The time that the transaction was committed, in seconds since the epoch.
|
||||
* - 0x38
|
||||
- \_\_be32
|
||||
- h\_commit\_nsec
|
||||
- Nanoseconds component of the above timestamp.
|
||||
|
77
Documentation/filesystems/ext4/ondisk/mmp.rst
Normal file
77
Documentation/filesystems/ext4/ondisk/mmp.rst
Normal file
@ -0,0 +1,77 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Multiple Mount Protection
|
||||
-------------------------
|
||||
|
||||
Multiple mount protection (MMP) is a feature that protects the
|
||||
filesystem against multiple hosts trying to use the filesystem
|
||||
simultaneously. When a filesystem is opened (for mounting, or fsck,
|
||||
etc.), the MMP code running on the node (call it node A) checks a
|
||||
sequence number. If the sequence number is EXT4\_MMP\_SEQ\_CLEAN, the
|
||||
open continues. If the sequence number is EXT4\_MMP\_SEQ\_FSCK, then
|
||||
fsck is (hopefully) running, and open fails immediately. Otherwise, the
|
||||
open code will wait for twice the specified MMP check interval and check
|
||||
the sequence number again. If the sequence number has changed, then the
|
||||
filesystem is active on another machine and the open fails. If the MMP
|
||||
code passes all of those checks, a new MMP sequence number is generated
|
||||
and written to the MMP block, and the mount proceeds.
|
||||
|
||||
While the filesystem is live, the kernel sets up a timer to re-check the
|
||||
MMP block at the specified MMP check interval. To perform the re-check,
|
||||
the MMP sequence number is re-read; if it does not match the in-memory
|
||||
MMP sequence number, then another node (node B) has mounted the
|
||||
filesystem, and node A remounts the filesystem read-only. If the
|
||||
sequence numbers match, the sequence number is incremented both in
|
||||
memory and on disk, and the re-check is complete.
|
||||
|
||||
The hostname and device filename are written into the MMP block whenever
|
||||
an open operation succeeds. The MMP code does not use these values; they
|
||||
are provided purely for informational purposes.
|
||||
|
||||
The checksum is calculated against the FS UUID and the MMP structure.
|
||||
The MMP structure (``struct mmp_struct``) is as follows:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Type
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- mmp\_magic
|
||||
- Magic number for MMP, 0x004D4D50 (“MMP”).
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- mmp\_seq
|
||||
- Sequence number, updated periodically.
|
||||
* - 0x8
|
||||
- \_\_le64
|
||||
- mmp\_time
|
||||
- Time that the MMP block was last updated.
|
||||
* - 0x10
|
||||
- char[64]
|
||||
- mmp\_nodename
|
||||
- Hostname of the node that opened the filesystem.
|
||||
* - 0x50
|
||||
- char[32]
|
||||
- mmp\_bdevname
|
||||
- Block device name of the filesystem.
|
||||
* - 0x70
|
||||
- \_\_le16
|
||||
- mmp\_check\_interval
|
||||
- The MMP re-check interval, in seconds.
|
||||
* - 0x72
|
||||
- \_\_le16
|
||||
- mmp\_pad1
|
||||
- Zero.
|
||||
* - 0x74
|
||||
- \_\_le32[226]
|
||||
- mmp\_pad2
|
||||
- Zero.
|
||||
* - 0x3FC
|
||||
- \_\_le32
|
||||
- mmp\_checksum
|
||||
- Checksum of the MMP block.
|
26
Documentation/filesystems/ext4/ondisk/overview.rst
Normal file
26
Documentation/filesystems/ext4/ondisk/overview.rst
Normal file
@ -0,0 +1,26 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
High Level Design
|
||||
=================
|
||||
|
||||
An ext4 file system is split into a series of block groups. To reduce
|
||||
performance difficulties due to fragmentation, the block allocator tries
|
||||
very hard to keep each file's blocks within the same group, thereby
|
||||
reducing seek times. The size of a block group is specified in
|
||||
``sb.s_blocks_per_group`` blocks, though it can also calculated as 8 \*
|
||||
``block_size_in_bytes``. With the default block size of 4KiB, each group
|
||||
will contain 32,768 blocks, for a length of 128MiB. The number of block
|
||||
groups is the size of the device divided by the size of a block group.
|
||||
|
||||
All fields in ext4 are written to disk in little-endian order. HOWEVER,
|
||||
all fields in jbd2 (the journal) are written to disk in big-endian
|
||||
order.
|
||||
|
||||
.. include:: blocks.rst
|
||||
.. include:: blockgroup.rst
|
||||
.. include:: special_inodes.rst
|
||||
.. include:: allocators.rst
|
||||
.. include:: checksums.rst
|
||||
.. include:: bigalloc.rst
|
||||
.. include:: inlinedata.rst
|
||||
.. include:: eainode.rst
|
38
Documentation/filesystems/ext4/ondisk/special_inodes.rst
Normal file
38
Documentation/filesystems/ext4/ondisk/special_inodes.rst
Normal file
@ -0,0 +1,38 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Special inodes
|
||||
--------------
|
||||
|
||||
ext4 reserves some inode for special features, as follows:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - inode Number
|
||||
- Purpose
|
||||
* - 0
|
||||
- Doesn't exist; there is no inode 0.
|
||||
* - 1
|
||||
- List of defective blocks.
|
||||
* - 2
|
||||
- Root directory.
|
||||
* - 3
|
||||
- User quota.
|
||||
* - 4
|
||||
- Group quota.
|
||||
* - 5
|
||||
- Boot loader.
|
||||
* - 6
|
||||
- Undelete directory.
|
||||
* - 7
|
||||
- Reserved group descriptors inode. (“resize inode”)
|
||||
* - 8
|
||||
- Journal inode.
|
||||
* - 9
|
||||
- The “exclude” inode, for snapshots(?)
|
||||
* - 10
|
||||
- Replica inode, used for some non-upstream feature?
|
||||
* - 11
|
||||
- Traditional first non-reserved inode. Usually this is the lost+found directory. See s\_first\_ino in the superblock.
|
||||
|
801
Documentation/filesystems/ext4/ondisk/super.rst
Normal file
801
Documentation/filesystems/ext4/ondisk/super.rst
Normal file
@ -0,0 +1,801 @@
|
||||
.. SPDX-License-Identifier: GPL-2.0
|
||||
|
||||
Super Block
|
||||
-----------
|
||||
|
||||
The superblock records various information about the enclosing
|
||||
filesystem, such as block counts, inode counts, supported features,
|
||||
maintenance information, and more.
|
||||
|
||||
If the sparse\_super feature flag is set, redundant copies of the
|
||||
superblock and group descriptors are kept only in the groups whose group
|
||||
number is either 0 or a power of 3, 5, or 7. If the flag is not set,
|
||||
redundant copies are kept in all groups.
|
||||
|
||||
The superblock checksum is calculated against the superblock structure,
|
||||
which includes the FS UUID.
|
||||
|
||||
The ext4 superblock is laid out as follows in
|
||||
``struct ext4_super_block``:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 1 1 77
|
||||
:header-rows: 1
|
||||
|
||||
* - Offset
|
||||
- Size
|
||||
- Name
|
||||
- Description
|
||||
* - 0x0
|
||||
- \_\_le32
|
||||
- s\_inodes\_count
|
||||
- Total inode count.
|
||||
* - 0x4
|
||||
- \_\_le32
|
||||
- s\_blocks\_count\_lo
|
||||
- Total block count.
|
||||
* - 0x8
|
||||
- \_\_le32
|
||||
- s\_r\_blocks\_count\_lo
|
||||
- This number of blocks can only be allocated by the super-user.
|
||||
* - 0xC
|
||||
- \_\_le32
|
||||
- s\_free\_blocks\_count\_lo
|
||||
- Free block count.
|
||||
* - 0x10
|
||||
- \_\_le32
|
||||
- s\_free\_inodes\_count
|
||||
- Free inode count.
|
||||
* - 0x14
|
||||
- \_\_le32
|
||||
- s\_first\_data\_block
|
||||
- First data block. This must be at least 1 for 1k-block filesystems and
|
||||
is typically 0 for all other block sizes.
|
||||
* - 0x18
|
||||
- \_\_le32
|
||||
- s\_log\_block\_size
|
||||
- Block size is 2 ^ (10 + s\_log\_block\_size).
|
||||
* - 0x1C
|
||||
- \_\_le32
|
||||
- s\_log\_cluster\_size
|
||||
- Cluster size is (2 ^ s\_log\_cluster\_size) blocks if bigalloc is
|
||||
enabled. Otherwise s\_log\_cluster\_size must equal s\_log\_block\_size.
|
||||
* - 0x20
|
||||
- \_\_le32
|
||||
- s\_blocks\_per\_group
|
||||
- Blocks per group.
|
||||
* - 0x24
|
||||
- \_\_le32
|
||||
- s\_clusters\_per\_group
|
||||
- Clusters per group, if bigalloc is enabled. Otherwise
|
||||
s\_clusters\_per\_group must equal s\_blocks\_per\_group.
|
||||
* - 0x28
|
||||
- \_\_le32
|
||||
- s\_inodes\_per\_group
|
||||
- Inodes per group.
|
||||
* - 0x2C
|
||||
- \_\_le32
|
||||
- s\_mtime
|
||||
- Mount time, in seconds since the epoch.
|
||||
* - 0x30
|
||||
- \_\_le32
|
||||
- s\_wtime
|
||||
- Write time, in seconds since the epoch.
|
||||
* - 0x34
|
||||
- \_\_le16
|
||||
- s\_mnt\_count
|
||||
- Number of mounts since the last fsck.
|
||||
* - 0x36
|
||||
- \_\_le16
|
||||
- s\_max\_mnt\_count
|
||||
- Number of mounts beyond which a fsck is needed.
|
||||
* - 0x38
|
||||
- \_\_le16
|
||||
- s\_magic
|
||||
- Magic signature, 0xEF53
|
||||
* - 0x3A
|
||||
- \_\_le16
|
||||
- s\_state
|
||||
- File system state. See super_state_ for more info.
|
||||
* - 0x3C
|
||||
- \_\_le16
|
||||
- s\_errors
|
||||
- Behaviour when detecting errors. See super_errors_ for more info.
|
||||
* - 0x3E
|
||||
- \_\_le16
|
||||
- s\_minor\_rev\_level
|
||||
- Minor revision level.
|
||||
* - 0x40
|
||||
- \_\_le32
|
||||
- s\_lastcheck
|
||||
- Time of last check, in seconds since the epoch.
|
||||
* - 0x44
|
||||
- \_\_le32
|
||||
- s\_checkinterval
|
||||
- Maximum time between checks, in seconds.
|
||||
* - 0x48
|
||||
- \_\_le32
|
||||
- s\_creator\_os
|
||||
- Creator OS. See the table super_creator_ for more info.
|
||||
* - 0x4C
|
||||
- \_\_le32
|
||||
- s\_rev\_level
|
||||
- Revision level. See the table super_revision_ for more info.
|
||||
* - 0x50
|
||||
- \_\_le16
|
||||
- s\_def\_resuid
|
||||
- Default uid for reserved blocks.
|
||||
* - 0x52
|
||||
- \_\_le16
|
||||
- s\_def\_resgid
|
||||
- Default gid for reserved blocks.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- These fields are for EXT4_DYNAMIC_REV superblocks only.
|
||||
|
||||
Note: the difference between the compatible feature set and the
|
||||
incompatible feature set is that if there is a bit set in the
|
||||
incompatible feature set that the kernel doesn't know about, it should
|
||||
refuse to mount the filesystem.
|
||||
|
||||
e2fsck's requirements are more strict; if it doesn't know
|
||||
about a feature in either the compatible or incompatible feature set, it
|
||||
must abort and not try to meddle with things it doesn't understand...
|
||||
* - 0x54
|
||||
- \_\_le32
|
||||
- s\_first\_ino
|
||||
- First non-reserved inode.
|
||||
* - 0x58
|
||||
- \_\_le16
|
||||
- s\_inode\_size
|
||||
- Size of inode structure, in bytes.
|
||||
* - 0x5A
|
||||
- \_\_le16
|
||||
- s\_block\_group\_nr
|
||||
- Block group # of this superblock.
|
||||
* - 0x5C
|
||||
- \_\_le32
|
||||
- s\_feature\_compat
|
||||
- Compatible feature set flags. Kernel can still read/write this fs even
|
||||
if it doesn't understand a flag; fsck should not do that. See the
|
||||
super_compat_ table for more info.
|
||||
* - 0x60
|
||||
- \_\_le32
|
||||
- s\_feature\_incompat
|
||||
- Incompatible feature set. If the kernel or fsck doesn't understand one
|
||||
of these bits, it should stop. See the super_incompat_ table for more
|
||||
info.
|
||||
* - 0x64
|
||||
- \_\_le32
|
||||
- s\_feature\_ro\_compat
|
||||
- Readonly-compatible feature set. If the kernel doesn't understand one of
|
||||
these bits, it can still mount read-only. See the super_rocompat_ table
|
||||
for more info.
|
||||
* - 0x68
|
||||
- \_\_u8
|
||||
- s\_uuid[16]
|
||||
- 128-bit UUID for volume.
|
||||
* - 0x78
|
||||
- char
|
||||
- s\_volume\_name[16]
|
||||
- Volume label.
|
||||
* - 0x88
|
||||
- char
|
||||
- s\_last\_mounted[64]
|
||||
- Directory where filesystem was last mounted.
|
||||
* - 0xC8
|
||||
- \_\_le32
|
||||
- s\_algorithm\_usage\_bitmap
|
||||
- For compression (Not used in e2fsprogs/Linux)
|
||||
* -
|
||||
-
|
||||
-
|
||||
- Performance hints. Directory preallocation should only happen if the
|
||||
EXT4_FEATURE_COMPAT_DIR_PREALLOC flag is on.
|
||||
* - 0xCC
|
||||
- \_\_u8
|
||||
- s\_prealloc\_blocks
|
||||
- #. of blocks to try to preallocate for ... files? (Not used in
|
||||
e2fsprogs/Linux)
|
||||
* - 0xCD
|
||||
- \_\_u8
|
||||
- s\_prealloc\_dir\_blocks
|
||||
- #. of blocks to preallocate for directories. (Not used in
|
||||
e2fsprogs/Linux)
|
||||
* - 0xCE
|
||||
- \_\_le16
|
||||
- s\_reserved\_gdt\_blocks
|
||||
- Number of reserved GDT entries for future filesystem expansion.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- Journalling support is valid only if EXT4_FEATURE_COMPAT_HAS_JOURNAL is
|
||||
set.
|
||||
* - 0xD0
|
||||
- \_\_u8
|
||||
- s\_journal\_uuid[16]
|
||||
- UUID of journal superblock
|
||||
* - 0xE0
|
||||
- \_\_le32
|
||||
- s\_journal\_inum
|
||||
- inode number of journal file.
|
||||
* - 0xE4
|
||||
- \_\_le32
|
||||
- s\_journal\_dev
|
||||
- Device number of journal file, if the external journal feature flag is
|
||||
set.
|
||||
* - 0xE8
|
||||
- \_\_le32
|
||||
- s\_last\_orphan
|
||||
- Start of list of orphaned inodes to delete.
|
||||
* - 0xEC
|
||||
- \_\_le32
|
||||
- s\_hash\_seed[4]
|
||||
- HTREE hash seed.
|
||||
* - 0xFC
|
||||
- \_\_u8
|
||||
- s\_def\_hash\_version
|
||||
- Default hash algorithm to use for directory hashes. See super_def_hash_
|
||||
for more info.
|
||||
* - 0xFD
|
||||
- \_\_u8
|
||||
- s\_jnl\_backup\_type
|
||||
- If this value is 0 or EXT3\_JNL\_BACKUP\_BLOCKS (1), then the
|
||||
``s_jnl_blocks`` field contains a duplicate copy of the inode's
|
||||
``i_block[]`` array and ``i_size``.
|
||||
* - 0xFE
|
||||
- \_\_le16
|
||||
- s\_desc\_size
|
||||
- Size of group descriptors, in bytes, if the 64bit incompat feature flag
|
||||
is set.
|
||||
* - 0x100
|
||||
- \_\_le32
|
||||
- s\_default\_mount\_opts
|
||||
- Default mount options. See the super_mountopts_ table for more info.
|
||||
* - 0x104
|
||||
- \_\_le32
|
||||
- s\_first\_meta\_bg
|
||||
- First metablock block group, if the meta\_bg feature is enabled.
|
||||
* - 0x108
|
||||
- \_\_le32
|
||||
- s\_mkfs\_time
|
||||
- When the filesystem was created, in seconds since the epoch.
|
||||
* - 0x10C
|
||||
- \_\_le32
|
||||
- s\_jnl\_blocks[17]
|
||||
- Backup copy of the journal inode's ``i_block[]`` array in the first 15
|
||||
elements and i\_size\_high and i\_size in the 16th and 17th elements,
|
||||
respectively.
|
||||
* -
|
||||
-
|
||||
-
|
||||
- 64bit support is valid only if EXT4_FEATURE_COMPAT_64BIT is set.
|
||||
* - 0x150
|
||||
- \_\_le32
|
||||
- s\_blocks\_count\_hi
|
||||
- High 32-bits of the block count.
|
||||
* - 0x154
|
||||
- \_\_le32
|
||||
- s\_r\_blocks\_count\_hi
|
||||
- High 32-bits of the reserved block count.
|
||||
* - 0x158
|
||||
- \_\_le32
|
||||
- s\_free\_blocks\_count\_hi
|
||||
- High 32-bits of the free block count.
|
||||
* - 0x15C
|
||||
- \_\_le16
|
||||
- s\_min\_extra\_isize
|
||||
- All inodes have at least # bytes.
|
||||
* - 0x15E
|
||||
- \_\_le16
|
||||
- s\_want\_extra\_isize
|
||||
- New inodes should reserve # bytes.
|
||||
* - 0x160
|
||||
- \_\_le32
|
||||
- s\_flags
|
||||
- Miscellaneous flags. See the super_flags_ table for more info.
|
||||
* - 0x164
|
||||
- \_\_le16
|
||||
- s\_raid\_stride
|
||||
- RAID stride. This is the number of logical blocks read from or written
|
||||
to the disk before moving to the next disk. This affects the placement
|
||||
of filesystem metadata, which will hopefully make RAID storage faster.
|
||||
* - 0x166
|
||||
- \_\_le16
|
||||
- s\_mmp\_interval
|
||||
- #. seconds to wait in multi-mount prevention (MMP) checking. In theory,
|
||||
MMP is a mechanism to record in the superblock which host and device
|
||||
have mounted the filesystem, in order to prevent multiple mounts. This
|
||||
feature does not seem to be implemented...
|
||||
* - 0x168
|
||||
- \_\_le64
|
||||
- s\_mmp\_block
|
||||
- Block # for multi-mount protection data.
|
||||
* - 0x170
|
||||
- \_\_le32
|
||||
- s\_raid\_stripe\_width
|
||||
- RAID stripe width. This is the number of logical blocks read from or
|
||||
written to the disk before coming back to the current disk. This is used
|
||||
by the block allocator to try to reduce the number of read-modify-write
|
||||
operations in a RAID5/6.
|
||||
* - 0x174
|
||||
- \_\_u8
|
||||
- s\_log\_groups\_per\_flex
|
||||
- Size of a flexible block group is 2 ^ ``s_log_groups_per_flex``.
|
||||
* - 0x175
|
||||
- \_\_u8
|
||||
- s\_checksum\_type
|
||||
- Metadata checksum algorithm type. The only valid value is 1 (crc32c).
|
||||
* - 0x176
|
||||
- \_\_le16
|
||||
- s\_reserved\_pad
|
||||
-
|
||||
* - 0x178
|
||||
- \_\_le64
|
||||
- s\_kbytes\_written
|
||||
- Number of KiB written to this filesystem over its lifetime.
|
||||
* - 0x180
|
||||
- \_\_le32
|
||||
- s\_snapshot\_inum
|
||||
- inode number of active snapshot. (Not used in e2fsprogs/Linux.)
|
||||
* - 0x184
|
||||
- \_\_le32
|
||||
- s\_snapshot\_id
|
||||
- Sequential ID of active snapshot. (Not used in e2fsprogs/Linux.)
|
||||
* - 0x188
|
||||
- \_\_le64
|
||||
- s\_snapshot\_r\_blocks\_count
|
||||
- Number of blocks reserved for active snapshot's future use. (Not used in
|
||||
e2fsprogs/Linux.)
|
||||
* - 0x190
|
||||
- \_\_le32
|
||||
- s\_snapshot\_list
|
||||
- inode number of the head of the on-disk snapshot list. (Not used in
|
||||
e2fsprogs/Linux.)
|
||||
* - 0x194
|
||||
- \_\_le32
|
||||
- s\_error\_count
|
||||
- Number of errors seen.
|
||||
* - 0x198
|
||||
- \_\_le32
|
||||
- s\_first\_error\_time
|
||||
- First time an error happened, in seconds since the epoch.
|
||||
* - 0x19C
|
||||
- \_\_le32
|
||||
- s\_first\_error\_ino
|
||||
- inode involved in first error.
|
||||
* - 0x1A0
|
||||
- \_\_le64
|
||||
- s\_first\_error\_block
|
||||
- Number of block involved of first error.
|
||||
* - 0x1A8
|
||||
- \_\_u8
|
||||
- s\_first\_error\_func[32]
|
||||
- Name of function where the error happened.
|
||||
* - 0x1C8
|
||||
- \_\_le32
|
||||
- s\_first\_error\_line
|
||||
- Line number where error happened.
|
||||
* - 0x1CC
|
||||
- \_\_le32
|
||||
- s\_last\_error\_time
|
||||
- Time of most recent error, in seconds since the epoch.
|
||||
* - 0x1D0
|
||||
- \_\_le32
|
||||
- s\_last\_error\_ino
|
||||
- inode involved in most recent error.
|
||||
* - 0x1D4
|
||||
- \_\_le32
|
||||
- s\_last\_error\_line
|
||||
- Line number where most recent error happened.
|
||||
* - 0x1D8
|
||||
- \_\_le64
|
||||
- s\_last\_error\_block
|
||||
- Number of block involved in most recent error.
|
||||
* - 0x1E0
|
||||
- \_\_u8
|
||||
- s\_last\_error\_func[32]
|
||||
- Name of function where the most recent error happened.
|
||||
* - 0x200
|
||||
- \_\_u8
|
||||
- s\_mount\_opts[64]
|
||||
- ASCIIZ string of mount options.
|
||||
* - 0x240
|
||||
- \_\_le32
|
||||
- s\_usr\_quota\_inum
|
||||
- Inode number of user `quota <quota>`__ file.
|
||||
* - 0x244
|
||||
- \_\_le32
|
||||
- s\_grp\_quota\_inum
|
||||
- Inode number of group `quota <quota>`__ file.
|
||||
* - 0x248
|
||||
- \_\_le32
|
||||
- s\_overhead\_blocks
|
||||
- Overhead blocks/clusters in fs. (Huh? This field is always zero, which
|
||||
means that the kernel calculates it dynamically.)
|
||||
* - 0x24C
|
||||
- \_\_le32
|
||||
- s\_backup\_bgs[2]
|
||||
- Block groups containing superblock backups (if sparse\_super2)
|
||||
* - 0x254
|
||||
- \_\_u8
|
||||
- s\_encrypt\_algos[4]
|
||||
- Encryption algorithms in use. There can be up to four algorithms in use
|
||||
at any time; valid algorithm codes are given in the super_encrypt_ table
|
||||
below.
|
||||
* - 0x258
|
||||
- \_\_u8
|
||||
- s\_encrypt\_pw\_salt[16]
|
||||
- Salt for the string2key algorithm for encryption.
|
||||
* - 0x268
|
||||
- \_\_le32
|
||||
- s\_lpf\_ino
|
||||
- Inode number of lost+found
|
||||
* - 0x26C
|
||||
- \_\_le32
|
||||
- s\_prj\_quota\_inum
|
||||
- Inode that tracks project quotas.
|
||||
* - 0x270
|
||||
- \_\_le32
|
||||
- s\_checksum\_seed
|
||||
- Checksum seed used for metadata\_csum calculations. This value is
|
||||
crc32c(~0, $orig\_fs\_uuid).
|
||||
* - 0x274
|
||||
- \_\_u8
|
||||
- s\_wtime_hi
|
||||
- Upper 8 bits of the s_wtime field.
|
||||
* - 0x275
|
||||
- \_\_u8
|
||||
- s\_wtime_hi
|
||||
- Upper 8 bits of the s_mtime field.
|
||||
* - 0x276
|
||||
- \_\_u8
|
||||
- s\_mkfs_time_hi
|
||||
- Upper 8 bits of the s_mkfs_time field.
|
||||
* - 0x277
|
||||
- \_\_u8
|
||||
- s\_lastcheck_hi
|
||||
- Upper 8 bits of the s_lastcheck_hi field.
|
||||
* - 0x278
|
||||
- \_\_u8
|
||||
- s\_first_error_time_hi
|
||||
- Upper 8 bits of the s_first_error_time_hi field.
|
||||
* - 0x279
|
||||
- \_\_u8
|
||||
- s\_last_error_time_hi
|
||||
- Upper 8 bits of the s_last_error_time_hi field.
|
||||
* - 0x27A
|
||||
- \_\_u8[2]
|
||||
- s\_pad
|
||||
- Zero padding.
|
||||
* - 0x27C
|
||||
- \_\_le32
|
||||
- s\_reserved[96]
|
||||
- Padding to the end of the block.
|
||||
* - 0x3FC
|
||||
- \_\_le32
|
||||
- s\_checksum
|
||||
- Superblock checksum.
|
||||
|
||||
.. _super_state:
|
||||
|
||||
The superblock state is some combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0001
|
||||
- Cleanly umounted
|
||||
* - 0x0002
|
||||
- Errors detected
|
||||
* - 0x0004
|
||||
- Orphans being recovered
|
||||
|
||||
.. _super_errors:
|
||||
|
||||
The superblock error policy is one of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 1
|
||||
- Continue
|
||||
* - 2
|
||||
- Remount read-only
|
||||
* - 3
|
||||
- Panic
|
||||
|
||||
.. _super_creator:
|
||||
|
||||
The filesystem creator is one of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0
|
||||
- Linux
|
||||
* - 1
|
||||
- Hurd
|
||||
* - 2
|
||||
- Masix
|
||||
* - 3
|
||||
- FreeBSD
|
||||
* - 4
|
||||
- Lites
|
||||
|
||||
.. _super_revision:
|
||||
|
||||
The superblock revision is one of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0
|
||||
- Original format
|
||||
* - 1
|
||||
- v2 format w/ dynamic inode sizes
|
||||
|
||||
Note that ``EXT4_DYNAMIC_REV`` refers to a revision 1 or newer filesystem.
|
||||
|
||||
.. _super_compat:
|
||||
|
||||
The superblock compatible features field is a combination of any of the
|
||||
following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Directory preallocation (COMPAT\_DIR\_PREALLOC).
|
||||
* - 0x2
|
||||
- “imagic inodes”. Not clear from the code what this does
|
||||
(COMPAT\_IMAGIC\_INODES).
|
||||
* - 0x4
|
||||
- Has a journal (COMPAT\_HAS\_JOURNAL).
|
||||
* - 0x8
|
||||
- Supports extended attributes (COMPAT\_EXT\_ATTR).
|
||||
* - 0x10
|
||||
- Has reserved GDT blocks for filesystem expansion
|
||||
(COMPAT\_RESIZE\_INODE). Requires RO\_COMPAT\_SPARSE\_SUPER.
|
||||
* - 0x20
|
||||
- Has directory indices (COMPAT\_DIR\_INDEX).
|
||||
* - 0x40
|
||||
- “Lazy BG”. Not in Linux kernel, seems to have been for uninitialized
|
||||
block groups? (COMPAT\_LAZY\_BG)
|
||||
* - 0x80
|
||||
- “Exclude inode”. Not used. (COMPAT\_EXCLUDE\_INODE).
|
||||
* - 0x100
|
||||
- “Exclude bitmap”. Seems to be used to indicate the presence of
|
||||
snapshot-related exclude bitmaps? Not defined in kernel or used in
|
||||
e2fsprogs (COMPAT\_EXCLUDE\_BITMAP).
|
||||
* - 0x200
|
||||
- Sparse Super Block, v2. If this flag is set, the SB field s\_backup\_bgs
|
||||
points to the two block groups that contain backup superblocks
|
||||
(COMPAT\_SPARSE\_SUPER2).
|
||||
|
||||
.. _super_incompat:
|
||||
|
||||
The superblock incompatible features field is a combination of any of the
|
||||
following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Compression (INCOMPAT\_COMPRESSION).
|
||||
* - 0x2
|
||||
- Directory entries record the file type. See ext4\_dir\_entry\_2 below
|
||||
(INCOMPAT\_FILETYPE).
|
||||
* - 0x4
|
||||
- Filesystem needs recovery (INCOMPAT\_RECOVER).
|
||||
* - 0x8
|
||||
- Filesystem has a separate journal device (INCOMPAT\_JOURNAL\_DEV).
|
||||
* - 0x10
|
||||
- Meta block groups. See the earlier discussion of this feature
|
||||
(INCOMPAT\_META\_BG).
|
||||
* - 0x40
|
||||
- Files in this filesystem use extents (INCOMPAT\_EXTENTS).
|
||||
* - 0x80
|
||||
- Enable a filesystem size of 2^64 blocks (INCOMPAT\_64BIT).
|
||||
* - 0x100
|
||||
- Multiple mount protection. Not implemented (INCOMPAT\_MMP).
|
||||
* - 0x200
|
||||
- Flexible block groups. See the earlier discussion of this feature
|
||||
(INCOMPAT\_FLEX\_BG).
|
||||
* - 0x400
|
||||
- Inodes can be used to store large extended attribute values
|
||||
(INCOMPAT\_EA\_INODE).
|
||||
* - 0x1000
|
||||
- Data in directory entry (INCOMPAT\_DIRDATA). (Not implemented?)
|
||||
* - 0x2000
|
||||
- Metadata checksum seed is stored in the superblock. This feature enables
|
||||
the administrator to change the UUID of a metadata\_csum filesystem
|
||||
while the filesystem is mounted; without it, the checksum definition
|
||||
requires all metadata blocks to be rewritten (INCOMPAT\_CSUM\_SEED).
|
||||
* - 0x4000
|
||||
- Large directory >2GB or 3-level htree (INCOMPAT\_LARGEDIR). Prior to
|
||||
this feature, directories could not be larger than 4GiB and could not
|
||||
have an htree more than 2 levels deep. If this feature is enabled,
|
||||
directories can be larger than 4GiB and have a maximum htree depth of 3.
|
||||
* - 0x8000
|
||||
- Data in inode (INCOMPAT\_INLINE\_DATA).
|
||||
* - 0x10000
|
||||
- Encrypted inodes are present on the filesystem. (INCOMPAT\_ENCRYPT).
|
||||
|
||||
.. _super_rocompat:
|
||||
|
||||
The superblock read-only compatible features field is a combination of any of
|
||||
the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x1
|
||||
- Sparse superblocks. See the earlier discussion of this feature
|
||||
(RO\_COMPAT\_SPARSE\_SUPER).
|
||||
* - 0x2
|
||||
- This filesystem has been used to store a file greater than 2GiB
|
||||
(RO\_COMPAT\_LARGE\_FILE).
|
||||
* - 0x4
|
||||
- Not used in kernel or e2fsprogs (RO\_COMPAT\_BTREE\_DIR).
|
||||
* - 0x8
|
||||
- This filesystem has files whose sizes are represented in units of
|
||||
logical blocks, not 512-byte sectors. This implies a very large file
|
||||
indeed! (RO\_COMPAT\_HUGE\_FILE)
|
||||
* - 0x10
|
||||
- Group descriptors have checksums. In addition to detecting corruption,
|
||||
this is useful for lazy formatting with uninitialized groups
|
||||
(RO\_COMPAT\_GDT\_CSUM).
|
||||
* - 0x20
|
||||
- Indicates that the old ext3 32,000 subdirectory limit no longer applies
|
||||
(RO\_COMPAT\_DIR\_NLINK). A directory's i\_links\_count will be set to 1
|
||||
if it is incremented past 64,999.
|
||||
* - 0x40
|
||||
- Indicates that large inodes exist on this filesystem
|
||||
(RO\_COMPAT\_EXTRA\_ISIZE).
|
||||
* - 0x80
|
||||
- This filesystem has a snapshot (RO\_COMPAT\_HAS\_SNAPSHOT).
|
||||
* - 0x100
|
||||
- `Quota <Quota>`__ (RO\_COMPAT\_QUOTA).
|
||||
* - 0x200
|
||||
- This filesystem supports “bigalloc”, which means that file extents are
|
||||
tracked in units of clusters (of blocks) instead of blocks
|
||||
(RO\_COMPAT\_BIGALLOC).
|
||||
* - 0x400
|
||||
- This filesystem supports metadata checksumming.
|
||||
(RO\_COMPAT\_METADATA\_CSUM; implies RO\_COMPAT\_GDT\_CSUM, though
|
||||
GDT\_CSUM must not be set)
|
||||
* - 0x800
|
||||
- Filesystem supports replicas. This feature is neither in the kernel nor
|
||||
e2fsprogs. (RO\_COMPAT\_REPLICA)
|
||||
* - 0x1000
|
||||
- Read-only filesystem image; the kernel will not mount this image
|
||||
read-write and most tools will refuse to write to the image.
|
||||
(RO\_COMPAT\_READONLY)
|
||||
* - 0x2000
|
||||
- Filesystem tracks project quotas. (RO\_COMPAT\_PROJECT)
|
||||
|
||||
.. _super_def_hash:
|
||||
|
||||
The ``s_def_hash_version`` field is one of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0
|
||||
- Legacy.
|
||||
* - 0x1
|
||||
- Half MD4.
|
||||
* - 0x2
|
||||
- Tea.
|
||||
* - 0x3
|
||||
- Legacy, unsigned.
|
||||
* - 0x4
|
||||
- Half MD4, unsigned.
|
||||
* - 0x5
|
||||
- Tea, unsigned.
|
||||
|
||||
.. _super_mountopts:
|
||||
|
||||
The ``s_default_mount_opts`` field is any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0001
|
||||
- Print debugging info upon (re)mount. (EXT4\_DEFM\_DEBUG)
|
||||
* - 0x0002
|
||||
- New files take the gid of the containing directory (instead of the fsgid
|
||||
of the current process). (EXT4\_DEFM\_BSDGROUPS)
|
||||
* - 0x0004
|
||||
- Support userspace-provided extended attributes. (EXT4\_DEFM\_XATTR\_USER)
|
||||
* - 0x0008
|
||||
- Support POSIX access control lists (ACLs). (EXT4\_DEFM\_ACL)
|
||||
* - 0x0010
|
||||
- Do not support 32-bit UIDs. (EXT4\_DEFM\_UID16)
|
||||
* - 0x0020
|
||||
- All data and metadata are commited to the journal.
|
||||
(EXT4\_DEFM\_JMODE\_DATA)
|
||||
* - 0x0040
|
||||
- All data are flushed to the disk before metadata are committed to the
|
||||
journal. (EXT4\_DEFM\_JMODE\_ORDERED)
|
||||
* - 0x0060
|
||||
- Data ordering is not preserved; data may be written after the metadata
|
||||
has been written. (EXT4\_DEFM\_JMODE\_WBACK)
|
||||
* - 0x0100
|
||||
- Disable write flushes. (EXT4\_DEFM\_NOBARRIER)
|
||||
* - 0x0200
|
||||
- Track which blocks in a filesystem are metadata and therefore should not
|
||||
be used as data blocks. This option will be enabled by default on 3.18,
|
||||
hopefully. (EXT4\_DEFM\_BLOCK\_VALIDITY)
|
||||
* - 0x0400
|
||||
- Enable DISCARD support, where the storage device is told about blocks
|
||||
becoming unused. (EXT4\_DEFM\_DISCARD)
|
||||
* - 0x0800
|
||||
- Disable delayed allocation. (EXT4\_DEFM\_NODELALLOC)
|
||||
|
||||
.. _super_flags:
|
||||
|
||||
The ``s_flags`` field is any combination of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0x0001
|
||||
- Signed directory hash in use.
|
||||
* - 0x0002
|
||||
- Unsigned directory hash in use.
|
||||
* - 0x0004
|
||||
- To test development code.
|
||||
|
||||
.. _super_encrypt:
|
||||
|
||||
The ``s_encrypt_algos`` list can contain any of the following:
|
||||
|
||||
.. list-table::
|
||||
:widths: 1 79
|
||||
:header-rows: 1
|
||||
|
||||
* - Value
|
||||
- Description
|
||||
* - 0
|
||||
- Invalid algorithm (ENCRYPTION\_MODE\_INVALID).
|
||||
* - 1
|
||||
- 256-bit AES in XTS mode (ENCRYPTION\_MODE\_AES\_256\_XTS).
|
||||
* - 2
|
||||
- 256-bit AES in GCM mode (ENCRYPTION\_MODE\_AES\_256\_GCM).
|
||||
* - 3
|
||||
- 256-bit AES in CBC mode (ENCRYPTION\_MODE\_AES\_256\_CBC).
|
||||
|
||||
Total size of the superblock is 1024 bytes.
|
@ -102,6 +102,17 @@ implementation.
|
||||
|
||||
sh/index
|
||||
|
||||
Filesystem Documentation
|
||||
------------------------
|
||||
|
||||
The documentation in this section are provided by specific filesystem
|
||||
subprojects.
|
||||
|
||||
.. toctree::
|
||||
:maxdepth: 2
|
||||
|
||||
filesystems/ext4/index
|
||||
|
||||
Korean translations
|
||||
-------------------
|
||||
|
||||
|
10
fs/dax.c
10
fs/dax.c
@ -566,7 +566,8 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
|
||||
if (index >= end)
|
||||
break;
|
||||
|
||||
if (!radix_tree_exceptional_entry(pvec_ent))
|
||||
if (WARN_ON_ONCE(
|
||||
!radix_tree_exceptional_entry(pvec_ent)))
|
||||
continue;
|
||||
|
||||
xa_lock_irq(&mapping->i_pages);
|
||||
@ -578,6 +579,13 @@ struct page *dax_layout_busy_page(struct address_space *mapping)
|
||||
if (page)
|
||||
break;
|
||||
}
|
||||
|
||||
/*
|
||||
* We don't expect normal struct page entries to exist in our
|
||||
* tree, but we keep these pagevec calls so that this code is
|
||||
* consistent with the common pattern for handling pagevecs
|
||||
* throughout the kernel.
|
||||
*/
|
||||
pagevec_remove_exceptionals(&pvec);
|
||||
pagevec_release(&pvec);
|
||||
index++;
|
||||
|
@ -426,9 +426,9 @@ ext4_read_block_bitmap_nowait(struct super_block *sb, ext4_group_t block_group)
|
||||
}
|
||||
bh = sb_getblk(sb, bitmap_blk);
|
||||
if (unlikely(!bh)) {
|
||||
ext4_error(sb, "Cannot get buffer for block bitmap - "
|
||||
"block_group = %u, block_bitmap = %llu",
|
||||
block_group, bitmap_blk);
|
||||
ext4_warning(sb, "Cannot get buffer for block bitmap - "
|
||||
"block_group = %u, block_bitmap = %llu",
|
||||
block_group, bitmap_blk);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
|
||||
|
@ -789,17 +789,16 @@ struct move_extent {
|
||||
* affected filesystem before 2242.
|
||||
*/
|
||||
|
||||
static inline __le32 ext4_encode_extra_time(struct timespec *time)
|
||||
static inline __le32 ext4_encode_extra_time(struct timespec64 *time)
|
||||
{
|
||||
u32 extra = sizeof(time->tv_sec) > 4 ?
|
||||
((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK : 0;
|
||||
u32 extra =((time->tv_sec - (s32)time->tv_sec) >> 32) & EXT4_EPOCH_MASK;
|
||||
return cpu_to_le32(extra | (time->tv_nsec << EXT4_EPOCH_BITS));
|
||||
}
|
||||
|
||||
static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
|
||||
static inline void ext4_decode_extra_time(struct timespec64 *time,
|
||||
__le32 extra)
|
||||
{
|
||||
if (unlikely(sizeof(time->tv_sec) > 4 &&
|
||||
(extra & cpu_to_le32(EXT4_EPOCH_MASK)))) {
|
||||
if (unlikely(extra & cpu_to_le32(EXT4_EPOCH_MASK))) {
|
||||
|
||||
#if 1
|
||||
/* Handle legacy encoding of pre-1970 dates with epoch
|
||||
@ -821,9 +820,8 @@ static inline void ext4_decode_extra_time(struct timespec *time, __le32 extra)
|
||||
do { \
|
||||
(raw_inode)->xtime = cpu_to_le32((inode)->xtime.tv_sec); \
|
||||
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) {\
|
||||
struct timespec ts = timespec64_to_timespec((inode)->xtime); \
|
||||
(raw_inode)->xtime ## _extra = \
|
||||
ext4_encode_extra_time(&ts); \
|
||||
ext4_encode_extra_time(&(inode)->xtime); \
|
||||
} \
|
||||
} while (0)
|
||||
|
||||
@ -840,10 +838,8 @@ do { \
|
||||
do { \
|
||||
(inode)->xtime.tv_sec = (signed)le32_to_cpu((raw_inode)->xtime); \
|
||||
if (EXT4_FITS_IN_INODE(raw_inode, EXT4_I(inode), xtime ## _extra)) { \
|
||||
struct timespec ts = timespec64_to_timespec((inode)->xtime); \
|
||||
ext4_decode_extra_time(&ts, \
|
||||
ext4_decode_extra_time(&(inode)->xtime, \
|
||||
raw_inode->xtime ## _extra); \
|
||||
(inode)->xtime = timespec_to_timespec64(ts); \
|
||||
} \
|
||||
else \
|
||||
(inode)->xtime.tv_nsec = 0; \
|
||||
@ -993,9 +989,9 @@ struct ext4_inode_info {
|
||||
|
||||
/*
|
||||
* File creation time. Its function is same as that of
|
||||
* struct timespec i_{a,c,m}time in the generic inode.
|
||||
* struct timespec64 i_{a,c,m}time in the generic inode.
|
||||
*/
|
||||
struct timespec i_crtime;
|
||||
struct timespec64 i_crtime;
|
||||
|
||||
/* mballoc */
|
||||
struct list_head i_prealloc_list;
|
||||
@ -1299,7 +1295,14 @@ struct ext4_super_block {
|
||||
__le32 s_lpf_ino; /* Location of the lost+found inode */
|
||||
__le32 s_prj_quota_inum; /* inode for tracking project quota */
|
||||
__le32 s_checksum_seed; /* crc32c(uuid) if csum_seed set */
|
||||
__le32 s_reserved[98]; /* Padding to the end of the block */
|
||||
__u8 s_wtime_hi;
|
||||
__u8 s_mtime_hi;
|
||||
__u8 s_mkfs_time_hi;
|
||||
__u8 s_lastcheck_hi;
|
||||
__u8 s_first_error_time_hi;
|
||||
__u8 s_last_error_time_hi;
|
||||
__u8 s_pad[2];
|
||||
__le32 s_reserved[96]; /* Padding to the end of the block */
|
||||
__le32 s_checksum; /* crc32c(superblock) */
|
||||
};
|
||||
|
||||
@ -2456,6 +2459,7 @@ extern int ext4_get_inode_loc(struct inode *, struct ext4_iloc *);
|
||||
extern int ext4_inode_attach_jinode(struct inode *inode);
|
||||
extern int ext4_can_truncate(struct inode *inode);
|
||||
extern int ext4_truncate(struct inode *);
|
||||
extern int ext4_break_layouts(struct inode *);
|
||||
extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
|
||||
extern int ext4_truncate_restart_trans(handle_t *, struct inode *, int nblocks);
|
||||
extern void ext4_set_inode_flags(struct inode *);
|
||||
|
@ -4826,6 +4826,13 @@ static long ext4_zero_range(struct file *file, loff_t offset,
|
||||
* released from page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
goto out_mutex;
|
||||
}
|
||||
|
||||
ret = ext4_update_disksize_before_punch(inode, offset, len);
|
||||
if (ret) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
@ -5499,6 +5506,11 @@ int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
goto out_mmap;
|
||||
|
||||
/*
|
||||
* Need to round down offset to be aligned with page size boundary
|
||||
* for page size > block size.
|
||||
@ -5647,6 +5659,11 @@ int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
goto out_mmap;
|
||||
|
||||
/*
|
||||
* Need to round down to align start offset to page size boundary
|
||||
* for page size > block size.
|
||||
|
@ -138,9 +138,9 @@ ext4_read_inode_bitmap(struct super_block *sb, ext4_group_t block_group)
|
||||
}
|
||||
bh = sb_getblk(sb, bitmap_blk);
|
||||
if (unlikely(!bh)) {
|
||||
ext4_error(sb, "Cannot read inode bitmap - "
|
||||
"block_group = %u, inode_bitmap = %llu",
|
||||
block_group, bitmap_blk);
|
||||
ext4_warning(sb, "Cannot read inode bitmap - "
|
||||
"block_group = %u, inode_bitmap = %llu",
|
||||
block_group, bitmap_blk);
|
||||
return ERR_PTR(-ENOMEM);
|
||||
}
|
||||
if (bitmap_uptodate(bh))
|
||||
@ -1086,7 +1086,7 @@ got:
|
||||
/* This is the optimal IO size (for stat), not the fs block size */
|
||||
inode->i_blocks = 0;
|
||||
inode->i_mtime = inode->i_atime = inode->i_ctime = current_time(inode);
|
||||
ei->i_crtime = timespec64_to_timespec(inode->i_mtime);
|
||||
ei->i_crtime = inode->i_mtime;
|
||||
|
||||
memset(ei->i_data, 0, sizeof(ei->i_data));
|
||||
ei->i_dir_start_lookup = 0;
|
||||
|
@ -317,7 +317,7 @@ stop_handle:
|
||||
* (Well, we could do this if we need to, but heck - it works)
|
||||
*/
|
||||
ext4_orphan_del(handle, inode);
|
||||
EXT4_I(inode)->i_dtime = get_seconds();
|
||||
EXT4_I(inode)->i_dtime = (__u32)ktime_get_real_seconds();
|
||||
|
||||
/*
|
||||
* One subtle ordering requirement: if anything has gone wrong
|
||||
@ -4191,6 +4191,39 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset,
|
||||
return 0;
|
||||
}
|
||||
|
||||
static void ext4_wait_dax_page(struct ext4_inode_info *ei, bool *did_unlock)
|
||||
{
|
||||
*did_unlock = true;
|
||||
up_write(&ei->i_mmap_sem);
|
||||
schedule();
|
||||
down_write(&ei->i_mmap_sem);
|
||||
}
|
||||
|
||||
int ext4_break_layouts(struct inode *inode)
|
||||
{
|
||||
struct ext4_inode_info *ei = EXT4_I(inode);
|
||||
struct page *page;
|
||||
bool retry;
|
||||
int error;
|
||||
|
||||
if (WARN_ON_ONCE(!rwsem_is_locked(&ei->i_mmap_sem)))
|
||||
return -EINVAL;
|
||||
|
||||
do {
|
||||
retry = false;
|
||||
page = dax_layout_busy_page(inode->i_mapping);
|
||||
if (!page)
|
||||
return 0;
|
||||
|
||||
error = ___wait_var_event(&page->_refcount,
|
||||
atomic_read(&page->_refcount) == 1,
|
||||
TASK_INTERRUPTIBLE, 0, 0,
|
||||
ext4_wait_dax_page(ei, &retry));
|
||||
} while (error == 0 && retry);
|
||||
|
||||
return error;
|
||||
}
|
||||
|
||||
/*
|
||||
* ext4_punch_hole: punches a hole in a file by releasing the blocks
|
||||
* associated with the given offset and length
|
||||
@ -4264,6 +4297,11 @@ int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
|
||||
* page cache.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
ret = ext4_break_layouts(inode);
|
||||
if (ret)
|
||||
goto out_dio;
|
||||
|
||||
first_block_offset = round_up(offset, sb->s_blocksize);
|
||||
last_block_offset = round_down((offset + length), sb->s_blocksize) - 1;
|
||||
|
||||
@ -4944,17 +4982,14 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
|
||||
ret = -EFSCORRUPTED;
|
||||
goto bad_inode;
|
||||
} else if (!ext4_has_inline_data(inode)) {
|
||||
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) {
|
||||
if ((S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
||||
(S_ISLNK(inode->i_mode) &&
|
||||
!ext4_inode_is_fast_symlink(inode))))
|
||||
/* Validate extent which is part of inode */
|
||||
/* validate the block references in the inode */
|
||||
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
||||
(S_ISLNK(inode->i_mode) &&
|
||||
!ext4_inode_is_fast_symlink(inode))) {
|
||||
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
|
||||
ret = ext4_ext_check_inode(inode);
|
||||
} else if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
|
||||
(S_ISLNK(inode->i_mode) &&
|
||||
!ext4_inode_is_fast_symlink(inode))) {
|
||||
/* Validate block references which are part of inode */
|
||||
ret = ext4_ind_check_inode(inode);
|
||||
else
|
||||
ret = ext4_ind_check_inode(inode);
|
||||
}
|
||||
}
|
||||
if (ret)
|
||||
@ -5553,6 +5588,14 @@ int ext4_setattr(struct dentry *dentry, struct iattr *attr)
|
||||
ext4_wait_for_tail_page_commit(inode);
|
||||
}
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
|
||||
rc = ext4_break_layouts(inode);
|
||||
if (rc) {
|
||||
up_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
error = rc;
|
||||
goto err_out;
|
||||
}
|
||||
|
||||
/*
|
||||
* Truncate pagecache after we've waited for commit
|
||||
* in data=journal mode to make pages freeable.
|
||||
|
@ -14,6 +14,7 @@
|
||||
#include <linux/log2.h>
|
||||
#include <linux/module.h>
|
||||
#include <linux/slab.h>
|
||||
#include <linux/nospec.h>
|
||||
#include <linux/backing-dev.h>
|
||||
#include <trace/events/ext4.h>
|
||||
|
||||
@ -2140,7 +2141,8 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac)
|
||||
* This should tell if fe_len is exactly power of 2
|
||||
*/
|
||||
if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0)
|
||||
ac->ac_2order = i - 1;
|
||||
ac->ac_2order = array_index_nospec(i - 1,
|
||||
sb->s_blocksize_bits + 2);
|
||||
}
|
||||
|
||||
/* if stream allocation is enabled, use global goal */
|
||||
@ -3799,7 +3801,6 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
|
||||
ext4_group_t group;
|
||||
ext4_grpblk_t bit;
|
||||
unsigned long long grp_blk_start;
|
||||
int err = 0;
|
||||
int free = 0;
|
||||
|
||||
BUG_ON(pa->pa_deleted == 0);
|
||||
@ -3840,7 +3841,7 @@ ext4_mb_release_inode_pa(struct ext4_buddy *e4b, struct buffer_head *bitmap_bh,
|
||||
}
|
||||
atomic_add(free, &sbi->s_mb_discarded);
|
||||
|
||||
return err;
|
||||
return 0;
|
||||
}
|
||||
|
||||
static noinline_for_stack int
|
||||
|
@ -147,7 +147,7 @@ static int kmmpd(void *data)
|
||||
|
||||
mmp_block = le64_to_cpu(es->s_mmp_block);
|
||||
mmp = (struct mmp_struct *)(bh->b_data);
|
||||
mmp->mmp_time = cpu_to_le64(get_seconds());
|
||||
mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
|
||||
/*
|
||||
* Start with the higher mmp_check_interval and reduce it if
|
||||
* the MMP block is being updated on time.
|
||||
@ -165,7 +165,7 @@ static int kmmpd(void *data)
|
||||
seq = 1;
|
||||
|
||||
mmp->mmp_seq = cpu_to_le32(seq);
|
||||
mmp->mmp_time = cpu_to_le64(get_seconds());
|
||||
mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
|
||||
last_update_time = jiffies;
|
||||
|
||||
retval = write_mmp_block(sb, bh);
|
||||
@ -241,7 +241,7 @@ static int kmmpd(void *data)
|
||||
* Unmount seems to be clean.
|
||||
*/
|
||||
mmp->mmp_seq = cpu_to_le32(EXT4_MMP_SEQ_CLEAN);
|
||||
mmp->mmp_time = cpu_to_le64(get_seconds());
|
||||
mmp->mmp_time = cpu_to_le64(ktime_get_real_seconds());
|
||||
|
||||
retval = write_mmp_block(sb, bh);
|
||||
|
||||
|
@ -134,9 +134,7 @@ mext_page_double_lock(struct inode *inode1, struct inode *inode2,
|
||||
mapping[0] = inode1->i_mapping;
|
||||
mapping[1] = inode2->i_mapping;
|
||||
} else {
|
||||
pgoff_t tmp = index1;
|
||||
index1 = index2;
|
||||
index2 = tmp;
|
||||
swap(index1, index2);
|
||||
mapping[0] = inode2->i_mapping;
|
||||
mapping[1] = inode1->i_mapping;
|
||||
}
|
||||
|
@ -1398,6 +1398,7 @@ static struct buffer_head * ext4_find_entry (struct inode *dir,
|
||||
goto cleanup_and_exit;
|
||||
dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, "
|
||||
"falling back\n"));
|
||||
ret = NULL;
|
||||
}
|
||||
nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb);
|
||||
if (!nblocks) {
|
||||
|
@ -312,6 +312,24 @@ void ext4_itable_unused_set(struct super_block *sb,
|
||||
bg->bg_itable_unused_hi = cpu_to_le16(count >> 16);
|
||||
}
|
||||
|
||||
static void __ext4_update_tstamp(__le32 *lo, __u8 *hi)
|
||||
{
|
||||
time64_t now = ktime_get_real_seconds();
|
||||
|
||||
now = clamp_val(now, 0, (1ull << 40) - 1);
|
||||
|
||||
*lo = cpu_to_le32(lower_32_bits(now));
|
||||
*hi = upper_32_bits(now);
|
||||
}
|
||||
|
||||
static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi)
|
||||
{
|
||||
return ((time64_t)(*hi) << 32) + le32_to_cpu(*lo);
|
||||
}
|
||||
#define ext4_update_tstamp(es, tstamp) \
|
||||
__ext4_update_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
|
||||
#define ext4_get_tstamp(es, tstamp) \
|
||||
__ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi)
|
||||
|
||||
static void __save_error_info(struct super_block *sb, const char *func,
|
||||
unsigned int line)
|
||||
@ -322,11 +340,12 @@ static void __save_error_info(struct super_block *sb, const char *func,
|
||||
if (bdev_read_only(sb->s_bdev))
|
||||
return;
|
||||
es->s_state |= cpu_to_le16(EXT4_ERROR_FS);
|
||||
es->s_last_error_time = cpu_to_le32(get_seconds());
|
||||
ext4_update_tstamp(es, s_last_error_time);
|
||||
strncpy(es->s_last_error_func, func, sizeof(es->s_last_error_func));
|
||||
es->s_last_error_line = cpu_to_le32(line);
|
||||
if (!es->s_first_error_time) {
|
||||
es->s_first_error_time = es->s_last_error_time;
|
||||
es->s_first_error_time_hi = es->s_last_error_time_hi;
|
||||
strncpy(es->s_first_error_func, func,
|
||||
sizeof(es->s_first_error_func));
|
||||
es->s_first_error_line = cpu_to_le32(line);
|
||||
@ -776,26 +795,26 @@ void ext4_mark_group_bitmap_corrupted(struct super_block *sb,
|
||||
struct ext4_sb_info *sbi = EXT4_SB(sb);
|
||||
struct ext4_group_info *grp = ext4_get_group_info(sb, group);
|
||||
struct ext4_group_desc *gdp = ext4_get_group_desc(sb, group, NULL);
|
||||
int ret;
|
||||
|
||||
if ((flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) &&
|
||||
!EXT4_MB_GRP_BBITMAP_CORRUPT(grp)) {
|
||||
percpu_counter_sub(&sbi->s_freeclusters_counter,
|
||||
grp->bb_free);
|
||||
set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
|
||||
&grp->bb_state);
|
||||
if (flags & EXT4_GROUP_INFO_BBITMAP_CORRUPT) {
|
||||
ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_BBITMAP_CORRUPT_BIT,
|
||||
&grp->bb_state);
|
||||
if (!ret)
|
||||
percpu_counter_sub(&sbi->s_freeclusters_counter,
|
||||
grp->bb_free);
|
||||
}
|
||||
|
||||
if ((flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) &&
|
||||
!EXT4_MB_GRP_IBITMAP_CORRUPT(grp)) {
|
||||
if (gdp) {
|
||||
if (flags & EXT4_GROUP_INFO_IBITMAP_CORRUPT) {
|
||||
ret = ext4_test_and_set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
|
||||
&grp->bb_state);
|
||||
if (!ret && gdp) {
|
||||
int count;
|
||||
|
||||
count = ext4_free_inodes_count(sb, gdp);
|
||||
percpu_counter_sub(&sbi->s_freeinodes_counter,
|
||||
count);
|
||||
}
|
||||
set_bit(EXT4_GROUP_INFO_IBITMAP_CORRUPT_BIT,
|
||||
&grp->bb_state);
|
||||
}
|
||||
}
|
||||
|
||||
@ -2174,8 +2193,8 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
|
||||
"warning: maximal mount count reached, "
|
||||
"running e2fsck is recommended");
|
||||
else if (le32_to_cpu(es->s_checkinterval) &&
|
||||
(le32_to_cpu(es->s_lastcheck) +
|
||||
le32_to_cpu(es->s_checkinterval) <= get_seconds()))
|
||||
(ext4_get_tstamp(es, s_lastcheck) +
|
||||
le32_to_cpu(es->s_checkinterval) <= ktime_get_real_seconds()))
|
||||
ext4_msg(sb, KERN_WARNING,
|
||||
"warning: checktime reached, "
|
||||
"running e2fsck is recommended");
|
||||
@ -2184,7 +2203,7 @@ static int ext4_setup_super(struct super_block *sb, struct ext4_super_block *es,
|
||||
if (!(__s16) le16_to_cpu(es->s_max_mnt_count))
|
||||
es->s_max_mnt_count = cpu_to_le16(EXT4_DFL_MAX_MNT_COUNT);
|
||||
le16_add_cpu(&es->s_mnt_count, 1);
|
||||
es->s_mtime = cpu_to_le32(get_seconds());
|
||||
ext4_update_tstamp(es, s_mtime);
|
||||
ext4_update_dynamic_rev(sb);
|
||||
if (sbi->s_journal)
|
||||
ext4_set_feature_journal_needs_recovery(sb);
|
||||
@ -2875,8 +2894,9 @@ static void print_daily_error_info(struct timer_list *t)
|
||||
ext4_msg(sb, KERN_NOTICE, "error count since last fsck: %u",
|
||||
le32_to_cpu(es->s_error_count));
|
||||
if (es->s_first_error_time) {
|
||||
printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %u: %.*s:%d",
|
||||
sb->s_id, le32_to_cpu(es->s_first_error_time),
|
||||
printk(KERN_NOTICE "EXT4-fs (%s): initial error at time %llu: %.*s:%d",
|
||||
sb->s_id,
|
||||
ext4_get_tstamp(es, s_first_error_time),
|
||||
(int) sizeof(es->s_first_error_func),
|
||||
es->s_first_error_func,
|
||||
le32_to_cpu(es->s_first_error_line));
|
||||
@ -2889,8 +2909,9 @@ static void print_daily_error_info(struct timer_list *t)
|
||||
printk(KERN_CONT "\n");
|
||||
}
|
||||
if (es->s_last_error_time) {
|
||||
printk(KERN_NOTICE "EXT4-fs (%s): last error at time %u: %.*s:%d",
|
||||
sb->s_id, le32_to_cpu(es->s_last_error_time),
|
||||
printk(KERN_NOTICE "EXT4-fs (%s): last error at time %llu: %.*s:%d",
|
||||
sb->s_id,
|
||||
ext4_get_tstamp(es, s_last_error_time),
|
||||
(int) sizeof(es->s_last_error_func),
|
||||
es->s_last_error_func,
|
||||
le32_to_cpu(es->s_last_error_line));
|
||||
@ -4813,7 +4834,7 @@ static int ext4_commit_super(struct super_block *sb, int sync)
|
||||
* to complain and force a full file system check.
|
||||
*/
|
||||
if (!(sb->s_flags & SB_RDONLY))
|
||||
es->s_wtime = cpu_to_le32(get_seconds());
|
||||
ext4_update_tstamp(es, s_wtime);
|
||||
if (sb->s_bdev->bd_part)
|
||||
es->s_kbytes_written =
|
||||
cpu_to_le64(EXT4_SB(sb)->s_kbytes_written +
|
||||
@ -5080,6 +5101,9 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data)
|
||||
#endif
|
||||
char *orig_data = kstrdup(data, GFP_KERNEL);
|
||||
|
||||
if (data && !orig_data)
|
||||
return -ENOMEM;
|
||||
|
||||
/* Store the original options */
|
||||
old_sb_flags = sb->s_flags;
|
||||
old_opts.s_mount_opt = sbi->s_mount_opt;
|
||||
@ -5665,13 +5689,13 @@ static int ext4_enable_quotas(struct super_block *sb)
|
||||
DQUOT_USAGE_ENABLED |
|
||||
(quota_mopt[type] ? DQUOT_LIMITS_ENABLED : 0));
|
||||
if (err) {
|
||||
for (type--; type >= 0; type--)
|
||||
dquot_quota_off(sb, type);
|
||||
|
||||
ext4_warning(sb,
|
||||
"Failed to enable quota tracking "
|
||||
"(type=%d, err=%d). Please run "
|
||||
"e2fsck to fix.", type, err);
|
||||
for (type--; type >= 0; type--)
|
||||
dquot_quota_off(sb, type);
|
||||
|
||||
return err;
|
||||
}
|
||||
}
|
||||
|
@ -25,6 +25,8 @@ typedef enum {
|
||||
attr_reserved_clusters,
|
||||
attr_inode_readahead,
|
||||
attr_trigger_test_error,
|
||||
attr_first_error_time,
|
||||
attr_last_error_time,
|
||||
attr_feature,
|
||||
attr_pointer_ui,
|
||||
attr_pointer_atomic,
|
||||
@ -182,8 +184,8 @@ EXT4_RW_ATTR_SBI_UI(warning_ratelimit_burst, s_warning_ratelimit_state.burst);
|
||||
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_interval_ms, s_msg_ratelimit_state.interval);
|
||||
EXT4_RW_ATTR_SBI_UI(msg_ratelimit_burst, s_msg_ratelimit_state.burst);
|
||||
EXT4_RO_ATTR_ES_UI(errors_count, s_error_count);
|
||||
EXT4_RO_ATTR_ES_UI(first_error_time, s_first_error_time);
|
||||
EXT4_RO_ATTR_ES_UI(last_error_time, s_last_error_time);
|
||||
EXT4_ATTR(first_error_time, 0444, first_error_time);
|
||||
EXT4_ATTR(last_error_time, 0444, last_error_time);
|
||||
|
||||
static unsigned int old_bump_val = 128;
|
||||
EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val);
|
||||
@ -249,6 +251,15 @@ static void *calc_ptr(struct ext4_attr *a, struct ext4_sb_info *sbi)
|
||||
return NULL;
|
||||
}
|
||||
|
||||
static ssize_t __print_tstamp(char *buf, __le32 lo, __u8 hi)
|
||||
{
|
||||
return snprintf(buf, PAGE_SIZE, "%lld",
|
||||
((time64_t)hi << 32) + le32_to_cpu(lo));
|
||||
}
|
||||
|
||||
#define print_tstamp(buf, es, tstamp) \
|
||||
__print_tstamp(buf, (es)->tstamp, (es)->tstamp ## _hi)
|
||||
|
||||
static ssize_t ext4_attr_show(struct kobject *kobj,
|
||||
struct attribute *attr, char *buf)
|
||||
{
|
||||
@ -274,8 +285,12 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
|
||||
case attr_pointer_ui:
|
||||
if (!ptr)
|
||||
return 0;
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n",
|
||||
*((unsigned int *) ptr));
|
||||
if (a->attr_ptr == ptr_ext4_super_block_offset)
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n",
|
||||
le32_to_cpup(ptr));
|
||||
else
|
||||
return snprintf(buf, PAGE_SIZE, "%u\n",
|
||||
*((unsigned int *) ptr));
|
||||
case attr_pointer_atomic:
|
||||
if (!ptr)
|
||||
return 0;
|
||||
@ -283,6 +298,10 @@ static ssize_t ext4_attr_show(struct kobject *kobj,
|
||||
atomic_read((atomic_t *) ptr));
|
||||
case attr_feature:
|
||||
return snprintf(buf, PAGE_SIZE, "supported\n");
|
||||
case attr_first_error_time:
|
||||
return print_tstamp(buf, sbi->s_es, s_first_error_time);
|
||||
case attr_last_error_time:
|
||||
return print_tstamp(buf, sbi->s_es, s_last_error_time);
|
||||
}
|
||||
|
||||
return 0;
|
||||
@ -308,7 +327,10 @@ static ssize_t ext4_attr_store(struct kobject *kobj,
|
||||
ret = kstrtoul(skip_spaces(buf), 0, &t);
|
||||
if (ret)
|
||||
return ret;
|
||||
*((unsigned int *) ptr) = t;
|
||||
if (a->attr_ptr == ptr_ext4_super_block_offset)
|
||||
*((__le32 *) ptr) = cpu_to_le32(t);
|
||||
else
|
||||
*((unsigned int *) ptr) = t;
|
||||
return len;
|
||||
case attr_inode_readahead:
|
||||
return inode_readahead_blks_store(sbi, buf, len);
|
||||
|
@ -11,6 +11,10 @@
|
||||
*/
|
||||
static inline void ext4_truncate_failed_write(struct inode *inode)
|
||||
{
|
||||
/*
|
||||
* We don't need to call ext4_break_layouts() because the blocks we
|
||||
* are truncating were never visible to userspace.
|
||||
*/
|
||||
down_write(&EXT4_I(inode)->i_mmap_sem);
|
||||
truncate_inode_pages(inode->i_mapping, inode->i_size);
|
||||
ext4_truncate(inode);
|
||||
|
@ -190,6 +190,8 @@ ext4_xattr_check_entries(struct ext4_xattr_entry *entry, void *end,
|
||||
struct ext4_xattr_entry *next = EXT4_XATTR_NEXT(e);
|
||||
if ((void *)next >= end)
|
||||
return -EFSCORRUPTED;
|
||||
if (strnlen(e->e_name, e->e_name_len) != e->e_name_len)
|
||||
return -EFSCORRUPTED;
|
||||
e = next;
|
||||
}
|
||||
|
||||
|
@ -121,7 +121,7 @@ static int journal_submit_commit_record(journal_t *journal,
|
||||
struct commit_header *tmp;
|
||||
struct buffer_head *bh;
|
||||
int ret;
|
||||
struct timespec64 now = current_kernel_time64();
|
||||
struct timespec64 now;
|
||||
|
||||
*cbh = NULL;
|
||||
|
||||
@ -134,6 +134,7 @@ static int journal_submit_commit_record(journal_t *journal,
|
||||
return 1;
|
||||
|
||||
tmp = (struct commit_header *)bh->b_data;
|
||||
ktime_get_coarse_real_ts64(&now);
|
||||
tmp->h_commit_sec = cpu_to_be64(now.tv_sec);
|
||||
tmp->h_commit_nsec = cpu_to_be32(now.tv_nsec);
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user