libnvdimm for 4.15

* Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
  'userspace flush' of persistent memory updates via filesystem-dax
   mappings. It arranges for any filesystem metadata updates that may be
   required to satisfy a write fault to also be flushed ("on disk") before
   the kernel returns to userspace from the fault handler. Effectively
   every write-fault that dirties metadata completes an fsync() before
   returning from the fault handler. The new MAP_SHARED_VALIDATE mapping
   type guarantees that the MAP_SYNC flag is validated as supported by the
   filesystem's ->mmap() file operation.
 
 * Add support for the standard ACPI 6.2 label access methods that
   replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods. This
   enables interoperability with environments that only implement the
   standardized methods.
 
 * Add support for the ACPI 6.2 NVDIMM media error injection methods.
 
 * Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for latch
   last shutdown status, firmware update, SMART error injection, and
   SMART alarm threshold control.
 
 * Cleanup physical address information disclosures to be root-only.
 
 * Fix revalidation of the DIMM "locked label area" status to support
   dynamic unlock of the label area.
 
 * Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
   (system-physical-address) command and error injection commands.
 
 Acknowledgements that came after the commits were pushed to -next:
 
 957ac8c421 dax: fix PMD faults on zero-length files
 Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>
 
 a39e596baa xfs: support for synchronous DAX faults
 Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
 
 7b565c9f96 xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
 Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>
 -----BEGIN PGP SIGNATURE-----
 
 iQIcBAABAgAGBQJaDfvcAAoJEB7SkWpmfYgCk7sP/2qJhBH+VTTdg2osDnhAdAhI
 co/AGEmsHFlUCMBb/Ek7UnMAmhBYiJU2q4ywPsNFBpusXpMlqNy5Iwo7k4/wQHE/
 SJcIM0g4zg0ViFuUhwV+C2T0R5UzFR8JLd9EYWj/YS6aJpurtotm5l4UStaM0Hzo
 AhxSXJLrBDuqCpbOxbctfiGEmdRL7aRfBEAARTNRKBn/iXxJUcYHlp62rtXQS+t4
 I6LC/URCWTNTTMGmzW6TRsgSD9WMfd19xKcGzN3qL6ee0KFccxN4ctFqHA/sFGOh
 iYLeR0XJUjJxyp+PkWGteXPVZL0Kj3bD/lSTG+Co5bm/ra8a/sh3TSFfgFyoBZD1
 EqMN8Ryf80hGp3FabeH2Iw2SviYPZpHSWgjddjxLD0RA6OmpzINc+Wm8eqApjMME
 sbZDTOijiab4QMQ0XamF4GuDHyQtawv5Y/w2Ehhl1tmiqW+5tKhsKqxkQt+/V3Yt
 RTVSRe2Pkway66b+cD64IdQ6L2tyonPnmi5IzgkKOhlOEGomy+4/U2Jt2bMbhzq6
 ymszKmXp2XI8P06wU8sHrIUeXO5I9qoKn/fZA73Eb8aIzgJe3tBE/5+Ab7RG6HB9
 1OVfcMWoXU1gNgNktTs63X1Lsg4aW9kt/K4fPHHcqUcaliEJpJTlAbg9GLF2buoW
 nQ+0fTRgMRihE3ZA0Fs3
 =h2vZ
 -----END PGP SIGNATURE-----

Merge tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm

Pull libnvdimm and dax updates from Dan Williams:
 "Save for a few late fixes, all of these commits have shipped in -next
  releases since before the merge window opened, and 0day has given a
  build success notification.

  The ext4 touches came from Jan, and the xfs touches have Darrick's
  reviewed-by. An xfstest for the MAP_SYNC feature has been through
  a few round of reviews and is on track to be merged.

   - Introduce MAP_SYNC and MAP_SHARED_VALIDATE, a mechanism to enable
     'userspace flush' of persistent memory updates via filesystem-dax
     mappings. It arranges for any filesystem metadata updates that may
     be required to satisfy a write fault to also be flushed ("on disk")
     before the kernel returns to userspace from the fault handler.
     Effectively every write-fault that dirties metadata completes an
     fsync() before returning from the fault handler. The new
     MAP_SHARED_VALIDATE mapping type guarantees that the MAP_SYNC flag
     is validated as supported by the filesystem's ->mmap() file
     operation.

   - Add support for the standard ACPI 6.2 label access methods that
     replace the NVDIMM_FAMILY_INTEL (vendor specific) label methods.
     This enables interoperability with environments that only implement
     the standardized methods.

   - Add support for the ACPI 6.2 NVDIMM media error injection methods.

   - Add support for the NVDIMM_FAMILY_INTEL v1.6 DIMM commands for
     latch last shutdown status, firmware update, SMART error injection,
     and SMART alarm threshold control.

   - Cleanup physical address information disclosures to be root-only.

   - Fix revalidation of the DIMM "locked label area" status to support
     dynamic unlock of the label area.

   - Expand unit test infrastructure to mock the ACPI 6.2 Translate SPA
     (system-physical-address) command and error injection commands.

  Acknowledgements that came after the commits were pushed to -next:

   - 957ac8c421 ("dax: fix PMD faults on zero-length files"):
       Reviewed-by: Ross Zwisler <ross.zwisler@linux.intel.com>

   - a39e596baa ("xfs: support for synchronous DAX faults") and
     7b565c9f96 ("xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()")
        Reviewed-by: Darrick J. Wong <darrick.wong@oracle.com>"

* tag 'libnvdimm-for-4.15' of git://git.kernel.org/pub/scm/linux/kernel/git/nvdimm/nvdimm: (49 commits)
  acpi, nfit: add 'Enable Latch System Shutdown Status' command support
  dax: fix general protection fault in dax_alloc_inode
  dax: fix PMD faults on zero-length files
  dax: stop requiring a live device for dax_flush()
  brd: remove dax support
  dax: quiet bdev_dax_supported()
  fs, dax: unify IOMAP_F_DIRTY read vs write handling policy in the dax core
  tools/testing/nvdimm: unit test clear-error commands
  acpi, nfit: validate commands against the device type
  tools/testing/nvdimm: stricter bounds checking for error injection commands
  xfs: support for synchronous DAX faults
  xfs: Implement xfs_filemap_pfn_mkwrite() using __xfs_filemap_fault()
  ext4: Support for synchronous DAX faults
  ext4: Simplify error handling in ext4_dax_huge_fault()
  dax: Implement dax_finish_sync_fault()
  dax, iomap: Add support for synchronous faults
  mm: Define MAP_SYNC and VM_SYNC flags
  dax: Allow tuning whether dax_insert_mapping_entry() dirties entry
  dax: Allow dax_iomap_fault() to return pfn
  dax: Fix comment describing dax_iomap_fault()
  ...
This commit is contained in:
Linus Torvalds 2017-11-17 09:51:57 -08:00
commit a3841f94c7
48 changed files with 1407 additions and 562 deletions

View File

@ -4208,7 +4208,7 @@ L: linux-i2c@vger.kernel.org
S: Maintained S: Maintained
F: drivers/i2c/busses/i2c-diolan-u2c.c F: drivers/i2c/busses/i2c-diolan-u2c.c
DIRECT ACCESS (DAX) FILESYSTEM DIRECT ACCESS (DAX)
M: Matthew Wilcox <mawilcox@microsoft.com> M: Matthew Wilcox <mawilcox@microsoft.com>
M: Ross Zwisler <ross.zwisler@linux.intel.com> M: Ross Zwisler <ross.zwisler@linux.intel.com>
L: linux-fsdevel@vger.kernel.org L: linux-fsdevel@vger.kernel.org
@ -4217,6 +4217,12 @@ F: fs/dax.c
F: include/linux/dax.h F: include/linux/dax.h
F: include/trace/events/fs_dax.h F: include/trace/events/fs_dax.h
DEVICE DIRECT ACCESS (DAX)
M: Dan Williams <dan.j.williams@intel.com>
L: linux-nvdimm@lists.01.org
S: Supported
F: drivers/dax/
DIRECTORY NOTIFICATION (DNOTIFY) DIRECTORY NOTIFICATION (DNOTIFY)
M: Jan Kara <jack@suse.cz> M: Jan Kara <jack@suse.cz>
R: Amir Goldstein <amir73il@gmail.com> R: Amir Goldstein <amir73il@gmail.com>

View File

@ -12,6 +12,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */ #define MAP_TYPE 0x0f /* Mask for type of mapping (OSF/1 is _wrong_) */
#define MAP_FIXED 0x100 /* Interpret addr exactly */ #define MAP_FIXED 0x100 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x10 /* don't use a file */ #define MAP_ANONYMOUS 0x10 /* don't use a file */

View File

@ -29,6 +29,7 @@
*/ */
#define MAP_SHARED 0x001 /* Share changes */ #define MAP_SHARED 0x001 /* Share changes */
#define MAP_PRIVATE 0x002 /* Changes are private */ #define MAP_PRIVATE 0x002 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
#define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_TYPE 0x00f /* Mask for type of mapping */
#define MAP_FIXED 0x010 /* Interpret addr exactly */ #define MAP_FIXED 0x010 /* Interpret addr exactly */

View File

@ -12,6 +12,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x03 /* Mask for type of mapping */ #define MAP_TYPE 0x03 /* Mask for type of mapping */
#define MAP_FIXED 0x04 /* Interpret addr exactly */ #define MAP_FIXED 0x04 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x10 /* don't use a file */ #define MAP_ANONYMOUS 0x10 /* don't use a file */

View File

@ -36,6 +36,7 @@
*/ */
#define MAP_SHARED 0x001 /* Share changes */ #define MAP_SHARED 0x001 /* Share changes */
#define MAP_PRIVATE 0x002 /* Changes are private */ #define MAP_PRIVATE 0x002 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x003 /* share + validate extension flags */
#define MAP_TYPE 0x00f /* Mask for type of mapping */ #define MAP_TYPE 0x00f /* Mask for type of mapping */
#define MAP_FIXED 0x010 /* Interpret addr exactly */ #define MAP_FIXED 0x010 /* Interpret addr exactly */

View File

@ -183,13 +183,33 @@ static int xlat_bus_status(void *buf, unsigned int cmd, u32 status)
return 0; return 0;
} }
static int xlat_nvdimm_status(void *buf, unsigned int cmd, u32 status) #define ACPI_LABELS_LOCKED 3
static int xlat_nvdimm_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
u32 status)
{ {
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
switch (cmd) { switch (cmd) {
case ND_CMD_GET_CONFIG_SIZE: case ND_CMD_GET_CONFIG_SIZE:
/*
* In the _LSI, _LSR, _LSW case the locked status is
* communicated via the read/write commands
*/
if (nfit_mem->has_lsi)
break;
if (status >> 16 & ND_CONFIG_LOCKED) if (status >> 16 & ND_CONFIG_LOCKED)
return -EACCES; return -EACCES;
break; break;
case ND_CMD_GET_CONFIG_DATA:
if (nfit_mem->has_lsr && status == ACPI_LABELS_LOCKED)
return -EACCES;
break;
case ND_CMD_SET_CONFIG_DATA:
if (nfit_mem->has_lsw && status == ACPI_LABELS_LOCKED)
return -EACCES;
break;
default: default:
break; break;
} }
@ -205,13 +225,182 @@ static int xlat_status(struct nvdimm *nvdimm, void *buf, unsigned int cmd,
{ {
if (!nvdimm) if (!nvdimm)
return xlat_bus_status(buf, cmd, status); return xlat_bus_status(buf, cmd, status);
return xlat_nvdimm_status(buf, cmd, status); return xlat_nvdimm_status(nvdimm, buf, cmd, status);
}
/* convert _LS{I,R} packages to the buffer object acpi_nfit_ctl expects */
static union acpi_object *pkg_to_buf(union acpi_object *pkg)
{
int i;
void *dst;
size_t size = 0;
union acpi_object *buf = NULL;
if (pkg->type != ACPI_TYPE_PACKAGE) {
WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
pkg->type);
goto err;
}
for (i = 0; i < pkg->package.count; i++) {
union acpi_object *obj = &pkg->package.elements[i];
if (obj->type == ACPI_TYPE_INTEGER)
size += 4;
else if (obj->type == ACPI_TYPE_BUFFER)
size += obj->buffer.length;
else {
WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
obj->type);
goto err;
}
}
buf = ACPI_ALLOCATE(sizeof(*buf) + size);
if (!buf)
goto err;
dst = buf + 1;
buf->type = ACPI_TYPE_BUFFER;
buf->buffer.length = size;
buf->buffer.pointer = dst;
for (i = 0; i < pkg->package.count; i++) {
union acpi_object *obj = &pkg->package.elements[i];
if (obj->type == ACPI_TYPE_INTEGER) {
memcpy(dst, &obj->integer.value, 4);
dst += 4;
} else if (obj->type == ACPI_TYPE_BUFFER) {
memcpy(dst, obj->buffer.pointer, obj->buffer.length);
dst += obj->buffer.length;
}
}
err:
ACPI_FREE(pkg);
return buf;
}
static union acpi_object *int_to_buf(union acpi_object *integer)
{
union acpi_object *buf = ACPI_ALLOCATE(sizeof(*buf) + 4);
void *dst = NULL;
if (!buf)
goto err;
if (integer->type != ACPI_TYPE_INTEGER) {
WARN_ONCE(1, "BIOS bug, unexpected element type: %d\n",
integer->type);
goto err;
}
dst = buf + 1;
buf->type = ACPI_TYPE_BUFFER;
buf->buffer.length = 4;
buf->buffer.pointer = dst;
memcpy(dst, &integer->integer.value, 4);
err:
ACPI_FREE(integer);
return buf;
}
static union acpi_object *acpi_label_write(acpi_handle handle, u32 offset,
u32 len, void *data)
{
acpi_status rc;
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
struct acpi_object_list input = {
.count = 3,
.pointer = (union acpi_object []) {
[0] = {
.integer.type = ACPI_TYPE_INTEGER,
.integer.value = offset,
},
[1] = {
.integer.type = ACPI_TYPE_INTEGER,
.integer.value = len,
},
[2] = {
.buffer.type = ACPI_TYPE_BUFFER,
.buffer.pointer = data,
.buffer.length = len,
},
},
};
rc = acpi_evaluate_object(handle, "_LSW", &input, &buf);
if (ACPI_FAILURE(rc))
return NULL;
return int_to_buf(buf.pointer);
}
static union acpi_object *acpi_label_read(acpi_handle handle, u32 offset,
u32 len)
{
acpi_status rc;
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
struct acpi_object_list input = {
.count = 2,
.pointer = (union acpi_object []) {
[0] = {
.integer.type = ACPI_TYPE_INTEGER,
.integer.value = offset,
},
[1] = {
.integer.type = ACPI_TYPE_INTEGER,
.integer.value = len,
},
},
};
rc = acpi_evaluate_object(handle, "_LSR", &input, &buf);
if (ACPI_FAILURE(rc))
return NULL;
return pkg_to_buf(buf.pointer);
}
static union acpi_object *acpi_label_info(acpi_handle handle)
{
acpi_status rc;
struct acpi_buffer buf = { ACPI_ALLOCATE_BUFFER, NULL };
rc = acpi_evaluate_object(handle, "_LSI", NULL, &buf);
if (ACPI_FAILURE(rc))
return NULL;
return pkg_to_buf(buf.pointer);
}
static u8 nfit_dsm_revid(unsigned family, unsigned func)
{
static const u8 revid_table[NVDIMM_FAMILY_MAX+1][32] = {
[NVDIMM_FAMILY_INTEL] = {
[NVDIMM_INTEL_GET_MODES] = 2,
[NVDIMM_INTEL_GET_FWINFO] = 2,
[NVDIMM_INTEL_START_FWUPDATE] = 2,
[NVDIMM_INTEL_SEND_FWUPDATE] = 2,
[NVDIMM_INTEL_FINISH_FWUPDATE] = 2,
[NVDIMM_INTEL_QUERY_FWUPDATE] = 2,
[NVDIMM_INTEL_SET_THRESHOLD] = 2,
[NVDIMM_INTEL_INJECT_ERROR] = 2,
},
};
u8 id;
if (family > NVDIMM_FAMILY_MAX)
return 0;
if (func > 31)
return 0;
id = revid_table[family][func];
if (id == 0)
return 1; /* default */
return id;
} }
int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm, int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc) unsigned int cmd, void *buf, unsigned int buf_len, int *cmd_rc)
{ {
struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc); struct acpi_nfit_desc *acpi_desc = to_acpi_nfit_desc(nd_desc);
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
union acpi_object in_obj, in_buf, *out_obj; union acpi_object in_obj, in_buf, *out_obj;
const struct nd_cmd_desc *desc = NULL; const struct nd_cmd_desc *desc = NULL;
struct device *dev = acpi_desc->dev; struct device *dev = acpi_desc->dev;
@ -235,7 +424,6 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
} }
if (nvdimm) { if (nvdimm) {
struct nfit_mem *nfit_mem = nvdimm_provider_data(nvdimm);
struct acpi_device *adev = nfit_mem->adev; struct acpi_device *adev = nfit_mem->adev;
if (!adev) if (!adev)
@ -294,7 +482,29 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
in_buf.buffer.pointer, in_buf.buffer.pointer,
min_t(u32, 256, in_buf.buffer.length), true); min_t(u32, 256, in_buf.buffer.length), true);
out_obj = acpi_evaluate_dsm(handle, guid, 1, func, &in_obj); /* call the BIOS, prefer the named methods over _DSM if available */
if (nvdimm && cmd == ND_CMD_GET_CONFIG_SIZE && nfit_mem->has_lsi)
out_obj = acpi_label_info(handle);
else if (nvdimm && cmd == ND_CMD_GET_CONFIG_DATA && nfit_mem->has_lsr) {
struct nd_cmd_get_config_data_hdr *p = buf;
out_obj = acpi_label_read(handle, p->in_offset, p->in_length);
} else if (nvdimm && cmd == ND_CMD_SET_CONFIG_DATA
&& nfit_mem->has_lsw) {
struct nd_cmd_set_config_hdr *p = buf;
out_obj = acpi_label_write(handle, p->in_offset, p->in_length,
p->in_buf);
} else {
u8 revid;
if (nvdimm)
revid = nfit_dsm_revid(nfit_mem->family, func);
else
revid = 1;
out_obj = acpi_evaluate_dsm(handle, guid, revid, func, &in_obj);
}
if (!out_obj) { if (!out_obj) {
dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name, dev_dbg(dev, "%s:%s _DSM failed cmd: %s\n", __func__, dimm_name,
cmd_name); cmd_name);
@ -356,8 +566,10 @@ int acpi_nfit_ctl(struct nvdimm_bus_descriptor *nd_desc, struct nvdimm *nvdimm,
* Set fw_status for all the commands with a known format to be * Set fw_status for all the commands with a known format to be
* later interpreted by xlat_status(). * later interpreted by xlat_status().
*/ */
if (i >= 1 && ((cmd >= ND_CMD_ARS_CAP && cmd <= ND_CMD_CLEAR_ERROR) if (i >= 1 && ((!nvdimm && cmd >= ND_CMD_ARS_CAP
|| (cmd >= ND_CMD_SMART && cmd <= ND_CMD_VENDOR))) && cmd <= ND_CMD_CLEAR_ERROR)
|| (nvdimm && cmd >= ND_CMD_SMART
&& cmd <= ND_CMD_VENDOR)))
fw_status = *(u32 *) out_obj->buffer.pointer; fw_status = *(u32 *) out_obj->buffer.pointer;
if (offset + in_buf.buffer.length < buf_len) { if (offset + in_buf.buffer.length < buf_len) {
@ -1431,6 +1643,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
{ {
struct acpi_device *adev, *adev_dimm; struct acpi_device *adev, *adev_dimm;
struct device *dev = acpi_desc->dev; struct device *dev = acpi_desc->dev;
union acpi_object *obj;
unsigned long dsm_mask; unsigned long dsm_mask;
const guid_t *guid; const guid_t *guid;
int i; int i;
@ -1463,7 +1676,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
* different command sets. Note, that checking for function0 (bit0) * different command sets. Note, that checking for function0 (bit0)
* tells us if any commands are reachable through this GUID. * tells us if any commands are reachable through this GUID.
*/ */
for (i = NVDIMM_FAMILY_INTEL; i <= NVDIMM_FAMILY_MSFT; i++) for (i = 0; i <= NVDIMM_FAMILY_MAX; i++)
if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1)) if (acpi_check_dsm(adev_dimm->handle, to_nfit_uuid(i), 1, 1))
if (family < 0 || i == default_dsm_family) if (family < 0 || i == default_dsm_family)
family = i; family = i;
@ -1473,7 +1686,7 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
if (override_dsm_mask && !disable_vendor_specific) if (override_dsm_mask && !disable_vendor_specific)
dsm_mask = override_dsm_mask; dsm_mask = override_dsm_mask;
else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) { else if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
dsm_mask = 0x3fe; dsm_mask = NVDIMM_INTEL_CMDMASK;
if (disable_vendor_specific) if (disable_vendor_specific)
dsm_mask &= ~(1 << ND_CMD_VENDOR); dsm_mask &= ~(1 << ND_CMD_VENDOR);
} else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) { } else if (nfit_mem->family == NVDIMM_FAMILY_HPE1) {
@ -1493,9 +1706,32 @@ static int acpi_nfit_add_dimm(struct acpi_nfit_desc *acpi_desc,
guid = to_nfit_uuid(nfit_mem->family); guid = to_nfit_uuid(nfit_mem->family);
for_each_set_bit(i, &dsm_mask, BITS_PER_LONG) for_each_set_bit(i, &dsm_mask, BITS_PER_LONG)
if (acpi_check_dsm(adev_dimm->handle, guid, 1, 1ULL << i)) if (acpi_check_dsm(adev_dimm->handle, guid,
nfit_dsm_revid(nfit_mem->family, i),
1ULL << i))
set_bit(i, &nfit_mem->dsm_mask); set_bit(i, &nfit_mem->dsm_mask);
obj = acpi_label_info(adev_dimm->handle);
if (obj) {
ACPI_FREE(obj);
nfit_mem->has_lsi = 1;
dev_dbg(dev, "%s: has _LSI\n", dev_name(&adev_dimm->dev));
}
obj = acpi_label_read(adev_dimm->handle, 0, 0);
if (obj) {
ACPI_FREE(obj);
nfit_mem->has_lsr = 1;
dev_dbg(dev, "%s: has _LSR\n", dev_name(&adev_dimm->dev));
}
obj = acpi_label_write(adev_dimm->handle, 0, 0, NULL);
if (obj) {
ACPI_FREE(obj);
nfit_mem->has_lsw = 1;
dev_dbg(dev, "%s: has _LSW\n", dev_name(&adev_dimm->dev));
}
return 0; return 0;
} }
@ -1571,8 +1807,21 @@ static int acpi_nfit_register_dimms(struct acpi_nfit_desc *acpi_desc)
* userspace interface. * userspace interface.
*/ */
cmd_mask = 1UL << ND_CMD_CALL; cmd_mask = 1UL << ND_CMD_CALL;
if (nfit_mem->family == NVDIMM_FAMILY_INTEL) if (nfit_mem->family == NVDIMM_FAMILY_INTEL) {
cmd_mask |= nfit_mem->dsm_mask; /*
* These commands have a 1:1 correspondence
* between DSM payload and libnvdimm ioctl
* payload format.
*/
cmd_mask |= nfit_mem->dsm_mask & NVDIMM_STANDARD_CMDMASK;
}
if (nfit_mem->has_lsi)
set_bit(ND_CMD_GET_CONFIG_SIZE, &cmd_mask);
if (nfit_mem->has_lsr)
set_bit(ND_CMD_GET_CONFIG_DATA, &cmd_mask);
if (nfit_mem->has_lsw)
set_bit(ND_CMD_SET_CONFIG_DATA, &cmd_mask);
flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush flush = nfit_mem->nfit_flush ? nfit_mem->nfit_flush->flush
: NULL; : NULL;
@ -1645,6 +1894,7 @@ static void acpi_nfit_init_dsms(struct acpi_nfit_desc *acpi_desc)
int i; int i;
nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en; nd_desc->cmd_mask = acpi_desc->bus_cmd_force_en;
nd_desc->bus_dsm_mask = acpi_desc->bus_nfit_cmd_force_en;
adev = to_acpi_dev(acpi_desc); adev = to_acpi_dev(acpi_desc);
if (!adev) if (!adev)
return; return;
@ -2239,7 +2489,7 @@ static int ars_status_process_records(struct acpi_nfit_desc *acpi_desc,
if (ars_status->out_length if (ars_status->out_length
< 44 + sizeof(struct nd_ars_record) * (i + 1)) < 44 + sizeof(struct nd_ars_record) * (i + 1))
break; break;
rc = nvdimm_bus_add_poison(nvdimm_bus, rc = nvdimm_bus_add_badrange(nvdimm_bus,
ars_status->records[i].err_address, ars_status->records[i].err_address,
ars_status->records[i].length); ars_status->records[i].length);
if (rc) if (rc)

View File

@ -67,7 +67,7 @@ static int nfit_handle_mce(struct notifier_block *nb, unsigned long val,
continue; continue;
/* If this fails due to an -ENOMEM, there is little we can do */ /* If this fails due to an -ENOMEM, there is little we can do */
nvdimm_bus_add_poison(acpi_desc->nvdimm_bus, nvdimm_bus_add_badrange(acpi_desc->nvdimm_bus,
ALIGN(mce->addr, L1_CACHE_BYTES), ALIGN(mce->addr, L1_CACHE_BYTES),
L1_CACHE_BYTES); L1_CACHE_BYTES);
nvdimm_region_notify(nfit_spa->nd_region, nvdimm_region_notify(nfit_spa->nd_region,

View File

@ -24,7 +24,7 @@
/* ACPI 6.1 */ /* ACPI 6.1 */
#define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba" #define UUID_NFIT_BUS "2f10e7a4-9e91-11e4-89d3-123b93f75cba"
/* http://pmem.io/documents/NVDIMM_DSM_Interface_Example.pdf */ /* http://pmem.io/documents/NVDIMM_DSM_Interface-V1.6.pdf */
#define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66" #define UUID_NFIT_DIMM "4309ac30-0d11-11e4-9191-0800200c9a66"
/* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */ /* https://github.com/HewlettPackard/hpe-nvm/blob/master/Documentation/ */
@ -38,6 +38,37 @@
| ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \ | ACPI_NFIT_MEM_RESTORE_FAILED | ACPI_NFIT_MEM_FLUSH_FAILED \
| ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED) | ACPI_NFIT_MEM_NOT_ARMED | ACPI_NFIT_MEM_MAP_FAILED)
#define NVDIMM_FAMILY_MAX NVDIMM_FAMILY_MSFT
#define NVDIMM_STANDARD_CMDMASK \
(1 << ND_CMD_SMART | 1 << ND_CMD_SMART_THRESHOLD | 1 << ND_CMD_DIMM_FLAGS \
| 1 << ND_CMD_GET_CONFIG_SIZE | 1 << ND_CMD_GET_CONFIG_DATA \
| 1 << ND_CMD_SET_CONFIG_DATA | 1 << ND_CMD_VENDOR_EFFECT_LOG_SIZE \
| 1 << ND_CMD_VENDOR_EFFECT_LOG | 1 << ND_CMD_VENDOR)
/*
* Command numbers that the kernel needs to know about to handle
* non-default DSM revision ids
*/
enum nvdimm_family_cmds {
NVDIMM_INTEL_LATCH_SHUTDOWN = 10,
NVDIMM_INTEL_GET_MODES = 11,
NVDIMM_INTEL_GET_FWINFO = 12,
NVDIMM_INTEL_START_FWUPDATE = 13,
NVDIMM_INTEL_SEND_FWUPDATE = 14,
NVDIMM_INTEL_FINISH_FWUPDATE = 15,
NVDIMM_INTEL_QUERY_FWUPDATE = 16,
NVDIMM_INTEL_SET_THRESHOLD = 17,
NVDIMM_INTEL_INJECT_ERROR = 18,
};
#define NVDIMM_INTEL_CMDMASK \
(NVDIMM_STANDARD_CMDMASK | 1 << NVDIMM_INTEL_GET_MODES \
| 1 << NVDIMM_INTEL_GET_FWINFO | 1 << NVDIMM_INTEL_START_FWUPDATE \
| 1 << NVDIMM_INTEL_SEND_FWUPDATE | 1 << NVDIMM_INTEL_FINISH_FWUPDATE \
| 1 << NVDIMM_INTEL_QUERY_FWUPDATE | 1 << NVDIMM_INTEL_SET_THRESHOLD \
| 1 << NVDIMM_INTEL_INJECT_ERROR | 1 << NVDIMM_INTEL_LATCH_SHUTDOWN)
enum nfit_uuids { enum nfit_uuids {
/* for simplicity alias the uuid index with the family id */ /* for simplicity alias the uuid index with the family id */
NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL, NFIT_DEV_DIMM = NVDIMM_FAMILY_INTEL,
@ -140,6 +171,9 @@ struct nfit_mem {
struct resource *flush_wpq; struct resource *flush_wpq;
unsigned long dsm_mask; unsigned long dsm_mask;
int family; int family;
u32 has_lsi:1;
u32 has_lsr:1;
u32 has_lsw:1;
}; };
struct acpi_nfit_desc { struct acpi_nfit_desc {
@ -167,6 +201,7 @@ struct acpi_nfit_desc {
unsigned int init_complete:1; unsigned int init_complete:1;
unsigned long dimm_cmd_force_en; unsigned long dimm_cmd_force_en;
unsigned long bus_cmd_force_en; unsigned long bus_cmd_force_en;
unsigned long bus_nfit_cmd_force_en;
int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa, int (*blk_do_io)(struct nd_blk_region *ndbr, resource_size_t dpa,
void *iobuf, u64 len, int rw); void *iobuf, u64 len, int rw);
}; };

View File

@ -302,7 +302,6 @@ config BLK_DEV_SX8
config BLK_DEV_RAM config BLK_DEV_RAM
tristate "RAM block device support" tristate "RAM block device support"
select DAX if BLK_DEV_RAM_DAX
---help--- ---help---
Saying Y here will allow you to use a portion of your RAM memory as Saying Y here will allow you to use a portion of your RAM memory as
a block device, so that you can make file systems on it, read and a block device, so that you can make file systems on it, read and
@ -338,17 +337,6 @@ config BLK_DEV_RAM_SIZE
The default value is 4096 kilobytes. Only change this if you know The default value is 4096 kilobytes. Only change this if you know
what you are doing. what you are doing.
config BLK_DEV_RAM_DAX
bool "Support Direct Access (DAX) to RAM block devices"
depends on BLK_DEV_RAM && FS_DAX
default n
help
Support filesystems using DAX to access RAM block devices. This
avoids double-buffering data in the page cache before copying it
to the block device. Answering Y will slightly enlarge the kernel,
and will prevent RAM block device backing store memory from being
allocated from highmem (only a problem for highmem systems).
config CDROM_PKTCDVD config CDROM_PKTCDVD
tristate "Packet writing on CD/DVD media (DEPRECATED)" tristate "Packet writing on CD/DVD media (DEPRECATED)"
depends on !UML depends on !UML

View File

@ -21,11 +21,6 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#ifdef CONFIG_BLK_DEV_RAM_DAX
#include <linux/pfn_t.h>
#include <linux/dax.h>
#include <linux/uio.h>
#endif
#include <linux/uaccess.h> #include <linux/uaccess.h>
@ -45,9 +40,6 @@ struct brd_device {
struct request_queue *brd_queue; struct request_queue *brd_queue;
struct gendisk *brd_disk; struct gendisk *brd_disk;
#ifdef CONFIG_BLK_DEV_RAM_DAX
struct dax_device *dax_dev;
#endif
struct list_head brd_list; struct list_head brd_list;
/* /*
@ -112,9 +104,6 @@ static struct page *brd_insert_page(struct brd_device *brd, sector_t sector)
* restriction might be able to be lifted. * restriction might be able to be lifted.
*/ */
gfp_flags = GFP_NOIO | __GFP_ZERO; gfp_flags = GFP_NOIO | __GFP_ZERO;
#ifndef CONFIG_BLK_DEV_RAM_DAX
gfp_flags |= __GFP_HIGHMEM;
#endif
page = alloc_page(gfp_flags); page = alloc_page(gfp_flags);
if (!page) if (!page)
return NULL; return NULL;
@ -334,43 +323,6 @@ static int brd_rw_page(struct block_device *bdev, sector_t sector,
return err; return err;
} }
#ifdef CONFIG_BLK_DEV_RAM_DAX
static long __brd_direct_access(struct brd_device *brd, pgoff_t pgoff,
long nr_pages, void **kaddr, pfn_t *pfn)
{
struct page *page;
if (!brd)
return -ENODEV;
page = brd_insert_page(brd, (sector_t)pgoff << PAGE_SECTORS_SHIFT);
if (!page)
return -ENOSPC;
*kaddr = page_address(page);
*pfn = page_to_pfn_t(page);
return 1;
}
static long brd_dax_direct_access(struct dax_device *dax_dev,
pgoff_t pgoff, long nr_pages, void **kaddr, pfn_t *pfn)
{
struct brd_device *brd = dax_get_private(dax_dev);
return __brd_direct_access(brd, pgoff, nr_pages, kaddr, pfn);
}
static size_t brd_dax_copy_from_iter(struct dax_device *dax_dev, pgoff_t pgoff,
void *addr, size_t bytes, struct iov_iter *i)
{
return copy_from_iter(addr, bytes, i);
}
static const struct dax_operations brd_dax_ops = {
.direct_access = brd_dax_direct_access,
.copy_from_iter = brd_dax_copy_from_iter,
};
#endif
static const struct block_device_operations brd_fops = { static const struct block_device_operations brd_fops = {
.owner = THIS_MODULE, .owner = THIS_MODULE,
.rw_page = brd_rw_page, .rw_page = brd_rw_page,
@ -451,21 +403,8 @@ static struct brd_device *brd_alloc(int i)
set_capacity(disk, rd_size * 2); set_capacity(disk, rd_size * 2);
disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO; disk->queue->backing_dev_info->capabilities |= BDI_CAP_SYNCHRONOUS_IO;
#ifdef CONFIG_BLK_DEV_RAM_DAX
queue_flag_set_unlocked(QUEUE_FLAG_DAX, brd->brd_queue);
brd->dax_dev = alloc_dax(brd, disk->disk_name, &brd_dax_ops);
if (!brd->dax_dev)
goto out_free_inode;
#endif
return brd; return brd;
#ifdef CONFIG_BLK_DEV_RAM_DAX
out_free_inode:
kill_dax(brd->dax_dev);
put_dax(brd->dax_dev);
#endif
out_free_queue: out_free_queue:
blk_cleanup_queue(brd->brd_queue); blk_cleanup_queue(brd->brd_queue);
out_free_dev: out_free_dev:
@ -505,10 +444,6 @@ out:
static void brd_del_one(struct brd_device *brd) static void brd_del_one(struct brd_device *brd)
{ {
list_del(&brd->brd_list); list_del(&brd->brd_list);
#ifdef CONFIG_BLK_DEV_RAM_DAX
kill_dax(brd->dax_dev);
put_dax(brd->dax_dev);
#endif
del_gendisk(brd->brd_disk); del_gendisk(brd->brd_disk);
brd_free(brd); brd_free(brd);
} }

View File

@ -222,7 +222,8 @@ __weak phys_addr_t dax_pgoff_to_phys(struct dev_dax *dev_dax, pgoff_t pgoff,
unsigned long size) unsigned long size)
{ {
struct resource *res; struct resource *res;
phys_addr_t phys; /* gcc-4.6.3-nolibc for i386 complains that this is uninitialized */
phys_addr_t uninitialized_var(phys);
int i; int i;
for (i = 0; i < dev_dax->num_resources; i++) { for (i = 0; i < dev_dax->num_resources; i++) {

View File

@ -92,21 +92,21 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
long len; long len;
if (blocksize != PAGE_SIZE) { if (blocksize != PAGE_SIZE) {
pr_err("VFS (%s): error: unsupported blocksize for dax\n", pr_debug("VFS (%s): error: unsupported blocksize for dax\n",
sb->s_id); sb->s_id);
return -EINVAL; return -EINVAL;
} }
err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff); err = bdev_dax_pgoff(bdev, 0, PAGE_SIZE, &pgoff);
if (err) { if (err) {
pr_err("VFS (%s): error: unaligned partition for dax\n", pr_debug("VFS (%s): error: unaligned partition for dax\n",
sb->s_id); sb->s_id);
return err; return err;
} }
dax_dev = dax_get_by_host(bdev->bd_disk->disk_name); dax_dev = dax_get_by_host(bdev->bd_disk->disk_name);
if (!dax_dev) { if (!dax_dev) {
pr_err("VFS (%s): error: device does not support dax\n", pr_debug("VFS (%s): error: device does not support dax\n",
sb->s_id); sb->s_id);
return -EOPNOTSUPP; return -EOPNOTSUPP;
} }
@ -118,7 +118,7 @@ int __bdev_dax_supported(struct super_block *sb, int blocksize)
put_dax(dax_dev); put_dax(dax_dev);
if (len < 1) { if (len < 1) {
pr_err("VFS (%s): error: dax access failed (%ld)", pr_debug("VFS (%s): error: dax access failed (%ld)\n",
sb->s_id, len); sb->s_id, len);
return len < 0 ? len : -EIO; return len < 0 ? len : -EIO;
} }
@ -273,9 +273,6 @@ EXPORT_SYMBOL_GPL(dax_copy_from_iter);
void arch_wb_cache_pmem(void *addr, size_t size); void arch_wb_cache_pmem(void *addr, size_t size);
void dax_flush(struct dax_device *dax_dev, void *addr, size_t size) void dax_flush(struct dax_device *dax_dev, void *addr, size_t size)
{ {
if (unlikely(!dax_alive(dax_dev)))
return;
if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags))) if (unlikely(!test_bit(DAXDEV_WRITE_CACHE, &dax_dev->flags)))
return; return;
@ -344,6 +341,9 @@ static struct inode *dax_alloc_inode(struct super_block *sb)
struct inode *inode; struct inode *inode;
dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL); dax_dev = kmem_cache_alloc(dax_cache, GFP_KERNEL);
if (!dax_dev)
return NULL;
inode = &dax_dev->inode; inode = &dax_dev->inode;
inode->i_rdev = 0; inode->i_rdev = 0;
return inode; return inode;

View File

@ -21,6 +21,7 @@ libnvdimm-y += region_devs.o
libnvdimm-y += region.o libnvdimm-y += region.o
libnvdimm-y += namespace_devs.o libnvdimm-y += namespace_devs.o
libnvdimm-y += label.o libnvdimm-y += label.o
libnvdimm-y += badrange.o
libnvdimm-$(CONFIG_ND_CLAIM) += claim.o libnvdimm-$(CONFIG_ND_CLAIM) += claim.o
libnvdimm-$(CONFIG_BTT) += btt_devs.o libnvdimm-$(CONFIG_BTT) += btt_devs.o
libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o libnvdimm-$(CONFIG_NVDIMM_PFN) += pfn_devs.o

293
drivers/nvdimm/badrange.c Normal file
View File

@ -0,0 +1,293 @@
/*
* Copyright(c) 2017 Intel Corporation. All rights reserved.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of version 2 of the GNU General Public License as
* published by the Free Software Foundation.
*
* This program is distributed in the hope that it will be useful, but
* WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* General Public License for more details.
*/
#include <linux/libnvdimm.h>
#include <linux/badblocks.h>
#include <linux/export.h>
#include <linux/module.h>
#include <linux/blkdev.h>
#include <linux/device.h>
#include <linux/ctype.h>
#include <linux/ndctl.h>
#include <linux/mutex.h>
#include <linux/slab.h>
#include <linux/io.h>
#include "nd-core.h"
#include "nd.h"
void badrange_init(struct badrange *badrange)
{
INIT_LIST_HEAD(&badrange->list);
spin_lock_init(&badrange->lock);
}
EXPORT_SYMBOL_GPL(badrange_init);
static void append_badrange_entry(struct badrange *badrange,
struct badrange_entry *bre, u64 addr, u64 length)
{
lockdep_assert_held(&badrange->lock);
bre->start = addr;
bre->length = length;
list_add_tail(&bre->list, &badrange->list);
}
static int alloc_and_append_badrange_entry(struct badrange *badrange,
u64 addr, u64 length, gfp_t flags)
{
struct badrange_entry *bre;
bre = kzalloc(sizeof(*bre), flags);
if (!bre)
return -ENOMEM;
append_badrange_entry(badrange, bre, addr, length);
return 0;
}
static int add_badrange(struct badrange *badrange, u64 addr, u64 length)
{
struct badrange_entry *bre, *bre_new;
spin_unlock(&badrange->lock);
bre_new = kzalloc(sizeof(*bre_new), GFP_KERNEL);
spin_lock(&badrange->lock);
if (list_empty(&badrange->list)) {
if (!bre_new)
return -ENOMEM;
append_badrange_entry(badrange, bre_new, addr, length);
return 0;
}
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(bre, &badrange->list, list)
if (bre->start == addr) {
/* If length has changed, update this list entry */
if (bre->length != length)
bre->length = length;
kfree(bre_new);
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
if (!bre_new)
return -ENOMEM;
append_badrange_entry(badrange, bre_new, addr, length);
return 0;
}
int badrange_add(struct badrange *badrange, u64 addr, u64 length)
{
int rc;
spin_lock(&badrange->lock);
rc = add_badrange(badrange, addr, length);
spin_unlock(&badrange->lock);
return rc;
}
EXPORT_SYMBOL_GPL(badrange_add);
void badrange_forget(struct badrange *badrange, phys_addr_t start,
unsigned int len)
{
struct list_head *badrange_list = &badrange->list;
u64 clr_end = start + len - 1;
struct badrange_entry *bre, *next;
spin_lock(&badrange->lock);
/*
* [start, clr_end] is the badrange interval being cleared.
* [bre->start, bre_end] is the badrange_list entry we're comparing
* the above interval against. The badrange list entry may need
* to be modified (update either start or length), deleted, or
* split into two based on the overlap characteristics
*/
list_for_each_entry_safe(bre, next, badrange_list, list) {
u64 bre_end = bre->start + bre->length - 1;
/* Skip intervals with no intersection */
if (bre_end < start)
continue;
if (bre->start > clr_end)
continue;
/* Delete completely overlapped badrange entries */
if ((bre->start >= start) && (bre_end <= clr_end)) {
list_del(&bre->list);
kfree(bre);
continue;
}
/* Adjust start point of partially cleared entries */
if ((start <= bre->start) && (clr_end > bre->start)) {
bre->length -= clr_end - bre->start + 1;
bre->start = clr_end + 1;
continue;
}
/* Adjust bre->length for partial clearing at the tail end */
if ((bre->start < start) && (bre_end <= clr_end)) {
/* bre->start remains the same */
bre->length = start - bre->start;
continue;
}
/*
* If clearing in the middle of an entry, we split it into
* two by modifying the current entry to represent one half of
* the split, and adding a new entry for the second half.
*/
if ((bre->start < start) && (bre_end > clr_end)) {
u64 new_start = clr_end + 1;
u64 new_len = bre_end - new_start + 1;
/* Add new entry covering the right half */
alloc_and_append_badrange_entry(badrange, new_start,
new_len, GFP_NOWAIT);
/* Adjust this entry to cover the left half */
bre->length = start - bre->start;
continue;
}
}
spin_unlock(&badrange->lock);
}
EXPORT_SYMBOL_GPL(badrange_forget);
static void set_badblock(struct badblocks *bb, sector_t s, int num)
{
dev_dbg(bb->dev, "Found a bad range (0x%llx, 0x%llx)\n",
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
}
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of badrange to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector, end_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
if (rem)
end_sector++;
num_sectors = end_sector - start_sector;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
static void badblocks_populate(struct badrange *badrange,
struct badblocks *bb, const struct resource *res)
{
struct badrange_entry *bre;
if (list_empty(&badrange->list))
return;
list_for_each_entry(bre, &badrange->list, list) {
u64 bre_end = bre->start + bre->length - 1;
/* Discard intervals with no intersection */
if (bre_end < res->start)
continue;
if (bre->start > res->end)
continue;
/* Deal with any overlap after start of the namespace */
if (bre->start >= res->start) {
u64 start = bre->start;
u64 len;
if (bre_end <= res->end)
len = bre->length;
else
len = res->start + resource_size(res)
- bre->start;
__add_badblock_range(bb, start - res->start, len);
continue;
}
/*
* Deal with overlap for badrange starting before
* the namespace.
*/
if (bre->start < res->start) {
u64 len;
if (bre_end < res->end)
len = bre->start + bre->length - res->start;
else
len = resource_size(res);
__add_badblock_range(bb, 0, len);
}
}
}
/**
* nvdimm_badblocks_populate() - Convert a list of badranges to badblocks
* @region: parent region of the range to interrogate
* @bb: badblocks instance to populate
* @res: resource range to consider
*
* The badrange list generated during bus initialization may contain
* multiple, possibly overlapping physical address ranges. Compare each
* of these ranges to the resource range currently being initialized,
* and add badblocks entries for all matching sub-ranges
*/
void nvdimm_badblocks_populate(struct nd_region *nd_region,
struct badblocks *bb, const struct resource *res)
{
struct nvdimm_bus *nvdimm_bus;
if (!is_memory(&nd_region->dev)) {
dev_WARN_ONCE(&nd_region->dev, 1,
"%s only valid for pmem regions\n", __func__);
return;
}
nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
nvdimm_bus_lock(&nvdimm_bus->dev);
badblocks_populate(&nvdimm_bus->badrange, bb, res);
nvdimm_bus_unlock(&nvdimm_bus->dev);
}
EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);

View File

@ -11,6 +11,7 @@
* General Public License for more details. * General Public License for more details.
*/ */
#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
#include <linux/libnvdimm.h>
#include <linux/sched/mm.h> #include <linux/sched/mm.h>
#include <linux/vmalloc.h> #include <linux/vmalloc.h>
#include <linux/uaccess.h> #include <linux/uaccess.h>
@ -221,7 +222,7 @@ static void nvdimm_account_cleared_poison(struct nvdimm_bus *nvdimm_bus,
phys_addr_t phys, u64 cleared) phys_addr_t phys, u64 cleared)
{ {
if (cleared > 0) if (cleared > 0)
nvdimm_forget_poison(nvdimm_bus, phys, cleared); badrange_forget(&nvdimm_bus->badrange, phys, cleared);
if (cleared > 0 && cleared / 512) if (cleared > 0 && cleared / 512)
nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared); nvdimm_clear_badblocks_regions(nvdimm_bus, phys, cleared);
@ -344,11 +345,10 @@ struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
return NULL; return NULL;
INIT_LIST_HEAD(&nvdimm_bus->list); INIT_LIST_HEAD(&nvdimm_bus->list);
INIT_LIST_HEAD(&nvdimm_bus->mapping_list); INIT_LIST_HEAD(&nvdimm_bus->mapping_list);
INIT_LIST_HEAD(&nvdimm_bus->poison_list);
init_waitqueue_head(&nvdimm_bus->probe_wait); init_waitqueue_head(&nvdimm_bus->probe_wait);
nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL); nvdimm_bus->id = ida_simple_get(&nd_ida, 0, 0, GFP_KERNEL);
mutex_init(&nvdimm_bus->reconfig_mutex); mutex_init(&nvdimm_bus->reconfig_mutex);
spin_lock_init(&nvdimm_bus->poison_lock); badrange_init(&nvdimm_bus->badrange);
if (nvdimm_bus->id < 0) { if (nvdimm_bus->id < 0) {
kfree(nvdimm_bus); kfree(nvdimm_bus);
return NULL; return NULL;
@ -395,15 +395,15 @@ static int child_unregister(struct device *dev, void *data)
return 0; return 0;
} }
static void free_poison_list(struct list_head *poison_list) static void free_badrange_list(struct list_head *badrange_list)
{ {
struct nd_poison *pl, *next; struct badrange_entry *bre, *next;
list_for_each_entry_safe(pl, next, poison_list, list) { list_for_each_entry_safe(bre, next, badrange_list, list) {
list_del(&pl->list); list_del(&bre->list);
kfree(pl); kfree(bre);
} }
list_del_init(poison_list); list_del_init(badrange_list);
} }
static int nd_bus_remove(struct device *dev) static int nd_bus_remove(struct device *dev)
@ -417,9 +417,9 @@ static int nd_bus_remove(struct device *dev)
nd_synchronize(); nd_synchronize();
device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister); device_for_each_child(&nvdimm_bus->dev, NULL, child_unregister);
spin_lock(&nvdimm_bus->poison_lock); spin_lock(&nvdimm_bus->badrange.lock);
free_poison_list(&nvdimm_bus->poison_list); free_badrange_list(&nvdimm_bus->badrange.list);
spin_unlock(&nvdimm_bus->poison_lock); spin_unlock(&nvdimm_bus->badrange.lock);
nvdimm_bus_destroy_ndctl(nvdimm_bus); nvdimm_bus_destroy_ndctl(nvdimm_bus);

View File

@ -398,265 +398,11 @@ struct attribute_group nvdimm_bus_attribute_group = {
}; };
EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group); EXPORT_SYMBOL_GPL(nvdimm_bus_attribute_group);
static void set_badblock(struct badblocks *bb, sector_t s, int num) int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{ {
dev_dbg(bb->dev, "Found a poison range (0x%llx, 0x%llx)\n", return badrange_add(&nvdimm_bus->badrange, addr, length);
(u64) s * 512, (u64) num * 512);
/* this isn't an error as the hardware will still throw an exception */
if (badblocks_set(bb, s, num, 1))
dev_info_once(bb->dev, "%s: failed for sector %llx\n",
__func__, (u64) s);
} }
EXPORT_SYMBOL_GPL(nvdimm_bus_add_badrange);
/**
* __add_badblock_range() - Convert a physical address range to bad sectors
* @bb: badblocks instance to populate
* @ns_offset: namespace offset where the error range begins (in bytes)
* @len: number of bytes of poison to be added
*
* This assumes that the range provided with (ns_offset, len) is within
* the bounds of physical addresses for this namespace, i.e. lies in the
* interval [ns_start, ns_start + ns_size)
*/
static void __add_badblock_range(struct badblocks *bb, u64 ns_offset, u64 len)
{
const unsigned int sector_size = 512;
sector_t start_sector, end_sector;
u64 num_sectors;
u32 rem;
start_sector = div_u64(ns_offset, sector_size);
end_sector = div_u64_rem(ns_offset + len, sector_size, &rem);
if (rem)
end_sector++;
num_sectors = end_sector - start_sector;
if (unlikely(num_sectors > (u64)INT_MAX)) {
u64 remaining = num_sectors;
sector_t s = start_sector;
while (remaining) {
int done = min_t(u64, remaining, INT_MAX);
set_badblock(bb, s, done);
remaining -= done;
s += done;
}
} else
set_badblock(bb, start_sector, num_sectors);
}
static void badblocks_populate(struct list_head *poison_list,
struct badblocks *bb, const struct resource *res)
{
struct nd_poison *pl;
if (list_empty(poison_list))
return;
list_for_each_entry(pl, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Discard intervals with no intersection */
if (pl_end < res->start)
continue;
if (pl->start > res->end)
continue;
/* Deal with any overlap after start of the namespace */
if (pl->start >= res->start) {
u64 start = pl->start;
u64 len;
if (pl_end <= res->end)
len = pl->length;
else
len = res->start + resource_size(res)
- pl->start;
__add_badblock_range(bb, start - res->start, len);
continue;
}
/* Deal with overlap for poison starting before the namespace */
if (pl->start < res->start) {
u64 len;
if (pl_end < res->end)
len = pl->start + pl->length - res->start;
else
len = resource_size(res);
__add_badblock_range(bb, 0, len);
}
}
}
/**
* nvdimm_badblocks_populate() - Convert a list of poison ranges to badblocks
* @region: parent region of the range to interrogate
* @bb: badblocks instance to populate
* @res: resource range to consider
*
* The poison list generated during bus initialization may contain
* multiple, possibly overlapping physical address ranges. Compare each
* of these ranges to the resource range currently being initialized,
* and add badblocks entries for all matching sub-ranges
*/
void nvdimm_badblocks_populate(struct nd_region *nd_region,
struct badblocks *bb, const struct resource *res)
{
struct nvdimm_bus *nvdimm_bus;
struct list_head *poison_list;
if (!is_memory(&nd_region->dev)) {
dev_WARN_ONCE(&nd_region->dev, 1,
"%s only valid for pmem regions\n", __func__);
return;
}
nvdimm_bus = walk_to_nvdimm_bus(&nd_region->dev);
poison_list = &nvdimm_bus->poison_list;
nvdimm_bus_lock(&nvdimm_bus->dev);
badblocks_populate(poison_list, bb, res);
nvdimm_bus_unlock(&nvdimm_bus->dev);
}
EXPORT_SYMBOL_GPL(nvdimm_badblocks_populate);
static void append_poison_entry(struct nvdimm_bus *nvdimm_bus,
struct nd_poison *pl, u64 addr, u64 length)
{
lockdep_assert_held(&nvdimm_bus->poison_lock);
pl->start = addr;
pl->length = length;
list_add_tail(&pl->list, &nvdimm_bus->poison_list);
}
static int add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length,
gfp_t flags)
{
struct nd_poison *pl;
pl = kzalloc(sizeof(*pl), flags);
if (!pl)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl, addr, length);
return 0;
}
static int bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
struct nd_poison *pl, *pl_new;
spin_unlock(&nvdimm_bus->poison_lock);
pl_new = kzalloc(sizeof(*pl_new), GFP_KERNEL);
spin_lock(&nvdimm_bus->poison_lock);
if (list_empty(&nvdimm_bus->poison_list)) {
if (!pl_new)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl_new, addr, length);
return 0;
}
/*
* There is a chance this is a duplicate, check for those first.
* This will be the common case as ARS_STATUS returns all known
* errors in the SPA space, and we can't query it per region
*/
list_for_each_entry(pl, &nvdimm_bus->poison_list, list)
if (pl->start == addr) {
/* If length has changed, update this list entry */
if (pl->length != length)
pl->length = length;
kfree(pl_new);
return 0;
}
/*
* If not a duplicate or a simple length update, add the entry as is,
* as any overlapping ranges will get resolved when the list is consumed
* and converted to badblocks
*/
if (!pl_new)
return -ENOMEM;
append_poison_entry(nvdimm_bus, pl_new, addr, length);
return 0;
}
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length)
{
int rc;
spin_lock(&nvdimm_bus->poison_lock);
rc = bus_add_poison(nvdimm_bus, addr, length);
spin_unlock(&nvdimm_bus->poison_lock);
return rc;
}
EXPORT_SYMBOL_GPL(nvdimm_bus_add_poison);
void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, phys_addr_t start,
unsigned int len)
{
struct list_head *poison_list = &nvdimm_bus->poison_list;
u64 clr_end = start + len - 1;
struct nd_poison *pl, *next;
spin_lock(&nvdimm_bus->poison_lock);
WARN_ON_ONCE(list_empty(poison_list));
/*
* [start, clr_end] is the poison interval being cleared.
* [pl->start, pl_end] is the poison_list entry we're comparing
* the above interval against. The poison list entry may need
* to be modified (update either start or length), deleted, or
* split into two based on the overlap characteristics
*/
list_for_each_entry_safe(pl, next, poison_list, list) {
u64 pl_end = pl->start + pl->length - 1;
/* Skip intervals with no intersection */
if (pl_end < start)
continue;
if (pl->start > clr_end)
continue;
/* Delete completely overlapped poison entries */
if ((pl->start >= start) && (pl_end <= clr_end)) {
list_del(&pl->list);
kfree(pl);
continue;
}
/* Adjust start point of partially cleared entries */
if ((start <= pl->start) && (clr_end > pl->start)) {
pl->length -= clr_end - pl->start + 1;
pl->start = clr_end + 1;
continue;
}
/* Adjust pl->length for partial clearing at the tail end */
if ((pl->start < start) && (pl_end <= clr_end)) {
/* pl->start remains the same */
pl->length = start - pl->start;
continue;
}
/*
* If clearing in the middle of an entry, we split it into
* two by modifying the current entry to represent one half of
* the split, and adding a new entry for the second half.
*/
if ((pl->start < start) && (pl_end > clr_end)) {
u64 new_start = clr_end + 1;
u64 new_len = pl_end - new_start + 1;
/* Add new entry covering the right half */
add_poison(nvdimm_bus, new_start, new_len, GFP_NOWAIT);
/* Adjust this entry to cover the left half */
pl->length = start - pl->start;
continue;
}
}
spin_unlock(&nvdimm_bus->poison_lock);
}
EXPORT_SYMBOL_GPL(nvdimm_forget_poison);
#ifdef CONFIG_BLK_DEV_INTEGRITY #ifdef CONFIG_BLK_DEV_INTEGRITY
int nd_integrity_init(struct gendisk *disk, unsigned long meta_size) int nd_integrity_init(struct gendisk *disk, unsigned long meta_size)

View File

@ -55,6 +55,8 @@ static int nvdimm_probe(struct device *dev)
goto err; goto err;
rc = nvdimm_init_config_data(ndd); rc = nvdimm_init_config_data(ndd);
if (rc == -EACCES)
nvdimm_set_locked(dev);
if (rc) if (rc)
goto err; goto err;
@ -68,6 +70,7 @@ static int nvdimm_probe(struct device *dev)
rc = nd_label_reserve_dpa(ndd); rc = nd_label_reserve_dpa(ndd);
if (ndd->ns_current >= 0) if (ndd->ns_current >= 0)
nvdimm_set_aliasing(dev); nvdimm_set_aliasing(dev);
nvdimm_clear_locked(dev);
nvdimm_bus_unlock(dev); nvdimm_bus_unlock(dev);
if (rc) if (rc)

View File

@ -200,6 +200,13 @@ void nvdimm_set_locked(struct device *dev)
set_bit(NDD_LOCKED, &nvdimm->flags); set_bit(NDD_LOCKED, &nvdimm->flags);
} }
void nvdimm_clear_locked(struct device *dev)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
clear_bit(NDD_LOCKED, &nvdimm->flags);
}
static void nvdimm_release(struct device *dev) static void nvdimm_release(struct device *dev)
{ {
struct nvdimm *nvdimm = to_nvdimm(dev); struct nvdimm *nvdimm = to_nvdimm(dev);
@ -324,6 +331,17 @@ static ssize_t commands_show(struct device *dev,
} }
static DEVICE_ATTR_RO(commands); static DEVICE_ATTR_RO(commands);
static ssize_t flags_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct nvdimm *nvdimm = to_nvdimm(dev);
return sprintf(buf, "%s%s\n",
test_bit(NDD_ALIASING, &nvdimm->flags) ? "alias " : "",
test_bit(NDD_LOCKED, &nvdimm->flags) ? "lock " : "");
}
static DEVICE_ATTR_RO(flags);
static ssize_t state_show(struct device *dev, struct device_attribute *attr, static ssize_t state_show(struct device *dev, struct device_attribute *attr,
char *buf) char *buf)
{ {
@ -365,6 +383,7 @@ static DEVICE_ATTR_RO(available_slots);
static struct attribute *nvdimm_attributes[] = { static struct attribute *nvdimm_attributes[] = {
&dev_attr_state.attr, &dev_attr_state.attr,
&dev_attr_flags.attr,
&dev_attr_commands.attr, &dev_attr_commands.attr,
&dev_attr_available_slots.attr, &dev_attr_available_slots.attr,
NULL, NULL,

View File

@ -1050,7 +1050,7 @@ static int init_labels(struct nd_mapping *nd_mapping, int num_labels)
nsindex = to_namespace_index(ndd, 0); nsindex = to_namespace_index(ndd, 0);
memset(nsindex, 0, ndd->nsarea.config_size); memset(nsindex, 0, ndd->nsarea.config_size);
for (i = 0; i < 2; i++) { for (i = 0; i < 2; i++) {
int rc = nd_label_write_index(ndd, i, i*2, ND_NSINDEX_INIT); int rc = nd_label_write_index(ndd, i, 3 - i, ND_NSINDEX_INIT);
if (rc) if (rc)
return rc; return rc;

View File

@ -1620,7 +1620,7 @@ static umode_t namespace_visible(struct kobject *kobj,
if (a == &dev_attr_resource.attr) { if (a == &dev_attr_resource.attr) {
if (is_namespace_blk(dev)) if (is_namespace_blk(dev))
return 0; return 0;
return a->mode; return 0400;
} }
if (is_namespace_pmem(dev) || is_namespace_blk(dev)) { if (is_namespace_pmem(dev) || is_namespace_blk(dev)) {
@ -1875,7 +1875,7 @@ static int select_pmem_id(struct nd_region *nd_region, u8 *pmem_id)
* @nspm: target namespace to create * @nspm: target namespace to create
* @nd_label: target pmem namespace label to evaluate * @nd_label: target pmem namespace label to evaluate
*/ */
struct device *create_namespace_pmem(struct nd_region *nd_region, static struct device *create_namespace_pmem(struct nd_region *nd_region,
struct nd_namespace_index *nsindex, struct nd_namespace_index *nsindex,
struct nd_namespace_label *nd_label) struct nd_namespace_label *nd_label)
{ {
@ -2186,7 +2186,7 @@ static int add_namespace_resource(struct nd_region *nd_region,
return i; return i;
} }
struct device *create_namespace_blk(struct nd_region *nd_region, static struct device *create_namespace_blk(struct nd_region *nd_region,
struct nd_namespace_label *nd_label, int count) struct nd_namespace_label *nd_label, int count)
{ {

View File

@ -29,10 +29,9 @@ struct nvdimm_bus {
struct list_head list; struct list_head list;
struct device dev; struct device dev;
int id, probe_active; int id, probe_active;
struct list_head poison_list;
struct list_head mapping_list; struct list_head mapping_list;
struct mutex reconfig_mutex; struct mutex reconfig_mutex;
spinlock_t poison_lock; struct badrange badrange;
}; };
struct nvdimm { struct nvdimm {

View File

@ -34,12 +34,6 @@ enum {
NVDIMM_IO_ATOMIC = 1, NVDIMM_IO_ATOMIC = 1,
}; };
struct nd_poison {
u64 start;
u64 length;
struct list_head list;
};
struct nvdimm_drvdata { struct nvdimm_drvdata {
struct device *dev; struct device *dev;
int nslabel_size; int nslabel_size;
@ -254,6 +248,7 @@ long nvdimm_clear_poison(struct device *dev, phys_addr_t phys,
unsigned int len); unsigned int len);
void nvdimm_set_aliasing(struct device *dev); void nvdimm_set_aliasing(struct device *dev);
void nvdimm_set_locked(struct device *dev); void nvdimm_set_locked(struct device *dev);
void nvdimm_clear_locked(struct device *dev);
struct nd_btt *to_nd_btt(struct device *dev); struct nd_btt *to_nd_btt(struct device *dev);
struct nd_gen_sb { struct nd_gen_sb {

View File

@ -282,8 +282,16 @@ static struct attribute *nd_pfn_attributes[] = {
NULL, NULL,
}; };
static umode_t pfn_visible(struct kobject *kobj, struct attribute *a, int n)
{
if (a == &dev_attr_resource.attr)
return 0400;
return a->mode;
}
struct attribute_group nd_pfn_attribute_group = { struct attribute_group nd_pfn_attribute_group = {
.attrs = nd_pfn_attributes, .attrs = nd_pfn_attributes,
.is_visible = pfn_visible,
}; };
static const struct attribute_group *nd_pfn_attribute_groups[] = { static const struct attribute_group *nd_pfn_attribute_groups[] = {

View File

@ -562,8 +562,12 @@ static umode_t region_visible(struct kobject *kobj, struct attribute *a, int n)
if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr) if (!is_nd_pmem(dev) && a == &dev_attr_badblocks.attr)
return 0; return 0;
if (!is_nd_pmem(dev) && a == &dev_attr_resource.attr) if (a == &dev_attr_resource.attr) {
return 0; if (is_nd_pmem(dev))
return 0400;
else
return 0;
}
if (a == &dev_attr_deep_flush.attr) { if (a == &dev_attr_deep_flush.attr) {
int has_flush = nvdimm_has_flush(nd_region); int has_flush = nvdimm_has_flush(nd_region);

323
fs/dax.c
View File

@ -526,13 +526,13 @@ static int copy_user_dax(struct block_device *bdev, struct dax_device *dax_dev,
static void *dax_insert_mapping_entry(struct address_space *mapping, static void *dax_insert_mapping_entry(struct address_space *mapping,
struct vm_fault *vmf, struct vm_fault *vmf,
void *entry, sector_t sector, void *entry, sector_t sector,
unsigned long flags) unsigned long flags, bool dirty)
{ {
struct radix_tree_root *page_tree = &mapping->page_tree; struct radix_tree_root *page_tree = &mapping->page_tree;
void *new_entry; void *new_entry;
pgoff_t index = vmf->pgoff; pgoff_t index = vmf->pgoff;
if (vmf->flags & FAULT_FLAG_WRITE) if (dirty)
__mark_inode_dirty(mapping->host, I_DIRTY_PAGES); __mark_inode_dirty(mapping->host, I_DIRTY_PAGES);
if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) { if (dax_is_zero_entry(entry) && !(flags & RADIX_DAX_ZERO_PAGE)) {
@ -569,7 +569,7 @@ static void *dax_insert_mapping_entry(struct address_space *mapping,
entry = new_entry; entry = new_entry;
} }
if (vmf->flags & FAULT_FLAG_WRITE) if (dirty)
radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY);
spin_unlock_irq(&mapping->tree_lock); spin_unlock_irq(&mapping->tree_lock);
@ -825,38 +825,42 @@ out:
} }
EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); EXPORT_SYMBOL_GPL(dax_writeback_mapping_range);
static int dax_insert_mapping(struct address_space *mapping, static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
struct block_device *bdev, struct dax_device *dax_dev,
sector_t sector, size_t size, void *entry,
struct vm_area_struct *vma, struct vm_fault *vmf)
{ {
unsigned long vaddr = vmf->address; return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
void *ret, *kaddr; }
pgoff_t pgoff;
int id, rc;
pfn_t pfn;
rc = bdev_dax_pgoff(bdev, sector, size, &pgoff); static int dax_iomap_pfn(struct iomap *iomap, loff_t pos, size_t size,
pfn_t *pfnp)
{
const sector_t sector = dax_iomap_sector(iomap, pos);
pgoff_t pgoff;
void *kaddr;
int id, rc;
long length;
rc = bdev_dax_pgoff(iomap->bdev, sector, size, &pgoff);
if (rc) if (rc)
return rc; return rc;
id = dax_read_lock(); id = dax_read_lock();
rc = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn); length = dax_direct_access(iomap->dax_dev, pgoff, PHYS_PFN(size),
if (rc < 0) { &kaddr, pfnp);
dax_read_unlock(id); if (length < 0) {
return rc; rc = length;
goto out;
} }
rc = -EINVAL;
if (PFN_PHYS(length) < size)
goto out;
if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1))
goto out;
/* For larger pages we need devmap */
if (length > 1 && !pfn_t_devmap(*pfnp))
goto out;
rc = 0;
out:
dax_read_unlock(id); dax_read_unlock(id);
return rc;
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector, 0);
if (IS_ERR(ret))
return PTR_ERR(ret);
trace_dax_insert_mapping(mapping->host, vmf, ret);
if (vmf->flags & FAULT_FLAG_WRITE)
return vm_insert_mixed_mkwrite(vma, vaddr, pfn);
else
return vm_insert_mixed(vma, vaddr, pfn);
} }
/* /*
@ -882,7 +886,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry,
} }
entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0, entry2 = dax_insert_mapping_entry(mapping, vmf, entry, 0,
RADIX_DAX_ZERO_PAGE); RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(entry2)) { if (IS_ERR(entry2)) {
ret = VM_FAULT_SIGBUS; ret = VM_FAULT_SIGBUS;
goto out; goto out;
@ -941,11 +945,6 @@ int __dax_zero_page_range(struct block_device *bdev,
} }
EXPORT_SYMBOL_GPL(__dax_zero_page_range); EXPORT_SYMBOL_GPL(__dax_zero_page_range);
static sector_t dax_iomap_sector(struct iomap *iomap, loff_t pos)
{
return (iomap->addr + (pos & PAGE_MASK) - iomap->offset) >> 9;
}
static loff_t static loff_t
dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data, dax_iomap_actor(struct inode *inode, loff_t pos, loff_t length, void *data,
struct iomap *iomap) struct iomap *iomap)
@ -1085,19 +1084,33 @@ static int dax_fault_return(int error)
return VM_FAULT_SIGBUS; return VM_FAULT_SIGBUS;
} }
static int dax_iomap_pte_fault(struct vm_fault *vmf, /*
* MAP_SYNC on a dax mapping guarantees dirty metadata is
* flushed on write-faults (non-cow), but not read-faults.
*/
static bool dax_fault_is_synchronous(unsigned long flags,
struct vm_area_struct *vma, struct iomap *iomap)
{
return (flags & IOMAP_WRITE) && (vma->vm_flags & VM_SYNC)
&& (iomap->flags & IOMAP_F_DIRTY);
}
static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops) const struct iomap_ops *ops)
{ {
struct address_space *mapping = vmf->vma->vm_file->f_mapping; struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
unsigned long vaddr = vmf->address; unsigned long vaddr = vmf->address;
loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT; loff_t pos = (loff_t)vmf->pgoff << PAGE_SHIFT;
sector_t sector;
struct iomap iomap = { 0 }; struct iomap iomap = { 0 };
unsigned flags = IOMAP_FAULT; unsigned flags = IOMAP_FAULT;
int error, major = 0; int error, major = 0;
bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
int vmf_ret = 0; int vmf_ret = 0;
void *entry; void *entry;
pfn_t pfn;
trace_dax_pte_fault(inode, vmf, vmf_ret); trace_dax_pte_fault(inode, vmf, vmf_ret);
/* /*
@ -1110,7 +1123,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto out; goto out;
} }
if ((vmf->flags & FAULT_FLAG_WRITE) && !vmf->cow_page) if (write && !vmf->cow_page)
flags |= IOMAP_WRITE; flags |= IOMAP_WRITE;
entry = grab_mapping_entry(mapping, vmf->pgoff, 0); entry = grab_mapping_entry(mapping, vmf->pgoff, 0);
@ -1145,9 +1158,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto error_finish_iomap; goto error_finish_iomap;
} }
sector = dax_iomap_sector(&iomap, pos);
if (vmf->cow_page) { if (vmf->cow_page) {
sector_t sector = dax_iomap_sector(&iomap, pos);
switch (iomap.type) { switch (iomap.type) {
case IOMAP_HOLE: case IOMAP_HOLE:
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
@ -1173,22 +1186,55 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
goto finish_iomap; goto finish_iomap;
} }
sync = dax_fault_is_synchronous(flags, vma, &iomap);
switch (iomap.type) { switch (iomap.type) {
case IOMAP_MAPPED: case IOMAP_MAPPED:
if (iomap.flags & IOMAP_F_NEW) { if (iomap.flags & IOMAP_F_NEW) {
count_vm_event(PGMAJFAULT); count_vm_event(PGMAJFAULT);
count_memcg_event_mm(vmf->vma->vm_mm, PGMAJFAULT); count_memcg_event_mm(vma->vm_mm, PGMAJFAULT);
major = VM_FAULT_MAJOR; major = VM_FAULT_MAJOR;
} }
error = dax_insert_mapping(mapping, iomap.bdev, iomap.dax_dev, error = dax_iomap_pfn(&iomap, pos, PAGE_SIZE, &pfn);
sector, PAGE_SIZE, entry, vmf->vma, vmf); if (error < 0)
goto error_finish_iomap;
entry = dax_insert_mapping_entry(mapping, vmf, entry,
dax_iomap_sector(&iomap, pos),
0, write && !sync);
if (IS_ERR(entry)) {
error = PTR_ERR(entry);
goto error_finish_iomap;
}
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PTE into page tables only after that happens.
* Skip insertion for now and return the pfn so that caller can
* insert it after fsync is done.
*/
if (sync) {
if (WARN_ON_ONCE(!pfnp)) {
error = -EIO;
goto error_finish_iomap;
}
*pfnp = pfn;
vmf_ret = VM_FAULT_NEEDDSYNC | major;
goto finish_iomap;
}
trace_dax_insert_mapping(inode, vmf, entry);
if (write)
error = vm_insert_mixed_mkwrite(vma, vaddr, pfn);
else
error = vm_insert_mixed(vma, vaddr, pfn);
/* -EBUSY is fine, somebody else faulted on the same PTE */ /* -EBUSY is fine, somebody else faulted on the same PTE */
if (error == -EBUSY) if (error == -EBUSY)
error = 0; error = 0;
break; break;
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
if (!(vmf->flags & FAULT_FLAG_WRITE)) { if (!write) {
vmf_ret = dax_load_hole(mapping, entry, vmf); vmf_ret = dax_load_hole(mapping, entry, vmf);
goto finish_iomap; goto finish_iomap;
} }
@ -1223,53 +1269,11 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf,
} }
#ifdef CONFIG_FS_DAX_PMD #ifdef CONFIG_FS_DAX_PMD
static int dax_pmd_insert_mapping(struct vm_fault *vmf, struct iomap *iomap, /*
loff_t pos, void *entry) * The 'colour' (ie low bits) within a PMD of a page offset. This comes up
{ * more often than one might expect in the below functions.
struct address_space *mapping = vmf->vma->vm_file->f_mapping; */
const sector_t sector = dax_iomap_sector(iomap, pos); #define PG_PMD_COLOUR ((PMD_SIZE >> PAGE_SHIFT) - 1)
struct dax_device *dax_dev = iomap->dax_dev;
struct block_device *bdev = iomap->bdev;
struct inode *inode = mapping->host;
const size_t size = PMD_SIZE;
void *ret = NULL, *kaddr;
long length = 0;
pgoff_t pgoff;
pfn_t pfn = {};
int id;
if (bdev_dax_pgoff(bdev, sector, size, &pgoff) != 0)
goto fallback;
id = dax_read_lock();
length = dax_direct_access(dax_dev, pgoff, PHYS_PFN(size), &kaddr, &pfn);
if (length < 0)
goto unlock_fallback;
length = PFN_PHYS(length);
if (length < size)
goto unlock_fallback;
if (pfn_t_to_pfn(pfn) & PG_PMD_COLOUR)
goto unlock_fallback;
if (!pfn_t_devmap(pfn))
goto unlock_fallback;
dax_read_unlock(id);
ret = dax_insert_mapping_entry(mapping, vmf, entry, sector,
RADIX_DAX_PMD);
if (IS_ERR(ret))
goto fallback;
trace_dax_pmd_insert_mapping(inode, vmf, length, pfn, ret);
return vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, vmf->flags & FAULT_FLAG_WRITE);
unlock_fallback:
dax_read_unlock(id);
fallback:
trace_dax_pmd_insert_mapping_fallback(inode, vmf, length, pfn, ret);
return VM_FAULT_FALLBACK;
}
static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
void *entry) void *entry)
@ -1288,7 +1292,7 @@ static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap,
goto fallback; goto fallback;
ret = dax_insert_mapping_entry(mapping, vmf, entry, 0, ret = dax_insert_mapping_entry(mapping, vmf, entry, 0,
RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE); RADIX_DAX_PMD | RADIX_DAX_ZERO_PAGE, false);
if (IS_ERR(ret)) if (IS_ERR(ret))
goto fallback; goto fallback;
@ -1310,13 +1314,14 @@ fallback:
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
static int dax_iomap_pmd_fault(struct vm_fault *vmf, static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops) const struct iomap_ops *ops)
{ {
struct vm_area_struct *vma = vmf->vma; struct vm_area_struct *vma = vmf->vma;
struct address_space *mapping = vma->vm_file->f_mapping; struct address_space *mapping = vma->vm_file->f_mapping;
unsigned long pmd_addr = vmf->address & PMD_MASK; unsigned long pmd_addr = vmf->address & PMD_MASK;
bool write = vmf->flags & FAULT_FLAG_WRITE; bool write = vmf->flags & FAULT_FLAG_WRITE;
bool sync;
unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT;
struct inode *inode = mapping->host; struct inode *inode = mapping->host;
int result = VM_FAULT_FALLBACK; int result = VM_FAULT_FALLBACK;
@ -1325,6 +1330,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
void *entry; void *entry;
loff_t pos; loff_t pos;
int error; int error;
pfn_t pfn;
/* /*
* Check whether offset isn't beyond end of file now. Caller is * Check whether offset isn't beyond end of file now. Caller is
@ -1332,7 +1338,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* this is a reliable test. * this is a reliable test.
*/ */
pgoff = linear_page_index(vma, pmd_addr); pgoff = linear_page_index(vma, pmd_addr);
max_pgoff = (i_size_read(inode) - 1) >> PAGE_SHIFT; max_pgoff = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE);
trace_dax_pmd_fault(inode, vmf, max_pgoff, 0); trace_dax_pmd_fault(inode, vmf, max_pgoff, 0);
@ -1356,13 +1362,13 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if ((pmd_addr + PMD_SIZE) > vma->vm_end) if ((pmd_addr + PMD_SIZE) > vma->vm_end)
goto fallback; goto fallback;
if (pgoff > max_pgoff) { if (pgoff >= max_pgoff) {
result = VM_FAULT_SIGBUS; result = VM_FAULT_SIGBUS;
goto out; goto out;
} }
/* If the PMD would extend beyond the file size */ /* If the PMD would extend beyond the file size */
if ((pgoff | PG_PMD_COLOUR) > max_pgoff) if ((pgoff | PG_PMD_COLOUR) >= max_pgoff)
goto fallback; goto fallback;
/* /*
@ -1400,9 +1406,37 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
if (iomap.offset + iomap.length < pos + PMD_SIZE) if (iomap.offset + iomap.length < pos + PMD_SIZE)
goto finish_iomap; goto finish_iomap;
sync = dax_fault_is_synchronous(iomap_flags, vma, &iomap);
switch (iomap.type) { switch (iomap.type) {
case IOMAP_MAPPED: case IOMAP_MAPPED:
result = dax_pmd_insert_mapping(vmf, &iomap, pos, entry); error = dax_iomap_pfn(&iomap, pos, PMD_SIZE, &pfn);
if (error < 0)
goto finish_iomap;
entry = dax_insert_mapping_entry(mapping, vmf, entry,
dax_iomap_sector(&iomap, pos),
RADIX_DAX_PMD, write && !sync);
if (IS_ERR(entry))
goto finish_iomap;
/*
* If we are doing synchronous page fault and inode needs fsync,
* we can insert PMD into page tables only after that happens.
* Skip insertion for now and return the pfn so that caller can
* insert it after fsync is done.
*/
if (sync) {
if (WARN_ON_ONCE(!pfnp))
goto finish_iomap;
*pfnp = pfn;
result = VM_FAULT_NEEDDSYNC;
goto finish_iomap;
}
trace_dax_pmd_insert_mapping(inode, vmf, PMD_SIZE, pfn, entry);
result = vmf_insert_pfn_pmd(vma, vmf->address, vmf->pmd, pfn,
write);
break; break;
case IOMAP_UNWRITTEN: case IOMAP_UNWRITTEN:
case IOMAP_HOLE: case IOMAP_HOLE:
@ -1442,7 +1476,7 @@ out:
return result; return result;
} }
#else #else
static int dax_iomap_pmd_fault(struct vm_fault *vmf, static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp,
const struct iomap_ops *ops) const struct iomap_ops *ops)
{ {
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
@ -1452,7 +1486,9 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
/** /**
* dax_iomap_fault - handle a page fault on a DAX file * dax_iomap_fault - handle a page fault on a DAX file
* @vmf: The description of the fault * @vmf: The description of the fault
* @ops: iomap ops passed from the file system * @pe_size: Size of the page to fault in
* @pfnp: PFN to insert for synchronous faults if fsync is required
* @ops: Iomap ops passed from the file system
* *
* When a page fault occurs, filesystems may call this helper in * When a page fault occurs, filesystems may call this helper in
* their fault handler for DAX files. dax_iomap_fault() assumes the caller * their fault handler for DAX files. dax_iomap_fault() assumes the caller
@ -1460,15 +1496,98 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf,
* successfully. * successfully.
*/ */
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops) pfn_t *pfnp, const struct iomap_ops *ops)
{ {
switch (pe_size) { switch (pe_size) {
case PE_SIZE_PTE: case PE_SIZE_PTE:
return dax_iomap_pte_fault(vmf, ops); return dax_iomap_pte_fault(vmf, pfnp, ops);
case PE_SIZE_PMD: case PE_SIZE_PMD:
return dax_iomap_pmd_fault(vmf, ops); return dax_iomap_pmd_fault(vmf, pfnp, ops);
default: default:
return VM_FAULT_FALLBACK; return VM_FAULT_FALLBACK;
} }
} }
EXPORT_SYMBOL_GPL(dax_iomap_fault); EXPORT_SYMBOL_GPL(dax_iomap_fault);
/**
* dax_insert_pfn_mkwrite - insert PTE or PMD entry into page tables
* @vmf: The description of the fault
* @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
*
* This function inserts writeable PTE or PMD entry into page tables for mmaped
* DAX file. It takes care of marking corresponding radix tree entry as dirty
* as well.
*/
static int dax_insert_pfn_mkwrite(struct vm_fault *vmf,
enum page_entry_size pe_size,
pfn_t pfn)
{
struct address_space *mapping = vmf->vma->vm_file->f_mapping;
void *entry, **slot;
pgoff_t index = vmf->pgoff;
int vmf_ret, error;
spin_lock_irq(&mapping->tree_lock);
entry = get_unlocked_mapping_entry(mapping, index, &slot);
/* Did we race with someone splitting entry or so? */
if (!entry ||
(pe_size == PE_SIZE_PTE && !dax_is_pte_entry(entry)) ||
(pe_size == PE_SIZE_PMD && !dax_is_pmd_entry(entry))) {
put_unlocked_mapping_entry(mapping, index, entry);
spin_unlock_irq(&mapping->tree_lock);
trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf,
VM_FAULT_NOPAGE);
return VM_FAULT_NOPAGE;
}
radix_tree_tag_set(&mapping->page_tree, index, PAGECACHE_TAG_DIRTY);
entry = lock_slot(mapping, slot);
spin_unlock_irq(&mapping->tree_lock);
switch (pe_size) {
case PE_SIZE_PTE:
error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn);
vmf_ret = dax_fault_return(error);
break;
#ifdef CONFIG_FS_DAX_PMD
case PE_SIZE_PMD:
vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd,
pfn, true);
break;
#endif
default:
vmf_ret = VM_FAULT_FALLBACK;
}
put_locked_mapping_entry(mapping, index);
trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret);
return vmf_ret;
}
/**
* dax_finish_sync_fault - finish synchronous page fault
* @vmf: The description of the fault
* @pe_size: Size of entry to be inserted
* @pfn: PFN to insert
*
* This function ensures that the file range touched by the page fault is
* stored persistently on the media and handles inserting of appropriate page
* table entry.
*/
int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t pfn)
{
int err;
loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT;
size_t len = 0;
if (pe_size == PE_SIZE_PTE)
len = PAGE_SIZE;
else if (pe_size == PE_SIZE_PMD)
len = PMD_SIZE;
else
WARN_ON_ONCE(1);
err = vfs_fsync_range(vmf->vma->vm_file, start, start + len - 1, 1);
if (err)
return VM_FAULT_SIGBUS;
return dax_insert_pfn_mkwrite(vmf, pe_size, pfn);
}
EXPORT_SYMBOL_GPL(dax_finish_sync_fault);

View File

@ -100,7 +100,7 @@ static int ext2_dax_fault(struct vm_fault *vmf)
} }
down_read(&ei->dax_sem); down_read(&ei->dax_sem);
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &ext2_iomap_ops); ret = dax_iomap_fault(vmf, PE_SIZE_PTE, NULL, &ext2_iomap_ops);
up_read(&ei->dax_sem); up_read(&ei->dax_sem);
if (vmf->flags & FAULT_FLAG_WRITE) if (vmf->flags & FAULT_FLAG_WRITE)

View File

@ -28,6 +28,7 @@
#include <linux/quotaops.h> #include <linux/quotaops.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/uio.h> #include <linux/uio.h>
#include <linux/mman.h>
#include "ext4.h" #include "ext4.h"
#include "ext4_jbd2.h" #include "ext4_jbd2.h"
#include "xattr.h" #include "xattr.h"
@ -297,6 +298,7 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
*/ */
bool write = (vmf->flags & FAULT_FLAG_WRITE) && bool write = (vmf->flags & FAULT_FLAG_WRITE) &&
(vmf->vma->vm_flags & VM_SHARED); (vmf->vma->vm_flags & VM_SHARED);
pfn_t pfn;
if (write) { if (write) {
sb_start_pagefault(sb); sb_start_pagefault(sb);
@ -304,16 +306,20 @@ static int ext4_dax_huge_fault(struct vm_fault *vmf,
down_read(&EXT4_I(inode)->i_mmap_sem); down_read(&EXT4_I(inode)->i_mmap_sem);
handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE, handle = ext4_journal_start_sb(sb, EXT4_HT_WRITE_PAGE,
EXT4_DATA_TRANS_BLOCKS(sb)); EXT4_DATA_TRANS_BLOCKS(sb));
if (IS_ERR(handle)) {
up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb);
return VM_FAULT_SIGBUS;
}
} else { } else {
down_read(&EXT4_I(inode)->i_mmap_sem); down_read(&EXT4_I(inode)->i_mmap_sem);
} }
if (!IS_ERR(handle)) result = dax_iomap_fault(vmf, pe_size, &pfn, &ext4_iomap_ops);
result = dax_iomap_fault(vmf, pe_size, &ext4_iomap_ops);
else
result = VM_FAULT_SIGBUS;
if (write) { if (write) {
if (!IS_ERR(handle)) ext4_journal_stop(handle);
ext4_journal_stop(handle); /* Handling synchronous page fault? */
if (result & VM_FAULT_NEEDDSYNC)
result = dax_finish_sync_fault(vmf, pe_size, pfn);
up_read(&EXT4_I(inode)->i_mmap_sem); up_read(&EXT4_I(inode)->i_mmap_sem);
sb_end_pagefault(sb); sb_end_pagefault(sb);
} else { } else {
@ -351,6 +357,13 @@ static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma)
if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb)))) if (unlikely(ext4_forced_shutdown(EXT4_SB(inode->i_sb))))
return -EIO; return -EIO;
/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(file)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
file_accessed(file); file_accessed(file);
if (IS_DAX(file_inode(file))) { if (IS_DAX(file_inode(file))) {
vma->vm_ops = &ext4_dax_vm_ops; vma->vm_ops = &ext4_dax_vm_ops;
@ -469,6 +482,7 @@ const struct file_operations ext4_file_operations = {
.compat_ioctl = ext4_compat_ioctl, .compat_ioctl = ext4_compat_ioctl,
#endif #endif
.mmap = ext4_file_mmap, .mmap = ext4_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = ext4_file_open, .open = ext4_file_open,
.release = ext4_release_file, .release = ext4_release_file,
.fsync = ext4_sync_file, .fsync = ext4_sync_file,

View File

@ -3384,6 +3384,19 @@ static int ext4_releasepage(struct page *page, gfp_t wait)
return try_to_free_buffers(page); return try_to_free_buffers(page);
} }
static bool ext4_inode_datasync_dirty(struct inode *inode)
{
journal_t *journal = EXT4_SB(inode->i_sb)->s_journal;
if (journal)
return !jbd2_transaction_committed(journal,
EXT4_I(inode)->i_datasync_tid);
/* Any metadata buffers to write? */
if (!list_empty(&inode->i_mapping->private_list))
return true;
return inode->i_state & I_DIRTY_DATASYNC;
}
static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
unsigned flags, struct iomap *iomap) unsigned flags, struct iomap *iomap)
{ {
@ -3497,6 +3510,8 @@ retry:
} }
iomap->flags = 0; iomap->flags = 0;
if (ext4_inode_datasync_dirty(inode))
iomap->flags |= IOMAP_F_DIRTY;
iomap->bdev = inode->i_sb->s_bdev; iomap->bdev = inode->i_sb->s_bdev;
iomap->dax_dev = sbi->s_daxdev; iomap->dax_dev = sbi->s_daxdev;
iomap->offset = first_block << blkbits; iomap->offset = first_block << blkbits;

View File

@ -737,6 +737,23 @@ int jbd2_log_wait_commit(journal_t *journal, tid_t tid)
return err; return err;
} }
/* Return 1 when transaction with given tid has already committed. */
int jbd2_transaction_committed(journal_t *journal, tid_t tid)
{
int ret = 1;
read_lock(&journal->j_state_lock);
if (journal->j_running_transaction &&
journal->j_running_transaction->t_tid == tid)
ret = 0;
if (journal->j_committing_transaction &&
journal->j_committing_transaction->t_tid == tid)
ret = 0;
read_unlock(&journal->j_state_lock);
return ret;
}
EXPORT_SYMBOL(jbd2_transaction_committed);
/* /*
* When this function returns the transaction corresponding to tid * When this function returns the transaction corresponding to tid
* will be completed. If the transaction has currently running, start * will be completed. If the transaction has currently running, start

View File

@ -661,6 +661,7 @@ static void show_smap_vma_flags(struct seq_file *m, struct vm_area_struct *vma)
[ilog2(VM_ACCOUNT)] = "ac", [ilog2(VM_ACCOUNT)] = "ac",
[ilog2(VM_NORESERVE)] = "nr", [ilog2(VM_NORESERVE)] = "nr",
[ilog2(VM_HUGETLB)] = "ht", [ilog2(VM_HUGETLB)] = "ht",
[ilog2(VM_SYNC)] = "sf",
[ilog2(VM_ARCH_1)] = "ar", [ilog2(VM_ARCH_1)] = "ar",
[ilog2(VM_WIPEONFORK)] = "wf", [ilog2(VM_WIPEONFORK)] = "wf",
[ilog2(VM_DONTDUMP)] = "dd", [ilog2(VM_DONTDUMP)] = "dd",

View File

@ -44,6 +44,7 @@
#include <linux/falloc.h> #include <linux/falloc.h>
#include <linux/pagevec.h> #include <linux/pagevec.h>
#include <linux/backing-dev.h> #include <linux/backing-dev.h>
#include <linux/mman.h>
static const struct vm_operations_struct xfs_file_vm_ops; static const struct vm_operations_struct xfs_file_vm_ops;
@ -1045,7 +1046,11 @@ __xfs_filemap_fault(
xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED); xfs_ilock(XFS_I(inode), XFS_MMAPLOCK_SHARED);
if (IS_DAX(inode)) { if (IS_DAX(inode)) {
ret = dax_iomap_fault(vmf, pe_size, &xfs_iomap_ops); pfn_t pfn;
ret = dax_iomap_fault(vmf, pe_size, &pfn, &xfs_iomap_ops);
if (ret & VM_FAULT_NEEDDSYNC)
ret = dax_finish_sync_fault(vmf, pe_size, pfn);
} else { } else {
if (write_fault) if (write_fault)
ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops); ret = iomap_page_mkwrite(vmf, &xfs_iomap_ops);
@ -1090,37 +1095,16 @@ xfs_filemap_page_mkwrite(
} }
/* /*
* pfn_mkwrite was originally inteneded to ensure we capture time stamp * pfn_mkwrite was originally intended to ensure we capture time stamp updates
* updates on write faults. In reality, it's need to serialise against * on write faults. In reality, it needs to serialise against truncate and
* truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED * prepare memory for writing so handle is as standard write fault.
* to ensure we serialise the fault barrier in place.
*/ */
static int static int
xfs_filemap_pfn_mkwrite( xfs_filemap_pfn_mkwrite(
struct vm_fault *vmf) struct vm_fault *vmf)
{ {
struct inode *inode = file_inode(vmf->vma->vm_file); return __xfs_filemap_fault(vmf, PE_SIZE_PTE, true);
struct xfs_inode *ip = XFS_I(inode);
int ret = VM_FAULT_NOPAGE;
loff_t size;
trace_xfs_filemap_pfn_mkwrite(ip);
sb_start_pagefault(inode->i_sb);
file_update_time(vmf->vma->vm_file);
/* check if the faulting page hasn't raced with truncate */
xfs_ilock(ip, XFS_MMAPLOCK_SHARED);
size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT;
if (vmf->pgoff >= size)
ret = VM_FAULT_SIGBUS;
else if (IS_DAX(inode))
ret = dax_iomap_fault(vmf, PE_SIZE_PTE, &xfs_iomap_ops);
xfs_iunlock(ip, XFS_MMAPLOCK_SHARED);
sb_end_pagefault(inode->i_sb);
return ret;
} }
static const struct vm_operations_struct xfs_file_vm_ops = { static const struct vm_operations_struct xfs_file_vm_ops = {
@ -1136,6 +1120,13 @@ xfs_file_mmap(
struct file *filp, struct file *filp,
struct vm_area_struct *vma) struct vm_area_struct *vma)
{ {
/*
* We don't support synchronous mappings for non-DAX files. At least
* until someone comes with a sensible use case.
*/
if (!IS_DAX(file_inode(filp)) && (vma->vm_flags & VM_SYNC))
return -EOPNOTSUPP;
file_accessed(filp); file_accessed(filp);
vma->vm_ops = &xfs_file_vm_ops; vma->vm_ops = &xfs_file_vm_ops;
if (IS_DAX(file_inode(filp))) if (IS_DAX(file_inode(filp)))
@ -1154,6 +1145,7 @@ const struct file_operations xfs_file_operations = {
.compat_ioctl = xfs_file_compat_ioctl, .compat_ioctl = xfs_file_compat_ioctl,
#endif #endif
.mmap = xfs_file_mmap, .mmap = xfs_file_mmap,
.mmap_supported_flags = MAP_SYNC,
.open = xfs_file_open, .open = xfs_file_open,
.release = xfs_file_release, .release = xfs_file_release,
.fsync = xfs_file_fsync, .fsync = xfs_file_fsync,

View File

@ -34,6 +34,7 @@
#include "xfs_error.h" #include "xfs_error.h"
#include "xfs_trans.h" #include "xfs_trans.h"
#include "xfs_trans_space.h" #include "xfs_trans_space.h"
#include "xfs_inode_item.h"
#include "xfs_iomap.h" #include "xfs_iomap.h"
#include "xfs_trace.h" #include "xfs_trace.h"
#include "xfs_icache.h" #include "xfs_icache.h"
@ -1089,6 +1090,10 @@ xfs_file_iomap_begin(
trace_xfs_iomap_found(ip, offset, length, 0, &imap); trace_xfs_iomap_found(ip, offset, length, 0, &imap);
} }
if (xfs_ipincount(ip) && (ip->i_itemp->ili_fsync_fields
& ~XFS_ILOG_TIMESTAMP))
iomap->flags |= IOMAP_F_DIRTY;
xfs_bmbt_to_iomap(ip, iomap, &imap); xfs_bmbt_to_iomap(ip, iomap, &imap);
if (shared) if (shared)

View File

@ -654,8 +654,6 @@ DEFINE_INODE_EVENT(xfs_inode_set_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag); DEFINE_INODE_EVENT(xfs_inode_clear_cowblocks_tag);
DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid); DEFINE_INODE_EVENT(xfs_inode_free_cowblocks_invalid);
DEFINE_INODE_EVENT(xfs_filemap_pfn_mkwrite);
TRACE_EVENT(xfs_filemap_fault, TRACE_EVENT(xfs_filemap_fault,
TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size, TP_PROTO(struct xfs_inode *ip, enum page_entry_size pe_size,
bool write_fault), bool write_fault),

View File

@ -96,7 +96,9 @@ bool dax_write_cache_enabled(struct dax_device *dax_dev);
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops); const struct iomap_ops *ops);
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
const struct iomap_ops *ops); pfn_t *pfnp, const struct iomap_ops *ops);
int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
pfn_t pfn);
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
int dax_invalidate_mapping_entry_sync(struct address_space *mapping, int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
pgoff_t index); pgoff_t index);

View File

@ -1702,6 +1702,7 @@ struct file_operations {
long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long); long (*unlocked_ioctl) (struct file *, unsigned int, unsigned long);
long (*compat_ioctl) (struct file *, unsigned int, unsigned long); long (*compat_ioctl) (struct file *, unsigned int, unsigned long);
int (*mmap) (struct file *, struct vm_area_struct *); int (*mmap) (struct file *, struct vm_area_struct *);
unsigned long mmap_supported_flags;
int (*open) (struct inode *, struct file *); int (*open) (struct inode *, struct file *);
int (*flush) (struct file *, fl_owner_t id); int (*flush) (struct file *, fl_owner_t id);
int (*release) (struct inode *, struct file *); int (*release) (struct inode *, struct file *);

View File

@ -21,9 +21,13 @@ struct vm_fault;
/* /*
* Flags for all iomap mappings: * Flags for all iomap mappings:
*
* IOMAP_F_DIRTY indicates the inode has uncommitted metadata needed to access
* written data and requires fdatasync to commit them to persistent storage.
*/ */
#define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */ #define IOMAP_F_NEW 0x01 /* blocks have been newly allocated */
#define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */ #define IOMAP_F_BOUNDARY 0x02 /* mapping ends at metadata boundary */
#define IOMAP_F_DIRTY 0x04 /* uncommitted metadata */
/* /*
* Flags that only need to be reported for IOMAP_REPORT requests: * Flags that only need to be reported for IOMAP_REPORT requests:

View File

@ -1367,6 +1367,7 @@ int jbd2_log_start_commit(journal_t *journal, tid_t tid);
int __jbd2_log_start_commit(journal_t *journal, tid_t tid); int __jbd2_log_start_commit(journal_t *journal, tid_t tid);
int jbd2_journal_start_commit(journal_t *journal, tid_t *tid); int jbd2_journal_start_commit(journal_t *journal, tid_t *tid);
int jbd2_log_wait_commit(journal_t *journal, tid_t tid); int jbd2_log_wait_commit(journal_t *journal, tid_t tid);
int jbd2_transaction_committed(journal_t *journal, tid_t tid);
int jbd2_complete_transaction(journal_t *journal, tid_t tid); int jbd2_complete_transaction(journal_t *journal, tid_t tid);
int jbd2_log_do_checkpoint(journal_t *journal); int jbd2_log_do_checkpoint(journal_t *journal);
int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid); int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid);

View File

@ -18,6 +18,18 @@
#include <linux/sizes.h> #include <linux/sizes.h>
#include <linux/types.h> #include <linux/types.h>
#include <linux/uuid.h> #include <linux/uuid.h>
#include <linux/spinlock.h>
struct badrange_entry {
u64 start;
u64 length;
struct list_head list;
};
struct badrange {
struct list_head list;
spinlock_t lock;
};
enum { enum {
/* when a dimm supports both PMEM and BLK access a label is required */ /* when a dimm supports both PMEM and BLK access a label is required */
@ -129,9 +141,12 @@ static inline struct nd_blk_region_desc *to_blk_region_desc(
} }
int nvdimm_bus_add_poison(struct nvdimm_bus *nvdimm_bus, u64 addr, u64 length); void badrange_init(struct badrange *badrange);
void nvdimm_forget_poison(struct nvdimm_bus *nvdimm_bus, int badrange_add(struct badrange *badrange, u64 addr, u64 length);
phys_addr_t start, unsigned int len); void badrange_forget(struct badrange *badrange, phys_addr_t start,
unsigned int len);
int nvdimm_bus_add_badrange(struct nvdimm_bus *nvdimm_bus, u64 addr,
u64 length);
struct nvdimm_bus *nvdimm_bus_register(struct device *parent, struct nvdimm_bus *nvdimm_bus_register(struct device *parent,
struct nvdimm_bus_descriptor *nfit_desc); struct nvdimm_bus_descriptor *nfit_desc);
void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus); void nvdimm_bus_unregister(struct nvdimm_bus *nvdimm_bus);

View File

@ -199,6 +199,7 @@ extern unsigned int kobjsize(const void *objp);
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */ #define VM_NORESERVE 0x00200000 /* should the VM suppress accounting */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */ #define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_SYNC 0x00800000 /* Synchronous page faults */
#define VM_ARCH_1 0x01000000 /* Architecture-specific flag */ #define VM_ARCH_1 0x01000000 /* Architecture-specific flag */
#define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */ #define VM_WIPEONFORK 0x02000000 /* Wipe VMA contents in child. */
#define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */ #define VM_DONTDUMP 0x04000000 /* Do not include in the core dump */
@ -1191,8 +1192,9 @@ static inline void clear_page_pfmemalloc(struct page *page)
#define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */ #define VM_FAULT_RETRY 0x0400 /* ->fault blocked, must retry */
#define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */ #define VM_FAULT_FALLBACK 0x0800 /* huge page fault failed, fall back to small */
#define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */ #define VM_FAULT_DONE_COW 0x1000 /* ->fault has fully handled COW */
#define VM_FAULT_NEEDDSYNC 0x2000 /* ->fault did not modify page tables
#define VM_FAULT_HWPOISON_LARGE_MASK 0xf000 /* encodes hpage index for large hwpoison */ * and needs fsync() to complete (for
* synchronous page faults in DAX) */
#define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \ #define VM_FAULT_ERROR (VM_FAULT_OOM | VM_FAULT_SIGBUS | VM_FAULT_SIGSEGV | \
VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \ VM_FAULT_HWPOISON | VM_FAULT_HWPOISON_LARGE | \
@ -1210,7 +1212,8 @@ static inline void clear_page_pfmemalloc(struct page *page)
{ VM_FAULT_LOCKED, "LOCKED" }, \ { VM_FAULT_LOCKED, "LOCKED" }, \
{ VM_FAULT_RETRY, "RETRY" }, \ { VM_FAULT_RETRY, "RETRY" }, \
{ VM_FAULT_FALLBACK, "FALLBACK" }, \ { VM_FAULT_FALLBACK, "FALLBACK" }, \
{ VM_FAULT_DONE_COW, "DONE_COW" } { VM_FAULT_DONE_COW, "DONE_COW" }, \
{ VM_FAULT_NEEDDSYNC, "NEEDDSYNC" }
/* Encode hstate index for a hwpoisoned large page */ /* Encode hstate index for a hwpoisoned large page */
#define VM_FAULT_SET_HINDEX(x) ((x) << 12) #define VM_FAULT_SET_HINDEX(x) ((x) << 12)

View File

@ -8,6 +8,48 @@
#include <linux/atomic.h> #include <linux/atomic.h>
#include <uapi/linux/mman.h> #include <uapi/linux/mman.h>
/*
* Arrange for legacy / undefined architecture specific flags to be
* ignored by mmap handling code.
*/
#ifndef MAP_32BIT
#define MAP_32BIT 0
#endif
#ifndef MAP_HUGE_2MB
#define MAP_HUGE_2MB 0
#endif
#ifndef MAP_HUGE_1GB
#define MAP_HUGE_1GB 0
#endif
#ifndef MAP_UNINITIALIZED
#define MAP_UNINITIALIZED 0
#endif
#ifndef MAP_SYNC
#define MAP_SYNC 0
#endif
/*
* The historical set of flags that all mmap implementations implicitly
* support when a ->mmap_validate() op is not provided in file_operations.
*/
#define LEGACY_MAP_MASK (MAP_SHARED \
| MAP_PRIVATE \
| MAP_FIXED \
| MAP_ANONYMOUS \
| MAP_DENYWRITE \
| MAP_EXECUTABLE \
| MAP_UNINITIALIZED \
| MAP_GROWSDOWN \
| MAP_LOCKED \
| MAP_NORESERVE \
| MAP_POPULATE \
| MAP_NONBLOCK \
| MAP_STACK \
| MAP_HUGETLB \
| MAP_32BIT \
| MAP_HUGE_2MB \
| MAP_HUGE_1GB)
extern int sysctl_overcommit_memory; extern int sysctl_overcommit_memory;
extern int sysctl_overcommit_ratio; extern int sysctl_overcommit_ratio;
extern unsigned long sysctl_overcommit_kbytes; extern unsigned long sysctl_overcommit_kbytes;
@ -64,8 +106,9 @@ static inline bool arch_validate_prot(unsigned long prot)
* ("bit1" and "bit2" must be single bits) * ("bit1" and "bit2" must be single bits)
*/ */
#define _calc_vm_trans(x, bit1, bit2) \ #define _calc_vm_trans(x, bit1, bit2) \
((!(bit1) || !(bit2)) ? 0 : \
((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \ ((bit1) <= (bit2) ? ((x) & (bit1)) * ((bit2) / (bit1)) \
: ((x) & (bit1)) / ((bit1) / (bit2))) : ((x) & (bit1)) / ((bit1) / (bit2))))
/* /*
* Combine the mmap "prot" argument into "vm_flags" used internally. * Combine the mmap "prot" argument into "vm_flags" used internally.
@ -87,7 +130,8 @@ calc_vm_flag_bits(unsigned long flags)
{ {
return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) | return _calc_vm_trans(flags, MAP_GROWSDOWN, VM_GROWSDOWN ) |
_calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) | _calc_vm_trans(flags, MAP_DENYWRITE, VM_DENYWRITE ) |
_calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ); _calc_vm_trans(flags, MAP_LOCKED, VM_LOCKED ) |
_calc_vm_trans(flags, MAP_SYNC, VM_SYNC );
} }
unsigned long vm_commit_limit(void); unsigned long vm_commit_limit(void);

View File

@ -149,7 +149,6 @@ DEFINE_EVENT(dax_pmd_insert_mapping_class, name, \
TP_ARGS(inode, vmf, length, pfn, radix_entry)) TP_ARGS(inode, vmf, length, pfn, radix_entry))
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping); DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping);
DEFINE_PMD_INSERT_MAPPING_EVENT(dax_pmd_insert_mapping_fallback);
DECLARE_EVENT_CLASS(dax_pte_fault_class, DECLARE_EVENT_CLASS(dax_pte_fault_class,
TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result), TP_PROTO(struct inode *inode, struct vm_fault *vmf, int result),
@ -192,6 +191,8 @@ DEFINE_EVENT(dax_pte_fault_class, name, \
DEFINE_PTE_FAULT_EVENT(dax_pte_fault); DEFINE_PTE_FAULT_EVENT(dax_pte_fault);
DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done); DEFINE_PTE_FAULT_EVENT(dax_pte_fault_done);
DEFINE_PTE_FAULT_EVENT(dax_load_hole); DEFINE_PTE_FAULT_EVENT(dax_load_hole);
DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite_no_entry);
DEFINE_PTE_FAULT_EVENT(dax_insert_pfn_mkwrite);
TRACE_EVENT(dax_insert_mapping, TRACE_EVENT(dax_insert_mapping,
TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry), TP_PROTO(struct inode *inode, struct vm_fault *vmf, void *radix_entry),

View File

@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */ #define MAP_ANONYMOUS 0x20 /* don't use a file */

View File

@ -13,6 +13,7 @@
#define MAP_NONBLOCK 0x10000 /* do not block on IO */ #define MAP_NONBLOCK 0x10000 /* do not block on IO */
#define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */ #define MAP_STACK 0x20000 /* give out an address that is best suited for process/thread stacks */
#define MAP_HUGETLB 0x40000 /* create a huge page mapping */ #define MAP_HUGETLB 0x40000 /* create a huge page mapping */
#define MAP_SYNC 0x80000 /* perform synchronous page faults for the mapping */
/* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */ /* Bits [26:31] are reserved, see mman-common.h for MAP_HUGETLB usage */

View File

@ -1387,9 +1387,24 @@ unsigned long do_mmap(struct file *file, unsigned long addr,
if (file) { if (file) {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
unsigned long flags_mask;
flags_mask = LEGACY_MAP_MASK | file->f_op->mmap_supported_flags;
switch (flags & MAP_TYPE) { switch (flags & MAP_TYPE) {
case MAP_SHARED: case MAP_SHARED:
/*
* Force use of MAP_SHARED_VALIDATE with non-legacy
* flags. E.g. MAP_SYNC is dangerous to use with
* MAP_SHARED as you don't know which consistency model
* you will get. We silently ignore unsupported flags
* with MAP_SHARED to preserve backward compatibility.
*/
flags &= LEGACY_MAP_MASK;
/* fall through */
case MAP_SHARED_VALIDATE:
if (flags & ~flags_mask)
return -EOPNOTSUPP;
if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE)) if ((prot&PROT_WRITE) && !(file->f_mode&FMODE_WRITE))
return -EACCES; return -EACCES;

View File

@ -17,6 +17,7 @@
#define MAP_SHARED 0x01 /* Share changes */ #define MAP_SHARED 0x01 /* Share changes */
#define MAP_PRIVATE 0x02 /* Changes are private */ #define MAP_PRIVATE 0x02 /* Changes are private */
#define MAP_SHARED_VALIDATE 0x03 /* share + validate extension flags */
#define MAP_TYPE 0x0f /* Mask for type of mapping */ #define MAP_TYPE 0x0f /* Mask for type of mapping */
#define MAP_FIXED 0x10 /* Interpret addr exactly */ #define MAP_FIXED 0x10 /* Interpret addr exactly */
#define MAP_ANONYMOUS 0x20 /* don't use a file */ #define MAP_ANONYMOUS 0x20 /* don't use a file */

View File

@ -70,6 +70,7 @@ libnvdimm-y += $(NVDIMM_SRC)/region_devs.o
libnvdimm-y += $(NVDIMM_SRC)/region.o libnvdimm-y += $(NVDIMM_SRC)/region.o
libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o libnvdimm-y += $(NVDIMM_SRC)/namespace_devs.o
libnvdimm-y += $(NVDIMM_SRC)/label.o libnvdimm-y += $(NVDIMM_SRC)/label.o
libnvdimm-y += $(NVDIMM_SRC)/badrange.o
libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o libnvdimm-$(CONFIG_ND_CLAIM) += $(NVDIMM_SRC)/claim.o
libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o libnvdimm-$(CONFIG_BTT) += $(NVDIMM_SRC)/btt_devs.o
libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o libnvdimm-$(CONFIG_NVDIMM_PFN) += $(NVDIMM_SRC)/pfn_devs.o

View File

@ -168,8 +168,12 @@ struct nfit_test {
spinlock_t lock; spinlock_t lock;
} ars_state; } ars_state;
struct device *dimm_dev[NUM_DCR]; struct device *dimm_dev[NUM_DCR];
struct badrange badrange;
struct work_struct work;
}; };
static struct workqueue_struct *nfit_wq;
static struct nfit_test *to_nfit_test(struct device *dev) static struct nfit_test *to_nfit_test(struct device *dev)
{ {
struct platform_device *pdev = to_platform_device(dev); struct platform_device *pdev = to_platform_device(dev);
@ -234,48 +238,68 @@ static int nfit_test_cmd_set_config_data(struct nd_cmd_set_config_hdr *nd_cmd,
return rc; return rc;
} }
#define NFIT_TEST_ARS_RECORDS 4
#define NFIT_TEST_CLEAR_ERR_UNIT 256 #define NFIT_TEST_CLEAR_ERR_UNIT 256
static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd, static int nfit_test_cmd_ars_cap(struct nd_cmd_ars_cap *nd_cmd,
unsigned int buf_len) unsigned int buf_len)
{ {
int ars_recs;
if (buf_len < sizeof(*nd_cmd)) if (buf_len < sizeof(*nd_cmd))
return -EINVAL; return -EINVAL;
/* for testing, only store up to n records that fit within 4k */
ars_recs = SZ_4K / sizeof(struct nd_ars_record);
nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status) nd_cmd->max_ars_out = sizeof(struct nd_cmd_ars_status)
+ NFIT_TEST_ARS_RECORDS * sizeof(struct nd_ars_record); + ars_recs * sizeof(struct nd_ars_record);
nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16; nd_cmd->status = (ND_ARS_PERSISTENT | ND_ARS_VOLATILE) << 16;
nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT; nd_cmd->clear_err_unit = NFIT_TEST_CLEAR_ERR_UNIT;
return 0; return 0;
} }
/* static void post_ars_status(struct ars_state *ars_state,
* Initialize the ars_state to return an ars_result 1 second in the future with struct badrange *badrange, u64 addr, u64 len)
* a 4K error range in the middle of the requested address range.
*/
static void post_ars_status(struct ars_state *ars_state, u64 addr, u64 len)
{ {
struct nd_cmd_ars_status *ars_status; struct nd_cmd_ars_status *ars_status;
struct nd_ars_record *ars_record; struct nd_ars_record *ars_record;
struct badrange_entry *be;
u64 end = addr + len - 1;
int i = 0;
ars_state->deadline = jiffies + 1*HZ; ars_state->deadline = jiffies + 1*HZ;
ars_status = ars_state->ars_status; ars_status = ars_state->ars_status;
ars_status->status = 0; ars_status->status = 0;
ars_status->out_length = sizeof(struct nd_cmd_ars_status)
+ sizeof(struct nd_ars_record);
ars_status->address = addr; ars_status->address = addr;
ars_status->length = len; ars_status->length = len;
ars_status->type = ND_ARS_PERSISTENT; ars_status->type = ND_ARS_PERSISTENT;
ars_status->num_records = 1;
ars_record = &ars_status->records[0]; spin_lock(&badrange->lock);
ars_record->handle = 0; list_for_each_entry(be, &badrange->list, list) {
ars_record->err_address = addr + len / 2; u64 be_end = be->start + be->length - 1;
ars_record->length = SZ_4K; u64 rstart, rend;
/* skip entries outside the range */
if (be_end < addr || be->start > end)
continue;
rstart = (be->start < addr) ? addr : be->start;
rend = (be_end < end) ? be_end : end;
ars_record = &ars_status->records[i];
ars_record->handle = 0;
ars_record->err_address = rstart;
ars_record->length = rend - rstart + 1;
i++;
}
spin_unlock(&badrange->lock);
ars_status->num_records = i;
ars_status->out_length = sizeof(struct nd_cmd_ars_status)
+ i * sizeof(struct nd_ars_record);
} }
static int nfit_test_cmd_ars_start(struct ars_state *ars_state, static int nfit_test_cmd_ars_start(struct nfit_test *t,
struct ars_state *ars_state,
struct nd_cmd_ars_start *ars_start, unsigned int buf_len, struct nd_cmd_ars_start *ars_start, unsigned int buf_len,
int *cmd_rc) int *cmd_rc)
{ {
@ -289,7 +313,7 @@ static int nfit_test_cmd_ars_start(struct ars_state *ars_state,
} else { } else {
ars_start->status = 0; ars_start->status = 0;
ars_start->scrub_time = 1; ars_start->scrub_time = 1;
post_ars_status(ars_state, ars_start->address, post_ars_status(ars_state, &t->badrange, ars_start->address,
ars_start->length); ars_start->length);
*cmd_rc = 0; *cmd_rc = 0;
} }
@ -320,7 +344,8 @@ static int nfit_test_cmd_ars_status(struct ars_state *ars_state,
return 0; return 0;
} }
static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err, static int nfit_test_cmd_clear_error(struct nfit_test *t,
struct nd_cmd_clear_error *clear_err,
unsigned int buf_len, int *cmd_rc) unsigned int buf_len, int *cmd_rc)
{ {
const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1; const u64 mask = NFIT_TEST_CLEAR_ERR_UNIT - 1;
@ -330,18 +355,91 @@ static int nfit_test_cmd_clear_error(struct nd_cmd_clear_error *clear_err,
if ((clear_err->address & mask) || (clear_err->length & mask)) if ((clear_err->address & mask) || (clear_err->length & mask))
return -EINVAL; return -EINVAL;
/* badrange_forget(&t->badrange, clear_err->address, clear_err->length);
* Report 'all clear' success for all commands even though a new
* scrub will find errors again. This is enough to have the
* error removed from the 'badblocks' tracking in the pmem
* driver.
*/
clear_err->status = 0; clear_err->status = 0;
clear_err->cleared = clear_err->length; clear_err->cleared = clear_err->length;
*cmd_rc = 0; *cmd_rc = 0;
return 0; return 0;
} }
struct region_search_spa {
u64 addr;
struct nd_region *region;
};
static int is_region_device(struct device *dev)
{
return !strncmp(dev->kobj.name, "region", 6);
}
static int nfit_test_search_region_spa(struct device *dev, void *data)
{
struct region_search_spa *ctx = data;
struct nd_region *nd_region;
resource_size_t ndr_end;
if (!is_region_device(dev))
return 0;
nd_region = to_nd_region(dev);
ndr_end = nd_region->ndr_start + nd_region->ndr_size;
if (ctx->addr >= nd_region->ndr_start && ctx->addr < ndr_end) {
ctx->region = nd_region;
return 1;
}
return 0;
}
static int nfit_test_search_spa(struct nvdimm_bus *bus,
struct nd_cmd_translate_spa *spa)
{
int ret;
struct nd_region *nd_region = NULL;
struct nvdimm *nvdimm = NULL;
struct nd_mapping *nd_mapping = NULL;
struct region_search_spa ctx = {
.addr = spa->spa,
.region = NULL,
};
u64 dpa;
ret = device_for_each_child(&bus->dev, &ctx,
nfit_test_search_region_spa);
if (!ret)
return -ENODEV;
nd_region = ctx.region;
dpa = ctx.addr - nd_region->ndr_start;
/*
* last dimm is selected for test
*/
nd_mapping = &nd_region->mapping[nd_region->ndr_mappings - 1];
nvdimm = nd_mapping->nvdimm;
spa->devices[0].nfit_device_handle = handle[nvdimm->id];
spa->num_nvdimms = 1;
spa->devices[0].dpa = dpa;
return 0;
}
static int nfit_test_cmd_translate_spa(struct nvdimm_bus *bus,
struct nd_cmd_translate_spa *spa, unsigned int buf_len)
{
if (buf_len < spa->translate_length)
return -EINVAL;
if (nfit_test_search_spa(bus, spa) < 0 || !spa->num_nvdimms)
spa->status = 2;
return 0;
}
static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len) static int nfit_test_cmd_smart(struct nd_cmd_smart *smart, unsigned int buf_len)
{ {
static const struct nd_smart_payload smart_data = { static const struct nd_smart_payload smart_data = {
@ -378,6 +476,93 @@ static int nfit_test_cmd_smart_threshold(struct nd_cmd_smart_threshold *smart_t,
return 0; return 0;
} }
static void uc_error_notify(struct work_struct *work)
{
struct nfit_test *t = container_of(work, typeof(*t), work);
__acpi_nfit_notify(&t->pdev.dev, t, NFIT_NOTIFY_UC_MEMORY_ERROR);
}
static int nfit_test_cmd_ars_error_inject(struct nfit_test *t,
struct nd_cmd_ars_err_inj *err_inj, unsigned int buf_len)
{
int rc;
if (buf_len != sizeof(*err_inj)) {
rc = -EINVAL;
goto err;
}
if (err_inj->err_inj_spa_range_length <= 0) {
rc = -EINVAL;
goto err;
}
rc = badrange_add(&t->badrange, err_inj->err_inj_spa_range_base,
err_inj->err_inj_spa_range_length);
if (rc < 0)
goto err;
if (err_inj->err_inj_options & (1 << ND_ARS_ERR_INJ_OPT_NOTIFY))
queue_work(nfit_wq, &t->work);
err_inj->status = 0;
return 0;
err:
err_inj->status = NFIT_ARS_INJECT_INVALID;
return rc;
}
static int nfit_test_cmd_ars_inject_clear(struct nfit_test *t,
struct nd_cmd_ars_err_inj_clr *err_clr, unsigned int buf_len)
{
int rc;
if (buf_len != sizeof(*err_clr)) {
rc = -EINVAL;
goto err;
}
if (err_clr->err_inj_clr_spa_range_length <= 0) {
rc = -EINVAL;
goto err;
}
badrange_forget(&t->badrange, err_clr->err_inj_clr_spa_range_base,
err_clr->err_inj_clr_spa_range_length);
err_clr->status = 0;
return 0;
err:
err_clr->status = NFIT_ARS_INJECT_INVALID;
return rc;
}
static int nfit_test_cmd_ars_inject_status(struct nfit_test *t,
struct nd_cmd_ars_err_inj_stat *err_stat,
unsigned int buf_len)
{
struct badrange_entry *be;
int max = SZ_4K / sizeof(struct nd_error_stat_query_record);
int i = 0;
err_stat->status = 0;
spin_lock(&t->badrange.lock);
list_for_each_entry(be, &t->badrange.list, list) {
err_stat->record[i].err_inj_stat_spa_range_base = be->start;
err_stat->record[i].err_inj_stat_spa_range_length = be->length;
i++;
if (i > max)
break;
}
spin_unlock(&t->badrange.lock);
err_stat->inj_err_rec_count = i;
return 0;
}
static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc, static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
struct nvdimm *nvdimm, unsigned int cmd, void *buf, struct nvdimm *nvdimm, unsigned int cmd, void *buf,
unsigned int buf_len, int *cmd_rc) unsigned int buf_len, int *cmd_rc)
@ -449,6 +634,38 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
} }
} else { } else {
struct ars_state *ars_state = &t->ars_state; struct ars_state *ars_state = &t->ars_state;
struct nd_cmd_pkg *call_pkg = buf;
if (!nd_desc)
return -ENOTTY;
if (cmd == ND_CMD_CALL) {
func = call_pkg->nd_command;
buf_len = call_pkg->nd_size_in + call_pkg->nd_size_out;
buf = (void *) call_pkg->nd_payload;
switch (func) {
case NFIT_CMD_TRANSLATE_SPA:
rc = nfit_test_cmd_translate_spa(
acpi_desc->nvdimm_bus, buf, buf_len);
return rc;
case NFIT_CMD_ARS_INJECT_SET:
rc = nfit_test_cmd_ars_error_inject(t, buf,
buf_len);
return rc;
case NFIT_CMD_ARS_INJECT_CLEAR:
rc = nfit_test_cmd_ars_inject_clear(t, buf,
buf_len);
return rc;
case NFIT_CMD_ARS_INJECT_GET:
rc = nfit_test_cmd_ars_inject_status(t, buf,
buf_len);
return rc;
default:
return -ENOTTY;
}
}
if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask)) if (!nd_desc || !test_bit(cmd, &nd_desc->cmd_mask))
return -ENOTTY; return -ENOTTY;
@ -458,15 +675,15 @@ static int nfit_test_ctl(struct nvdimm_bus_descriptor *nd_desc,
rc = nfit_test_cmd_ars_cap(buf, buf_len); rc = nfit_test_cmd_ars_cap(buf, buf_len);
break; break;
case ND_CMD_ARS_START: case ND_CMD_ARS_START:
rc = nfit_test_cmd_ars_start(ars_state, buf, buf_len, rc = nfit_test_cmd_ars_start(t, ars_state, buf,
cmd_rc); buf_len, cmd_rc);
break; break;
case ND_CMD_ARS_STATUS: case ND_CMD_ARS_STATUS:
rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len, rc = nfit_test_cmd_ars_status(ars_state, buf, buf_len,
cmd_rc); cmd_rc);
break; break;
case ND_CMD_CLEAR_ERROR: case ND_CMD_CLEAR_ERROR:
rc = nfit_test_cmd_clear_error(buf, buf_len, cmd_rc); rc = nfit_test_cmd_clear_error(t, buf, buf_len, cmd_rc);
break; break;
default: default:
return -ENOTTY; return -ENOTTY;
@ -566,10 +783,9 @@ static struct nfit_test_resource *nfit_test_lookup(resource_size_t addr)
static int ars_state_init(struct device *dev, struct ars_state *ars_state) static int ars_state_init(struct device *dev, struct ars_state *ars_state)
{ {
/* for testing, only store up to n records that fit within 4k */
ars_state->ars_status = devm_kzalloc(dev, ars_state->ars_status = devm_kzalloc(dev,
sizeof(struct nd_cmd_ars_status) sizeof(struct nd_cmd_ars_status) + SZ_4K, GFP_KERNEL);
+ sizeof(struct nd_ars_record) * NFIT_TEST_ARS_RECORDS,
GFP_KERNEL);
if (!ars_state->ars_status) if (!ars_state->ars_status)
return -ENOMEM; return -ENOMEM;
spin_lock_init(&ars_state->lock); spin_lock_init(&ars_state->lock);
@ -1419,7 +1635,8 @@ static void nfit_test0_setup(struct nfit_test *t)
+ i * sizeof(u64); + i * sizeof(u64);
} }
post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA0_SIZE); post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
SPA0_SIZE);
acpi_desc = &t->acpi_desc; acpi_desc = &t->acpi_desc;
set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en); set_bit(ND_CMD_GET_CONFIG_SIZE, &acpi_desc->dimm_cmd_force_en);
@ -1430,7 +1647,12 @@ static void nfit_test0_setup(struct nfit_test *t)
set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_START, &acpi_desc->bus_cmd_force_en);
set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_STATUS, &acpi_desc->bus_cmd_force_en);
set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_CLEAR_ERROR, &acpi_desc->bus_cmd_force_en);
set_bit(ND_CMD_CALL, &acpi_desc->bus_cmd_force_en);
set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en); set_bit(ND_CMD_SMART_THRESHOLD, &acpi_desc->dimm_cmd_force_en);
set_bit(NFIT_CMD_TRANSLATE_SPA, &acpi_desc->bus_nfit_cmd_force_en);
set_bit(NFIT_CMD_ARS_INJECT_SET, &acpi_desc->bus_nfit_cmd_force_en);
set_bit(NFIT_CMD_ARS_INJECT_CLEAR, &acpi_desc->bus_nfit_cmd_force_en);
set_bit(NFIT_CMD_ARS_INJECT_GET, &acpi_desc->bus_nfit_cmd_force_en);
} }
static void nfit_test1_setup(struct nfit_test *t) static void nfit_test1_setup(struct nfit_test *t)
@ -1520,7 +1742,8 @@ static void nfit_test1_setup(struct nfit_test *t)
dcr->code = NFIT_FIC_BYTE; dcr->code = NFIT_FIC_BYTE;
dcr->windows = 0; dcr->windows = 0;
post_ars_status(&t->ars_state, t->spa_set_dma[0], SPA2_SIZE); post_ars_status(&t->ars_state, &t->badrange, t->spa_set_dma[0],
SPA2_SIZE);
acpi_desc = &t->acpi_desc; acpi_desc = &t->acpi_desc;
set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en); set_bit(ND_CMD_ARS_CAP, &acpi_desc->bus_cmd_force_en);
@ -1589,6 +1812,7 @@ static int nfit_ctl_test(struct device *dev)
unsigned long mask, cmd_size, offset; unsigned long mask, cmd_size, offset;
union { union {
struct nd_cmd_get_config_size cfg_size; struct nd_cmd_get_config_size cfg_size;
struct nd_cmd_clear_error clear_err;
struct nd_cmd_ars_status ars_stat; struct nd_cmd_ars_status ars_stat;
struct nd_cmd_ars_cap ars_cap; struct nd_cmd_ars_cap ars_cap;
char buf[sizeof(struct nd_cmd_ars_status) char buf[sizeof(struct nd_cmd_ars_status)
@ -1613,10 +1837,15 @@ static int nfit_ctl_test(struct device *dev)
.cmd_mask = 1UL << ND_CMD_ARS_CAP .cmd_mask = 1UL << ND_CMD_ARS_CAP
| 1UL << ND_CMD_ARS_START | 1UL << ND_CMD_ARS_START
| 1UL << ND_CMD_ARS_STATUS | 1UL << ND_CMD_ARS_STATUS
| 1UL << ND_CMD_CLEAR_ERROR, | 1UL << ND_CMD_CLEAR_ERROR
| 1UL << ND_CMD_CALL,
.module = THIS_MODULE, .module = THIS_MODULE,
.provider_name = "ACPI.NFIT", .provider_name = "ACPI.NFIT",
.ndctl = acpi_nfit_ctl, .ndctl = acpi_nfit_ctl,
.bus_dsm_mask = 1UL << NFIT_CMD_TRANSLATE_SPA
| 1UL << NFIT_CMD_ARS_INJECT_SET
| 1UL << NFIT_CMD_ARS_INJECT_CLEAR
| 1UL << NFIT_CMD_ARS_INJECT_GET,
}, },
.dev = &adev->dev, .dev = &adev->dev,
}; };
@ -1767,6 +1996,23 @@ static int nfit_ctl_test(struct device *dev)
return -EIO; return -EIO;
} }
/* test clear error */
cmd_size = sizeof(cmds.clear_err);
cmds.clear_err = (struct nd_cmd_clear_error) {
.length = 512,
.cleared = 512,
};
rc = setup_result(cmds.buf, cmd_size);
if (rc)
return rc;
rc = acpi_nfit_ctl(&acpi_desc->nd_desc, NULL, ND_CMD_CLEAR_ERROR,
cmds.buf, cmd_size, &cmd_rc);
if (rc < 0 || cmd_rc) {
dev_dbg(dev, "%s: failed at: %d rc: %d cmd_rc: %d\n",
__func__, __LINE__, rc, cmd_rc);
return -EIO;
}
return 0; return 0;
} }
@ -1915,6 +2161,10 @@ static __init int nfit_test_init(void)
nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm); nfit_test_setup(nfit_test_lookup, nfit_test_evaluate_dsm);
nfit_wq = create_singlethread_workqueue("nfit");
if (!nfit_wq)
return -ENOMEM;
nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm"); nfit_test_dimm = class_create(THIS_MODULE, "nfit_test_dimm");
if (IS_ERR(nfit_test_dimm)) { if (IS_ERR(nfit_test_dimm)) {
rc = PTR_ERR(nfit_test_dimm); rc = PTR_ERR(nfit_test_dimm);
@ -1931,6 +2181,7 @@ static __init int nfit_test_init(void)
goto err_register; goto err_register;
} }
INIT_LIST_HEAD(&nfit_test->resources); INIT_LIST_HEAD(&nfit_test->resources);
badrange_init(&nfit_test->badrange);
switch (i) { switch (i) {
case 0: case 0:
nfit_test->num_pm = NUM_PM; nfit_test->num_pm = NUM_PM;
@ -1966,6 +2217,7 @@ static __init int nfit_test_init(void)
goto err_register; goto err_register;
instances[i] = nfit_test; instances[i] = nfit_test;
INIT_WORK(&nfit_test->work, uc_error_notify);
} }
rc = platform_driver_register(&nfit_test_driver); rc = platform_driver_register(&nfit_test_driver);
@ -1974,6 +2226,7 @@ static __init int nfit_test_init(void)
return 0; return 0;
err_register: err_register:
destroy_workqueue(nfit_wq);
for (i = 0; i < NUM_NFITS; i++) for (i = 0; i < NUM_NFITS; i++)
if (instances[i]) if (instances[i])
platform_device_unregister(&instances[i]->pdev); platform_device_unregister(&instances[i]->pdev);
@ -1989,6 +2242,8 @@ static __exit void nfit_test_exit(void)
{ {
int i; int i;
flush_workqueue(nfit_wq);
destroy_workqueue(nfit_wq);
for (i = 0; i < NUM_NFITS; i++) for (i = 0; i < NUM_NFITS; i++)
platform_device_unregister(&instances[i]->pdev); platform_device_unregister(&instances[i]->pdev);
platform_driver_unregister(&nfit_test_driver); platform_driver_unregister(&nfit_test_driver);

View File

@ -32,6 +32,58 @@ struct nfit_test_resource {
void *buf; void *buf;
}; };
#define ND_TRANSLATE_SPA_STATUS_INVALID_SPA 2
#define NFIT_ARS_INJECT_INVALID 2
enum err_inj_options {
ND_ARS_ERR_INJ_OPT_NOTIFY = 0,
};
/* nfit commands */
enum nfit_cmd_num {
NFIT_CMD_TRANSLATE_SPA = 5,
NFIT_CMD_ARS_INJECT_SET = 7,
NFIT_CMD_ARS_INJECT_CLEAR = 8,
NFIT_CMD_ARS_INJECT_GET = 9,
};
struct nd_cmd_translate_spa {
__u64 spa;
__u32 status;
__u8 flags;
__u8 _reserved[3];
__u64 translate_length;
__u32 num_nvdimms;
struct nd_nvdimm_device {
__u32 nfit_device_handle;
__u32 _reserved;
__u64 dpa;
} __packed devices[0];
} __packed;
struct nd_cmd_ars_err_inj {
__u64 err_inj_spa_range_base;
__u64 err_inj_spa_range_length;
__u8 err_inj_options;
__u32 status;
} __packed;
struct nd_cmd_ars_err_inj_clr {
__u64 err_inj_clr_spa_range_base;
__u64 err_inj_clr_spa_range_length;
__u32 status;
} __packed;
struct nd_cmd_ars_err_inj_stat {
__u32 status;
__u32 inj_err_rec_count;
struct nd_error_stat_query_record {
__u64 err_inj_stat_spa_range_base;
__u64 err_inj_stat_spa_range_length;
} __packed record[0];
} __packed;
union acpi_object; union acpi_object;
typedef void *acpi_handle; typedef void *acpi_handle;