diff --git a/Documentation/ABI/testing/debugfs-hisi-migration b/Documentation/ABI/testing/debugfs-hisi-migration new file mode 100644 index 000000000000..2c01b2d387dd --- /dev/null +++ b/Documentation/ABI/testing/debugfs-hisi-migration @@ -0,0 +1,25 @@ +What: /sys/kernel/debug/vfio//migration/hisi_acc/dev_data +Date: Jan 2025 +KernelVersion: 6.13 +Contact: Longfang Liu +Description: Read the configuration data and some status data + required for device live migration. These data include device + status data, queue configuration data, some task configuration + data and device attribute data. The output format of the data + is defined by the live migration driver. + +What: /sys/kernel/debug/vfio//migration/hisi_acc/migf_data +Date: Jan 2025 +KernelVersion: 6.13 +Contact: Longfang Liu +Description: Read the data from the last completed live migration. + This data includes the same device status data as in "dev_data". + The migf_data is the dev_data that is migrated. + +What: /sys/kernel/debug/vfio//migration/hisi_acc/cmd_state +Date: Jan 2025 +KernelVersion: 6.13 +Contact: Longfang Liu +Description: Used to obtain the device command sending and receiving + channel status. Returns failure or success logs based on the + results. diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c index 0d632ba5d2a3..451c639299eb 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.c @@ -486,31 +486,11 @@ static int vf_qm_load_data(struct hisi_acc_vf_core_device *hisi_acc_vdev, return 0; } -static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, - struct hisi_acc_vf_migration_file *migf) +static int vf_qm_read_data(struct hisi_qm *vf_qm, struct acc_vf_data *vf_data) { - struct acc_vf_data *vf_data = &migf->vf_data; - struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; struct device *dev = &vf_qm->pdev->dev; int ret; - if (unlikely(qm_wait_dev_not_ready(vf_qm))) { - /* Update state and return with match data */ - vf_data->vf_qm_state = QM_NOT_READY; - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - migf->total_length = QM_MATCH_SIZE; - return 0; - } - - vf_data->vf_qm_state = QM_READY; - hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; - - ret = vf_qm_cache_wb(vf_qm); - if (ret) { - dev_err(dev, "failed to writeback QM Cache!\n"); - return ret; - } - ret = qm_get_regs(vf_qm, vf_data); if (ret) return -EINVAL; @@ -536,6 +516,38 @@ static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, return -EINVAL; } + return 0; +} + +static int vf_qm_state_save(struct hisi_acc_vf_core_device *hisi_acc_vdev, + struct hisi_acc_vf_migration_file *migf) +{ + struct acc_vf_data *vf_data = &migf->vf_data; + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + struct device *dev = &vf_qm->pdev->dev; + int ret; + + if (unlikely(qm_wait_dev_not_ready(vf_qm))) { + /* Update state and return with match data */ + vf_data->vf_qm_state = QM_NOT_READY; + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + migf->total_length = QM_MATCH_SIZE; + return 0; + } + + vf_data->vf_qm_state = QM_READY; + hisi_acc_vdev->vf_qm_state = vf_data->vf_qm_state; + + ret = vf_qm_cache_wb(vf_qm); + if (ret) { + dev_err(dev, "failed to writeback QM Cache!\n"); + return ret; + } + + ret = vf_qm_read_data(vf_qm, vf_data); + if (ret) + return -EINVAL; + migf->total_length = sizeof(struct acc_vf_data); return 0; } @@ -615,21 +627,43 @@ static void hisi_acc_vf_disable_fd(struct hisi_acc_vf_migration_file *migf) mutex_unlock(&migf->lock); } +static void +hisi_acc_debug_migf_copy(struct hisi_acc_vf_core_device *hisi_acc_vdev, + struct hisi_acc_vf_migration_file *src_migf) +{ + struct hisi_acc_vf_migration_file *dst_migf = hisi_acc_vdev->debug_migf; + + if (!dst_migf) + return; + + dst_migf->total_length = src_migf->total_length; + memcpy(&dst_migf->vf_data, &src_migf->vf_data, + sizeof(struct acc_vf_data)); +} + static void hisi_acc_vf_disable_fds(struct hisi_acc_vf_core_device *hisi_acc_vdev) { if (hisi_acc_vdev->resuming_migf) { + hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->resuming_migf); hisi_acc_vf_disable_fd(hisi_acc_vdev->resuming_migf); fput(hisi_acc_vdev->resuming_migf->filp); hisi_acc_vdev->resuming_migf = NULL; } if (hisi_acc_vdev->saving_migf) { + hisi_acc_debug_migf_copy(hisi_acc_vdev, hisi_acc_vdev->saving_migf); hisi_acc_vf_disable_fd(hisi_acc_vdev->saving_migf); fput(hisi_acc_vdev->saving_migf->filp); hisi_acc_vdev->saving_migf = NULL; } } +static struct hisi_acc_vf_core_device *hisi_acc_get_vf_dev(struct vfio_device *vdev) +{ + return container_of(vdev, struct hisi_acc_vf_core_device, + core_device.vdev); +} + static void hisi_acc_vf_reset(struct hisi_acc_vf_core_device *hisi_acc_vdev) { hisi_acc_vdev->vf_qm_state = QM_NOT_READY; @@ -1031,8 +1065,7 @@ static struct file * hisi_acc_vfio_pci_set_device_state(struct vfio_device *vdev, enum vfio_device_mig_state new_state) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); enum vfio_device_mig_state next_state; struct file *res = NULL; int ret; @@ -1073,8 +1106,7 @@ static int hisi_acc_vfio_pci_get_device_state(struct vfio_device *vdev, enum vfio_device_mig_state *curr_state) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); mutex_lock(&hisi_acc_vdev->state_mutex); *curr_state = hisi_acc_vdev->mig_state; @@ -1276,10 +1308,132 @@ static long hisi_acc_vfio_pci_ioctl(struct vfio_device *core_vdev, unsigned int return vfio_pci_core_ioctl(core_vdev, cmd, arg); } +static int hisi_acc_vf_debug_check(struct seq_file *seq, struct vfio_device *vdev) +{ + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + int ret; + + lockdep_assert_held(&hisi_acc_vdev->open_mutex); + /* + * When the device is not opened, the io_base is not mapped. + * The driver cannot perform device read and write operations. + */ + if (!hisi_acc_vdev->dev_opened) { + seq_puts(seq, "device not opened!\n"); + return -EINVAL; + } + + ret = qm_wait_dev_not_ready(vf_qm); + if (ret) { + seq_puts(seq, "VF device not ready!\n"); + return -EBUSY; + } + + return 0; +} + +static int hisi_acc_vf_debug_cmd(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + u64 value; + int ret; + + mutex_lock(&hisi_acc_vdev->open_mutex); + ret = hisi_acc_vf_debug_check(seq, vdev); + if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + return ret; + } + + value = readl(vf_qm->io_base + QM_MB_CMD_SEND_BASE); + if (value == QM_MB_CMD_NOT_READY) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + seq_puts(seq, "mailbox cmd channel not ready!\n"); + return -EINVAL; + } + mutex_unlock(&hisi_acc_vdev->open_mutex); + seq_puts(seq, "mailbox cmd channel ready!\n"); + + return 0; +} + +static int hisi_acc_vf_dev_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + size_t vf_data_sz = offsetofend(struct acc_vf_data, padding); + struct acc_vf_data *vf_data; + int ret; + + mutex_lock(&hisi_acc_vdev->open_mutex); + ret = hisi_acc_vf_debug_check(seq, vdev); + if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); + return ret; + } + + mutex_lock(&hisi_acc_vdev->state_mutex); + vf_data = kzalloc(sizeof(*vf_data), GFP_KERNEL); + if (!vf_data) { + ret = -ENOMEM; + goto mutex_release; + } + + vf_data->vf_qm_state = hisi_acc_vdev->vf_qm_state; + ret = vf_qm_read_data(&hisi_acc_vdev->vf_qm, vf_data); + if (ret) + goto migf_err; + + seq_hex_dump(seq, "Dev Data:", DUMP_PREFIX_OFFSET, 16, 1, + (const void *)vf_data, vf_data_sz, false); + + seq_printf(seq, + "guest driver load: %u\n" + "data size: %lu\n", + hisi_acc_vdev->vf_qm_state, + sizeof(struct acc_vf_data)); + +migf_err: + kfree(vf_data); +mutex_release: + mutex_unlock(&hisi_acc_vdev->state_mutex); + mutex_unlock(&hisi_acc_vdev->open_mutex); + + return ret; +} + +static int hisi_acc_vf_migf_read(struct seq_file *seq, void *data) +{ + struct device *vf_dev = seq->private; + struct vfio_pci_core_device *core_device = dev_get_drvdata(vf_dev); + struct vfio_device *vdev = &core_device->vdev; + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(vdev); + size_t vf_data_sz = offsetofend(struct acc_vf_data, padding); + struct hisi_acc_vf_migration_file *debug_migf = hisi_acc_vdev->debug_migf; + + /* Check whether the live migration operation has been performed */ + if (debug_migf->total_length < QM_MATCH_SIZE) { + seq_puts(seq, "device not migrated!\n"); + return -EAGAIN; + } + + seq_hex_dump(seq, "Mig Data:", DUMP_PREFIX_OFFSET, 16, 1, + (const void *)&debug_migf->vf_data, vf_data_sz, false); + seq_printf(seq, "migrate data length: %lu\n", debug_migf->total_length); + + return 0; +} + static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct vfio_pci_core_device *vdev = &hisi_acc_vdev->core_device; int ret; @@ -1288,12 +1442,16 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) return ret; if (core_vdev->mig_ops) { + mutex_lock(&hisi_acc_vdev->open_mutex); ret = hisi_acc_vf_qm_init(hisi_acc_vdev); if (ret) { + mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_disable(vdev); return ret; } hisi_acc_vdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + hisi_acc_vdev->dev_opened = true; + mutex_unlock(&hisi_acc_vdev->open_mutex); } vfio_pci_core_finish_enable(vdev); @@ -1302,11 +1460,13 @@ static int hisi_acc_vfio_pci_open_device(struct vfio_device *core_vdev) static void hisi_acc_vfio_pci_close_device(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct hisi_qm *vf_qm = &hisi_acc_vdev->vf_qm; + mutex_lock(&hisi_acc_vdev->open_mutex); + hisi_acc_vdev->dev_opened = false; iounmap(vf_qm->io_base); + mutex_unlock(&hisi_acc_vdev->open_mutex); vfio_pci_core_close_device(core_vdev); } @@ -1318,8 +1478,7 @@ static const struct vfio_migration_ops hisi_acc_vfio_pci_migrn_state_ops = { static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) { - struct hisi_acc_vf_core_device *hisi_acc_vdev = container_of(core_vdev, - struct hisi_acc_vf_core_device, core_device.vdev); + struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_get_vf_dev(core_vdev); struct pci_dev *pdev = to_pci_dev(core_vdev->dev); struct hisi_qm *pf_qm = hisi_acc_get_pf_qm(pdev); @@ -1327,6 +1486,7 @@ static int hisi_acc_vfio_pci_migrn_init_dev(struct vfio_device *core_vdev) hisi_acc_vdev->pf_qm = pf_qm; hisi_acc_vdev->vf_dev = pdev; mutex_init(&hisi_acc_vdev->state_mutex); + mutex_init(&hisi_acc_vdev->open_mutex); core_vdev->migration_flags = VFIO_MIGRATION_STOP_COPY | VFIO_MIGRATION_PRE_COPY; core_vdev->mig_ops = &hisi_acc_vfio_pci_migrn_state_ops; @@ -1372,6 +1532,47 @@ static const struct vfio_device_ops hisi_acc_vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +static void hisi_acc_vfio_debug_init(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + struct vfio_device *vdev = &hisi_acc_vdev->core_device.vdev; + struct hisi_acc_vf_migration_file *migf; + struct dentry *vfio_dev_migration; + struct dentry *vfio_hisi_acc; + struct device *dev = vdev->dev; + + if (!debugfs_initialized() || + !IS_ENABLED(CONFIG_VFIO_DEBUGFS)) + return; + + if (vdev->ops != &hisi_acc_vfio_pci_migrn_ops) + return; + + vfio_dev_migration = debugfs_lookup("migration", vdev->debug_root); + if (!vfio_dev_migration) { + dev_err(dev, "failed to lookup migration debugfs file!\n"); + return; + } + + migf = kzalloc(sizeof(*migf), GFP_KERNEL); + if (!migf) + return; + hisi_acc_vdev->debug_migf = migf; + + vfio_hisi_acc = debugfs_create_dir("hisi_acc", vfio_dev_migration); + debugfs_create_devm_seqfile(dev, "dev_data", vfio_hisi_acc, + hisi_acc_vf_dev_read); + debugfs_create_devm_seqfile(dev, "migf_data", vfio_hisi_acc, + hisi_acc_vf_migf_read); + debugfs_create_devm_seqfile(dev, "cmd_state", vfio_hisi_acc, + hisi_acc_vf_debug_cmd); +} + +static void hisi_acc_vf_debugfs_exit(struct hisi_acc_vf_core_device *hisi_acc_vdev) +{ + kfree(hisi_acc_vdev->debug_migf); + hisi_acc_vdev->debug_migf = NULL; +} + static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { struct hisi_acc_vf_core_device *hisi_acc_vdev; @@ -1398,6 +1599,8 @@ static int hisi_acc_vfio_pci_probe(struct pci_dev *pdev, const struct pci_device ret = vfio_pci_core_register_device(&hisi_acc_vdev->core_device); if (ret) goto out_put_vdev; + + hisi_acc_vfio_debug_init(hisi_acc_vdev); return 0; out_put_vdev: @@ -1410,6 +1613,7 @@ static void hisi_acc_vfio_pci_remove(struct pci_dev *pdev) struct hisi_acc_vf_core_device *hisi_acc_vdev = hisi_acc_drvdata(pdev); vfio_pci_core_unregister_device(&hisi_acc_vdev->core_device); + hisi_acc_vf_debugfs_exit(hisi_acc_vdev); vfio_put_device(&hisi_acc_vdev->core_device.vdev); } diff --git a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h index 5bab46602fad..245d7537b2bc 100644 --- a/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h +++ b/drivers/vfio/pci/hisilicon/hisi_acc_vfio_pci.h @@ -32,6 +32,7 @@ #define QM_SQC_VFT_BASE_MASK_V2 GENMASK(15, 0) #define QM_SQC_VFT_NUM_SHIFT_V2 45 #define QM_SQC_VFT_NUM_MASK_V2 GENMASK(9, 0) +#define QM_MB_CMD_NOT_READY 0xffffffff /* RW regs */ #define QM_REGS_MAX_LEN 7 @@ -99,6 +100,13 @@ struct hisi_acc_vf_migration_file { struct hisi_acc_vf_core_device { struct vfio_pci_core_device core_device; u8 match_done; + /* + * io_base is only valid when dev_opened is true, + * which is protected by open_mutex. + */ + bool dev_opened; + /* Ensure the accuracy of dev_opened operation */ + struct mutex open_mutex; /* For migration state */ struct mutex state_mutex; @@ -107,9 +115,20 @@ struct hisi_acc_vf_core_device { struct pci_dev *vf_dev; struct hisi_qm *pf_qm; struct hisi_qm vf_qm; + /* + * vf_qm_state represents the QM_VF_STATE register value. + * It is set by Guest driver for the ACC VF dev indicating + * the driver has loaded and configured the dev correctly. + */ u32 vf_qm_state; int vf_id; struct hisi_acc_vf_migration_file *resuming_migf; struct hisi_acc_vf_migration_file *saving_migf; + + /* + * It holds migration data corresponding to the last migration + * and is used by the debugfs interface to report it. + */ + struct hisi_acc_vf_migration_file *debug_migf; }; #endif /* HISI_ACC_VFIO_PCI_H */ diff --git a/drivers/vfio/pci/mlx5/cmd.c b/drivers/vfio/pci/mlx5/cmd.c index 41a4b0cf4297..7527e277c898 100644 --- a/drivers/vfio/pci/mlx5/cmd.c +++ b/drivers/vfio/pci/mlx5/cmd.c @@ -423,6 +423,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, unsigned long filled; unsigned int to_fill; int ret; + int i; to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); page_list = kvzalloc(to_fill * sizeof(*page_list), GFP_KERNEL_ACCOUNT); @@ -443,7 +444,7 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, GFP_KERNEL_ACCOUNT); if (ret) - goto err; + goto err_append; buf->allocated_length += filled * PAGE_SIZE; /* clean input for another bulk allocation */ memset(page_list, 0, filled * sizeof(*page_list)); @@ -454,6 +455,9 @@ static int mlx5vf_add_migration_pages(struct mlx5_vhca_data_buffer *buf, kvfree(page_list); return 0; +err_append: + for (i = filled - 1; i >= 0; i--) + __free_page(page_list[i]); err: kvfree(page_list); return ret; diff --git a/drivers/vfio/pci/mlx5/main.c b/drivers/vfio/pci/mlx5/main.c index 242c23eef452..8833e60d42f5 100644 --- a/drivers/vfio/pci/mlx5/main.c +++ b/drivers/vfio/pci/mlx5/main.c @@ -640,14 +640,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) O_RDONLY); if (IS_ERR(migf->filp)) { ret = PTR_ERR(migf->filp); - goto end; + kfree(migf); + return ERR_PTR(ret); } migf->mvdev = mvdev; - ret = mlx5vf_cmd_alloc_pd(migf); - if (ret) - goto out_free; - stream_open(migf->filp->f_inode, migf->filp); mutex_init(&migf->lock); init_waitqueue_head(&migf->poll_wait); @@ -663,6 +660,11 @@ mlx5vf_pci_save_device_data(struct mlx5vf_pci_core_device *mvdev, bool track) INIT_LIST_HEAD(&migf->buf_list); INIT_LIST_HEAD(&migf->avail_list); spin_lock_init(&migf->list_lock); + + ret = mlx5vf_cmd_alloc_pd(migf); + if (ret) + goto out; + ret = mlx5vf_cmd_query_vhca_migration_state(mvdev, &length, &full_size, 0); if (ret) goto out_pd; @@ -692,10 +694,8 @@ out_save: mlx5vf_free_data_buffer(buf); out_pd: mlx5fv_cmd_clean_migf_resources(migf); -out_free: +out: fput(migf->filp); -end: - kfree(migf); return ERR_PTR(ret); } @@ -1016,13 +1016,19 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) O_WRONLY); if (IS_ERR(migf->filp)) { ret = PTR_ERR(migf->filp); - goto end; + kfree(migf); + return ERR_PTR(ret); } + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); migf->mvdev = mvdev; ret = mlx5vf_cmd_alloc_pd(migf); if (ret) - goto out_free; + goto out; buf = mlx5vf_alloc_data_buffer(migf, 0, DMA_TO_DEVICE); if (IS_ERR(buf)) { @@ -1041,20 +1047,13 @@ mlx5vf_pci_resume_device_data(struct mlx5vf_pci_core_device *mvdev) migf->buf_header[0] = buf; migf->load_state = MLX5_VF_LOAD_STATE_READ_HEADER; - stream_open(migf->filp->f_inode, migf->filp); - mutex_init(&migf->lock); - INIT_LIST_HEAD(&migf->buf_list); - INIT_LIST_HEAD(&migf->avail_list); - spin_lock_init(&migf->list_lock); return migf; out_buf: mlx5vf_free_data_buffer(migf->buf[0]); out_pd: mlx5vf_cmd_dealloc_pd(migf); -out_free: +out: fput(migf->filp); -end: - kfree(migf); return ERR_PTR(ret); } diff --git a/drivers/vfio/pci/nvgrace-gpu/main.c b/drivers/vfio/pci/nvgrace-gpu/main.c index a7fd018aa548..a467085038f0 100644 --- a/drivers/vfio/pci/nvgrace-gpu/main.c +++ b/drivers/vfio/pci/nvgrace-gpu/main.c @@ -866,6 +866,8 @@ static const struct pci_device_id nvgrace_gpu_vfio_pci_table[] = { { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2342) }, /* GH200 480GB */ { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2345) }, + /* GH200 SKU */ + { PCI_DRIVER_OVERRIDE_DEVICE_VFIO(PCI_VENDOR_ID_NVIDIA, 0x2348) }, {} }; diff --git a/drivers/vfio/pci/qat/main.c b/drivers/vfio/pci/qat/main.c index be3644ced17b..c78cb6de9390 100644 --- a/drivers/vfio/pci/qat/main.c +++ b/drivers/vfio/pci/qat/main.c @@ -304,7 +304,7 @@ static ssize_t qat_vf_resume_write(struct file *filp, const char __user *buf, offs = &filp->f_pos; if (*offs < 0 || - check_add_overflow((loff_t)len, *offs, &end)) + check_add_overflow(len, *offs, &end)) return -EOVERFLOW; if (end > mig_dev->state_size) diff --git a/drivers/vfio/pci/vfio_pci_config.c b/drivers/vfio/pci/vfio_pci_config.c index 97422aafaa7b..ea2745c1ac5e 100644 --- a/drivers/vfio/pci/vfio_pci_config.c +++ b/drivers/vfio/pci/vfio_pci_config.c @@ -313,6 +313,10 @@ static int vfio_virt_config_read(struct vfio_pci_core_device *vdev, int pos, return count; } +static struct perm_bits direct_ro_perms = { + .readfn = vfio_direct_config_read, +}; + /* Default capability regions to read-only, no-virtualization */ static struct perm_bits cap_perms[PCI_CAP_ID_MAX + 1] = { [0 ... PCI_CAP_ID_MAX] = { .readfn = vfio_direct_config_read } @@ -1897,9 +1901,17 @@ static ssize_t vfio_config_do_rw(struct vfio_pci_core_device *vdev, char __user cap_start = *ppos; } else { if (*ppos >= PCI_CFG_SPACE_SIZE) { - WARN_ON(cap_id > PCI_EXT_CAP_ID_MAX); + /* + * We can get a cap_id that exceeds PCI_EXT_CAP_ID_MAX + * if we're hiding an unknown capability at the start + * of the extended capability list. Use default, ro + * access, which will virtualize the id and next values. + */ + if (cap_id > PCI_EXT_CAP_ID_MAX) + perm = &direct_ro_perms; + else + perm = &ecap_perms[cap_id]; - perm = &ecap_perms[cap_id]; cap_start = vfio_find_cap_start(vdev, *ppos); } else { WARN_ON(cap_id > PCI_CAP_ID_MAX); diff --git a/drivers/vfio/pci/virtio/Kconfig b/drivers/vfio/pci/virtio/Kconfig index bd80eca4a196..2770f7eb702c 100644 --- a/drivers/vfio/pci/virtio/Kconfig +++ b/drivers/vfio/pci/virtio/Kconfig @@ -1,15 +1,31 @@ # SPDX-License-Identifier: GPL-2.0-only config VIRTIO_VFIO_PCI - tristate "VFIO support for VIRTIO NET PCI devices" - depends on VIRTIO_PCI && VIRTIO_PCI_ADMIN_LEGACY - select VFIO_PCI_CORE - help - This provides support for exposing VIRTIO NET VF devices which support - legacy IO access, using the VFIO framework that can work with a legacy - virtio driver in the guest. - Based on PCIe spec, VFs do not support I/O Space. - As of that this driver emulates I/O BAR in software to let a VF be - seen as a transitional device by its users and let it work with - a legacy driver. + tristate "VFIO support for VIRTIO NET PCI VF devices" + depends on VIRTIO_PCI + select VFIO_PCI_CORE + help + This provides migration support for VIRTIO NET PCI VF devices + using the VFIO framework. Migration support requires the + SR-IOV PF device to support specific VIRTIO extensions, + otherwise this driver provides no additional functionality + beyond vfio-pci. - If you don't know what to do here, say N. + Migration support in this driver relies on dirty page tracking + provided by the IOMMU hardware and exposed through IOMMUFD, any + other use cases are dis-recommended. + + If you don't know what to do here, say N. + +config VIRTIO_VFIO_PCI_ADMIN_LEGACY + bool "Legacy I/O support for VIRTIO NET PCI VF devices" + depends on VIRTIO_VFIO_PCI && VIRTIO_PCI_ADMIN_LEGACY + default y + help + This extends the virtio-vfio-pci driver to support legacy I/O + access, allowing use of legacy virtio drivers with VIRTIO NET + PCI VF devices. Legacy I/O support requires the SR-IOV PF + device to support and enable specific VIRTIO extensions, + otherwise this driver provides no additional functionality + beyond vfio-pci. + + If you don't know what to do here, say N. diff --git a/drivers/vfio/pci/virtio/Makefile b/drivers/vfio/pci/virtio/Makefile index 7171105baf33..d9b0bb40d6b3 100644 --- a/drivers/vfio/pci/virtio/Makefile +++ b/drivers/vfio/pci/virtio/Makefile @@ -1,3 +1,4 @@ # SPDX-License-Identifier: GPL-2.0-only obj-$(CONFIG_VIRTIO_VFIO_PCI) += virtio-vfio-pci.o -virtio-vfio-pci-y := main.o +virtio-vfio-pci-y := main.o migrate.o +virtio-vfio-pci-$(CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY) += legacy_io.o diff --git a/drivers/vfio/pci/virtio/common.h b/drivers/vfio/pci/virtio/common.h new file mode 100644 index 000000000000..c7d7e27af386 --- /dev/null +++ b/drivers/vfio/pci/virtio/common.h @@ -0,0 +1,127 @@ +/* SPDX-License-Identifier: GPL-2.0 */ + +#ifndef VIRTIO_VFIO_COMMON_H +#define VIRTIO_VFIO_COMMON_H + +#include +#include +#include +#include + +enum virtiovf_migf_state { + VIRTIOVF_MIGF_STATE_ERROR = 1, + VIRTIOVF_MIGF_STATE_PRECOPY = 2, + VIRTIOVF_MIGF_STATE_COMPLETE = 3, +}; + +enum virtiovf_load_state { + VIRTIOVF_LOAD_STATE_READ_HEADER, + VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA, + VIRTIOVF_LOAD_STATE_READ_HEADER_DATA, + VIRTIOVF_LOAD_STATE_PREP_CHUNK, + VIRTIOVF_LOAD_STATE_READ_CHUNK, + VIRTIOVF_LOAD_STATE_LOAD_CHUNK, +}; + +struct virtiovf_data_buffer { + struct sg_append_table table; + loff_t start_pos; + u64 length; + u64 allocated_length; + struct list_head buf_elm; + u8 include_header_object:1; + struct virtiovf_migration_file *migf; + /* Optimize virtiovf_get_migration_page() for sequential access */ + struct scatterlist *last_offset_sg; + unsigned int sg_last_entry; + unsigned long last_offset; +}; + +enum virtiovf_migf_header_flags { + VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY = 0, + VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL = 1 << 0, +}; + +enum virtiovf_migf_header_tag { + VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA = 0, +}; + +struct virtiovf_migration_header { + __le64 record_size; + /* For future use in case we may need to change the kernel protocol */ + __le32 flags; /* Use virtiovf_migf_header_flags */ + __le32 tag; /* Use virtiovf_migf_header_tag */ + __u8 data[]; /* Its size is given in the record_size */ +}; + +struct virtiovf_migration_file { + struct file *filp; + /* synchronize access to the file state */ + struct mutex lock; + loff_t max_pos; + u64 pre_copy_initial_bytes; + struct ratelimit_state pre_copy_rl_state; + u64 record_size; + u32 record_tag; + u8 has_obj_id:1; + u32 obj_id; + enum virtiovf_migf_state state; + enum virtiovf_load_state load_state; + /* synchronize access to the lists */ + spinlock_t list_lock; + struct list_head buf_list; + struct list_head avail_list; + struct virtiovf_data_buffer *buf; + struct virtiovf_data_buffer *buf_header; + struct virtiovf_pci_core_device *virtvdev; +}; + +struct virtiovf_pci_core_device { + struct vfio_pci_core_device core_device; +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + u8 *bar0_virtual_buf; + /* synchronize access to the virtual buf */ + struct mutex bar_mutex; + void __iomem *notify_addr; + u64 notify_offset; + __le32 pci_base_addr_0; + __le16 pci_cmd; + u8 bar0_virtual_buf_size; + u8 notify_bar; +#endif + + /* LM related */ + u8 migrate_cap:1; + u8 deferred_reset:1; + /* protect migration state */ + struct mutex state_mutex; + enum vfio_device_mig_state mig_state; + /* protect the reset_done flow */ + spinlock_t reset_lock; + struct virtiovf_migration_file *resuming_migf; + struct virtiovf_migration_file *saving_migf; +}; + +void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_migration_reset_done(struct pci_dev *pdev); + +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY +int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev); +long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg); +ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos); +ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos); +bool virtiovf_support_legacy_io(struct pci_dev *pdev); +int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev); +void virtiovf_legacy_io_reset_done(struct pci_dev *pdev); +#endif + +#endif /* VIRTIO_VFIO_COMMON_H */ diff --git a/drivers/vfio/pci/virtio/legacy_io.c b/drivers/vfio/pci/virtio/legacy_io.c new file mode 100644 index 000000000000..20382ee15fac --- /dev/null +++ b/drivers/vfio/pci/virtio/legacy_io.c @@ -0,0 +1,418 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2023, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +static int +virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + bool msix_enabled = + (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX); + struct pci_dev *pdev = virtvdev->core_device.pdev; + u8 *bar0_buf = virtvdev->bar0_virtual_buf; + bool common; + u8 offset; + int ret; + + common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); + /* offset within the relevant configuration area */ + offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); + mutex_lock(&virtvdev->bar_mutex); + if (read) { + if (common) + ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, + count, bar0_buf + pos); + if (ret) + goto out; + if (copy_to_user(buf, bar0_buf + pos, count)) + ret = -EFAULT; + } else { + if (copy_from_user(bar0_buf + pos, buf, count)) { + ret = -EFAULT; + goto out; + } + + if (common) + ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, + count, bar0_buf + pos); + else + ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, + count, bar0_buf + pos); + } +out: + mutex_unlock(&virtvdev->bar_mutex); + return ret; +} + +static int +virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev, + loff_t pos, char __user *buf, + size_t count, bool read) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + struct pci_dev *pdev = core_device->pdev; + u16 queue_notify; + int ret; + + if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO)) + return -EIO; + + if (pos + count > virtvdev->bar0_virtual_buf_size) + return -EINVAL; + + ret = pm_runtime_resume_and_get(&pdev->dev); + if (ret) { + pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret); + return -EIO; + } + + switch (pos) { + case VIRTIO_PCI_QUEUE_NOTIFY: + if (count != sizeof(queue_notify)) { + ret = -EINVAL; + goto end; + } + if (read) { + ret = vfio_pci_core_ioread16(core_device, true, &queue_notify, + virtvdev->notify_addr); + if (ret) + goto end; + if (copy_to_user(buf, &queue_notify, + sizeof(queue_notify))) { + ret = -EFAULT; + goto end; + } + } else { + if (copy_from_user(&queue_notify, buf, count)) { + ret = -EFAULT; + goto end; + } + ret = vfio_pci_core_iowrite16(core_device, true, queue_notify, + virtvdev->notify_addr); + } + break; + default: + ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count, + read); + } + +end: + pm_runtime_put(&pdev->dev); + return ret ? ret : count; +} + +static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, + char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + __le32 val32; + __le16 val16; + u8 val8; + int ret; + + ret = vfio_pci_core_read(core_vdev, buf, count, ppos); + if (ret < 0) + return ret; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) + return -EFAULT; + } + + if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && + vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, + copy_count)) + return -EFAULT; + val16 |= cpu_to_le16(PCI_COMMAND_IO); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, + sizeof(val8), ©_offset, + ©_count, ®ister_offset)) { + /* Transional needs to have revision 0 */ + val8 = 0; + if (copy_to_user(buf + copy_offset, &val8, copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(val32), ©_offset, + ©_count, ®ister_offset)) { + u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); + u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); + + val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO); + if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + /* + * Transitional devices use the PCI subsystem device id as + * virtio device id, same as legacy driver always did. + */ + val16 = cpu_to_le16(VIRTIO_ID_NET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, + sizeof(val16), ©_offset, + ©_count, ®ister_offset)) { + val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); + if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, + copy_count)) + return -EFAULT; + } + + return count; +} + +ssize_t virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_read_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true); + + return vfio_pci_core_read(core_vdev, buf, count, ppos); +} + +static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, + const char __user *buf, size_t count, + loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + size_t register_offset; + loff_t copy_offset; + size_t copy_count; + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, + sizeof(virtvdev->pci_cmd), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, + sizeof(virtvdev->pci_base_addr_0), + ©_offset, ©_count, + ®ister_offset)) { + if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, + buf + copy_offset, + copy_count)) + return -EFAULT; + } + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +ssize_t virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, + size_t count, loff_t *ppos) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); + loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; + + if (!count) + return 0; + + if (index == VFIO_PCI_CONFIG_REGION_INDEX) + return virtiovf_pci_write_config(core_vdev, buf, count, ppos); + + if (index == VFIO_PCI_BAR0_REGION_INDEX) + return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false); + + return vfio_pci_core_write(core_vdev, buf, count, ppos); +} + +int virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, + unsigned int cmd, unsigned long arg) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + unsigned long minsz = offsetofend(struct vfio_region_info, offset); + void __user *uarg = (void __user *)arg; + struct vfio_region_info info = {}; + + if (copy_from_user(&info, uarg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + switch (info.index) { + case VFIO_PCI_BAR0_REGION_INDEX: + info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); + info.size = virtvdev->bar0_virtual_buf_size; + info.flags = VFIO_REGION_INFO_FLAG_READ | + VFIO_REGION_INFO_FLAG_WRITE; + return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +long virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, + unsigned long arg) +{ + switch (cmd) { + case VFIO_DEVICE_GET_REGION_INFO: + return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); + default: + return vfio_pci_core_ioctl(core_vdev, cmd, arg); + } +} + +static int virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) +{ + struct vfio_pci_core_device *core_device = &virtvdev->core_device; + int ret; + + /* + * Setup the BAR where the 'notify' exists to be used by vfio as well + * This will let us mmap it only once and use it when needed. + */ + ret = vfio_pci_core_setup_barmap(core_device, + virtvdev->notify_bar); + if (ret) + return ret; + + virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + + virtvdev->notify_offset; + return 0; +} + +int virtiovf_open_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->bar0_virtual_buf) + return 0; + + /* + * Upon close_device() the vfio_pci_core_disable() is called + * and will close all the previous mmaps, so it seems that the + * valid life cycle for the 'notify' addr is per open/close. + */ + return virtiovf_set_notify_addr(virtvdev); +} + +static int virtiovf_get_device_config_size(unsigned short device) +{ + /* Network card */ + return offsetofend(struct virtio_net_config, status); +} + +static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev) +{ + u64 offset; + int ret; + u8 bar; + + ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev, + VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM, + &bar, &offset); + if (ret) + return ret; + + virtvdev->notify_bar = bar; + virtvdev->notify_offset = offset; + return 0; +} + +static bool virtiovf_bar0_exists(struct pci_dev *pdev) +{ + struct resource *res = pdev->resource; + + return res->flags; +} + +bool virtiovf_support_legacy_io(struct pci_dev *pdev) +{ + return virtio_pci_admin_has_legacy_io(pdev) && !virtiovf_bar0_exists(pdev); +} + +int virtiovf_init_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + struct pci_dev *pdev = virtvdev->core_device.pdev; + int ret; + + ret = virtiovf_read_notify_info(virtvdev); + if (ret) + return ret; + + virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) + + virtiovf_get_device_config_size(pdev->device); + BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size)); + virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size, + GFP_KERNEL); + if (!virtvdev->bar0_virtual_buf) + return -ENOMEM; + mutex_init(&virtvdev->bar_mutex); + return 0; +} + +void virtiovf_release_legacy_io(struct virtiovf_pci_core_device *virtvdev) +{ + kfree(virtvdev->bar0_virtual_buf); +} + +void virtiovf_legacy_io_reset_done(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + virtvdev->pci_cmd = 0; +} diff --git a/drivers/vfio/pci/virtio/main.c b/drivers/vfio/pci/virtio/main.c index b5d3a8c5bbc9..d534d48c4163 100644 --- a/drivers/vfio/pci/virtio/main.c +++ b/drivers/vfio/pci/virtio/main.c @@ -16,347 +16,12 @@ #include #include -struct virtiovf_pci_core_device { - struct vfio_pci_core_device core_device; - u8 *bar0_virtual_buf; - /* synchronize access to the virtual buf */ - struct mutex bar_mutex; - void __iomem *notify_addr; - u64 notify_offset; - __le32 pci_base_addr_0; - __le16 pci_cmd; - u8 bar0_virtual_buf_size; - u8 notify_bar; -}; - -static int -virtiovf_issue_legacy_rw_cmd(struct virtiovf_pci_core_device *virtvdev, - loff_t pos, char __user *buf, - size_t count, bool read) -{ - bool msix_enabled = - (virtvdev->core_device.irq_type == VFIO_PCI_MSIX_IRQ_INDEX); - struct pci_dev *pdev = virtvdev->core_device.pdev; - u8 *bar0_buf = virtvdev->bar0_virtual_buf; - bool common; - u8 offset; - int ret; - - common = pos < VIRTIO_PCI_CONFIG_OFF(msix_enabled); - /* offset within the relevant configuration area */ - offset = common ? pos : pos - VIRTIO_PCI_CONFIG_OFF(msix_enabled); - mutex_lock(&virtvdev->bar_mutex); - if (read) { - if (common) - ret = virtio_pci_admin_legacy_common_io_read(pdev, offset, - count, bar0_buf + pos); - else - ret = virtio_pci_admin_legacy_device_io_read(pdev, offset, - count, bar0_buf + pos); - if (ret) - goto out; - if (copy_to_user(buf, bar0_buf + pos, count)) - ret = -EFAULT; - } else { - if (copy_from_user(bar0_buf + pos, buf, count)) { - ret = -EFAULT; - goto out; - } - - if (common) - ret = virtio_pci_admin_legacy_common_io_write(pdev, offset, - count, bar0_buf + pos); - else - ret = virtio_pci_admin_legacy_device_io_write(pdev, offset, - count, bar0_buf + pos); - } -out: - mutex_unlock(&virtvdev->bar_mutex); - return ret; -} - -static int -virtiovf_pci_bar0_rw(struct virtiovf_pci_core_device *virtvdev, - loff_t pos, char __user *buf, - size_t count, bool read) -{ - struct vfio_pci_core_device *core_device = &virtvdev->core_device; - struct pci_dev *pdev = core_device->pdev; - u16 queue_notify; - int ret; - - if (!(le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO)) - return -EIO; - - if (pos + count > virtvdev->bar0_virtual_buf_size) - return -EINVAL; - - ret = pm_runtime_resume_and_get(&pdev->dev); - if (ret) { - pci_info_ratelimited(pdev, "runtime resume failed %d\n", ret); - return -EIO; - } - - switch (pos) { - case VIRTIO_PCI_QUEUE_NOTIFY: - if (count != sizeof(queue_notify)) { - ret = -EINVAL; - goto end; - } - if (read) { - ret = vfio_pci_core_ioread16(core_device, true, &queue_notify, - virtvdev->notify_addr); - if (ret) - goto end; - if (copy_to_user(buf, &queue_notify, - sizeof(queue_notify))) { - ret = -EFAULT; - goto end; - } - } else { - if (copy_from_user(&queue_notify, buf, count)) { - ret = -EFAULT; - goto end; - } - ret = vfio_pci_core_iowrite16(core_device, true, queue_notify, - virtvdev->notify_addr); - } - break; - default: - ret = virtiovf_issue_legacy_rw_cmd(virtvdev, pos, buf, count, - read); - } - -end: - pm_runtime_put(&pdev->dev); - return ret ? ret : count; -} - -static ssize_t virtiovf_pci_read_config(struct vfio_device *core_vdev, - char __user *buf, size_t count, - loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - size_t register_offset; - loff_t copy_offset; - size_t copy_count; - __le32 val32; - __le16 val16; - u8 val8; - int ret; - - ret = vfio_pci_core_read(core_vdev, buf, count, ppos); - if (ret < 0) - return ret; - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_DEVICE_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - val16 = cpu_to_le16(VIRTIO_TRANS_ID_NET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, copy_count)) - return -EFAULT; - } - - if ((le16_to_cpu(virtvdev->pci_cmd) & PCI_COMMAND_IO) && - vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - if (copy_from_user((void *)&val16 + register_offset, buf + copy_offset, - copy_count)) - return -EFAULT; - val16 |= cpu_to_le16(PCI_COMMAND_IO); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_REVISION_ID, - sizeof(val8), ©_offset, - ©_count, ®ister_offset)) { - /* Transional needs to have revision 0 */ - val8 = 0; - if (copy_to_user(buf + copy_offset, &val8, copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, - sizeof(val32), ©_offset, - ©_count, ®ister_offset)) { - u32 bar_mask = ~(virtvdev->bar0_virtual_buf_size - 1); - u32 pci_base_addr_0 = le32_to_cpu(virtvdev->pci_base_addr_0); - - val32 = cpu_to_le32((pci_base_addr_0 & bar_mask) | PCI_BASE_ADDRESS_SPACE_IO); - if (copy_to_user(buf + copy_offset, (void *)&val32 + register_offset, copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - /* - * Transitional devices use the PCI subsystem device id as - * virtio device id, same as legacy driver always did. - */ - val16 = cpu_to_le16(VIRTIO_ID_NET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_SUBSYSTEM_VENDOR_ID, - sizeof(val16), ©_offset, - ©_count, ®ister_offset)) { - val16 = cpu_to_le16(PCI_VENDOR_ID_REDHAT_QUMRANET); - if (copy_to_user(buf + copy_offset, (void *)&val16 + register_offset, - copy_count)) - return -EFAULT; - } - - return count; -} - -static ssize_t -virtiovf_pci_core_read(struct vfio_device *core_vdev, char __user *buf, - size_t count, loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (!count) - return 0; - - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return virtiovf_pci_read_config(core_vdev, buf, count, ppos); - - if (index == VFIO_PCI_BAR0_REGION_INDEX) - return virtiovf_pci_bar0_rw(virtvdev, pos, buf, count, true); - - return vfio_pci_core_read(core_vdev, buf, count, ppos); -} - -static ssize_t virtiovf_pci_write_config(struct vfio_device *core_vdev, - const char __user *buf, size_t count, - loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - size_t register_offset; - loff_t copy_offset; - size_t copy_count; - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_COMMAND, - sizeof(virtvdev->pci_cmd), - ©_offset, ©_count, - ®ister_offset)) { - if (copy_from_user((void *)&virtvdev->pci_cmd + register_offset, - buf + copy_offset, - copy_count)) - return -EFAULT; - } - - if (vfio_pci_core_range_intersect_range(pos, count, PCI_BASE_ADDRESS_0, - sizeof(virtvdev->pci_base_addr_0), - ©_offset, ©_count, - ®ister_offset)) { - if (copy_from_user((void *)&virtvdev->pci_base_addr_0 + register_offset, - buf + copy_offset, - copy_count)) - return -EFAULT; - } - - return vfio_pci_core_write(core_vdev, buf, count, ppos); -} - -static ssize_t -virtiovf_pci_core_write(struct vfio_device *core_vdev, const char __user *buf, - size_t count, loff_t *ppos) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned int index = VFIO_PCI_OFFSET_TO_INDEX(*ppos); - loff_t pos = *ppos & VFIO_PCI_OFFSET_MASK; - - if (!count) - return 0; - - if (index == VFIO_PCI_CONFIG_REGION_INDEX) - return virtiovf_pci_write_config(core_vdev, buf, count, ppos); - - if (index == VFIO_PCI_BAR0_REGION_INDEX) - return virtiovf_pci_bar0_rw(virtvdev, pos, (char __user *)buf, count, false); - - return vfio_pci_core_write(core_vdev, buf, count, ppos); -} - -static int -virtiovf_pci_ioctl_get_region_info(struct vfio_device *core_vdev, - unsigned int cmd, unsigned long arg) -{ - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - unsigned long minsz = offsetofend(struct vfio_region_info, offset); - void __user *uarg = (void __user *)arg; - struct vfio_region_info info = {}; - - if (copy_from_user(&info, uarg, minsz)) - return -EFAULT; - - if (info.argsz < minsz) - return -EINVAL; - - switch (info.index) { - case VFIO_PCI_BAR0_REGION_INDEX: - info.offset = VFIO_PCI_INDEX_TO_OFFSET(info.index); - info.size = virtvdev->bar0_virtual_buf_size; - info.flags = VFIO_REGION_INFO_FLAG_READ | - VFIO_REGION_INFO_FLAG_WRITE; - return copy_to_user(uarg, &info, minsz) ? -EFAULT : 0; - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -static long -virtiovf_vfio_pci_core_ioctl(struct vfio_device *core_vdev, unsigned int cmd, - unsigned long arg) -{ - switch (cmd) { - case VFIO_DEVICE_GET_REGION_INFO: - return virtiovf_pci_ioctl_get_region_info(core_vdev, cmd, arg); - default: - return vfio_pci_core_ioctl(core_vdev, cmd, arg); - } -} - -static int -virtiovf_set_notify_addr(struct virtiovf_pci_core_device *virtvdev) -{ - struct vfio_pci_core_device *core_device = &virtvdev->core_device; - int ret; - - /* - * Setup the BAR where the 'notify' exists to be used by vfio as well - * This will let us mmap it only once and use it when needed. - */ - ret = vfio_pci_core_setup_barmap(core_device, - virtvdev->notify_bar); - if (ret) - return ret; - - virtvdev->notify_addr = core_device->barmap[virtvdev->notify_bar] + - virtvdev->notify_offset; - return 0; -} +#include "common.h" static int virtiovf_pci_open_device(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); struct vfio_pci_core_device *vdev = &virtvdev->core_device; int ret; @@ -364,88 +29,84 @@ static int virtiovf_pci_open_device(struct vfio_device *core_vdev) if (ret) return ret; - if (virtvdev->bar0_virtual_buf) { - /* - * Upon close_device() the vfio_pci_core_disable() is called - * and will close all the previous mmaps, so it seems that the - * valid life cycle for the 'notify' addr is per open/close. - */ - ret = virtiovf_set_notify_addr(virtvdev); - if (ret) { - vfio_pci_core_disable(vdev); - return ret; - } +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + ret = virtiovf_open_legacy_io(virtvdev); + if (ret) { + vfio_pci_core_disable(vdev); + return ret; } +#endif + virtiovf_open_migration(virtvdev); vfio_pci_core_finish_enable(vdev); return 0; } -static int virtiovf_get_device_config_size(unsigned short device) +static void virtiovf_pci_close_device(struct vfio_device *core_vdev) { - /* Network card */ - return offsetofend(struct virtio_net_config, status); -} - -static int virtiovf_read_notify_info(struct virtiovf_pci_core_device *virtvdev) -{ - u64 offset; - int ret; - u8 bar; - - ret = virtio_pci_admin_legacy_io_notify_info(virtvdev->core_device.pdev, - VIRTIO_ADMIN_CMD_NOTIFY_INFO_FLAGS_OWNER_MEM, - &bar, &offset); - if (ret) - return ret; - - virtvdev->notify_bar = bar; - virtvdev->notify_offset = offset; - return 0; + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); + + virtiovf_close_migration(virtvdev); + vfio_pci_core_close_device(core_vdev); } +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY static int virtiovf_pci_init_device(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); - struct pci_dev *pdev; + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); int ret; ret = vfio_pci_core_init_dev(core_vdev); if (ret) return ret; - pdev = virtvdev->core_device.pdev; - ret = virtiovf_read_notify_info(virtvdev); - if (ret) - return ret; - - virtvdev->bar0_virtual_buf_size = VIRTIO_PCI_CONFIG_OFF(true) + - virtiovf_get_device_config_size(pdev->device); - BUILD_BUG_ON(!is_power_of_2(virtvdev->bar0_virtual_buf_size)); - virtvdev->bar0_virtual_buf = kzalloc(virtvdev->bar0_virtual_buf_size, - GFP_KERNEL); - if (!virtvdev->bar0_virtual_buf) - return -ENOMEM; - mutex_init(&virtvdev->bar_mutex); - return 0; + /* + * The vfio_device_ops.init() callback is set to virtiovf_pci_init_device() + * only when legacy I/O is supported. Now, let's initialize it. + */ + return virtiovf_init_legacy_io(virtvdev); } +#endif static void virtiovf_pci_core_release_dev(struct vfio_device *core_vdev) { - struct virtiovf_pci_core_device *virtvdev = container_of( - core_vdev, struct virtiovf_pci_core_device, core_device.vdev); +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + struct virtiovf_pci_core_device *virtvdev = container_of(core_vdev, + struct virtiovf_pci_core_device, core_device.vdev); - kfree(virtvdev->bar0_virtual_buf); + virtiovf_release_legacy_io(virtvdev); +#endif vfio_pci_core_release_dev(core_vdev); } -static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = { - .name = "virtio-vfio-pci-trans", +static const struct vfio_device_ops virtiovf_vfio_pci_lm_ops = { + .name = "virtio-vfio-pci-lm", + .init = vfio_pci_core_init_dev, + .release = virtiovf_pci_core_release_dev, + .open_device = virtiovf_pci_open_device, + .close_device = virtiovf_pci_close_device, + .ioctl = vfio_pci_core_ioctl, + .device_feature = vfio_pci_core_ioctl_feature, + .read = vfio_pci_core_read, + .write = vfio_pci_core_write, + .mmap = vfio_pci_core_mmap, + .request = vfio_pci_core_request, + .match = vfio_pci_core_match, + .bind_iommufd = vfio_iommufd_physical_bind, + .unbind_iommufd = vfio_iommufd_physical_unbind, + .attach_ioas = vfio_iommufd_physical_attach_ioas, + .detach_ioas = vfio_iommufd_physical_detach_ioas, +}; + +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY +static const struct vfio_device_ops virtiovf_vfio_pci_tran_lm_ops = { + .name = "virtio-vfio-pci-trans-lm", .init = virtiovf_pci_init_device, .release = virtiovf_pci_core_release_dev, .open_device = virtiovf_pci_open_device, - .close_device = vfio_pci_core_close_device, + .close_device = virtiovf_pci_close_device, .ioctl = virtiovf_vfio_pci_core_ioctl, .device_feature = vfio_pci_core_ioctl_feature, .read = virtiovf_pci_core_read, @@ -458,6 +119,7 @@ static const struct vfio_device_ops virtiovf_vfio_pci_tran_ops = { .attach_ioas = vfio_iommufd_physical_attach_ioas, .detach_ioas = vfio_iommufd_physical_detach_ioas, }; +#endif static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .name = "virtio-vfio-pci", @@ -478,29 +140,34 @@ static const struct vfio_device_ops virtiovf_vfio_pci_ops = { .detach_ioas = vfio_iommufd_physical_detach_ioas, }; -static bool virtiovf_bar0_exists(struct pci_dev *pdev) -{ - struct resource *res = pdev->resource; - - return res->flags; -} - static int virtiovf_pci_probe(struct pci_dev *pdev, const struct pci_device_id *id) { const struct vfio_device_ops *ops = &virtiovf_vfio_pci_ops; struct virtiovf_pci_core_device *virtvdev; + bool sup_legacy_io = false; + bool sup_lm = false; int ret; - if (pdev->is_virtfn && virtio_pci_admin_has_legacy_io(pdev) && - !virtiovf_bar0_exists(pdev)) - ops = &virtiovf_vfio_pci_tran_ops; + if (pdev->is_virtfn) { +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + sup_legacy_io = virtiovf_support_legacy_io(pdev); + if (sup_legacy_io) + ops = &virtiovf_vfio_pci_tran_lm_ops; +#endif + sup_lm = virtio_pci_admin_has_dev_parts(pdev); + if (sup_lm && !sup_legacy_io) + ops = &virtiovf_vfio_pci_lm_ops; + } virtvdev = vfio_alloc_device(virtiovf_pci_core_device, core_device.vdev, &pdev->dev, ops); if (IS_ERR(virtvdev)) return PTR_ERR(virtvdev); + if (sup_lm) + virtiovf_set_migratable(virtvdev); + dev_set_drvdata(&pdev->dev, &virtvdev->core_device); ret = vfio_pci_core_register_device(&virtvdev->core_device); if (ret) @@ -529,9 +196,10 @@ MODULE_DEVICE_TABLE(pci, virtiovf_pci_table); static void virtiovf_pci_aer_reset_done(struct pci_dev *pdev) { - struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); - - virtvdev->pci_cmd = 0; +#ifdef CONFIG_VIRTIO_VFIO_PCI_ADMIN_LEGACY + virtiovf_legacy_io_reset_done(pdev); +#endif + virtiovf_migration_reset_done(pdev); } static const struct pci_error_handlers virtiovf_err_handlers = { diff --git a/drivers/vfio/pci/virtio/migrate.c b/drivers/vfio/pci/virtio/migrate.c new file mode 100644 index 000000000000..ee54f4c17857 --- /dev/null +++ b/drivers/vfio/pci/virtio/migrate.c @@ -0,0 +1,1337 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Copyright (c) 2024, NVIDIA CORPORATION & AFFILIATES. All rights reserved + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common.h" + +/* Device specification max parts size */ +#define MAX_LOAD_SIZE (BIT_ULL(BITS_PER_TYPE \ + (((struct virtio_admin_cmd_dev_parts_metadata_result *)0)->parts_size.size)) - 1) + +/* Initial target buffer size */ +#define VIRTIOVF_TARGET_INITIAL_BUF_SIZE SZ_1M + +static int +virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, + u32 ctx_size); + +static struct page * +virtiovf_get_migration_page(struct virtiovf_data_buffer *buf, + unsigned long offset) +{ + unsigned long cur_offset = 0; + struct scatterlist *sg; + unsigned int i; + + /* All accesses are sequential */ + if (offset < buf->last_offset || !buf->last_offset_sg) { + buf->last_offset = 0; + buf->last_offset_sg = buf->table.sgt.sgl; + buf->sg_last_entry = 0; + } + + cur_offset = buf->last_offset; + + for_each_sg(buf->last_offset_sg, sg, + buf->table.sgt.orig_nents - buf->sg_last_entry, i) { + if (offset < sg->length + cur_offset) { + buf->last_offset_sg = sg; + buf->sg_last_entry += i; + buf->last_offset = cur_offset; + return nth_page(sg_page(sg), + (offset - cur_offset) / PAGE_SIZE); + } + cur_offset += sg->length; + } + return NULL; +} + +static int virtiovf_add_migration_pages(struct virtiovf_data_buffer *buf, + unsigned int npages) +{ + unsigned int to_alloc = npages; + struct page **page_list; + unsigned long filled; + unsigned int to_fill; + int ret; + int i; + + to_fill = min_t(unsigned int, npages, PAGE_SIZE / sizeof(*page_list)); + page_list = kvcalloc(to_fill, sizeof(*page_list), GFP_KERNEL_ACCOUNT); + if (!page_list) + return -ENOMEM; + + do { + filled = alloc_pages_bulk_array(GFP_KERNEL_ACCOUNT, to_fill, + page_list); + if (!filled) { + ret = -ENOMEM; + goto err; + } + to_alloc -= filled; + ret = sg_alloc_append_table_from_pages(&buf->table, page_list, + filled, 0, filled << PAGE_SHIFT, UINT_MAX, + SG_MAX_SINGLE_ALLOC, GFP_KERNEL_ACCOUNT); + + if (ret) + goto err_append; + buf->allocated_length += filled * PAGE_SIZE; + /* clean input for another bulk allocation */ + memset(page_list, 0, filled * sizeof(*page_list)); + to_fill = min_t(unsigned int, to_alloc, + PAGE_SIZE / sizeof(*page_list)); + } while (to_alloc > 0); + + kvfree(page_list); + return 0; + +err_append: + for (i = filled - 1; i >= 0; i--) + __free_page(page_list[i]); +err: + kvfree(page_list); + return ret; +} + +static void virtiovf_free_data_buffer(struct virtiovf_data_buffer *buf) +{ + struct sg_page_iter sg_iter; + + /* Undo alloc_pages_bulk_array() */ + for_each_sgtable_page(&buf->table.sgt, &sg_iter, 0) + __free_page(sg_page_iter_page(&sg_iter)); + sg_free_append_table(&buf->table); + kfree(buf); +} + +static struct virtiovf_data_buffer * +virtiovf_alloc_data_buffer(struct virtiovf_migration_file *migf, size_t length) +{ + struct virtiovf_data_buffer *buf; + int ret; + + buf = kzalloc(sizeof(*buf), GFP_KERNEL_ACCOUNT); + if (!buf) + return ERR_PTR(-ENOMEM); + + ret = virtiovf_add_migration_pages(buf, + DIV_ROUND_UP_ULL(length, PAGE_SIZE)); + if (ret) + goto end; + + buf->migf = migf; + return buf; +end: + virtiovf_free_data_buffer(buf); + return ERR_PTR(ret); +} + +static void virtiovf_put_data_buffer(struct virtiovf_data_buffer *buf) +{ + spin_lock_irq(&buf->migf->list_lock); + list_add_tail(&buf->buf_elm, &buf->migf->avail_list); + spin_unlock_irq(&buf->migf->list_lock); +} + +static int +virtiovf_pci_alloc_obj_id(struct virtiovf_pci_core_device *virtvdev, u8 type, + u32 *obj_id) +{ + return virtio_pci_admin_obj_create(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, type, obj_id); +} + +static void +virtiovf_pci_free_obj_id(struct virtiovf_pci_core_device *virtvdev, u32 obj_id) +{ + virtio_pci_admin_obj_destroy(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id); +} + +static struct virtiovf_data_buffer * +virtiovf_get_data_buffer(struct virtiovf_migration_file *migf, size_t length) +{ + struct virtiovf_data_buffer *buf, *temp_buf; + struct list_head free_list; + + INIT_LIST_HEAD(&free_list); + + spin_lock_irq(&migf->list_lock); + list_for_each_entry_safe(buf, temp_buf, &migf->avail_list, buf_elm) { + list_del_init(&buf->buf_elm); + if (buf->allocated_length >= length) { + spin_unlock_irq(&migf->list_lock); + goto found; + } + /* + * Prevent holding redundant buffers. Put in a free + * list and call at the end not under the spin lock + * (&migf->list_lock) to minimize its scope usage. + */ + list_add(&buf->buf_elm, &free_list); + } + spin_unlock_irq(&migf->list_lock); + buf = virtiovf_alloc_data_buffer(migf, length); + +found: + while ((temp_buf = list_first_entry_or_null(&free_list, + struct virtiovf_data_buffer, buf_elm))) { + list_del(&temp_buf->buf_elm); + virtiovf_free_data_buffer(temp_buf); + } + + return buf; +} + +static void virtiovf_clean_migf_resources(struct virtiovf_migration_file *migf) +{ + struct virtiovf_data_buffer *entry; + + if (migf->buf) { + virtiovf_free_data_buffer(migf->buf); + migf->buf = NULL; + } + + if (migf->buf_header) { + virtiovf_free_data_buffer(migf->buf_header); + migf->buf_header = NULL; + } + + list_splice(&migf->avail_list, &migf->buf_list); + + while ((entry = list_first_entry_or_null(&migf->buf_list, + struct virtiovf_data_buffer, buf_elm))) { + list_del(&entry->buf_elm); + virtiovf_free_data_buffer(entry); + } + + if (migf->has_obj_id) + virtiovf_pci_free_obj_id(migf->virtvdev, migf->obj_id); +} + +static void virtiovf_disable_fd(struct virtiovf_migration_file *migf) +{ + mutex_lock(&migf->lock); + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + migf->filp->f_pos = 0; + mutex_unlock(&migf->lock); +} + +static void virtiovf_disable_fds(struct virtiovf_pci_core_device *virtvdev) +{ + if (virtvdev->resuming_migf) { + virtiovf_disable_fd(virtvdev->resuming_migf); + virtiovf_clean_migf_resources(virtvdev->resuming_migf); + fput(virtvdev->resuming_migf->filp); + virtvdev->resuming_migf = NULL; + } + if (virtvdev->saving_migf) { + virtiovf_disable_fd(virtvdev->saving_migf); + virtiovf_clean_migf_resources(virtvdev->saving_migf); + fput(virtvdev->saving_migf->filp); + virtvdev->saving_migf = NULL; + } +} + +/* + * This function is called in all state_mutex unlock cases to + * handle a 'deferred_reset' if exists. + */ +static void virtiovf_state_mutex_unlock(struct virtiovf_pci_core_device *virtvdev) +{ +again: + spin_lock(&virtvdev->reset_lock); + if (virtvdev->deferred_reset) { + virtvdev->deferred_reset = false; + spin_unlock(&virtvdev->reset_lock); + virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; + virtiovf_disable_fds(virtvdev); + goto again; + } + mutex_unlock(&virtvdev->state_mutex); + spin_unlock(&virtvdev->reset_lock); +} + +void virtiovf_migration_reset_done(struct pci_dev *pdev) +{ + struct virtiovf_pci_core_device *virtvdev = dev_get_drvdata(&pdev->dev); + + if (!virtvdev->migrate_cap) + return; + + /* + * As the higher VFIO layers are holding locks across reset and using + * those same locks with the mm_lock we need to prevent ABBA deadlock + * with the state_mutex and mm_lock. + * In case the state_mutex was taken already we defer the cleanup work + * to the unlock flow of the other running context. + */ + spin_lock(&virtvdev->reset_lock); + virtvdev->deferred_reset = true; + if (!mutex_trylock(&virtvdev->state_mutex)) { + spin_unlock(&virtvdev->reset_lock); + return; + } + spin_unlock(&virtvdev->reset_lock); + virtiovf_state_mutex_unlock(virtvdev); +} + +static int virtiovf_release_file(struct inode *inode, struct file *filp) +{ + struct virtiovf_migration_file *migf = filp->private_data; + + virtiovf_disable_fd(migf); + mutex_destroy(&migf->lock); + kfree(migf); + return 0; +} + +static struct virtiovf_data_buffer * +virtiovf_get_data_buff_from_pos(struct virtiovf_migration_file *migf, + loff_t pos, bool *end_of_data) +{ + struct virtiovf_data_buffer *buf; + bool found = false; + + *end_of_data = false; + spin_lock_irq(&migf->list_lock); + if (list_empty(&migf->buf_list)) { + *end_of_data = true; + goto end; + } + + buf = list_first_entry(&migf->buf_list, struct virtiovf_data_buffer, + buf_elm); + if (pos >= buf->start_pos && + pos < buf->start_pos + buf->length) { + found = true; + goto end; + } + + /* + * As we use a stream based FD we may expect having the data always + * on first chunk + */ + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + +end: + spin_unlock_irq(&migf->list_lock); + return found ? buf : NULL; +} + +static ssize_t virtiovf_buf_read(struct virtiovf_data_buffer *vhca_buf, + char __user **buf, size_t *len, loff_t *pos) +{ + unsigned long offset; + ssize_t done = 0; + size_t copy_len; + + copy_len = min_t(size_t, + vhca_buf->start_pos + vhca_buf->length - *pos, *len); + while (copy_len) { + size_t page_offset; + struct page *page; + size_t page_len; + u8 *from_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + page_offset = offset % PAGE_SIZE; + offset -= page_offset; + page = virtiovf_get_migration_page(vhca_buf, offset); + if (!page) + return -EINVAL; + page_len = min_t(size_t, copy_len, PAGE_SIZE - page_offset); + from_buff = kmap_local_page(page); + ret = copy_to_user(*buf, from_buff + page_offset, page_len); + kunmap_local(from_buff); + if (ret) + return -EFAULT; + *pos += page_len; + *len -= page_len; + *buf += page_len; + done += page_len; + copy_len -= page_len; + } + + if (*pos >= vhca_buf->start_pos + vhca_buf->length) { + spin_lock_irq(&vhca_buf->migf->list_lock); + list_del_init(&vhca_buf->buf_elm); + list_add_tail(&vhca_buf->buf_elm, &vhca_buf->migf->avail_list); + spin_unlock_irq(&vhca_buf->migf->list_lock); + } + + return done; +} + +static ssize_t virtiovf_save_read(struct file *filp, char __user *buf, size_t len, + loff_t *pos) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_data_buffer *vhca_buf; + bool first_loop_call = true; + bool end_of_data; + ssize_t done = 0; + + if (pos) + return -ESPIPE; + pos = &filp->f_pos; + + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + done = -ENODEV; + goto out_unlock; + } + + while (len) { + ssize_t count; + + vhca_buf = virtiovf_get_data_buff_from_pos(migf, *pos, &end_of_data); + if (first_loop_call) { + first_loop_call = false; + /* Temporary end of file as part of PRE_COPY */ + if (end_of_data && migf->state == VIRTIOVF_MIGF_STATE_PRECOPY) { + done = -ENOMSG; + goto out_unlock; + } + if (end_of_data && migf->state != VIRTIOVF_MIGF_STATE_COMPLETE) { + done = -EINVAL; + goto out_unlock; + } + } + + if (end_of_data) + goto out_unlock; + + if (!vhca_buf) { + done = -EINVAL; + goto out_unlock; + } + + count = virtiovf_buf_read(vhca_buf, &buf, &len, pos); + if (count < 0) { + done = count; + goto out_unlock; + } + done += count; + } + +out_unlock: + mutex_unlock(&migf->lock); + return done; +} + +static long virtiovf_precopy_ioctl(struct file *filp, unsigned int cmd, + unsigned long arg) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_pci_core_device *virtvdev = migf->virtvdev; + struct vfio_precopy_info info = {}; + loff_t *pos = &filp->f_pos; + bool end_of_data = false; + unsigned long minsz; + u32 ctx_size = 0; + int ret; + + if (cmd != VFIO_MIG_GET_PRECOPY_INFO) + return -ENOTTY; + + minsz = offsetofend(struct vfio_precopy_info, dirty_bytes); + if (copy_from_user(&info, (void __user *)arg, minsz)) + return -EFAULT; + + if (info.argsz < minsz) + return -EINVAL; + + mutex_lock(&virtvdev->state_mutex); + if (virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY && + virtvdev->mig_state != VFIO_DEVICE_STATE_PRE_COPY_P2P) { + ret = -EINVAL; + goto err_state_unlock; + } + + /* + * The virtio specification does not include a PRE_COPY concept. + * Since we can expect the data to remain the same for a certain period, + * we use a rate limiter mechanism before making a call to the device. + */ + if (__ratelimit(&migf->pre_copy_rl_state)) { + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err_state_unlock; + } + + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + ret = -ENODEV; + goto err_migf_unlock; + } + + if (migf->pre_copy_initial_bytes > *pos) { + info.initial_bytes = migf->pre_copy_initial_bytes - *pos; + } else { + info.dirty_bytes = migf->max_pos - *pos; + if (!info.dirty_bytes) + end_of_data = true; + info.dirty_bytes += ctx_size; + } + + if (!end_of_data || !ctx_size) { + mutex_unlock(&migf->lock); + goto done; + } + + mutex_unlock(&migf->lock); + /* + * We finished transferring the current state and the device has a + * dirty state, read a new state. + */ + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + /* + * The machine is running, and context size could be grow, so no reason to mark + * the device state as VIRTIOVF_MIGF_STATE_ERROR. + */ + goto err_state_unlock; + +done: + virtiovf_state_mutex_unlock(virtvdev); + if (copy_to_user((void __user *)arg, &info, minsz)) + return -EFAULT; + return 0; + +err_migf_unlock: + mutex_unlock(&migf->lock); +err_state_unlock: + virtiovf_state_mutex_unlock(virtvdev); + return ret; +} + +static const struct file_operations virtiovf_save_fops = { + .owner = THIS_MODULE, + .read = virtiovf_save_read, + .unlocked_ioctl = virtiovf_precopy_ioctl, + .compat_ioctl = compat_ptr_ioctl, + .release = virtiovf_release_file, +}; + +static int +virtiovf_add_buf_header(struct virtiovf_data_buffer *header_buf, + u32 data_size) +{ + struct virtiovf_migration_file *migf = header_buf->migf; + struct virtiovf_migration_header header = {}; + struct page *page; + u8 *to_buff; + + header.record_size = cpu_to_le64(data_size); + header.flags = cpu_to_le32(VIRTIOVF_MIGF_HEADER_FLAGS_TAG_MANDATORY); + header.tag = cpu_to_le32(VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA); + page = virtiovf_get_migration_page(header_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &header, sizeof(header)); + kunmap_local(to_buff); + header_buf->length = sizeof(header); + header_buf->start_pos = header_buf->migf->max_pos; + migf->max_pos += header_buf->length; + spin_lock_irq(&migf->list_lock); + list_add_tail(&header_buf->buf_elm, &migf->buf_list); + spin_unlock_irq(&migf->list_lock); + return 0; +} + +static int +virtiovf_read_device_context_chunk(struct virtiovf_migration_file *migf, + u32 ctx_size) +{ + struct virtiovf_data_buffer *header_buf; + struct virtiovf_data_buffer *buf; + bool unmark_end = false; + struct scatterlist *sg; + unsigned int i; + u32 res_size; + int nent; + int ret; + + buf = virtiovf_get_data_buffer(migf, ctx_size); + if (IS_ERR(buf)) + return PTR_ERR(buf); + + /* Find the total count of SG entries which satisfies the size */ + nent = sg_nents_for_len(buf->table.sgt.sgl, ctx_size); + if (nent <= 0) { + ret = -EINVAL; + goto out; + } + + /* + * Iterate to that SG entry and mark it as last (if it's not already) + * to let underlay layers iterate only till that entry. + */ + for_each_sg(buf->table.sgt.sgl, sg, nent - 1, i) + ; + + if (!sg_is_last(sg)) { + unmark_end = true; + sg_mark_end(sg); + } + + ret = virtio_pci_admin_dev_parts_get(migf->virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, + migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL, + buf->table.sgt.sgl, &res_size); + /* Restore the original SG mark end */ + if (unmark_end) + sg_unmark_end(sg); + if (ret) + goto out; + + buf->length = res_size; + header_buf = virtiovf_get_data_buffer(migf, + sizeof(struct virtiovf_migration_header)); + if (IS_ERR(header_buf)) { + ret = PTR_ERR(header_buf); + goto out; + } + + ret = virtiovf_add_buf_header(header_buf, res_size); + if (ret) + goto out_header; + + buf->start_pos = buf->migf->max_pos; + migf->max_pos += buf->length; + spin_lock(&migf->list_lock); + list_add_tail(&buf->buf_elm, &migf->buf_list); + spin_unlock_irq(&migf->list_lock); + return 0; + +out_header: + virtiovf_put_data_buffer(header_buf); +out: + virtiovf_put_data_buffer(buf); + return ret; +} + +static int +virtiovf_pci_save_device_final_data(struct virtiovf_pci_core_device *virtvdev) +{ + struct virtiovf_migration_file *migf = virtvdev->saving_migf; + u32 ctx_size; + int ret; + + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) + return -ENODEV; + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, migf->obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto err; + + if (!ctx_size) { + ret = -EINVAL; + goto err; + } + + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + goto err; + + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + return 0; + +err: + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + return ret; +} + +static struct virtiovf_migration_file * +virtiovf_pci_save_device_data(struct virtiovf_pci_core_device *virtvdev, + bool pre_copy) +{ + struct virtiovf_migration_file *migf; + u32 ctx_size; + u32 obj_id; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_save_fops, migf, + O_RDONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + migf->virtvdev = virtvdev; + + lockdep_assert_held(&virtvdev->state_mutex); + ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET, + &obj_id); + if (ret) + goto out; + + migf->obj_id = obj_id; + /* Mark as having a valid obj id which can be even 0 */ + migf->has_obj_id = true; + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &ctx_size); + if (ret) + goto out_clean; + + if (!ctx_size) { + ret = -EINVAL; + goto out_clean; + } + + ret = virtiovf_read_device_context_chunk(migf, ctx_size); + if (ret) + goto out_clean; + + if (pre_copy) { + migf->pre_copy_initial_bytes = migf->max_pos; + /* Arbitrarily set the pre-copy rate limit to 1-second intervals */ + ratelimit_state_init(&migf->pre_copy_rl_state, 1 * HZ, 1); + /* Prevent any rate messages upon its usage */ + ratelimit_set_flags(&migf->pre_copy_rl_state, + RATELIMIT_MSG_ON_RELEASE); + migf->state = VIRTIOVF_MIGF_STATE_PRECOPY; + } else { + migf->state = VIRTIOVF_MIGF_STATE_COMPLETE; + } + + return migf; + +out_clean: + virtiovf_clean_migf_resources(migf); +out: + fput(migf->filp); + return ERR_PTR(ret); +} + +/* + * Set the required object header at the beginning of the buffer. + * The actual device parts data will be written post of the header offset. + */ +static int virtiovf_set_obj_cmd_header(struct virtiovf_data_buffer *vhca_buf) +{ + struct virtio_admin_cmd_resource_obj_cmd_hdr obj_hdr = {}; + struct page *page; + u8 *to_buff; + + obj_hdr.type = cpu_to_le16(VIRTIO_RESOURCE_OBJ_DEV_PARTS); + obj_hdr.id = cpu_to_le32(vhca_buf->migf->obj_id); + page = virtiovf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + memcpy(to_buff, &obj_hdr, sizeof(obj_hdr)); + kunmap_local(to_buff); + + /* Mark the buffer as including the header object data */ + vhca_buf->include_header_object = 1; + return 0; +} + +static int +virtiovf_append_page_to_mig_buf(struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + unsigned long offset; + size_t page_offset; + struct page *page; + size_t page_len; + u8 *to_buff; + int ret; + + offset = *pos - vhca_buf->start_pos; + + if (vhca_buf->include_header_object) + /* The buffer holds the object header, update the offset accordingly */ + offset += sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr); + + page_offset = offset % PAGE_SIZE; + + page = virtiovf_get_migration_page(vhca_buf, offset - page_offset); + if (!page) + return -EINVAL; + + page_len = min_t(size_t, *len, PAGE_SIZE - page_offset); + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + page_offset, *buf, page_len); + kunmap_local(to_buff); + if (ret) + return -EFAULT; + + *pos += page_len; + *done += page_len; + *buf += page_len; + *len -= page_len; + vhca_buf->length += page_len; + return 0; +} + +static ssize_t +virtiovf_resume_read_chunk(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + size_t chunk_size, const char __user **buf, + size_t *len, loff_t *pos, ssize_t *done, + bool *has_work) +{ + size_t copy_len, to_copy; + int ret; + + to_copy = min_t(size_t, *len, chunk_size - vhca_buf->length); + copy_len = to_copy; + while (to_copy) { + ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, + pos, done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == chunk_size) { + migf->load_state = VIRTIOVF_LOAD_STATE_LOAD_CHUNK; + migf->max_pos += chunk_size; + *has_work = true; + } + + return 0; +} + +static int +virtiovf_resume_read_header_data(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, size_t *len, + loff_t *pos, ssize_t *done) +{ + size_t copy_len, to_copy; + size_t required_data; + int ret; + + required_data = migf->record_size - vhca_buf->length; + to_copy = min_t(size_t, *len, required_data); + copy_len = to_copy; + while (to_copy) { + ret = virtiovf_append_page_to_mig_buf(vhca_buf, buf, &to_copy, + pos, done); + if (ret) + return ret; + } + + *len -= copy_len; + if (vhca_buf->length == migf->record_size) { + switch (migf->record_tag) { + default: + /* Optional tag */ + break; + } + + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + migf->max_pos += migf->record_size; + vhca_buf->length = 0; + } + + return 0; +} + +static int +virtiovf_resume_read_header(struct virtiovf_migration_file *migf, + struct virtiovf_data_buffer *vhca_buf, + const char __user **buf, + size_t *len, loff_t *pos, + ssize_t *done, bool *has_work) +{ + struct page *page; + size_t copy_len; + u8 *to_buff; + int ret; + + copy_len = min_t(size_t, *len, + sizeof(struct virtiovf_migration_header) - vhca_buf->length); + page = virtiovf_get_migration_page(vhca_buf, 0); + if (!page) + return -EINVAL; + to_buff = kmap_local_page(page); + ret = copy_from_user(to_buff + vhca_buf->length, *buf, copy_len); + if (ret) { + ret = -EFAULT; + goto end; + } + + *buf += copy_len; + *pos += copy_len; + *done += copy_len; + *len -= copy_len; + vhca_buf->length += copy_len; + if (vhca_buf->length == sizeof(struct virtiovf_migration_header)) { + u64 record_size; + u32 flags; + + record_size = le64_to_cpup((__le64 *)to_buff); + if (record_size > MAX_LOAD_SIZE) { + ret = -ENOMEM; + goto end; + } + + migf->record_size = record_size; + flags = le32_to_cpup((__le32 *)(to_buff + + offsetof(struct virtiovf_migration_header, flags))); + migf->record_tag = le32_to_cpup((__le32 *)(to_buff + + offsetof(struct virtiovf_migration_header, tag))); + switch (migf->record_tag) { + case VIRTIOVF_MIGF_HEADER_TAG_DEVICE_DATA: + migf->load_state = VIRTIOVF_LOAD_STATE_PREP_CHUNK; + break; + default: + if (!(flags & VIRTIOVF_MIGF_HEADER_FLAGS_TAG_OPTIONAL)) { + ret = -EOPNOTSUPP; + goto end; + } + /* We may read and skip this optional record data */ + migf->load_state = VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA; + } + + migf->max_pos += vhca_buf->length; + vhca_buf->length = 0; + *has_work = true; + } +end: + kunmap_local(to_buff); + return ret; +} + +static ssize_t virtiovf_resume_write(struct file *filp, const char __user *buf, + size_t len, loff_t *pos) +{ + struct virtiovf_migration_file *migf = filp->private_data; + struct virtiovf_data_buffer *vhca_buf = migf->buf; + struct virtiovf_data_buffer *vhca_buf_header = migf->buf_header; + unsigned int orig_length; + bool has_work = false; + ssize_t done = 0; + int ret = 0; + + if (pos) + return -ESPIPE; + + pos = &filp->f_pos; + if (*pos < vhca_buf->start_pos) + return -EINVAL; + + mutex_lock(&migf->virtvdev->state_mutex); + mutex_lock(&migf->lock); + if (migf->state == VIRTIOVF_MIGF_STATE_ERROR) { + done = -ENODEV; + goto out_unlock; + } + + while (len || has_work) { + has_work = false; + switch (migf->load_state) { + case VIRTIOVF_LOAD_STATE_READ_HEADER: + ret = virtiovf_resume_read_header(migf, vhca_buf_header, &buf, + &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_PREP_HEADER_DATA: + if (vhca_buf_header->allocated_length < migf->record_size) { + virtiovf_free_data_buffer(vhca_buf_header); + + migf->buf_header = virtiovf_alloc_data_buffer(migf, + migf->record_size); + if (IS_ERR(migf->buf_header)) { + ret = PTR_ERR(migf->buf_header); + migf->buf_header = NULL; + goto out_unlock; + } + + vhca_buf_header = migf->buf_header; + } + + vhca_buf_header->start_pos = migf->max_pos; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER_DATA; + break; + case VIRTIOVF_LOAD_STATE_READ_HEADER_DATA: + ret = virtiovf_resume_read_header_data(migf, vhca_buf_header, + &buf, &len, pos, &done); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_PREP_CHUNK: + { + u32 cmd_size = migf->record_size + + sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr); + + /* + * The DMA map/unmap is managed in virtio layer, we just need to extend + * the SG pages to hold the extra required chunk data. + */ + if (vhca_buf->allocated_length < cmd_size) { + ret = virtiovf_add_migration_pages(vhca_buf, + DIV_ROUND_UP_ULL(cmd_size - vhca_buf->allocated_length, + PAGE_SIZE)); + if (ret) + goto out_unlock; + } + + vhca_buf->start_pos = migf->max_pos; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_CHUNK; + break; + } + case VIRTIOVF_LOAD_STATE_READ_CHUNK: + ret = virtiovf_resume_read_chunk(migf, vhca_buf, migf->record_size, + &buf, &len, pos, &done, &has_work); + if (ret) + goto out_unlock; + break; + case VIRTIOVF_LOAD_STATE_LOAD_CHUNK: + /* Mark the last SG entry and set its length */ + sg_mark_end(vhca_buf->last_offset_sg); + orig_length = vhca_buf->last_offset_sg->length; + /* Length should include the resource object command header */ + vhca_buf->last_offset_sg->length = vhca_buf->length + + sizeof(struct virtio_admin_cmd_resource_obj_cmd_hdr) - + vhca_buf->last_offset; + ret = virtio_pci_admin_dev_parts_set(migf->virtvdev->core_device.pdev, + vhca_buf->table.sgt.sgl); + /* Restore the original SG data */ + vhca_buf->last_offset_sg->length = orig_length; + sg_unmark_end(vhca_buf->last_offset_sg); + if (ret) + goto out_unlock; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + /* be ready for reading the next chunk */ + vhca_buf->length = 0; + break; + default: + break; + } + } + +out_unlock: + if (ret) + migf->state = VIRTIOVF_MIGF_STATE_ERROR; + mutex_unlock(&migf->lock); + virtiovf_state_mutex_unlock(migf->virtvdev); + return ret ? ret : done; +} + +static const struct file_operations virtiovf_resume_fops = { + .owner = THIS_MODULE, + .write = virtiovf_resume_write, + .release = virtiovf_release_file, +}; + +static struct virtiovf_migration_file * +virtiovf_pci_resume_device_data(struct virtiovf_pci_core_device *virtvdev) +{ + struct virtiovf_migration_file *migf; + struct virtiovf_data_buffer *buf; + u32 obj_id; + int ret; + + migf = kzalloc(sizeof(*migf), GFP_KERNEL_ACCOUNT); + if (!migf) + return ERR_PTR(-ENOMEM); + + migf->filp = anon_inode_getfile("virtiovf_mig", &virtiovf_resume_fops, migf, + O_WRONLY); + if (IS_ERR(migf->filp)) { + ret = PTR_ERR(migf->filp); + kfree(migf); + return ERR_PTR(ret); + } + + stream_open(migf->filp->f_inode, migf->filp); + mutex_init(&migf->lock); + INIT_LIST_HEAD(&migf->buf_list); + INIT_LIST_HEAD(&migf->avail_list); + spin_lock_init(&migf->list_lock); + + buf = virtiovf_alloc_data_buffer(migf, VIRTIOVF_TARGET_INITIAL_BUF_SIZE); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out; + } + + migf->buf = buf; + + buf = virtiovf_alloc_data_buffer(migf, + sizeof(struct virtiovf_migration_header)); + if (IS_ERR(buf)) { + ret = PTR_ERR(buf); + goto out_clean; + } + + migf->buf_header = buf; + migf->load_state = VIRTIOVF_LOAD_STATE_READ_HEADER; + + migf->virtvdev = virtvdev; + ret = virtiovf_pci_alloc_obj_id(virtvdev, VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET, + &obj_id); + if (ret) + goto out_clean; + + migf->obj_id = obj_id; + /* Mark as having a valid obj id which can be even 0 */ + migf->has_obj_id = true; + ret = virtiovf_set_obj_cmd_header(migf->buf); + if (ret) + goto out_clean; + + return migf; + +out_clean: + virtiovf_clean_migf_resources(migf); +out: + fput(migf->filp); + return ERR_PTR(ret); +} + +static struct file * +virtiovf_pci_step_device_state_locked(struct virtiovf_pci_core_device *virtvdev, + u32 new) +{ + u32 cur = virtvdev->mig_state; + int ret; + + if (cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_STOP) { + /* NOP */ + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RUNNING_P2P) { + /* NOP */ + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_RUNNING_P2P) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, + BIT(VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED)); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING_P2P && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_PRE_COPY)) { + ret = virtio_pci_admin_mode_set(virtvdev->core_device.pdev, 0); + if (ret) + return ERR_PTR(ret); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_STOP_COPY) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_save_device_data(virtvdev, false); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->saving_migf = migf; + return migf->filp; + } + + if ((cur == VFIO_DEVICE_STATE_STOP_COPY && new == VFIO_DEVICE_STATE_STOP) || + (cur == VFIO_DEVICE_STATE_PRE_COPY && new == VFIO_DEVICE_STATE_RUNNING) || + (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_RUNNING_P2P)) { + virtiovf_disable_fds(virtvdev); + return NULL; + } + + if (cur == VFIO_DEVICE_STATE_STOP && new == VFIO_DEVICE_STATE_RESUMING) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_resume_device_data(virtvdev); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->resuming_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_RESUMING && new == VFIO_DEVICE_STATE_STOP) { + virtiovf_disable_fds(virtvdev); + return NULL; + } + + if ((cur == VFIO_DEVICE_STATE_RUNNING && new == VFIO_DEVICE_STATE_PRE_COPY) || + (cur == VFIO_DEVICE_STATE_RUNNING_P2P && + new == VFIO_DEVICE_STATE_PRE_COPY_P2P)) { + struct virtiovf_migration_file *migf; + + migf = virtiovf_pci_save_device_data(virtvdev, true); + if (IS_ERR(migf)) + return ERR_CAST(migf); + get_file(migf->filp); + virtvdev->saving_migf = migf; + return migf->filp; + } + + if (cur == VFIO_DEVICE_STATE_PRE_COPY_P2P && new == VFIO_DEVICE_STATE_STOP_COPY) { + ret = virtiovf_pci_save_device_final_data(virtvdev); + return ret ? ERR_PTR(ret) : NULL; + } + + /* + * vfio_mig_get_next_state() does not use arcs other than the above + */ + WARN_ON(true); + return ERR_PTR(-EINVAL); +} + +static struct file * +virtiovf_pci_set_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state new_state) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + enum vfio_device_mig_state next_state; + struct file *res = NULL; + int ret; + + mutex_lock(&virtvdev->state_mutex); + while (new_state != virtvdev->mig_state) { + ret = vfio_mig_get_next_state(vdev, virtvdev->mig_state, + new_state, &next_state); + if (ret) { + res = ERR_PTR(ret); + break; + } + res = virtiovf_pci_step_device_state_locked(virtvdev, next_state); + if (IS_ERR(res)) + break; + virtvdev->mig_state = next_state; + if (WARN_ON(res && new_state != virtvdev->mig_state)) { + fput(res); + res = ERR_PTR(-EINVAL); + break; + } + } + virtiovf_state_mutex_unlock(virtvdev); + return res; +} + +static int virtiovf_pci_get_device_state(struct vfio_device *vdev, + enum vfio_device_mig_state *curr_state) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + + mutex_lock(&virtvdev->state_mutex); + *curr_state = virtvdev->mig_state; + virtiovf_state_mutex_unlock(virtvdev); + return 0; +} + +static int virtiovf_pci_get_data_size(struct vfio_device *vdev, + unsigned long *stop_copy_length) +{ + struct virtiovf_pci_core_device *virtvdev = container_of( + vdev, struct virtiovf_pci_core_device, core_device.vdev); + bool obj_id_exists; + u32 res_size; + u32 obj_id; + int ret; + + mutex_lock(&virtvdev->state_mutex); + obj_id_exists = virtvdev->saving_migf && virtvdev->saving_migf->has_obj_id; + if (!obj_id_exists) { + ret = virtiovf_pci_alloc_obj_id(virtvdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET, + &obj_id); + if (ret) + goto end; + } else { + obj_id = virtvdev->saving_migf->obj_id; + } + + ret = virtio_pci_admin_dev_parts_metadata_get(virtvdev->core_device.pdev, + VIRTIO_RESOURCE_OBJ_DEV_PARTS, obj_id, + VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE, + &res_size); + if (!ret) + *stop_copy_length = res_size; + + /* + * We can't leave this obj_id alive if didn't exist before, otherwise, it might + * stay alive, even without an active migration flow (e.g. migration was cancelled) + */ + if (!obj_id_exists) + virtiovf_pci_free_obj_id(virtvdev, obj_id); +end: + virtiovf_state_mutex_unlock(virtvdev); + return ret; +} + +static const struct vfio_migration_ops virtvdev_pci_mig_ops = { + .migration_set_state = virtiovf_pci_set_device_state, + .migration_get_state = virtiovf_pci_get_device_state, + .migration_get_data_size = virtiovf_pci_get_data_size, +}; + +void virtiovf_set_migratable(struct virtiovf_pci_core_device *virtvdev) +{ + virtvdev->migrate_cap = 1; + mutex_init(&virtvdev->state_mutex); + spin_lock_init(&virtvdev->reset_lock); + virtvdev->core_device.vdev.migration_flags = + VFIO_MIGRATION_STOP_COPY | + VFIO_MIGRATION_P2P | + VFIO_MIGRATION_PRE_COPY; + virtvdev->core_device.vdev.mig_ops = &virtvdev_pci_mig_ops; +} + +void virtiovf_open_migration(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->migrate_cap) + return; + + virtvdev->mig_state = VFIO_DEVICE_STATE_RUNNING; +} + +void virtiovf_close_migration(struct virtiovf_pci_core_device *virtvdev) +{ + if (!virtvdev->migrate_cap) + return; + + virtiovf_disable_fds(virtvdev); +} diff --git a/drivers/virtio/virtio_pci_common.h b/drivers/virtio/virtio_pci_common.h index 8beecf23ec85..8cd01de27baf 100644 --- a/drivers/virtio/virtio_pci_common.h +++ b/drivers/virtio/virtio_pci_common.h @@ -48,6 +48,9 @@ struct virtio_pci_admin_vq { /* Protects virtqueue access. */ spinlock_t lock; u64 supported_cmds; + u64 supported_caps; + u8 max_dev_parts_objects; + struct ida dev_parts_ida; /* Name of the admin queue: avq.$vq_index. */ char name[10]; u16 vq_index; @@ -167,15 +170,27 @@ struct virtio_device *virtio_pci_vf_get_pf_dev(struct pci_dev *pdev); BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ) | \ BIT_ULL(VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO)) +#define VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP \ + (BIT_ULL(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DRIVER_CAP_SET) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_GET) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DEV_PARTS_SET) | \ + BIT_ULL(VIRTIO_ADMIN_CMD_DEV_MODE_SET)) + /* Unlike modern drivers which support hardware virtio devices, legacy drivers * assume software-based devices: e.g. they don't use proper memory barriers * on ARM, use big endian on PPC, etc. X86 drivers are mostly ok though, more * or less by chance. For now, only support legacy IO on X86. */ #ifdef CONFIG_VIRTIO_PCI_ADMIN_LEGACY -#define VIRTIO_ADMIN_CMD_BITMAP VIRTIO_LEGACY_ADMIN_CMD_BITMAP +#define VIRTIO_ADMIN_CMD_BITMAP (VIRTIO_LEGACY_ADMIN_CMD_BITMAP | \ + VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP) #else -#define VIRTIO_ADMIN_CMD_BITMAP 0 +#define VIRTIO_ADMIN_CMD_BITMAP VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP #endif bool vp_is_avq(struct virtio_device *vdev, unsigned int index); diff --git a/drivers/virtio/virtio_pci_modern.c b/drivers/virtio/virtio_pci_modern.c index 4fbcbc7a9ae1..5eaade757860 100644 --- a/drivers/virtio/virtio_pci_modern.c +++ b/drivers/virtio/virtio_pci_modern.c @@ -15,6 +15,7 @@ */ #include +#include #define VIRTIO_PCI_NO_LEGACY #define VIRTIO_RING_NO_LEGACY #include "virtio_pci_common.h" @@ -54,8 +55,10 @@ void vp_modern_avq_done(struct virtqueue *vq) spin_lock_irqsave(&admin_vq->lock, flags); do { virtqueue_disable_cb(vq); - while ((cmd = virtqueue_get_buf(vq, &len))) + while ((cmd = virtqueue_get_buf(vq, &len))) { + cmd->result_sg_size = len; complete(&cmd->completion); + } } while (!virtqueue_enable_cb(vq)); spin_unlock_irqrestore(&admin_vq->lock, flags); } @@ -218,12 +221,117 @@ end: kfree(data); } +static void +virtio_pci_admin_cmd_dev_parts_objects_enable(struct virtio_device *virtio_dev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(virtio_dev); + struct virtio_admin_cmd_cap_get_data *get_data; + struct virtio_admin_cmd_cap_set_data *set_data; + struct virtio_dev_parts_cap *result; + struct virtio_admin_cmd cmd = {}; + struct scatterlist result_sg; + struct scatterlist data_sg; + u8 resource_objects_limit; + u16 set_data_size; + int ret; + + get_data = kzalloc(sizeof(*get_data), GFP_KERNEL); + if (!get_data) + return; + + result = kzalloc(sizeof(*result), GFP_KERNEL); + if (!result) + goto end; + + get_data->id = cpu_to_le16(VIRTIO_DEV_PARTS_CAP); + sg_init_one(&data_sg, get_data, sizeof(*get_data)); + sg_init_one(&result_sg, result, sizeof(*result)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEVICE_CAP_GET); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.data_sg = &data_sg; + cmd.result_sg = &result_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (ret) + goto err_get; + + set_data_size = sizeof(*set_data) + sizeof(*result); + set_data = kzalloc(set_data_size, GFP_KERNEL); + if (!set_data) + goto err_get; + + set_data->id = cpu_to_le16(VIRTIO_DEV_PARTS_CAP); + + /* Set the limit to the minimum value between the GET and SET values + * supported by the device. Since the obj_id for VIRTIO_DEV_PARTS_CAP + * is a globally unique value per PF, there is no possibility of + * overlap between GET and SET operations. + */ + resource_objects_limit = min(result->get_parts_resource_objects_limit, + result->set_parts_resource_objects_limit); + result->get_parts_resource_objects_limit = resource_objects_limit; + result->set_parts_resource_objects_limit = resource_objects_limit; + memcpy(set_data->cap_specific_data, result, sizeof(*result)); + sg_init_one(&data_sg, set_data, set_data_size); + cmd.data_sg = &data_sg; + cmd.result_sg = NULL; + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DRIVER_CAP_SET); + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (ret) + goto err_set; + + /* Allocate IDR to manage the dev caps objects */ + ida_init(&vp_dev->admin_vq.dev_parts_ida); + vp_dev->admin_vq.max_dev_parts_objects = resource_objects_limit; + +err_set: + kfree(set_data); +err_get: + kfree(result); +end: + kfree(get_data); +} + +static void virtio_pci_admin_cmd_cap_init(struct virtio_device *virtio_dev) +{ + struct virtio_pci_device *vp_dev = to_vp_device(virtio_dev); + struct virtio_admin_cmd_query_cap_id_result *data; + struct virtio_admin_cmd cmd = {}; + struct scatterlist result_sg; + int ret; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return; + + sg_init_one(&result_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.result_sg = &result_sg; + + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (ret) + goto end; + + /* Max number of caps fits into a single u64 */ + BUILD_BUG_ON(sizeof(data->supported_caps) > sizeof(u64)); + + vp_dev->admin_vq.supported_caps = le64_to_cpu(data->supported_caps[0]); + + if (!(vp_dev->admin_vq.supported_caps & (1 << VIRTIO_DEV_PARTS_CAP))) + goto end; + + virtio_pci_admin_cmd_dev_parts_objects_enable(virtio_dev); +end: + kfree(data); +} + static void vp_modern_avq_activate(struct virtio_device *vdev) { if (!virtio_has_feature(vdev, VIRTIO_F_ADMIN_VQ)) return; virtio_pci_admin_cmd_list_init(vdev); + virtio_pci_admin_cmd_cap_init(vdev); } static void vp_modern_avq_cleanup(struct virtio_device *vdev) @@ -758,6 +866,353 @@ static bool vp_get_shm_region(struct virtio_device *vdev, return true; } +/* + * virtio_pci_admin_has_dev_parts - Checks whether the device parts + * functionality is supported + * @pdev: VF pci_dev + * + * Returns true on success. + */ +bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_pci_device *vp_dev; + + if (!virtio_dev) + return false; + + if (!virtio_has_feature(virtio_dev, VIRTIO_F_ADMIN_VQ)) + return false; + + vp_dev = to_vp_device(virtio_dev); + + if (!((vp_dev->admin_vq.supported_cmds & VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP) == + VIRTIO_DEV_PARTS_ADMIN_CMD_BITMAP)) + return false; + + return vp_dev->admin_vq.max_dev_parts_objects; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_has_dev_parts); + +/* + * virtio_pci_admin_mode_set - Sets the mode of a member device + * @pdev: VF pci_dev + * @flags: device mode's flags + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 flags) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_dev_mode_set_data *data; + struct virtio_admin_cmd cmd = {}; + struct scatterlist data_sg; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->flags = flags; + sg_init_one(&data_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_MODE_SET); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_mode_set); + +/* + * virtio_pci_admin_obj_create - Creates an object for a given type and operation, + * following the max objects that can be created for that request. + * @pdev: VF pci_dev + * @obj_type: Object type + * @operation_type: Operation type + * @obj_id: Output unique object id + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type, + u32 *obj_id) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + u16 data_size = sizeof(struct virtio_admin_cmd_resource_obj_create_data); + struct virtio_admin_cmd_resource_obj_create_data *obj_create_data; + struct virtio_resource_obj_dev_parts obj_dev_parts = {}; + struct virtio_pci_admin_vq *avq; + struct virtio_admin_cmd cmd = {}; + struct scatterlist data_sg; + void *data; + int id = -1; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS) + return -EOPNOTSUPP; + + if (operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET && + operation_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET) + return -EINVAL; + + avq = &to_vp_device(virtio_dev)->admin_vq; + if (!avq->max_dev_parts_objects) + return -EOPNOTSUPP; + + id = ida_alloc_range(&avq->dev_parts_ida, 0, + avq->max_dev_parts_objects - 1, GFP_KERNEL); + if (id < 0) + return id; + + *obj_id = id; + data_size += sizeof(obj_dev_parts); + data = kzalloc(data_size, GFP_KERNEL); + if (!data) { + ret = -ENOMEM; + goto end; + } + + obj_create_data = data; + obj_create_data->hdr.type = cpu_to_le16(obj_type); + obj_create_data->hdr.id = cpu_to_le32(*obj_id); + obj_dev_parts.type = operation_type; + memcpy(obj_create_data->resource_obj_specific_data, &obj_dev_parts, + sizeof(obj_dev_parts)); + sg_init_one(&data_sg, data, data_size); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + + kfree(data); +end: + if (ret) + ida_free(&avq->dev_parts_ida, id); + + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_create); + +/* + * virtio_pci_admin_obj_destroy - Destroys an object of a given type and id + * @pdev: VF pci_dev + * @obj_type: Object type + * @id: Object id + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_resource_obj_cmd_hdr *data; + struct virtio_pci_device *vp_dev; + struct virtio_admin_cmd cmd = {}; + struct scatterlist data_sg; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + if (obj_type != VIRTIO_RESOURCE_OBJ_DEV_PARTS) + return -EINVAL; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->type = cpu_to_le16(obj_type); + data->id = cpu_to_le32(id); + sg_init_one(&data_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (!ret) { + vp_dev = to_vp_device(virtio_dev); + ida_free(&vp_dev->admin_vq.dev_parts_ida, id); + } + + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_obj_destroy); + +/* + * virtio_pci_admin_dev_parts_metadata_get - Gets the metadata of the device parts + * identified by the below attributes. + * @pdev: VF pci_dev + * @obj_type: Object type + * @id: Object id + * @metadata_type: Metadata type + * @out: Upon success holds the output for 'metadata type size' + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type, + u32 id, u8 metadata_type, u32 *out) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_dev_parts_metadata_result *result; + struct virtio_admin_cmd_dev_parts_metadata_data *data; + struct scatterlist data_sg, result_sg; + struct virtio_admin_cmd cmd = {}; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + if (metadata_type != VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE) + return -EOPNOTSUPP; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + result = kzalloc(sizeof(*result), GFP_KERNEL); + if (!result) { + ret = -ENOMEM; + goto end; + } + + data->hdr.type = cpu_to_le16(obj_type); + data->hdr.id = cpu_to_le32(id); + data->type = metadata_type; + sg_init_one(&data_sg, data, sizeof(*data)); + sg_init_one(&result_sg, result, sizeof(*result)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + cmd.result_sg = &result_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (!ret) + *out = le32_to_cpu(result->parts_size.size); + + kfree(result); +end: + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_metadata_get); + +/* + * virtio_pci_admin_dev_parts_get - Gets the device parts identified by the below attributes. + * @pdev: VF pci_dev + * @obj_type: Object type + * @id: Object id + * @get_type: Get type + * @res_sg: Upon success holds the output result data + * @res_size: Upon success holds the output result size + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id, + u8 get_type, struct scatterlist *res_sg, + u32 *res_size) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd_dev_parts_get_data *data; + struct scatterlist data_sg; + struct virtio_admin_cmd cmd = {}; + int vf_id; + int ret; + + if (!virtio_dev) + return -ENODEV; + + if (get_type != VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL) + return -EOPNOTSUPP; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + + data->hdr.type = cpu_to_le16(obj_type); + data->hdr.id = cpu_to_le32(id); + data->type = get_type; + sg_init_one(&data_sg, data, sizeof(*data)); + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_GET); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = &data_sg; + cmd.result_sg = res_sg; + ret = vp_modern_admin_cmd_exec(virtio_dev, &cmd); + if (!ret) + *res_size = cmd.result_sg_size; + + kfree(data); + return ret; +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_get); + +/* + * virtio_pci_admin_dev_parts_set - Sets the device parts identified by the below attributes. + * @pdev: VF pci_dev + * @data_sg: The device parts data, its layout follows struct virtio_admin_cmd_dev_parts_set_data + * + * Note: caller must serialize access for the given device. + * Returns 0 on success, or negative on failure. + */ +int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg) +{ + struct virtio_device *virtio_dev = virtio_pci_vf_get_pf_dev(pdev); + struct virtio_admin_cmd cmd = {}; + int vf_id; + + if (!virtio_dev) + return -ENODEV; + + vf_id = pci_iov_vf_id(pdev); + if (vf_id < 0) + return vf_id; + + cmd.opcode = cpu_to_le16(VIRTIO_ADMIN_CMD_DEV_PARTS_SET); + cmd.group_type = cpu_to_le16(VIRTIO_ADMIN_GROUP_TYPE_SRIOV); + cmd.group_member_id = cpu_to_le64(vf_id + 1); + cmd.data_sg = data_sg; + return vp_modern_admin_cmd_exec(virtio_dev, &cmd); +} +EXPORT_SYMBOL_GPL(virtio_pci_admin_dev_parts_set); + static const struct virtio_config_ops virtio_pci_config_nodev_ops = { .get = NULL, .set = NULL, diff --git a/include/linux/virtio.h b/include/linux/virtio.h index 338e0f5efb4b..57cc4b07fd17 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -120,6 +120,7 @@ struct virtio_admin_cmd { struct scatterlist *data_sg; struct scatterlist *result_sg; struct completion completion; + u32 result_sg_size; int ret; }; diff --git a/include/linux/virtio_pci_admin.h b/include/linux/virtio_pci_admin.h index f4a100a0fe2e..dffc92c17ad2 100644 --- a/include/linux/virtio_pci_admin.h +++ b/include/linux/virtio_pci_admin.h @@ -20,4 +20,15 @@ int virtio_pci_admin_legacy_io_notify_info(struct pci_dev *pdev, u64 *bar_offset); #endif +bool virtio_pci_admin_has_dev_parts(struct pci_dev *pdev); +int virtio_pci_admin_mode_set(struct pci_dev *pdev, u8 mode); +int virtio_pci_admin_obj_create(struct pci_dev *pdev, u16 obj_type, u8 operation_type, + u32 *obj_id); +int virtio_pci_admin_obj_destroy(struct pci_dev *pdev, u16 obj_type, u32 id); +int virtio_pci_admin_dev_parts_metadata_get(struct pci_dev *pdev, u16 obj_type, + u32 id, u8 metadata_type, u32 *out); +int virtio_pci_admin_dev_parts_get(struct pci_dev *pdev, u16 obj_type, u32 id, + u8 get_type, struct scatterlist *res_sg, u32 *res_size); +int virtio_pci_admin_dev_parts_set(struct pci_dev *pdev, struct scatterlist *data_sg); + #endif /* _LINUX_VIRTIO_PCI_ADMIN_H */ diff --git a/include/uapi/linux/virtio_pci.h b/include/uapi/linux/virtio_pci.h index a8208492e822..1beb317df1b9 100644 --- a/include/uapi/linux/virtio_pci.h +++ b/include/uapi/linux/virtio_pci.h @@ -40,6 +40,7 @@ #define _LINUX_VIRTIO_PCI_H #include +#include #ifndef VIRTIO_PCI_NO_LEGACY @@ -240,6 +241,17 @@ struct virtio_pci_cfg_cap { #define VIRTIO_ADMIN_CMD_LEGACY_DEV_CFG_READ 0x5 #define VIRTIO_ADMIN_CMD_LEGACY_NOTIFY_INFO 0x6 +/* Device parts access commands. */ +#define VIRTIO_ADMIN_CMD_CAP_ID_LIST_QUERY 0x7 +#define VIRTIO_ADMIN_CMD_DEVICE_CAP_GET 0x8 +#define VIRTIO_ADMIN_CMD_DRIVER_CAP_SET 0x9 +#define VIRTIO_ADMIN_CMD_RESOURCE_OBJ_CREATE 0xa +#define VIRTIO_ADMIN_CMD_RESOURCE_OBJ_DESTROY 0xd +#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_GET 0xe +#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET 0xf +#define VIRTIO_ADMIN_CMD_DEV_PARTS_SET 0x10 +#define VIRTIO_ADMIN_CMD_DEV_MODE_SET 0x11 + struct virtio_admin_cmd_hdr { __le16 opcode; /* @@ -286,4 +298,123 @@ struct virtio_admin_cmd_notify_info_result { struct virtio_admin_cmd_notify_info_data entries[VIRTIO_ADMIN_CMD_MAX_NOTIFY_INFO]; }; +#define VIRTIO_DEV_PARTS_CAP 0x0000 + +struct virtio_dev_parts_cap { + __u8 get_parts_resource_objects_limit; + __u8 set_parts_resource_objects_limit; +}; + +#define MAX_CAP_ID __KERNEL_DIV_ROUND_UP(VIRTIO_DEV_PARTS_CAP + 1, 64) + +struct virtio_admin_cmd_query_cap_id_result { + __le64 supported_caps[MAX_CAP_ID]; +}; + +struct virtio_admin_cmd_cap_get_data { + __le16 id; + __u8 reserved[6]; +}; + +struct virtio_admin_cmd_cap_set_data { + __le16 id; + __u8 reserved[6]; + __u8 cap_specific_data[]; +}; + +struct virtio_admin_cmd_resource_obj_cmd_hdr { + __le16 type; + __u8 reserved[2]; + __le32 id; /* Indicates unique resource object id per resource object type */ +}; + +struct virtio_admin_cmd_resource_obj_create_data { + struct virtio_admin_cmd_resource_obj_cmd_hdr hdr; + __le64 flags; + __u8 resource_obj_specific_data[]; +}; + +#define VIRTIO_RESOURCE_OBJ_DEV_PARTS 0 + +#define VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_GET 0 +#define VIRTIO_RESOURCE_OBJ_DEV_PARTS_TYPE_SET 1 + +struct virtio_resource_obj_dev_parts { + __u8 type; + __u8 reserved[7]; +}; + +#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_SIZE 0 +#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_COUNT 1 +#define VIRTIO_ADMIN_CMD_DEV_PARTS_METADATA_TYPE_LIST 2 + +struct virtio_admin_cmd_dev_parts_metadata_data { + struct virtio_admin_cmd_resource_obj_cmd_hdr hdr; + __u8 type; + __u8 reserved[7]; +}; + +#define VIRTIO_DEV_PART_F_OPTIONAL 0 + +struct virtio_dev_part_hdr { + __le16 part_type; + __u8 flags; + __u8 reserved; + union { + struct { + __le32 offset; + __le32 reserved; + } pci_common_cfg; + struct { + __le16 index; + __u8 reserved[6]; + } vq_index; + } selector; + __le32 length; +}; + +struct virtio_dev_part { + struct virtio_dev_part_hdr hdr; + __u8 value[]; +}; + +struct virtio_admin_cmd_dev_parts_metadata_result { + union { + struct { + __le32 size; + __le32 reserved; + } parts_size; + struct { + __le32 count; + __le32 reserved; + } hdr_list_count; + struct { + __le32 count; + __le32 reserved; + struct virtio_dev_part_hdr hdrs[]; + } hdr_list; + }; +}; + +#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_SELECTED 0 +#define VIRTIO_ADMIN_CMD_DEV_PARTS_GET_TYPE_ALL 1 + +struct virtio_admin_cmd_dev_parts_get_data { + struct virtio_admin_cmd_resource_obj_cmd_hdr hdr; + __u8 type; + __u8 reserved[7]; + struct virtio_dev_part_hdr hdr_list[]; +}; + +struct virtio_admin_cmd_dev_parts_set_data { + struct virtio_admin_cmd_resource_obj_cmd_hdr hdr; + struct virtio_dev_part parts[]; +}; + +#define VIRTIO_ADMIN_CMD_DEV_MODE_F_STOPPED 0 + +struct virtio_admin_cmd_dev_mode_set_data { + __u8 flags; +}; + #endif diff --git a/virt/kvm/vfio.c b/virt/kvm/vfio.c index 72aa1fdeb699..196a102e34fb 100644 --- a/virt/kvm/vfio.c +++ b/virt/kvm/vfio.c @@ -347,7 +347,7 @@ static void kvm_vfio_release(struct kvm_device *dev) static int kvm_vfio_create(struct kvm_device *dev, u32 type); -static struct kvm_device_ops kvm_vfio_ops = { +static const struct kvm_device_ops kvm_vfio_ops = { .name = "kvm-vfio", .create = kvm_vfio_create, .release = kvm_vfio_release,