From b779260b097dec295e8798277ed08ec5ecd3b2c1 Mon Sep 17 00:00:00 2001 From: Joseph Cihula Date: Tue, 3 May 2011 00:08:37 -0700 Subject: [PATCH 01/46] intel-iommu: fix VT-d PMR disable for TXT on S3 resume This patch is a follow on to https://lkml.org/lkml/2011/3/21/239, which was merged as commit 51a63e67da6056c13b5b597dcc9e1b3bd7ceaa55. This patch adds support for S3, as pointed out by Chris Wright. Signed-off-by: Joseph Cihula Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 31 +++++++++++++++++++++++++------ 1 file changed, 25 insertions(+), 6 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index d552d2c77844..4e4e0202e59d 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -141,6 +141,12 @@ static struct intel_iommu **g_iommus; static void __init check_tylersburg_isoch(void); static int rwbf_quirk; +/* + * set to 1 to panic kernel if can't successfully enable VT-d + * (used when kernel is launched w/ TXT) + */ +static int force_on = 0; + /* * 0: Present * 1-11: Reserved @@ -2217,7 +2223,7 @@ static int __init iommu_prepare_static_identity_mapping(int hw) return 0; } -static int __init init_dmars(int force_on) +static int __init init_dmars(void) { struct dmar_drhd_unit *drhd; struct dmar_rmrr_unit *rmrr; @@ -3117,7 +3123,17 @@ static int init_iommu_hw(void) if (iommu->qi) dmar_reenable_qi(iommu); - for_each_active_iommu(iommu, drhd) { + for_each_iommu(iommu, drhd) { + if (drhd->ignored) { + /* + * we always have to disable PMRs or DMA may fail on + * this device + */ + if (force_on) + iommu_disable_protect_mem_regions(iommu); + continue; + } + iommu_flush_write_buffer(iommu); iommu_set_root_entry(iommu); @@ -3126,7 +3142,8 @@ static int init_iommu_hw(void) DMA_CCMD_GLOBAL_INVL); iommu->flush.flush_iotlb(iommu, 0, 0, 0, DMA_TLB_GLOBAL_FLUSH); - iommu_enable_translation(iommu); + if (iommu_enable_translation(iommu)) + return 1; iommu_disable_protect_mem_regions(iommu); } @@ -3193,7 +3210,10 @@ static void iommu_resume(void) unsigned long flag; if (init_iommu_hw()) { - WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); + if (force_on) + panic("tboot: IOMMU setup failed, DMAR can not resume!\n"); + else + WARN(1, "IOMMU setup failed, DMAR can not resume!\n"); return; } @@ -3270,7 +3290,6 @@ static struct notifier_block device_nb = { int __init intel_iommu_init(void) { int ret = 0; - int force_on = 0; /* VT-d is required for a TXT/tboot launch, so enforce that */ force_on = tboot_force_iommu(); @@ -3308,7 +3327,7 @@ int __init intel_iommu_init(void) init_no_remapping_devices(); - ret = init_dmars(force_on); + ret = init_dmars(); if (ret) { if (force_on) panic("tboot: Failed to initialize DMARs\n"); From b3a530e4e734cab7043a3ab7d135f539042443a7 Mon Sep 17 00:00:00 2001 From: Jan Kiszka Date: Sun, 15 May 2011 12:34:55 +0200 Subject: [PATCH 02/46] intel-iommu: Remove obsolete comment from detect_intel_iommu Since cacd4213d8, this comment no longer applies. Signed-off-by: Jan Kiszka Signed-off-by: David Woodhouse --- drivers/pci/dmar.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/drivers/pci/dmar.c b/drivers/pci/dmar.c index 12e02bf92c4a..3dc9befa5aec 100644 --- a/drivers/pci/dmar.c +++ b/drivers/pci/dmar.c @@ -698,12 +698,7 @@ int __init detect_intel_iommu(void) { #ifdef CONFIG_INTR_REMAP struct acpi_table_dmar *dmar; - /* - * for now we will disable dma-remapping when interrupt - * remapping is enabled. - * When support for queued invalidation for IOTLB invalidation - * is added, we will not need this any more. - */ + dmar = (struct acpi_table_dmar *) dmar_tbl; if (ret && cpu_has_x2apic && dmar->flags & 0x1) printk(KERN_INFO From 7b668357810ecb5fdda4418689d50f5d95aea6a8 Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 24 May 2011 12:02:41 +0100 Subject: [PATCH 03/46] intel-iommu: Flush unmaps at domain_exit We typically batch unmaps to be lazily flushed out at regular intervals. When we destroy a domain, we need to force a flush of these lazy unmaps to be sure none reference the domain we're about to free. Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=35062 Signed-off-by: Alex Williamson Signed-off-by: David Woodhouse Cc: stable@kernel.org --- drivers/pci/intel-iommu.c | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 4e4e0202e59d..395f253c0494 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -1422,6 +1422,10 @@ static void domain_exit(struct dmar_domain *domain) if (!domain) return; + /* Flush any lazy unmaps that may reference this domain */ + if (!intel_iommu_strict) + flush_unmaps_timeout(0); + domain_remove_dev_info(domain); /* destroy iovas */ put_iova_domain(&domain->iovad); From 89e1be50c68eb5e58b873dce87bbac627ee18d1f Mon Sep 17 00:00:00 2001 From: Steven Rostedt Date: Fri, 27 May 2011 23:11:24 -0400 Subject: [PATCH 04/46] x86: Put back -pg to tsc.o and add no GCOV to vread_tsc_64.o The commit 44259b1abfaa8bb819d25d41d71e8e33e25dd36a Author: Andy Lutomirski x86-64: Move vread_tsc into a new file with sensible options Removed the -pg from tsc.o which caused the function graph tracer to go into an infinite function call recursion as it uses the tsc internally outside its recursion protection, thus tracing the tsc breaks the function graph tracer. This commit also added the file vread_tsc_64.c that gets used by vdso but failed to prevent GCOV from monkeying with it, causing userspace to try to access kernel data when GCOV was enabled. Thanks to Thomas Gleixner for pointing out GCOV as the likely culprit that added strange kernel accesses into the vread_tsc() call. Cc: Author: Andy Lutomirski Cc: Thomas Gleixner Signed-off-by: Steven Rostedt --- arch/x86/kernel/Makefile | 2 ++ 1 file changed, 2 insertions(+) diff --git a/arch/x86/kernel/Makefile b/arch/x86/kernel/Makefile index f5abe3a245b8..90b06d4daee2 100644 --- a/arch/x86/kernel/Makefile +++ b/arch/x86/kernel/Makefile @@ -8,6 +8,7 @@ CPPFLAGS_vmlinux.lds += -U$(UTS_MACHINE) ifdef CONFIG_FUNCTION_TRACER # Do not profile debug and lowlevel utilities +CFLAGS_REMOVE_tsc.o = -pg CFLAGS_REMOVE_rtc.o = -pg CFLAGS_REMOVE_paravirt-spinlocks.o = -pg CFLAGS_REMOVE_pvclock.o = -pg @@ -28,6 +29,7 @@ CFLAGS_paravirt.o := $(nostackp) GCOV_PROFILE_vsyscall_64.o := n GCOV_PROFILE_hpet.o := n GCOV_PROFILE_tsc.o := n +GCOV_PROFILE_vread_tsc_64.o := n GCOV_PROFILE_paravirt.o := n # vread_tsc_64 is hot and should be fully optimized: From 15517f7c213442e4d8a098cf0732b237f764c576 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 May 2011 11:14:08 -0600 Subject: [PATCH 05/46] lguest: fix timer interrupt setup Without an IRQ chip set, we now get a WARN_ON and no timer interrupt. This prevents booting. Fortunately, the fix is a one-liner: set up the timer IRQ like everything else. Signed-off-by: Rusty Russell Cc: stable@kernel.org # .39.x --- arch/x86/lguest/boot.c | 1 + 1 file changed, 1 insertion(+) diff --git a/arch/x86/lguest/boot.c b/arch/x86/lguest/boot.c index e191c096ab90..db832fd65ecb 100644 --- a/arch/x86/lguest/boot.c +++ b/arch/x86/lguest/boot.c @@ -993,6 +993,7 @@ static void lguest_time_irq(unsigned int irq, struct irq_desc *desc) static void lguest_time_init(void) { /* Set up the timer interrupt (0) to go to our simple timer routine */ + lguest_setup_irq(0); irq_set_handler(0, lguest_time_irq); clocksource_register_hz(&lguest_clock, NSEC_PER_SEC); From bc805a03c26e1e25171bc627c6264553d27f746c Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 May 2011 11:14:11 -0600 Subject: [PATCH 06/46] lguest: fix up compilation after move ed16648eb5b86917f0b90bdcdbc857202da72f90 "Move kvm, uml, and lguest subdirectories" broke the lguest example launcher. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/Makefile | 2 +- Documentation/virtual/lguest/lguest.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Documentation/virtual/lguest/Makefile b/Documentation/virtual/lguest/Makefile index bebac6b4f332..0ac34206f7a7 100644 --- a/Documentation/virtual/lguest/Makefile +++ b/Documentation/virtual/lguest/Makefile @@ -1,5 +1,5 @@ # This creates the demonstration utility "lguest" which runs a Linux guest. -# Missing headers? Add "-I../../include -I../../arch/x86/include" +# Missing headers? Add "-I../../../include -I../../../arch/x86/include" CFLAGS:=-m32 -Wall -Wmissing-declarations -Wmissing-prototypes -O3 -U_FORTIFY_SOURCE all: lguest diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index d9da7e148538..711ba9e9a007 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -49,7 +49,7 @@ #include #include #include -#include "../../include/linux/lguest_launcher.h" +#include "../../../include/linux/lguest_launcher.h" /*L:110 * We can ignore the 42 include files we need for this program, but I do want * to draw attention to the use of kernel-style types. From 990c91f0af46c57f0291060d928c7ab82f9d5667 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 May 2011 11:14:12 -0600 Subject: [PATCH 07/46] lguest: remove support for VIRTIO_F_NOTIFY_ON_EMPTY. No virtio device does this any more, so no need to clutter lguest with it. Signed-off-by: Rusty Russell --- Documentation/virtual/lguest/lguest.c | 20 +------------------- 1 file changed, 1 insertion(+), 19 deletions(-) diff --git a/Documentation/virtual/lguest/lguest.c b/Documentation/virtual/lguest/lguest.c index 711ba9e9a007..cd9d6af61d07 100644 --- a/Documentation/virtual/lguest/lguest.c +++ b/Documentation/virtual/lguest/lguest.c @@ -135,9 +135,6 @@ struct device { /* Is it operational */ bool running; - /* Does Guest want an intrrupt on empty? */ - bool irq_on_empty; - /* Device-specific data. */ void *priv; }; @@ -637,10 +634,7 @@ static void trigger_irq(struct virtqueue *vq) /* If they don't want an interrupt, don't send one... */ if (vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT) { - /* ... unless they've asked us to force one on empty. */ - if (!vq->dev->irq_on_empty - || lg_last_avail(vq) != vq->vring.avail->idx) - return; + return; } /* Send the Guest an interrupt tell them we used something up. */ @@ -1057,15 +1051,6 @@ static void create_thread(struct virtqueue *vq) close(vq->eventfd); } -static bool accepted_feature(struct device *dev, unsigned int bit) -{ - const u8 *features = get_feature_bits(dev) + dev->feature_len; - - if (dev->feature_len < bit / CHAR_BIT) - return false; - return features[bit / CHAR_BIT] & (1 << (bit % CHAR_BIT)); -} - static void start_device(struct device *dev) { unsigned int i; @@ -1079,8 +1064,6 @@ static void start_device(struct device *dev) verbose(" %02x", get_feature_bits(dev) [dev->feature_len+i]); - dev->irq_on_empty = accepted_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); - for (vq = dev->vq; vq; vq = vq->next) { if (vq->service) create_thread(vq); @@ -1564,7 +1547,6 @@ static void setup_tun_net(char *arg) /* Set up the tun device. */ configure_device(ipfd, tapif, ip); - add_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY); /* Expect Guest to handle everything except UFO */ add_feature(dev, VIRTIO_NET_F_CSUM); add_feature(dev, VIRTIO_NET_F_GUEST_CSUM); From 7a7c924cf03da2a76ea4dc0aac1a788cf95a9c29 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 1 Feb 2011 21:43:48 +0100 Subject: [PATCH 08/46] virtio_blk: allow re-reading config space at runtime Wire up the virtio_driver config_changed method to get notified about config changes raised by the host. For now we just re-read the device size to support online resizing of devices, but once we add more attributes that might be changeable they could be added as well. Note that the config_changed method is called from irq context, so we'll have to use the workqueue infrastructure to provide us a proper user context for our changes. Signed-off-by: Christoph Hellwig Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 88 +++++++++++++++++++++++++++++++++----- 1 file changed, 78 insertions(+), 10 deletions(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 6ecf89cdf006..33a48a80c7e8 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -6,10 +6,12 @@ #include #include #include +#include #define PART_BITS 4 static int major, index; +struct workqueue_struct *virtblk_wq; struct virtio_blk { @@ -26,6 +28,9 @@ struct virtio_blk mempool_t *pool; + /* Process context for config space updates */ + struct work_struct config_work; + /* What host tells us, plus 2 for header & tailer. */ unsigned int sg_elems; @@ -291,6 +296,46 @@ static ssize_t virtblk_serial_show(struct device *dev, } DEVICE_ATTR(serial, S_IRUGO, virtblk_serial_show, NULL); +static void virtblk_config_changed_work(struct work_struct *work) +{ + struct virtio_blk *vblk = + container_of(work, struct virtio_blk, config_work); + struct virtio_device *vdev = vblk->vdev; + struct request_queue *q = vblk->disk->queue; + char cap_str_2[10], cap_str_10[10]; + u64 capacity, size; + + /* Host must always specify the capacity. */ + vdev->config->get(vdev, offsetof(struct virtio_blk_config, capacity), + &capacity, sizeof(capacity)); + + /* If capacity is too big, truncate with warning. */ + if ((sector_t)capacity != capacity) { + dev_warn(&vdev->dev, "Capacity %llu too large: truncating\n", + (unsigned long long)capacity); + capacity = (sector_t)-1; + } + + size = capacity * queue_logical_block_size(q); + string_get_size(size, STRING_UNITS_2, cap_str_2, sizeof(cap_str_2)); + string_get_size(size, STRING_UNITS_10, cap_str_10, sizeof(cap_str_10)); + + dev_notice(&vdev->dev, + "new size: %llu %d-byte logical blocks (%s/%s)\n", + (unsigned long long)capacity, + queue_logical_block_size(q), + cap_str_10, cap_str_2); + + set_capacity(vblk->disk, capacity); +} + +static void virtblk_config_changed(struct virtio_device *vdev) +{ + struct virtio_blk *vblk = vdev->priv; + + queue_work(virtblk_wq, &vblk->config_work); +} + static int __devinit virtblk_probe(struct virtio_device *vdev) { struct virtio_blk *vblk; @@ -327,6 +372,7 @@ static int __devinit virtblk_probe(struct virtio_device *vdev) vblk->vdev = vdev; vblk->sg_elems = sg_elems; sg_init_table(vblk->sg, vblk->sg_elems); + INIT_WORK(&vblk->config_work, virtblk_config_changed_work); /* We expect one virtqueue, for output. */ vblk->vq = virtio_find_single_vq(vdev, blk_done, "requests"); @@ -477,6 +523,8 @@ static void __devexit virtblk_remove(struct virtio_device *vdev) { struct virtio_blk *vblk = vdev->priv; + flush_work(&vblk->config_work); + /* Nothing should be pending. */ BUG_ON(!list_empty(&vblk->reqs)); @@ -508,27 +556,47 @@ static unsigned int features[] = { * Use __refdata to avoid this warning. */ static struct virtio_driver __refdata virtio_blk = { - .feature_table = features, - .feature_table_size = ARRAY_SIZE(features), - .driver.name = KBUILD_MODNAME, - .driver.owner = THIS_MODULE, - .id_table = id_table, - .probe = virtblk_probe, - .remove = __devexit_p(virtblk_remove), + .feature_table = features, + .feature_table_size = ARRAY_SIZE(features), + .driver.name = KBUILD_MODNAME, + .driver.owner = THIS_MODULE, + .id_table = id_table, + .probe = virtblk_probe, + .remove = __devexit_p(virtblk_remove), + .config_changed = virtblk_config_changed, }; static int __init init(void) { + int error; + + virtblk_wq = alloc_workqueue("virtio-blk", 0, 0); + if (!virtblk_wq) + return -ENOMEM; + major = register_blkdev(0, "virtblk"); - if (major < 0) - return major; - return register_virtio_driver(&virtio_blk); + if (major < 0) { + error = major; + goto out_destroy_workqueue; + } + + error = register_virtio_driver(&virtio_blk); + if (error) + goto out_unregister_blkdev; + return 0; + +out_unregister_blkdev: + unregister_blkdev(major, "virtblk"); +out_destroy_workqueue: + destroy_workqueue(virtblk_wq); + return error; } static void __exit fini(void) { unregister_blkdev(major, "virtblk"); unregister_virtio_driver(&virtio_blk); + destroy_workqueue(virtblk_wq); } module_init(init); module_exit(fini); From 6917f83ffe5e6b6414ccc845263b792ed201c0f1 Mon Sep 17 00:00:00 2001 From: Liu Yuan Date: Sun, 24 Apr 2011 02:49:26 +0800 Subject: [PATCH 09/46] drivers, block: virtio_blk: Replace cryptic number with the macro It is easier to figure out the context by reading SCSI_SENSE_BUFFERSIZE instead of plain '96'. Signed-off-by: Liu Yuan Signed-off-by: Rusty Russell --- drivers/block/virtio_blk.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/drivers/block/virtio_blk.c b/drivers/block/virtio_blk.c index 33a48a80c7e8..079c08808d8a 100644 --- a/drivers/block/virtio_blk.c +++ b/drivers/block/virtio_blk.c @@ -7,6 +7,7 @@ #include #include #include +#include #define PART_BITS 4 @@ -146,7 +147,7 @@ static bool do_req(struct request_queue *q, struct virtio_blk *vblk, num = blk_rq_map_sg(q, vbr->req, vblk->sg + out); if (vbr->req->cmd_type == REQ_TYPE_BLOCK_PC) { - sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, 96); + sg_set_buf(&vblk->sg[num + out + in++], vbr->req->sense, SCSI_SENSE_BUFFERSIZE); sg_set_buf(&vblk->sg[num + out + in++], &vbr->in_hdr, sizeof(vbr->in_hdr)); } From 177dbd95637a52b9118aca757d5856ec3995d3e7 Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 May 2011 11:14:13 -0600 Subject: [PATCH 10/46] virtio console: don't manually set or finalize VIRTIO_CONSOLE_F_MULTIPORT. That's already been done by the virtio infrastructure before the probe function is called. Reported-by: alexey.kardashevskiy@au1.ibm.com Acked-by: Amit Shah Tested-by: Amit Shah Signed-off-by: Rusty Russell --- drivers/char/virtio_console.c | 5 ----- 1 file changed, 5 deletions(-) diff --git a/drivers/char/virtio_console.c b/drivers/char/virtio_console.c index 838568a7dbf5..fb68b1295373 100644 --- a/drivers/char/virtio_console.c +++ b/drivers/char/virtio_console.c @@ -1677,17 +1677,12 @@ static int __devinit virtcons_probe(struct virtio_device *vdev) portdev->config.max_nr_ports = 1; if (virtio_has_feature(vdev, VIRTIO_CONSOLE_F_MULTIPORT)) { multiport = true; - vdev->features[0] |= 1 << VIRTIO_CONSOLE_F_MULTIPORT; - vdev->config->get(vdev, offsetof(struct virtio_console_config, max_nr_ports), &portdev->config.max_nr_ports, sizeof(portdev->config.max_nr_ports)); } - /* Let the Host know we support multiple ports.*/ - vdev->config->finalize_features(vdev); - err = init_vqs(portdev); if (err < 0) { dev_err(&vdev->dev, "Error %d initializing vqs\n", err); From bf50e69f63d21091e525185c3ae761412be0ba72 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Thu, 7 Apr 2011 10:43:25 -0700 Subject: [PATCH 11/46] virtio balloon: kill tell-host-first logic The virtio balloon driver has a VIRTIO_BALLOON_F_MUST_TELL_HOST feature bit. Whenever the bit is set, the guest kernel must always tell the host before we free pages back to the allocator. Without this feature, we might free a page (and have another user touch it) while the hypervisor is unprepared for it. But, if the bit is _not_ set, we are under no obligation to reverse the order; we're under no obligation to do _anything_. As of now, qemu-kvm defines the bit, but doesn't set it. This patch makes the "tell host first" logic the only case. This should make everybody happy, and reduce the amount of untested or untestable code in the kernel. This _also_ means that we don't have to preserve a pfn list after the pages are freed, which should let us get rid of some temporary storage (vb->pfns) eventually. Signed-off-by: Dave Hansen Signed-off-by: Rusty Russell --- drivers/virtio/virtio_balloon.c | 21 ++++++++------------- 1 file changed, 8 insertions(+), 13 deletions(-) diff --git a/drivers/virtio/virtio_balloon.c b/drivers/virtio/virtio_balloon.c index 0f1da45ba47d..e058ace2a4ad 100644 --- a/drivers/virtio/virtio_balloon.c +++ b/drivers/virtio/virtio_balloon.c @@ -40,9 +40,6 @@ struct virtio_balloon /* Waiting for host to ack the pages we released. */ struct completion acked; - /* Do we have to tell Host *before* we reuse pages? */ - bool tell_host_first; - /* The pages we've told the Host we're not using. */ unsigned int num_pages; struct list_head pages; @@ -151,13 +148,14 @@ static void leak_balloon(struct virtio_balloon *vb, size_t num) vb->num_pages--; } - if (vb->tell_host_first) { - tell_host(vb, vb->deflate_vq); - release_pages_by_pfn(vb->pfns, vb->num_pfns); - } else { - release_pages_by_pfn(vb->pfns, vb->num_pfns); - tell_host(vb, vb->deflate_vq); - } + + /* + * Note that if + * virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); + * is true, we *have* to do it in this order + */ + tell_host(vb, vb->deflate_vq); + release_pages_by_pfn(vb->pfns, vb->num_pfns); } static inline void update_stat(struct virtio_balloon *vb, int idx, @@ -325,9 +323,6 @@ static int virtballoon_probe(struct virtio_device *vdev) goto out_del_vqs; } - vb->tell_host_first - = virtio_has_feature(vdev, VIRTIO_BALLOON_F_MUST_TELL_HOST); - return 0; out_del_vqs: From a1b383870a28cfbd1657d4922c0fafc634a62ebd Mon Sep 17 00:00:00 2001 From: Rusty Russell Date: Mon, 30 May 2011 11:14:13 -0600 Subject: [PATCH 12/46] virtio: add full three-clause BSD text to headers. It's unclear to me if it's important, but it's obviously causing my technical colleages some headaches and I'd hate such imprecision to slow virtio adoption. I've emailed this to all non-trivial contributors for approval, too. Signed-off-by: Rusty Russell Acked-by: Grant Likely Acked-by: Ryan Harper Acked-by: Anthony Liguori Acked-by: Eric Van Hensbergen Acked-by: john cooper Acked-by: Aneesh Kumar K.V Acked-by: Christian Borntraeger Acked-by: Fernando Luis Vazquez Cao --- include/linux/virtio_9p.h | 25 ++++++++++++++++++++++++- include/linux/virtio_balloon.h | 25 ++++++++++++++++++++++++- include/linux/virtio_blk.h | 25 ++++++++++++++++++++++++- include/linux/virtio_config.h | 25 ++++++++++++++++++++++++- include/linux/virtio_console.h | 26 +++++++++++++++++++++++++- include/linux/virtio_ids.h | 24 +++++++++++++++++++++++- include/linux/virtio_net.h | 25 ++++++++++++++++++++++++- include/linux/virtio_pci.h | 23 +++++++++++++++++++++++ include/linux/virtio_ring.h | 23 +++++++++++++++++++++++ 9 files changed, 214 insertions(+), 7 deletions(-) diff --git a/include/linux/virtio_9p.h b/include/linux/virtio_9p.h index e68b439b2860..277c4ad44e84 100644 --- a/include/linux/virtio_9p.h +++ b/include/linux/virtio_9p.h @@ -1,7 +1,30 @@ #ifndef _LINUX_VIRTIO_9P_H #define _LINUX_VIRTIO_9P_H /* This header is BSD licensed so anyone can use the definitions to implement - * compatible drivers/servers. */ + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include #include #include diff --git a/include/linux/virtio_balloon.h b/include/linux/virtio_balloon.h index a50ecd1b81a2..652dc8bea921 100644 --- a/include/linux/virtio_balloon.h +++ b/include/linux/virtio_balloon.h @@ -1,7 +1,30 @@ #ifndef _LINUX_VIRTIO_BALLOON_H #define _LINUX_VIRTIO_BALLOON_H /* This header is BSD licensed so anyone can use the definitions to implement - * compatible drivers/servers. */ + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include #include diff --git a/include/linux/virtio_blk.h b/include/linux/virtio_blk.h index 167720d695ed..e0edb40ca7aa 100644 --- a/include/linux/virtio_blk.h +++ b/include/linux/virtio_blk.h @@ -1,7 +1,30 @@ #ifndef _LINUX_VIRTIO_BLK_H #define _LINUX_VIRTIO_BLK_H /* This header is BSD licensed so anyone can use the definitions to implement - * compatible drivers/servers. */ + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include #include #include diff --git a/include/linux/virtio_config.h b/include/linux/virtio_config.h index 800617b4ddd5..39c88c5ad19d 100644 --- a/include/linux/virtio_config.h +++ b/include/linux/virtio_config.h @@ -1,7 +1,30 @@ #ifndef _LINUX_VIRTIO_CONFIG_H #define _LINUX_VIRTIO_CONFIG_H /* This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers. */ + * anyone can use the definitions to implement compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ /* Virtio devices use a standardized configuration space to define their * features and pass configuration information, but each implementation can diff --git a/include/linux/virtio_console.h b/include/linux/virtio_console.h index e4d333543a33..bdf4b0034739 100644 --- a/include/linux/virtio_console.h +++ b/include/linux/virtio_console.h @@ -5,7 +5,31 @@ #include /* * This header, excluding the #ifdef __KERNEL__ part, is BSD licensed so - * anyone can use the definitions to implement compatible drivers/servers. + * anyone can use the definitions to implement compatible drivers/servers: + * + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. * * Copyright (C) Red Hat, Inc., 2009, 2010, 2011 * Copyright (C) Amit Shah , 2009, 2010, 2011 diff --git a/include/linux/virtio_ids.h b/include/linux/virtio_ids.h index 06660c0a78d7..85bb0bb66ffc 100644 --- a/include/linux/virtio_ids.h +++ b/include/linux/virtio_ids.h @@ -5,7 +5,29 @@ * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. - */ + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #define VIRTIO_ID_NET 1 /* virtio net */ #define VIRTIO_ID_BLOCK 2 /* virtio block */ diff --git a/include/linux/virtio_net.h b/include/linux/virtio_net.h index 085e42298ce5..136040bba3e3 100644 --- a/include/linux/virtio_net.h +++ b/include/linux/virtio_net.h @@ -1,7 +1,30 @@ #ifndef _LINUX_VIRTIO_NET_H #define _LINUX_VIRTIO_NET_H /* This header is BSD licensed so anyone can use the definitions to implement - * compatible drivers/servers. */ + * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #include #include #include diff --git a/include/linux/virtio_pci.h b/include/linux/virtio_pci.h index 9a3d7c48c622..ea66f3f60d63 100644 --- a/include/linux/virtio_pci.h +++ b/include/linux/virtio_pci.h @@ -11,6 +11,29 @@ * * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. */ #ifndef _LINUX_VIRTIO_PCI_H diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index e4d144b132b5..a813e5d460eb 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -7,6 +7,29 @@ * This header is BSD licensed so anyone can use the definitions to implement * compatible drivers/servers. * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * 3. Neither the name of IBM nor the names of its contributors + * may be used to endorse or promote products derived from this software + * without specific prior written permission. + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL IBM OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * * Copyright Rusty Russell IBM Corporation 2007. */ #include From 770b31a85e000b0194974922f238a30ade4246b6 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:10:17 +0300 Subject: [PATCH 13/46] virtio: event index interface Define a new feature bit for the guest and host to utilize an event index (like Xen) instead if a flag bit to enable/disable interrupts and kicks. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/linux/virtio_ring.h | 15 ++++++++++++++- 1 file changed, 14 insertions(+), 1 deletion(-) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index a813e5d460eb..c4eef73deb3f 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -52,6 +52,12 @@ /* We support indirect buffer descriptors */ #define VIRTIO_RING_F_INDIRECT_DESC 28 +/* The Guest publishes the used index for which it expects an interrupt + * at the end of the avail ring. Host should ignore the avail->flags field. */ +/* The Host publishes the avail index for which it expects a kick + * at the end of the used ring. Guest should ignore the used->flags field. */ +#define VIRTIO_RING_F_EVENT_IDX 29 + /* Virtio ring descriptors: 16 bytes. These can chain together via "next". */ struct vring_desc { /* Address (guest-physical). */ @@ -106,6 +112,7 @@ struct vring { * __u16 avail_flags; * __u16 avail_idx; * __u16 available[num]; + * __u16 used_event_idx; * * // Padding to the next align boundary. * char pad[]; @@ -114,8 +121,14 @@ struct vring { * __u16 used_flags; * __u16 used_idx; * struct vring_used_elem used[num]; + * __u16 avail_event_idx; * }; */ +/* We publish the used event index at the end of the available ring, and vice + * versa. They are at the end for backwards compatibility. */ +#define vring_used_event(vr) ((vr)->avail->ring[(vr)->num]) +#define vring_avail_event(vr) (*(__u16 *)&(vr)->used->ring[(vr)->num]) + static inline void vring_init(struct vring *vr, unsigned int num, void *p, unsigned long align) { @@ -130,7 +143,7 @@ static inline unsigned vring_size(unsigned int num, unsigned long align) { return ((sizeof(struct vring_desc) * num + sizeof(__u16) * (2 + num) + align - 1) & ~(align - 1)) - + sizeof(__u16) * 2 + sizeof(struct vring_used_elem) * num; + + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num; } #ifdef __KERNEL__ From bf7035bf20563a6cadcb9e870406e7b21daf5e30 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:10:27 +0300 Subject: [PATCH 14/46] virtio ring: inline function to check for events With the new used_event and avail_event and features, both host and guest need similar logic to check whether events are enabled, so it helps to put the common code in the header. Note that Xen has similar logic for notification hold-off in include/xen/interface/io/ring.h with req_event and req_prod corresponding to event_idx + 1 and new_idx respectively. +1 comes from the fact that req_event and req_prod in Xen start at 1, while event index in virtio starts at 0. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- include/linux/virtio_ring.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/linux/virtio_ring.h b/include/linux/virtio_ring.h index c4eef73deb3f..4a32cb6da425 100644 --- a/include/linux/virtio_ring.h +++ b/include/linux/virtio_ring.h @@ -146,6 +146,20 @@ static inline unsigned vring_size(unsigned int num, unsigned long align) + sizeof(__u16) * 3 + sizeof(struct vring_used_elem) * num; } +/* The following is used with USED_EVENT_IDX and AVAIL_EVENT_IDX */ +/* Assuming a given event_idx value from the other size, if + * we have just incremented index from old to new_idx, + * should we trigger an event? */ +static inline int vring_need_event(__u16 event_idx, __u16 new_idx, __u16 old) +{ + /* Note: Xen has similar logic for notification hold-off + * in include/xen/interface/io/ring.h with req_event and req_prod + * corresponding to event_idx + 1 and new_idx respectively. + * Note also that req_event and req_prod in Xen start at 1, + * event indexes in virtio start at 0. */ + return (__u16)(new_idx - event_idx - 1) < (__u16)(new_idx - old); +} + #ifdef __KERNEL__ #include struct virtio_device; From a5c262c5fd83ece01bd649fb08416c501d4c59d7 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:10:44 +0300 Subject: [PATCH 15/46] virtio_ring: support event idx feature Support for the new event idx feature: 1. When enabling interrupts, publish the current avail index value to the host to get interrupts on the next update. 2. Use the new avail_event feature to reduce the number of exits from the guest. Simple test with the simulator: [virtio]# time ./virtio_test spurious wakeus: 0x7 real 0m0.169s user 0m0.140s sys 0m0.019s [virtio]# time ./virtio_test --no-event-idx spurious wakeus: 0x11 real 0m0.649s user 0m0.295s sys 0m0.335s Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 26 ++++++++++++++++++++++++-- 1 file changed, 24 insertions(+), 2 deletions(-) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index b0043fb26a4d..a5fadc40c448 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -82,6 +82,9 @@ struct vring_virtqueue /* Host supports indirect buffers */ bool indirect; + /* Host publishes avail event idx */ + bool event; + /* Number of free buffers */ unsigned int num_free; /* Head of free buffer list. */ @@ -237,18 +240,22 @@ EXPORT_SYMBOL_GPL(virtqueue_add_buf_gfp); void virtqueue_kick(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); + u16 new, old; START_USE(vq); /* Descriptors and available array need to be set before we expose the * new available array entries. */ virtio_wmb(); - vq->vring.avail->idx += vq->num_added; + old = vq->vring.avail->idx; + new = vq->vring.avail->idx = old + vq->num_added; vq->num_added = 0; /* Need to update avail index before checking if we should notify */ virtio_mb(); - if (!(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) + if (vq->event ? + vring_need_event(vring_avail_event(&vq->vring), new, old) : + !(vq->vring.used->flags & VRING_USED_F_NO_NOTIFY)) /* Prod other side to tell it about changes. */ vq->notify(&vq->vq); @@ -324,6 +331,14 @@ void *virtqueue_get_buf(struct virtqueue *_vq, unsigned int *len) ret = vq->data[i]; detach_buf(vq, i); vq->last_used_idx++; + /* If we expect an interrupt for the next entry, tell host + * by writing event index and flush out the write before + * the read in the next get_buf call. */ + if (!(vq->vring.avail->flags & VRING_AVAIL_F_NO_INTERRUPT)) { + vring_used_event(&vq->vring) = vq->last_used_idx; + virtio_mb(); + } + END_USE(vq); return ret; } @@ -345,7 +360,11 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) /* We optimistically turn back on interrupts, then check if there was * more to do. */ + /* Depending on the VIRTIO_RING_F_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + vring_used_event(&vq->vring) = vq->last_used_idx; virtio_mb(); if (unlikely(more_used(vq))) { END_USE(vq); @@ -438,6 +457,7 @@ struct virtqueue *vring_new_virtqueue(unsigned int num, #endif vq->indirect = virtio_has_feature(vdev, VIRTIO_RING_F_INDIRECT_DESC); + vq->event = virtio_has_feature(vdev, VIRTIO_RING_F_EVENT_IDX); /* No callback? Tell other side not to bother us. */ if (!callback) @@ -472,6 +492,8 @@ void vring_transport_features(struct virtio_device *vdev) switch (i) { case VIRTIO_RING_F_INDIRECT_DESC: break; + case VIRTIO_RING_F_EVENT_IDX: + break; default: /* We don't understand this bit. */ clear_bit(i, vdev->features); From 8ea8cf89e19aeb596b818ee5f2bec8a8b0586b60 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:10:54 +0300 Subject: [PATCH 16/46] vhost: support event index Support the new event index feature. When acked, utilize it to reduce the # of interrupts sent to the guest. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/vhost/net.c | 12 ++-- drivers/vhost/test.c | 6 +- drivers/vhost/vhost.c | 138 +++++++++++++++++++++++++++++++----------- drivers/vhost/vhost.h | 21 ++++--- 4 files changed, 127 insertions(+), 50 deletions(-) diff --git a/drivers/vhost/net.c b/drivers/vhost/net.c index 2f7c76a85e53..e224a92baa16 100644 --- a/drivers/vhost/net.c +++ b/drivers/vhost/net.c @@ -144,7 +144,7 @@ static void handle_tx(struct vhost_net *net) } mutex_lock(&vq->mutex); - vhost_disable_notify(vq); + vhost_disable_notify(&net->dev, vq); if (wmem < sock->sk->sk_sndbuf / 2) tx_poll_stop(net); @@ -166,8 +166,8 @@ static void handle_tx(struct vhost_net *net) set_bit(SOCK_ASYNC_NOSPACE, &sock->flags); break; } - if (unlikely(vhost_enable_notify(vq))) { - vhost_disable_notify(vq); + if (unlikely(vhost_enable_notify(&net->dev, vq))) { + vhost_disable_notify(&net->dev, vq); continue; } break; @@ -315,7 +315,7 @@ static void handle_rx(struct vhost_net *net) return; mutex_lock(&vq->mutex); - vhost_disable_notify(vq); + vhost_disable_notify(&net->dev, vq); vhost_hlen = vq->vhost_hlen; sock_hlen = vq->sock_hlen; @@ -334,10 +334,10 @@ static void handle_rx(struct vhost_net *net) break; /* OK, now we need to know about added descriptors. */ if (!headcount) { - if (unlikely(vhost_enable_notify(vq))) { + if (unlikely(vhost_enable_notify(&net->dev, vq))) { /* They have slipped one in as we were * doing that: check again. */ - vhost_disable_notify(vq); + vhost_disable_notify(&net->dev, vq); continue; } /* Nothing new? Wait for eventfd to tell us diff --git a/drivers/vhost/test.c b/drivers/vhost/test.c index 099f30230d06..734e1d74ad80 100644 --- a/drivers/vhost/test.c +++ b/drivers/vhost/test.c @@ -49,7 +49,7 @@ static void handle_vq(struct vhost_test *n) return; mutex_lock(&vq->mutex); - vhost_disable_notify(vq); + vhost_disable_notify(&n->dev, vq); for (;;) { head = vhost_get_vq_desc(&n->dev, vq, vq->iov, @@ -61,8 +61,8 @@ static void handle_vq(struct vhost_test *n) break; /* Nothing new? Wait for eventfd to tell us they refilled. */ if (head == vq->num) { - if (unlikely(vhost_enable_notify(vq))) { - vhost_disable_notify(vq); + if (unlikely(vhost_enable_notify(&n->dev, vq))) { + vhost_disable_notify(&n->dev, vq); continue; } break; diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c index 7aa4eea930f1..ea966b356352 100644 --- a/drivers/vhost/vhost.c +++ b/drivers/vhost/vhost.c @@ -37,6 +37,9 @@ enum { VHOST_MEMORY_F_LOG = 0x1, }; +#define vhost_used_event(vq) ((u16 __user *)&vq->avail->ring[vq->num]) +#define vhost_avail_event(vq) ((u16 __user *)&vq->used->ring[vq->num]) + static void vhost_poll_func(struct file *file, wait_queue_head_t *wqh, poll_table *pt) { @@ -161,6 +164,8 @@ static void vhost_vq_reset(struct vhost_dev *dev, vq->last_avail_idx = 0; vq->avail_idx = 0; vq->last_used_idx = 0; + vq->signalled_used = 0; + vq->signalled_used_valid = false; vq->used_flags = 0; vq->log_used = false; vq->log_addr = -1ull; @@ -489,16 +494,17 @@ static int memory_access_ok(struct vhost_dev *d, struct vhost_memory *mem, return 1; } -static int vq_access_ok(unsigned int num, +static int vq_access_ok(struct vhost_dev *d, unsigned int num, struct vring_desc __user *desc, struct vring_avail __user *avail, struct vring_used __user *used) { + size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; return access_ok(VERIFY_READ, desc, num * sizeof *desc) && access_ok(VERIFY_READ, avail, - sizeof *avail + num * sizeof *avail->ring) && + sizeof *avail + num * sizeof *avail->ring + s) && access_ok(VERIFY_WRITE, used, - sizeof *used + num * sizeof *used->ring); + sizeof *used + num * sizeof *used->ring + s); } /* Can we log writes? */ @@ -514,9 +520,11 @@ int vhost_log_access_ok(struct vhost_dev *dev) /* Verify access for write logging. */ /* Caller should have vq mutex and device mutex */ -static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) +static int vq_log_access_ok(struct vhost_dev *d, struct vhost_virtqueue *vq, + void __user *log_base) { struct vhost_memory *mp; + size_t s = vhost_has_feature(d, VIRTIO_RING_F_EVENT_IDX) ? 2 : 0; mp = rcu_dereference_protected(vq->dev->memory, lockdep_is_held(&vq->mutex)); @@ -524,15 +532,15 @@ static int vq_log_access_ok(struct vhost_virtqueue *vq, void __user *log_base) vhost_has_feature(vq->dev, VHOST_F_LOG_ALL)) && (!vq->log_used || log_access_ok(log_base, vq->log_addr, sizeof *vq->used + - vq->num * sizeof *vq->used->ring)); + vq->num * sizeof *vq->used->ring + s)); } /* Can we start vq? */ /* Caller should have vq mutex and device mutex */ int vhost_vq_access_ok(struct vhost_virtqueue *vq) { - return vq_access_ok(vq->num, vq->desc, vq->avail, vq->used) && - vq_log_access_ok(vq, vq->log_base); + return vq_access_ok(vq->dev, vq->num, vq->desc, vq->avail, vq->used) && + vq_log_access_ok(vq->dev, vq, vq->log_base); } static long vhost_set_memory(struct vhost_dev *d, struct vhost_memory __user *m) @@ -577,6 +585,7 @@ static int init_used(struct vhost_virtqueue *vq, if (r) return r; + vq->signalled_used_valid = false; return get_user(vq->last_used_idx, &used->idx); } @@ -674,7 +683,7 @@ static long vhost_set_vring(struct vhost_dev *d, int ioctl, void __user *argp) * If it is not, we don't as size might not have been setup. * We will verify when backend is configured. */ if (vq->private_data) { - if (!vq_access_ok(vq->num, + if (!vq_access_ok(d, vq->num, (void __user *)(unsigned long)a.desc_user_addr, (void __user *)(unsigned long)a.avail_user_addr, (void __user *)(unsigned long)a.used_user_addr)) { @@ -818,7 +827,7 @@ long vhost_dev_ioctl(struct vhost_dev *d, unsigned int ioctl, unsigned long arg) vq = d->vqs + i; mutex_lock(&vq->mutex); /* If ring is inactive, will check when it's enabled. */ - if (vq->private_data && !vq_log_access_ok(vq, base)) + if (vq->private_data && !vq_log_access_ok(d, vq, base)) r = -EFAULT; else vq->log_base = base; @@ -1219,6 +1228,10 @@ int vhost_get_vq_desc(struct vhost_dev *dev, struct vhost_virtqueue *vq, /* On success, increment avail index. */ vq->last_avail_idx++; + + /* Assume notifications from guest are disabled at this point, + * if they aren't we would need to update avail_event index. */ + BUG_ON(!(vq->used_flags & VRING_USED_F_NO_NOTIFY)); return head; } @@ -1267,6 +1280,12 @@ int vhost_add_used(struct vhost_virtqueue *vq, unsigned int head, int len) eventfd_signal(vq->log_ctx, 1); } vq->last_used_idx++; + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely(vq->last_used_idx == vq->signalled_used)) + vq->signalled_used_valid = false; return 0; } @@ -1275,6 +1294,7 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, unsigned count) { struct vring_used_elem __user *used; + u16 old, new; int start; start = vq->last_used_idx % vq->num; @@ -1292,7 +1312,14 @@ static int __vhost_add_used_n(struct vhost_virtqueue *vq, ((void __user *)used - (void __user *)vq->used), count * sizeof *used); } - vq->last_used_idx += count; + old = vq->last_used_idx; + new = (vq->last_used_idx += count); + /* If the driver never bothers to signal in a very long while, + * used index might wrap around. If that happens, invalidate + * signalled_used index we stored. TODO: make sure driver + * signals at least once in 2^16 and remove this. */ + if (unlikely((u16)(new - vq->signalled_used) < (u16)(new - old))) + vq->signalled_used_valid = false; return 0; } @@ -1331,29 +1358,47 @@ int vhost_add_used_n(struct vhost_virtqueue *vq, struct vring_used_elem *heads, return r; } -/* This actually signals the guest, using eventfd. */ -void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +static bool vhost_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { - __u16 flags; - + __u16 old, new, event; + bool v; /* Flush out used index updates. This is paired * with the barrier that the Guest executes when enabling * interrupts. */ smp_mb(); - if (__get_user(flags, &vq->avail->flags)) { - vq_err(vq, "Failed to get flags"); - return; + if (vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY) && + unlikely(vq->avail_idx == vq->last_avail_idx)) + return true; + + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + __u16 flags; + if (__get_user(flags, &vq->avail->flags)) { + vq_err(vq, "Failed to get flags"); + return true; + } + return !(flags & VRING_AVAIL_F_NO_INTERRUPT); } + old = vq->signalled_used; + v = vq->signalled_used_valid; + new = vq->signalled_used = vq->last_used_idx; + vq->signalled_used_valid = true; - /* If they don't want an interrupt, don't signal, unless empty. */ - if ((flags & VRING_AVAIL_F_NO_INTERRUPT) && - (vq->avail_idx != vq->last_avail_idx || - !vhost_has_feature(dev, VIRTIO_F_NOTIFY_ON_EMPTY))) - return; + if (unlikely(!v)) + return true; + if (get_user(event, vhost_used_event(vq))) { + vq_err(vq, "Failed to get used event idx"); + return true; + } + return vring_need_event(event, new, old); +} + +/* This actually signals the guest, using eventfd. */ +void vhost_signal(struct vhost_dev *dev, struct vhost_virtqueue *vq) +{ /* Signal the Guest tell them we used something up. */ - if (vq->call_ctx) + if (vq->call_ctx && vhost_notify(dev, vq)) eventfd_signal(vq->call_ctx, 1); } @@ -1376,7 +1421,7 @@ void vhost_add_used_and_signal_n(struct vhost_dev *dev, } /* OK, now we need to know about added descriptors. */ -bool vhost_enable_notify(struct vhost_virtqueue *vq) +bool vhost_enable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { u16 avail_idx; int r; @@ -1384,11 +1429,34 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) if (!(vq->used_flags & VRING_USED_F_NO_NOTIFY)) return false; vq->used_flags &= ~VRING_USED_F_NO_NOTIFY; - r = put_user(vq->used_flags, &vq->used->flags); - if (r) { - vq_err(vq, "Failed to enable notification at %p: %d\n", - &vq->used->flags, r); - return false; + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + r = put_user(vq->used_flags, &vq->used->flags); + if (r) { + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + return false; + } + } else { + r = put_user(vq->avail_idx, vhost_avail_event(vq)); + if (r) { + vq_err(vq, "Failed to update avail event index at %p: %d\n", + vhost_avail_event(vq), r); + return false; + } + } + if (unlikely(vq->log_used)) { + void __user *used; + /* Make sure data is seen before log. */ + smp_wmb(); + used = vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX) ? + &vq->used->flags : vhost_avail_event(vq); + /* Log used flags or event index entry write. Both are 16 bit + * fields. */ + log_write(vq->log_base, vq->log_addr + + (used - (void __user *)vq->used), + sizeof(u16)); + if (vq->log_ctx) + eventfd_signal(vq->log_ctx, 1); } /* They could have slipped one in as we were doing that: make * sure it's written, then check again. */ @@ -1404,15 +1472,17 @@ bool vhost_enable_notify(struct vhost_virtqueue *vq) } /* We don't need to be notified again. */ -void vhost_disable_notify(struct vhost_virtqueue *vq) +void vhost_disable_notify(struct vhost_dev *dev, struct vhost_virtqueue *vq) { int r; if (vq->used_flags & VRING_USED_F_NO_NOTIFY) return; vq->used_flags |= VRING_USED_F_NO_NOTIFY; - r = put_user(vq->used_flags, &vq->used->flags); - if (r) - vq_err(vq, "Failed to enable notification at %p: %d\n", - &vq->used->flags, r); + if (!vhost_has_feature(dev, VIRTIO_RING_F_EVENT_IDX)) { + r = put_user(vq->used_flags, &vq->used->flags); + if (r) + vq_err(vq, "Failed to enable notification at %p: %d\n", + &vq->used->flags, r); + } } diff --git a/drivers/vhost/vhost.h b/drivers/vhost/vhost.h index b3363ae38518..8e03379dd30f 100644 --- a/drivers/vhost/vhost.h +++ b/drivers/vhost/vhost.h @@ -84,6 +84,12 @@ struct vhost_virtqueue { /* Used flags */ u16 used_flags; + /* Last used index value we have signalled on */ + u16 signalled_used; + + /* Last used index value we have signalled on */ + bool signalled_used_valid; + /* Log writes to used structure. */ bool log_used; u64 log_addr; @@ -149,8 +155,8 @@ void vhost_add_used_and_signal(struct vhost_dev *, struct vhost_virtqueue *, void vhost_add_used_and_signal_n(struct vhost_dev *, struct vhost_virtqueue *, struct vring_used_elem *heads, unsigned count); void vhost_signal(struct vhost_dev *, struct vhost_virtqueue *); -void vhost_disable_notify(struct vhost_virtqueue *); -bool vhost_enable_notify(struct vhost_virtqueue *); +void vhost_disable_notify(struct vhost_dev *, struct vhost_virtqueue *); +bool vhost_enable_notify(struct vhost_dev *, struct vhost_virtqueue *); int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, unsigned int log_num, u64 len); @@ -162,11 +168,12 @@ int vhost_log_write(struct vhost_virtqueue *vq, struct vhost_log *log, } while (0) enum { - VHOST_FEATURES = (1 << VIRTIO_F_NOTIFY_ON_EMPTY) | - (1 << VIRTIO_RING_F_INDIRECT_DESC) | - (1 << VHOST_F_LOG_ALL) | - (1 << VHOST_NET_F_VIRTIO_NET_HDR) | - (1 << VIRTIO_NET_F_MRG_RXBUF), + VHOST_FEATURES = (1ULL << VIRTIO_F_NOTIFY_ON_EMPTY) | + (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX) | + (1ULL << VHOST_F_LOG_ALL) | + (1ULL << VHOST_NET_F_VIRTIO_NET_HDR) | + (1ULL << VIRTIO_NET_F_MRG_RXBUF), }; static inline int vhost_has_feature(struct vhost_dev *dev, int bit) From 4423fe40b03f32b11e72ecfa03077e702e55d5a9 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:11:05 +0300 Subject: [PATCH 17/46] virtio_test: support event index Add ability to test the new event idx feature, enable by default. Signed-off-by: Rusty Russell --- tools/virtio/virtio_test.c | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/tools/virtio/virtio_test.c b/tools/virtio/virtio_test.c index df0c6d2c3860..74d3331bdaf9 100644 --- a/tools/virtio/virtio_test.c +++ b/tools/virtio/virtio_test.c @@ -197,6 +197,14 @@ const struct option longopts[] = { .name = "help", .val = 'h', }, + { + .name = "event-idx", + .val = 'E', + }, + { + .name = "no-event-idx", + .val = 'e', + }, { .name = "indirect", .val = 'I', @@ -211,13 +219,17 @@ const struct option longopts[] = { static void help() { - fprintf(stderr, "Usage: virtio_test [--help] [--no-indirect]\n"); + fprintf(stderr, "Usage: virtio_test [--help]" + " [--no-indirect]" + " [--no-event-idx]" + "\n"); } int main(int argc, char **argv) { struct vdev_info dev; - unsigned long long features = 1ULL << VIRTIO_RING_F_INDIRECT_DESC; + unsigned long long features = (1ULL << VIRTIO_RING_F_INDIRECT_DESC) | + (1ULL << VIRTIO_RING_F_EVENT_IDX); int o; for (;;) { @@ -228,6 +240,9 @@ int main(int argc, char **argv) case '?': help(); exit(2); + case 'e': + features &= ~(1ULL << VIRTIO_RING_F_EVENT_IDX); + break; case 'h': help(); goto done; From 7ab358c23cbf15cea08129cd722d1ce77433a94d Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:11:14 +0300 Subject: [PATCH 18/46] virtio: add api for delayed callbacks Add an API that tells the other side that callbacks should be delayed until a lot of work has been done. Implement using the new event_idx feature. Note: it might seem advantageous to let the drivers ask for a callback after a specific capacity has been reached. However, as a single head can free many entries in the descriptor table, we don't really have a clue about capacity until get_buf is called. The API is the simplest to implement at the moment, we'll see what kind of hints drivers can pass when there's more than one user of the feature. Signed-off-by: Michael S. Tsirkin Signed-off-by: Rusty Russell --- drivers/virtio/virtio_ring.c | 27 +++++++++++++++++++++++++++ include/linux/virtio.h | 9 +++++++++ 2 files changed, 36 insertions(+) diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c index a5fadc40c448..68b9136847af 100644 --- a/drivers/virtio/virtio_ring.c +++ b/drivers/virtio/virtio_ring.c @@ -376,6 +376,33 @@ bool virtqueue_enable_cb(struct virtqueue *_vq) } EXPORT_SYMBOL_GPL(virtqueue_enable_cb); +bool virtqueue_enable_cb_delayed(struct virtqueue *_vq) +{ + struct vring_virtqueue *vq = to_vvq(_vq); + u16 bufs; + + START_USE(vq); + + /* We optimistically turn back on interrupts, then check if there was + * more to do. */ + /* Depending on the VIRTIO_RING_F_USED_EVENT_IDX feature, we need to + * either clear the flags bit or point the event index at the next + * entry. Always do both to keep code simple. */ + vq->vring.avail->flags &= ~VRING_AVAIL_F_NO_INTERRUPT; + /* TODO: tune this threshold */ + bufs = (u16)(vq->vring.avail->idx - vq->last_used_idx) * 3 / 4; + vring_used_event(&vq->vring) = vq->last_used_idx + bufs; + virtio_mb(); + if (unlikely((u16)(vq->vring.used->idx - vq->last_used_idx) > bufs)) { + END_USE(vq); + return false; + } + + END_USE(vq); + return true; +} +EXPORT_SYMBOL_GPL(virtqueue_enable_cb_delayed); + void *virtqueue_detach_unused_buf(struct virtqueue *_vq) { struct vring_virtqueue *vq = to_vvq(_vq); diff --git a/include/linux/virtio.h b/include/linux/virtio.h index aff5b4f74041..710885749605 100644 --- a/include/linux/virtio.h +++ b/include/linux/virtio.h @@ -51,6 +51,13 @@ struct virtqueue { * This re-enables callbacks; it returns "false" if there are pending * buffers in the queue, to detect a possible race between the driver * checking for more work, and enabling callbacks. + * virtqueue_enable_cb_delayed: restart callbacks after disable_cb. + * vq: the struct virtqueue we're talking about. + * This re-enables callbacks but hints to the other side to delay + * interrupts until most of the available buffers have been processed; + * it returns "false" if there are many pending buffers in the queue, + * to detect a possible race between the driver checking for more work, + * and enabling callbacks. * virtqueue_detach_unused_buf: detach first unused buffer * vq: the struct virtqueue we're talking about. * Returns NULL or the "data" token handed to add_buf @@ -86,6 +93,8 @@ void virtqueue_disable_cb(struct virtqueue *vq); bool virtqueue_enable_cb(struct virtqueue *vq); +bool virtqueue_enable_cb_delayed(struct virtqueue *vq); + void *virtqueue_detach_unused_buf(struct virtqueue *vq); /** From 7a66f784375c5922315bbe879b789ee50b924d26 Mon Sep 17 00:00:00 2001 From: "Michael S. Tsirkin" Date: Fri, 20 May 2011 02:11:23 +0300 Subject: [PATCH 19/46] virtio_net: delay TX callbacks Ask for delayed callbacks on TX ring full, to give the other side more of a chance to make progress. Signed-off-by: Michael S. Tsirkin Acked-by: David S. Miller Signed-off-by: Rusty Russell --- drivers/net/virtio_net.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c index 0cb0b0632672..f6853247a620 100644 --- a/drivers/net/virtio_net.c +++ b/drivers/net/virtio_net.c @@ -609,7 +609,7 @@ static netdev_tx_t start_xmit(struct sk_buff *skb, struct net_device *dev) * before it gets out of hand. Naturally, this wastes entries. */ if (capacity < 2+MAX_SKB_FRAGS) { netif_stop_queue(dev); - if (unlikely(!virtqueue_enable_cb(vi->svq))) { + if (unlikely(!virtqueue_enable_cb_delayed(vi->svq))) { /* More just got used, free them then recheck. */ capacity += free_old_xmit_skbs(vi); if (capacity >= 2+MAX_SKB_FRAGS) { From 3cebde2413ba42504cf2c10ec1d47582912435cd Mon Sep 17 00:00:00 2001 From: Sage Weil Date: Sun, 29 May 2011 21:20:59 -0700 Subject: [PATCH 20/46] vfs: shrink_dcache_parent before rmdir, dir rename The dentry_unhash push-down series missed that shink_dcache_parent needs to be called prior to rmdir or dir rename to clear DCACHE_REFERENCED and allow efficient dentry reclaim. Reported-by: Dave Chinner Signed-off-by: Sage Weil Signed-off-by: Al Viro --- fs/namei.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/namei.c b/fs/namei.c index 1ab641f2e78e..e2e4e8d032ee 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -2579,6 +2579,7 @@ int vfs_rmdir(struct inode *dir, struct dentry *dentry) if (error) goto out; + shrink_dcache_parent(dentry); error = dir->i_op->rmdir(dir, dentry); if (error) goto out; @@ -2993,6 +2994,8 @@ static int vfs_rename_dir(struct inode *old_dir, struct dentry *old_dentry, if (d_mountpoint(old_dentry) || d_mountpoint(new_dentry)) goto out; + if (target) + shrink_dcache_parent(new_dentry); error = old_dir->i_op->rename(old_dir, old_dentry, new_dir, new_dentry); if (error) goto out; From c7427d23f7ed695ac226dbe3a84d7f19091d34ce Mon Sep 17 00:00:00 2001 From: Al Viro Date: Mon, 30 May 2011 01:50:53 -0400 Subject: [PATCH 21/46] autofs4: bogus dentry_unhash() added in ->unlink() Signed-off-by: Al Viro --- fs/autofs4/root.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index 87d95a8cddbc..f55ae23b137e 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -583,8 +583,6 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EACCES; - dentry_unhash(dentry); - if (atomic_dec_and_test(&ino->count)) { p_ino = autofs4_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) From 598e887d8b01655780c81cc86a9e7820ed091580 Mon Sep 17 00:00:00 2001 From: Borislav Petkov Date: Mon, 30 May 2011 11:38:06 +0200 Subject: [PATCH 22/46] x86 idle: Fix mwait deprecation warning message Fix: arch/x86/kernel/process.c:645:1: warning: unknown escape sequence '\i' due to missing escape backslash, introduced by this commit: 5d4c47e0195b: x86 idle: deprecate mwait_idle() and "idle=mwait" cmdline param Signed-off-by: Borislav Petkov Cc: Len Brown Cc: Linus Torvalds Link: http://lkml.kernel.org/r/1306748286-24701-1-git-send-email-bp@alien8.de Signed-off-by: Ingo Molnar --- arch/x86/kernel/process.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/process.c b/arch/x86/kernel/process.c index 426a5b66f7e4..3d83a71af7d5 100644 --- a/arch/x86/kernel/process.c +++ b/arch/x86/kernel/process.c @@ -642,7 +642,7 @@ static int __init idle_setup(char *str) boot_option_idle_override = IDLE_POLL; } else if (!strcmp(str, "mwait")) { boot_option_idle_override = IDLE_FORCE_MWAIT; - WARN_ONCE(1, "\idle=mwait\" will be removed in 2012\"\n"); + WARN_ONCE(1, "\"idle=mwait\" will be removed in 2012\"\n"); } else if (!strcmp(str, "halt")) { /* * When the boot option of idle=halt is added, halt is From 4f3c125c7420c85eaff627145557e392a871922d Mon Sep 17 00:00:00 2001 From: Avi Kivity Date: Mon, 30 May 2011 08:23:57 -0400 Subject: [PATCH 23/46] x86: Fix mwait_play_dead() faulting on mwait-incapable cpus MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit A logic error in mwait_play_dead() causes the kernel to use mwait even on cpus which don't support it, such as KVM virtual cpus. Introduced by: 349c004e3d31: x86: A fast way to check capabilities of the current cpu Fixes: https://bugzilla.kernel.org/show_bug.cgi?id=36222 Reported-by: Török Edwin Signed-off-by: Avi Kivity Cc: Christoph Lameter Cc: Tejun Heo Link: http://lkml.kernel.org/r/1306758237-9327-1-git-send-email-avi@redhat.com Signed-off-by: Ingo Molnar --- arch/x86/kernel/smpboot.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c index eefd96765e79..33a0c11797de 100644 --- a/arch/x86/kernel/smpboot.c +++ b/arch/x86/kernel/smpboot.c @@ -1332,7 +1332,7 @@ static inline void mwait_play_dead(void) void *mwait_ptr; struct cpuinfo_x86 *c = __this_cpu_ptr(&cpu_info); - if (!this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c)) + if (!(this_cpu_has(X86_FEATURE_MWAIT) && mwait_usable(c))) return; if (!this_cpu_has(X86_FEATURE_CLFLSH)) return; From d72bce0e67e8afc6eb959f656013cbb577426f1e Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2011 13:34:51 +0200 Subject: [PATCH 24/46] rcu: Cure load woes Commit cc3ce5176d83 (rcu: Start RCU kthreads in TASK_INTERRUPTIBLE state) fudges a sleeping task' state, resulting in the scheduler seeing a TASK_UNINTERRUPTIBLE task going to sleep, but a TASK_INTERRUPTIBLE task waking up. The result is unbalanced load calculation. The problem that patch tried to address is that the RCU threads could stay in UNINTERRUPTIBLE state for quite a while and triggering the hung task detector due to on-demand wake-ups. Cure the problem differently by always giving the tasks at least one wake-up once the CPU is fully up and running, this will kick them out of the initial UNINTERRUPTIBLE state and into the regular INTERRUPTIBLE wait state. [ The alternative would be teaching kthread_create() to start threads as INTERRUPTIBLE but that needs a tad more thought. ] Reported-by: Damien Wyart Signed-off-by: Peter Zijlstra Acked-by: Paul E. McKenney Link: http://lkml.kernel.org/r/1306755291.1200.2872.camel@twins Signed-off-by: Ingo Molnar --- kernel/rcutree.c | 54 +++++++++++++++++++++++++++++++++++------ kernel/rcutree_plugin.h | 11 ++++++++- 2 files changed, 56 insertions(+), 9 deletions(-) diff --git a/kernel/rcutree.c b/kernel/rcutree.c index 77a7671dd147..89419ff92e99 100644 --- a/kernel/rcutree.c +++ b/kernel/rcutree.c @@ -1648,7 +1648,6 @@ static int __cpuinit rcu_spawn_one_cpu_kthread(int cpu) if (IS_ERR(t)) return PTR_ERR(t); kthread_bind(t, cpu); - set_task_state(t, TASK_INTERRUPTIBLE); per_cpu(rcu_cpu_kthread_cpu, cpu) = cpu; WARN_ON_ONCE(per_cpu(rcu_cpu_kthread_task, cpu) != NULL); per_cpu(rcu_cpu_kthread_task, cpu) = t; @@ -1756,7 +1755,6 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); - set_task_state(t, TASK_INTERRUPTIBLE); rnp->node_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = 99; @@ -1765,6 +1763,8 @@ static int __cpuinit rcu_spawn_one_node_kthread(struct rcu_state *rsp, return rcu_spawn_one_boost_kthread(rsp, rnp, rnp_index); } +static void rcu_wake_one_boost_kthread(struct rcu_node *rnp); + /* * Spawn all kthreads -- called as soon as the scheduler is running. */ @@ -1772,18 +1772,30 @@ static int __init rcu_spawn_kthreads(void) { int cpu; struct rcu_node *rnp; + struct task_struct *t; rcu_kthreads_spawnable = 1; for_each_possible_cpu(cpu) { per_cpu(rcu_cpu_has_work, cpu) = 0; - if (cpu_online(cpu)) + if (cpu_online(cpu)) { (void)rcu_spawn_one_cpu_kthread(cpu); + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t) + wake_up_process(t); + } } rnp = rcu_get_root(rcu_state); (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + if (rnp->node_kthread_task) + wake_up_process(rnp->node_kthread_task); if (NUM_RCU_NODES > 1) { - rcu_for_each_leaf_node(rcu_state, rnp) + rcu_for_each_leaf_node(rcu_state, rnp) { (void)rcu_spawn_one_node_kthread(rcu_state, rnp); + t = rnp->node_kthread_task; + if (t) + wake_up_process(t); + rcu_wake_one_boost_kthread(rnp); + } } return 0; } @@ -2188,14 +2200,14 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp, int preemptible) raw_spin_unlock_irqrestore(&rsp->onofflock, flags); } -static void __cpuinit rcu_online_cpu(int cpu) +static void __cpuinit rcu_prepare_cpu(int cpu) { rcu_init_percpu_data(cpu, &rcu_sched_state, 0); rcu_init_percpu_data(cpu, &rcu_bh_state, 0); rcu_preempt_init_percpu_data(cpu); } -static void __cpuinit rcu_online_kthreads(int cpu) +static void __cpuinit rcu_prepare_kthreads(int cpu) { struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); struct rcu_node *rnp = rdp->mynode; @@ -2208,6 +2220,31 @@ static void __cpuinit rcu_online_kthreads(int cpu) } } +/* + * kthread_create() creates threads in TASK_UNINTERRUPTIBLE state, + * but the RCU threads are woken on demand, and if demand is low this + * could be a while triggering the hung task watchdog. + * + * In order to avoid this, poke all tasks once the CPU is fully + * up and running. + */ +static void __cpuinit rcu_online_kthreads(int cpu) +{ + struct rcu_data *rdp = per_cpu_ptr(rcu_state->rda, cpu); + struct rcu_node *rnp = rdp->mynode; + struct task_struct *t; + + t = per_cpu(rcu_cpu_kthread_task, cpu); + if (t) + wake_up_process(t); + + t = rnp->node_kthread_task; + if (t) + wake_up_process(t); + + rcu_wake_one_boost_kthread(rnp); +} + /* * Handle CPU online/offline notification events. */ @@ -2221,10 +2258,11 @@ static int __cpuinit rcu_cpu_notify(struct notifier_block *self, switch (action) { case CPU_UP_PREPARE: case CPU_UP_PREPARE_FROZEN: - rcu_online_cpu(cpu); - rcu_online_kthreads(cpu); + rcu_prepare_cpu(cpu); + rcu_prepare_kthreads(cpu); break; case CPU_ONLINE: + rcu_online_kthreads(cpu); case CPU_DOWN_FAILED: rcu_node_kthread_setaffinity(rnp, -1); rcu_cpu_kthread_setrt(cpu, 1); diff --git a/kernel/rcutree_plugin.h b/kernel/rcutree_plugin.h index a767b7dac365..c8bff3099a89 100644 --- a/kernel/rcutree_plugin.h +++ b/kernel/rcutree_plugin.h @@ -1295,7 +1295,6 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, if (IS_ERR(t)) return PTR_ERR(t); raw_spin_lock_irqsave(&rnp->lock, flags); - set_task_state(t, TASK_INTERRUPTIBLE); rnp->boost_kthread_task = t; raw_spin_unlock_irqrestore(&rnp->lock, flags); sp.sched_priority = RCU_KTHREAD_PRIO; @@ -1303,6 +1302,12 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } +static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) +{ + if (rnp->boost_kthread_task) + wake_up_process(rnp->boost_kthread_task); +} + #else /* #ifdef CONFIG_RCU_BOOST */ static void rcu_initiate_boost(struct rcu_node *rnp, unsigned long flags) @@ -1326,6 +1331,10 @@ static int __cpuinit rcu_spawn_one_boost_kthread(struct rcu_state *rsp, return 0; } +static void __cpuinit rcu_wake_one_boost_kthread(struct rcu_node *rnp) +{ +} + #endif /* #else #ifdef CONFIG_RCU_BOOST */ #ifndef CONFIG_SMP From 339dedf709e21d5718d6596750166f70e8bed40a Mon Sep 17 00:00:00 2001 From: Benjamin Herrenschmidt Date: Tue, 31 May 2011 18:01:23 +1000 Subject: [PATCH 25/46] powerpc/pmac: Don't register pmac PIC syscore ops when HW not present The Apple custom PIC only exist in some earlier machine models, anything with an MPIC will crash on suspend if we register those syscore ops unconditionally. This is a regression caused by commit f5a592f7d74e ("PM / PowerPC: Use struct syscore_ops instead of sysdevs for PM") Signed-off-by: Benjamin Herrenschmidt Signed-off-by: Linus Torvalds --- arch/powerpc/platforms/powermac/pic.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/arch/powerpc/platforms/powermac/pic.c b/arch/powerpc/platforms/powermac/pic.c index 9089b0421191..7667db448aa7 100644 --- a/arch/powerpc/platforms/powermac/pic.c +++ b/arch/powerpc/platforms/powermac/pic.c @@ -715,7 +715,8 @@ static struct syscore_ops pmacpic_syscore_ops = { static int __init init_pmacpic_syscore(void) { - register_syscore_ops(&pmacpic_syscore_ops); + if (pmac_irq_hw[0]) + register_syscore_ops(&pmacpic_syscore_ops); return 0; } From 74c355fbdfedd3820046dba4f537876cea54c207 Mon Sep 17 00:00:00 2001 From: Peter Zijlstra Date: Mon, 30 May 2011 16:48:06 +0200 Subject: [PATCH 26/46] perf, cgroups: Fix up for new API Ben changed the cgroup API in commit f780bdb7c1c (cgroups: add per-thread subsystem callbacks) in an incompatible way, but forgot to convert the perf cgroup bits. Avoid compile warnings and runtime splats and convert perf too ;-) Acked-by: Ben Blum Cc: Stephane Eranian Signed-off-by: Peter Zijlstra Link: http://lkml.kernel.org/r/1306767651.1200.2990.camel@twins Signed-off-by: Ingo Molnar --- kernel/events/core.c | 22 ++++------------------ 1 file changed, 4 insertions(+), 18 deletions(-) diff --git a/kernel/events/core.c b/kernel/events/core.c index c09767f7db3e..8a15944bf9d2 100644 --- a/kernel/events/core.c +++ b/kernel/events/core.c @@ -7394,26 +7394,12 @@ static int __perf_cgroup_move(void *info) return 0; } -static void perf_cgroup_move(struct task_struct *task) +static void +perf_cgroup_attach_task(struct cgroup *cgrp, struct task_struct *task) { task_function_call(task, __perf_cgroup_move, task); } -static void perf_cgroup_attach(struct cgroup_subsys *ss, struct cgroup *cgrp, - struct cgroup *old_cgrp, struct task_struct *task, - bool threadgroup) -{ - perf_cgroup_move(task); - if (threadgroup) { - struct task_struct *c; - rcu_read_lock(); - list_for_each_entry_rcu(c, &task->thread_group, thread_group) { - perf_cgroup_move(c); - } - rcu_read_unlock(); - } -} - static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, struct cgroup *old_cgrp, struct task_struct *task) { @@ -7425,7 +7411,7 @@ static void perf_cgroup_exit(struct cgroup_subsys *ss, struct cgroup *cgrp, if (!(task->flags & PF_EXITING)) return; - perf_cgroup_move(task); + perf_cgroup_attach_task(cgrp, task); } struct cgroup_subsys perf_subsys = { @@ -7434,6 +7420,6 @@ struct cgroup_subsys perf_subsys = { .create = perf_cgroup_create, .destroy = perf_cgroup_destroy, .exit = perf_cgroup_exit, - .attach = perf_cgroup_attach, + .attach_task = perf_cgroup_attach_task, }; #endif /* CONFIG_CGROUP_PERF */ From 83caba8436b28bd9081013fd840424983127d4ca Mon Sep 17 00:00:00 2001 From: Tony Luck Date: Tue, 31 May 2011 10:09:24 -0700 Subject: [PATCH 27/46] [IA64] wire up sendmmsg() syscall for Itanium Add entries in unistd.h and entry.S to make this new syscall visible. Signed-off-by: Tony Luck --- arch/ia64/include/asm/unistd.h | 3 ++- arch/ia64/kernel/entry.S | 1 + 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/arch/ia64/include/asm/unistd.h b/arch/ia64/include/asm/unistd.h index 1cf0f496f744..7c928da35b17 100644 --- a/arch/ia64/include/asm/unistd.h +++ b/arch/ia64/include/asm/unistd.h @@ -320,11 +320,12 @@ #define __NR_clock_adjtime 1328 #define __NR_syncfs 1329 #define __NR_setns 1330 +#define __NR_sendmmsg 1331 #ifdef __KERNEL__ -#define NR_syscalls 307 /* length of syscall table */ +#define NR_syscalls 308 /* length of syscall table */ /* * The following defines stop scripts/checksyscalls.sh from complaining about diff --git a/arch/ia64/kernel/entry.S b/arch/ia64/kernel/entry.S index 9ca80193cd4e..97dd2abdeb1a 100644 --- a/arch/ia64/kernel/entry.S +++ b/arch/ia64/kernel/entry.S @@ -1776,6 +1776,7 @@ sys_call_table: data8 sys_clock_adjtime data8 sys_syncfs data8 sys_setns // 1330 + data8 sys_sendmmsg .org sys_call_table + 8*NR_syscalls // guard against failures to increase NR_syscalls #endif /* __IA64_ASM_PARAVIRTUALIZED_NATIVE */ From a5b2c5b2ad5853591a6cac6134cd0f599a720865 Mon Sep 17 00:00:00 2001 From: Kees Cook Date: Tue, 31 May 2011 11:31:41 -0700 Subject: [PATCH 28/46] AppArmor: fix oops in apparmor_setprocattr When invalid parameters are passed to apparmor_setprocattr a NULL deref oops occurs when it tries to record an audit message. This is because it is passing NULL for the profile parameter for aa_audit. But aa_audit now requires that the profile passed is not NULL. Fix this by passing the current profile on the task that is trying to setprocattr. Signed-off-by: Kees Cook Signed-off-by: John Johansen Cc: stable@kernel.org Signed-off-by: James Morris --- security/apparmor/lsm.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/security/apparmor/lsm.c b/security/apparmor/lsm.c index ae3a698415e6..ec1bcecf2cda 100644 --- a/security/apparmor/lsm.c +++ b/security/apparmor/lsm.c @@ -593,7 +593,8 @@ static int apparmor_setprocattr(struct task_struct *task, char *name, sa.aad.op = OP_SETPROCATTR; sa.aad.info = name; sa.aad.error = -EINVAL; - return aa_audit(AUDIT_APPARMOR_DENIED, NULL, GFP_KERNEL, + return aa_audit(AUDIT_APPARMOR_DENIED, + __aa_current_profile(), GFP_KERNEL, &sa, NULL); } } else if (strcmp(name, "exec") == 0) { From 603d04b2010976a52f62b7633f9999d104046900 Mon Sep 17 00:00:00 2001 From: Mike Frysinger Date: Sat, 28 May 2011 10:04:25 -0400 Subject: [PATCH 29/46] kgdbts: only use new asm-generic/ptrace.h api when needed The new instruction_pointer_set helper is defined for people who have converted to asm-generic/ptrace.h, so don't use it generally unless the arch needs it (in which case it has been converted). This should fix building of kgdb tests for arches not yet converted. Signed-off-by: Mike Frysinger Acked-by: Stephen Rothwell Cc: Jason Wessel Cc: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/misc/kgdbts.c | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/drivers/misc/kgdbts.c b/drivers/misc/kgdbts.c index b0c56313dbbb..8cebec5e85ee 100644 --- a/drivers/misc/kgdbts.c +++ b/drivers/misc/kgdbts.c @@ -304,7 +304,10 @@ static int check_and_rewind_pc(char *put_str, char *arg) return 1; } /* Readjust the instruction pointer if needed */ - instruction_pointer_set(&kgdbts_regs, ip + offset); + ip += offset; +#ifdef GDB_ADJUSTS_BREAK_OFFSET + instruction_pointer_set(&kgdbts_regs, ip); +#endif return 0; } From 63da029015b5255915cd6d61f19ffc276ad4635d Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Mon, 23 May 2011 11:37:09 -0700 Subject: [PATCH 30/46] mtd: fix physmap.h warnings Fix build warnings in physmap.h: include/linux/mtd/physmap.h:25: warning: 'struct platform_device' declared inside parameter list include/linux/mtd/physmap.h:25: warning: its scope is only this definition or declaration, which is probably not what you want include/linux/mtd/physmap.h:26: warning: 'struct platform_device' declared inside parameter list include/linux/mtd/physmap.h:27: warning: 'struct platform_device' declared inside parameter list Signed-off-by: Randy Dunlap Signed-off-by: David Woodhouse --- include/linux/mtd/physmap.h | 1 + 1 file changed, 1 insertion(+) diff --git a/include/linux/mtd/physmap.h b/include/linux/mtd/physmap.h index d40bfa1d9c91..e5f21d293c70 100644 --- a/include/linux/mtd/physmap.h +++ b/include/linux/mtd/physmap.h @@ -19,6 +19,7 @@ #include struct map_info; +struct platform_device; struct physmap_flash_data { unsigned int width; From 6dd9a7c73761a8a5f5475d5cfdc15368a0f4c06d Mon Sep 17 00:00:00 2001 From: Youquan Song Date: Wed, 25 May 2011 19:13:49 +0100 Subject: [PATCH 31/46] intel-iommu: Enable super page (2MiB, 1GiB, etc.) support There are no externally-visible changes with this. In the loop in the internal __domain_mapping() function, we simply detect if we are mapping: - size >= 2MiB, and - virtual address aligned to 2MiB, and - physical address aligned to 2MiB, and - on hardware that supports superpages. (and likewise for larger superpages). We automatically use a superpage for such mappings. We never have to worry about *breaking* superpages, since we trust that we will always *unmap* the same range that was mapped. So all we need to do is ensure that dma_pte_clear_range() will also cope with superpages. Adjust pfn_to_dma_pte() to take a superpage 'level' as an argument, so it can return a PTE at the appropriate level rather than always extending the page tables all the way down to level 1. Again, this is simplified by the fact that we should never encounter existing small pages when we're creating a mapping; any old mapping that used the same virtual range will have been entirely removed and its obsolete page tables freed. Provide an 'intel_iommu=sp_off' argument on the command line as a chicken bit. Not that it should ever be required. == The original commit seen in the iommu-2.6.git was Youquan's implementation (and completion) of my own half-baked code which I'd typed into an email. Followed by half a dozen subsequent 'fixes'. I've taken the unusual step of rewriting history and collapsing the original commits in order to keep the main history simpler, and make life easier for the people who are going to have to backport this to older kernels. And also so I can give it a more coherent commit comment which (hopefully) gives a better explanation of what's going on. The original sequence of commits leading to identical code was: Youquan Song (3): intel-iommu: super page support intel-iommu: Fix superpage alignment calculation error intel-iommu: Fix superpage level calculation error in dma_pfn_level_pte() David Woodhouse (4): intel-iommu: Precalculate superpage support for dmar_domain intel-iommu: Fix hardware_largepage_caps() intel-iommu: Fix inappropriate use of superpages in __domain_mapping() intel-iommu: Fix phys_pfn in __domain_mapping for sglist pages Signed-off-by: Youquan Song Signed-off-by: David Woodhouse --- Documentation/kernel-parameters.txt | 5 +- drivers/pci/intel-iommu.c | 157 ++++++++++++++++++++++++---- include/linux/dma_remapping.h | 4 + 3 files changed, 147 insertions(+), 19 deletions(-) diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt index cc85a9278190..d005487c1a22 100644 --- a/Documentation/kernel-parameters.txt +++ b/Documentation/kernel-parameters.txt @@ -999,7 +999,10 @@ bytes respectively. Such letter suffixes can also be entirely omitted. With this option on every unmap_single operation will result in a hardware IOTLB flush operation as opposed to batching them for performance. - + sp_off [Default Off] + By default, super page will be supported if Intel IOMMU + has the capability. With this option, super page will + not be supported. intremap= [X86-64, Intel-IOMMU] Format: { on (default) | off | nosid } on enable Interrupt Remapping (default) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 395f253c0494..e6fe1994f9d3 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -115,6 +115,11 @@ static inline unsigned long align_to_level(unsigned long pfn, int level) return (pfn + level_size(level) - 1) & level_mask(level); } +static inline unsigned long lvl_to_nr_pages(unsigned int lvl) +{ + return 1 << ((lvl - 1) * LEVEL_STRIDE); +} + /* VT-d pages must always be _smaller_ than MM pages. Otherwise things are never going to work. */ static inline unsigned long dma_to_mm_pfn(unsigned long dma_pfn) @@ -343,6 +348,9 @@ struct dmar_domain { int iommu_coherency;/* indicate coherency of iommu access */ int iommu_snooping; /* indicate snooping control feature*/ int iommu_count; /* reference count of iommu */ + int iommu_superpage;/* Level of superpages supported: + 0 == 4KiB (no superpages), 1 == 2MiB, + 2 == 1GiB, 3 == 512GiB, 4 == 1TiB */ spinlock_t iommu_lock; /* protect iommu set in domain */ u64 max_addr; /* maximum mapped address */ }; @@ -392,6 +400,7 @@ int dmar_disabled = 1; static int dmar_map_gfx = 1; static int dmar_forcedac; static int intel_iommu_strict; +static int intel_iommu_superpage = 1; #define DUMMY_DEVICE_DOMAIN_INFO ((struct device_domain_info *)(-1)) static DEFINE_SPINLOCK(device_domain_lock); @@ -422,6 +431,10 @@ static int __init intel_iommu_setup(char *str) printk(KERN_INFO "Intel-IOMMU: disable batched IOTLB flush\n"); intel_iommu_strict = 1; + } else if (!strncmp(str, "sp_off", 6)) { + printk(KERN_INFO + "Intel-IOMMU: disable supported super page\n"); + intel_iommu_superpage = 0; } str += strcspn(str, ","); @@ -560,11 +573,32 @@ static void domain_update_iommu_snooping(struct dmar_domain *domain) } } +static void domain_update_iommu_superpage(struct dmar_domain *domain) +{ + int i, mask = 0xf; + + if (!intel_iommu_superpage) { + domain->iommu_superpage = 0; + return; + } + + domain->iommu_superpage = 4; /* 1TiB */ + + for_each_set_bit(i, &domain->iommu_bmp, g_num_of_iommus) { + mask |= cap_super_page_val(g_iommus[i]->cap); + if (!mask) { + break; + } + } + domain->iommu_superpage = fls(mask); +} + /* Some capabilities may be different across iommus */ static void domain_update_iommu_cap(struct dmar_domain *domain) { domain_update_iommu_coherency(domain); domain_update_iommu_snooping(domain); + domain_update_iommu_superpage(domain); } static struct intel_iommu *device_to_iommu(int segment, u8 bus, u8 devfn) @@ -694,23 +728,31 @@ out: } static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, - unsigned long pfn) + unsigned long pfn, int large_level) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; struct dma_pte *parent, *pte = NULL; int level = agaw_to_level(domain->agaw); - int offset; + int offset, target_level; BUG_ON(!domain->pgd); BUG_ON(addr_width < BITS_PER_LONG && pfn >> addr_width); parent = domain->pgd; + /* Search pte */ + if (!large_level) + target_level = 1; + else + target_level = large_level; + while (level > 0) { void *tmp_page; offset = pfn_level_offset(pfn, level); pte = &parent[offset]; - if (level == 1) + if (!large_level && (pte->val & DMA_PTE_LARGE_PAGE)) + break; + if (level == target_level) break; if (!dma_pte_present(pte)) { @@ -738,10 +780,11 @@ static struct dma_pte *pfn_to_dma_pte(struct dmar_domain *domain, return pte; } + /* return address's pte at specific level */ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, unsigned long pfn, - int level) + int level, int *large_page) { struct dma_pte *parent, *pte = NULL; int total = agaw_to_level(domain->agaw); @@ -754,8 +797,16 @@ static struct dma_pte *dma_pfn_level_pte(struct dmar_domain *domain, if (level == total) return pte; - if (!dma_pte_present(pte)) + if (!dma_pte_present(pte)) { + *large_page = total; break; + } + + if (pte->val & DMA_PTE_LARGE_PAGE) { + *large_page = total; + return pte; + } + parent = phys_to_virt(dma_pte_addr(pte)); total--; } @@ -768,6 +819,7 @@ static void dma_pte_clear_range(struct dmar_domain *domain, unsigned long last_pfn) { int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; + unsigned int large_page = 1; struct dma_pte *first_pte, *pte; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); @@ -776,14 +828,15 @@ static void dma_pte_clear_range(struct dmar_domain *domain, /* we don't need lock here; nobody else touches the iova range */ do { - first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1); + large_page = 1; + first_pte = pte = dma_pfn_level_pte(domain, start_pfn, 1, &large_page); if (!pte) { - start_pfn = align_to_level(start_pfn + 1, 2); + start_pfn = align_to_level(start_pfn + 1, large_page + 1); continue; } - do { + do { dma_clear_pte(pte); - start_pfn++; + start_pfn += lvl_to_nr_pages(large_page); pte++; } while (start_pfn <= last_pfn && !first_pte_in_page(pte)); @@ -803,6 +856,7 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, int total = agaw_to_level(domain->agaw); int level; unsigned long tmp; + int large_page = 2; BUG_ON(addr_width < BITS_PER_LONG && start_pfn >> addr_width); BUG_ON(addr_width < BITS_PER_LONG && last_pfn >> addr_width); @@ -818,7 +872,10 @@ static void dma_pte_free_pagetable(struct dmar_domain *domain, return; do { - first_pte = pte = dma_pfn_level_pte(domain, tmp, level); + large_page = level; + first_pte = pte = dma_pfn_level_pte(domain, tmp, level, &large_page); + if (large_page > level) + level = large_page + 1; if (!pte) { tmp = align_to_level(tmp + 1, level + 1); continue; @@ -1402,6 +1459,7 @@ static int domain_init(struct dmar_domain *domain, int guest_width) else domain->iommu_snooping = 0; + domain->iommu_superpage = fls(cap_super_page_val(iommu->cap)); domain->iommu_count = 1; domain->nid = iommu->node; @@ -1657,6 +1715,34 @@ static inline unsigned long aligned_nrpages(unsigned long host_addr, return PAGE_ALIGN(host_addr + size) >> VTD_PAGE_SHIFT; } +/* Return largest possible superpage level for a given mapping */ +static inline int hardware_largepage_caps(struct dmar_domain *domain, + unsigned long iov_pfn, + unsigned long phy_pfn, + unsigned long pages) +{ + int support, level = 1; + unsigned long pfnmerge; + + support = domain->iommu_superpage; + + /* To use a large page, the virtual *and* physical addresses + must be aligned to 2MiB/1GiB/etc. Lower bits set in either + of them will mean we have to use smaller pages. So just + merge them and check both at once. */ + pfnmerge = iov_pfn | phy_pfn; + + while (support && !(pfnmerge & ~VTD_STRIDE_MASK)) { + pages >>= VTD_STRIDE_SHIFT; + if (!pages) + break; + pfnmerge >>= VTD_STRIDE_SHIFT; + level++; + support--; + } + return level; +} + static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, struct scatterlist *sg, unsigned long phys_pfn, unsigned long nr_pages, int prot) @@ -1665,6 +1751,8 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, phys_addr_t uninitialized_var(pteval); int addr_width = agaw_to_width(domain->agaw) - VTD_PAGE_SHIFT; unsigned long sg_res; + unsigned int largepage_lvl = 0; + unsigned long lvl_pages = 0; BUG_ON(addr_width < BITS_PER_LONG && (iov_pfn + nr_pages - 1) >> addr_width); @@ -1680,7 +1768,7 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, pteval = ((phys_addr_t)phys_pfn << VTD_PAGE_SHIFT) | prot; } - while (nr_pages--) { + while (nr_pages > 0) { uint64_t tmp; if (!sg_res) { @@ -1688,11 +1776,21 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, sg->dma_address = ((dma_addr_t)iov_pfn << VTD_PAGE_SHIFT) + sg->offset; sg->dma_length = sg->length; pteval = page_to_phys(sg_page(sg)) | prot; + phys_pfn = pteval >> VTD_PAGE_SHIFT; } + if (!pte) { - first_pte = pte = pfn_to_dma_pte(domain, iov_pfn); + largepage_lvl = hardware_largepage_caps(domain, iov_pfn, phys_pfn, sg_res); + + first_pte = pte = pfn_to_dma_pte(domain, iov_pfn, largepage_lvl); if (!pte) return -ENOMEM; + /* It is large page*/ + if (largepage_lvl > 1) + pteval |= DMA_PTE_LARGE_PAGE; + else + pteval &= ~(uint64_t)DMA_PTE_LARGE_PAGE; + } /* We don't need lock here, nobody else * touches the iova range @@ -1708,16 +1806,38 @@ static int __domain_mapping(struct dmar_domain *domain, unsigned long iov_pfn, } WARN_ON(1); } + + lvl_pages = lvl_to_nr_pages(largepage_lvl); + + BUG_ON(nr_pages < lvl_pages); + BUG_ON(sg_res < lvl_pages); + + nr_pages -= lvl_pages; + iov_pfn += lvl_pages; + phys_pfn += lvl_pages; + pteval += lvl_pages * VTD_PAGE_SIZE; + sg_res -= lvl_pages; + + /* If the next PTE would be the first in a new page, then we + need to flush the cache on the entries we've just written. + And then we'll need to recalculate 'pte', so clear it and + let it get set again in the if (!pte) block above. + + If we're done (!nr_pages) we need to flush the cache too. + + Also if we've been setting superpages, we may need to + recalculate 'pte' and switch back to smaller pages for the + end of the mapping, if the trailing size is not enough to + use another superpage (i.e. sg_res < lvl_pages). */ pte++; - if (!nr_pages || first_pte_in_page(pte)) { + if (!nr_pages || first_pte_in_page(pte) || + (largepage_lvl > 1 && sg_res < lvl_pages)) { domain_flush_cache(domain, first_pte, (void *)pte - (void *)first_pte); pte = NULL; } - iov_pfn++; - pteval += VTD_PAGE_SIZE; - sg_res--; - if (!sg_res) + + if (!sg_res && nr_pages) sg = sg_next(sg); } return 0; @@ -3527,6 +3647,7 @@ static int md_domain_init(struct dmar_domain *domain, int guest_width) domain->iommu_count = 0; domain->iommu_coherency = 0; domain->iommu_snooping = 0; + domain->iommu_superpage = 0; domain->max_addr = 0; domain->nid = -1; @@ -3742,7 +3863,7 @@ static phys_addr_t intel_iommu_iova_to_phys(struct iommu_domain *domain, struct dma_pte *pte; u64 phys = 0; - pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT); + pte = pfn_to_dma_pte(dmar_domain, iova >> VTD_PAGE_SHIFT, 0); if (pte) phys = dma_pte_addr(pte); diff --git a/include/linux/dma_remapping.h b/include/linux/dma_remapping.h index 5619f8522738..bbd8661b3473 100644 --- a/include/linux/dma_remapping.h +++ b/include/linux/dma_remapping.h @@ -9,8 +9,12 @@ #define VTD_PAGE_MASK (((u64)-1) << VTD_PAGE_SHIFT) #define VTD_PAGE_ALIGN(addr) (((addr) + VTD_PAGE_SIZE - 1) & VTD_PAGE_MASK) +#define VTD_STRIDE_SHIFT (9) +#define VTD_STRIDE_MASK (((u64)-1) << VTD_STRIDE_SHIFT) + #define DMA_PTE_READ (1) #define DMA_PTE_WRITE (2) +#define DMA_PTE_LARGE_PAGE (1 << 7) #define DMA_PTE_SNP (1 << 11) #define CONTEXT_TT_MULTI_LEVEL 0 From 9b4554b21ed07e8556405510638171f0c787742a Mon Sep 17 00:00:00 2001 From: Alex Williamson Date: Tue, 24 May 2011 12:19:04 -0400 Subject: [PATCH 32/46] intel-iommu: Only unlink device domains from iommu Commit a97590e5 added unlinking domains from iommus to reciprocate the iommu from domains unlinking that was already done. We actually want to only do this for device domains and never for the static identity map domain or VM domains. The SI domain is special and never freed, while VM domain->id lives in their own special address space, separate from iommu->domain_ids. In the current code, a VM can get domain->id zero, then mark that domain unused when unbound from pci-stub. This leads to DMAR write faults when the device is re-bound to the host driver. Signed-off-by: Alex Williamson Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index e6fe1994f9d3..4eaec2fa1369 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3561,10 +3561,13 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, domain_update_iommu_cap(domain); spin_unlock_irqrestore(&domain->iommu_lock, tmp_flags); - spin_lock_irqsave(&iommu->lock, tmp_flags); - clear_bit(domain->id, iommu->domain_ids); - iommu->domains[domain->id] = NULL; - spin_unlock_irqrestore(&iommu->lock, tmp_flags); + if (!(domain->flags & DOMAIN_FLAG_VIRTUAL_MACHINE) && + !(domain->flags & DOMAIN_FLAG_STATIC_IDENTITY)) { + spin_lock_irqsave(&iommu->lock, tmp_flags); + clear_bit(domain->id, iommu->domain_ids); + iommu->domains[domain->id] = NULL; + spin_unlock_irqrestore(&iommu->lock, tmp_flags); + } } spin_unlock_irqrestore(&device_domain_lock, flags); From 8fcc5372fbac085199d84a880503ed67aba3fe49 Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sat, 28 May 2011 13:15:02 -0500 Subject: [PATCH 33/46] intel-iommu: Check for identity mapping candidate using system dma mask The identity mapping code appears to make the assumption that if the devices dma_mask is greater than 32bits the device can use identity mapping. But that is not true: take the case where we have a 40bit device in a 44bit architecture. The device can potentially receive a physical address that it will truncate and cause incorrect addresses to be used. Instead check to see if the device's dma_mask is large enough to address the system's dma_mask. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 4eaec2fa1369..dcf051d53b74 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2316,8 +2316,19 @@ static int iommu_should_identity_map(struct pci_dev *pdev, int startup) * Assume that they will -- if they turn out not to be, then we can * take them out of the 1:1 domain later. */ - if (!startup) - return pdev->dma_mask > DMA_BIT_MASK(32); + if (!startup) { + /* + * If the device's dma_mask is less than the system's memory + * size then this is not a candidate for identity mapping. + */ + u64 dma_mask = pdev->dma_mask; + + if (pdev->dev.coherent_dma_mask && + pdev->dev.coherent_dma_mask < dma_mask) + dma_mask = pdev->dev.coherent_dma_mask; + + return dma_mask >= dma_get_required_mask(&pdev->dev); + } return 1; } From cb452a4040bb051d92e85d6e7eb60c11734c1781 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:03 -0500 Subject: [PATCH 34/46] intel-iommu: Speed up processing of the identity_mapping function When there are a large count of PCI devices, and the pass through option for iommu is set, much time is spent in the identity_mapping function hunting though the iommu domains to check if a specific device is "identity mapped". Speed up the function by checking the cached info to see if it's mapped to the static identity domain. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index dcf051d53b74..98be0b55ac0b 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2235,10 +2235,10 @@ static int identity_mapping(struct pci_dev *pdev) if (likely(!iommu_identity_mapping)) return 0; + info = pdev->dev.archdata.iommu; + if (info && info != DUMMY_DEVICE_DOMAIN_INFO) + return (info->domain == si_domain); - list_for_each_entry(info, &si_domain->devices, link) - if (info->dev == pdev) - return 1; return 0; } From 1c9fc3d11b84fbd0c4f4aa7855702c2a1f098ebb Mon Sep 17 00:00:00 2001 From: Chris Wright Date: Sat, 28 May 2011 13:15:04 -0500 Subject: [PATCH 35/46] intel-iommu: Dont cache iova above 32bit Mike Travis and Mike Habeck reported an issue where iova allocation would return a range that was larger than a device's dma mask. https://lkml.org/lkml/2011/3/29/423 The dmar initialization code will reserve all PCI MMIO regions and copy those reservations into a domain specific iova tree. It is possible for one of those regions to be above the dma mask of a device. It is typical to allocate iovas with a 32bit mask (despite device's dma mask possibly being larger) and cache the result until it exhausts the lower 32bit address space. Freeing the iova range that is >= the last iova in the lower 32bit range when there is still an iova above the 32bit range will corrupt the cached iova by pointing it to a region that is above 32bit. If that region is also larger than the device's dma mask, a subsequent allocation will return an unusable iova and cause dma failure. Simply don't cache an iova that is above the 32bit caching boundary. Reported-by: Mike Travis Reported-by: Mike Habeck Cc: stable@kernel.org Acked-by: Mike Travis Tested-by: Mike Habeck Signed-off-by: Chris Wright Signed-off-by: David Woodhouse --- drivers/pci/iova.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/pci/iova.c b/drivers/pci/iova.c index 9606e599a475..c5c274ab5c5a 100644 --- a/drivers/pci/iova.c +++ b/drivers/pci/iova.c @@ -63,8 +63,16 @@ __cached_rbnode_delete_update(struct iova_domain *iovad, struct iova *free) curr = iovad->cached32_node; cached_iova = container_of(curr, struct iova, node); - if (free->pfn_lo >= cached_iova->pfn_lo) - iovad->cached32_node = rb_next(&free->node); + if (free->pfn_lo >= cached_iova->pfn_lo) { + struct rb_node *node = rb_next(&free->node); + struct iova *iova = container_of(node, struct iova, node); + + /* only cache if it's below 32bit pfn */ + if (node && iova->pfn_lo < iovad->dma_32bit_pfn) + iovad->cached32_node = node; + else + iovad->cached32_node = NULL; + } } /* Computes the padding size required, to make the From c681d0ba1252954208220ad32248a3e8e2fc98e4 Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:05 -0500 Subject: [PATCH 36/46] intel-iommu: Use coherent DMA mask when requested The __intel_map_single function is not honoring the passed in DMA mask. This results in not using the coherent DMA mask when called from intel_alloc_coherent(). Signed-off-by: Mike Travis Acked-by: Chris Wright Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 98be0b55ac0b..5cbab7f19ae0 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2732,8 +2732,7 @@ static dma_addr_t __intel_map_single(struct device *hwdev, phys_addr_t paddr, iommu = domain_get_iommu(domain); size = aligned_nrpages(paddr, size); - iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), - pdev->dma_mask); + iova = intel_alloc_iova(hwdev, domain, dma_to_mm_pfn(size), dma_mask); if (!iova) goto error; From 825507d6d059f1cbe2503e0e5a3926225b983aec Mon Sep 17 00:00:00 2001 From: Mike Travis Date: Sat, 28 May 2011 13:15:06 -0500 Subject: [PATCH 37/46] intel-iommu: Remove Host Bridge devices from identity mapping When using the 1:1 (identity) PCI DMA remapping, PCI Host Bridge devices that do not use the IOMMU causes a kernel panic. Fix that by not inserting those devices into the si_domain. Signed-off-by: Mike Travis Reviewed-by: Mike Habeck Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 5cbab7f19ae0..7f757ce60c5a 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -46,6 +46,8 @@ #define ROOT_SIZE VTD_PAGE_SIZE #define CONTEXT_SIZE VTD_PAGE_SIZE +#define IS_BRIDGE_HOST_DEVICE(pdev) \ + ((pdev->class >> 8) == PCI_CLASS_BRIDGE_HOST) #define IS_GFX_DEVICE(pdev) ((pdev->class >> 16) == PCI_BASE_CLASS_DISPLAY) #define IS_ISA_DEVICE(pdev) ((pdev->class >> 8) == PCI_CLASS_BRIDGE_ISA) #define IS_AZALIA(pdev) ((pdev)->vendor == 0x8086 && (pdev)->device == 0x3a3e) @@ -2343,6 +2345,9 @@ static int __init iommu_prepare_static_identity_mapping(int hw) return -EFAULT; for_each_pci_dev(pdev) { + /* Skip Host/PCI Bridge devices */ + if (IS_BRIDGE_HOST_DEVICE(pdev)) + continue; if (iommu_should_identity_map(pdev, 1)) { printk(KERN_INFO "IOMMU: %s identity mapping for device %s\n", hw ? "hardware" : "software", pci_name(pdev)); From 8519dc4401ddf8a5399f979870bbeeadbc111186 Mon Sep 17 00:00:00 2001 From: Mike Habeck Date: Sat, 28 May 2011 13:15:07 -0500 Subject: [PATCH 38/46] intel-iommu: Add domain check in domain_remove_one_dev_info The comment in domain_remove_one_dev_info() states "No need to compare PCI domain; it has to be the same". But for the si_domain that isn't going to be true, as it consists of all the PCI devices that are identity mapped thus multiple PCI domains can be in si_domain. The code needs to validate the PCI domain too. Signed-off-by: Mike Habeck Signed-off-by: Mike Travis Cc: stable@kernel.org Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index 7f757ce60c5a..b0c96d390802 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -3537,8 +3537,8 @@ static void domain_remove_one_dev_info(struct dmar_domain *domain, spin_lock_irqsave(&device_domain_lock, flags); list_for_each_safe(entry, tmp, &domain->devices) { info = list_entry(entry, struct device_domain_info, link); - /* No need to compare PCI domain; it has to be the same */ - if (info->bus == pdev->bus->number && + if (info->segment == pci_domain_nr(pdev->bus) && + info->bus == pdev->bus->number && info->devfn == pdev->devfn) { list_del(&info->link); list_del(&info->global); From 70e535d1e5d1e4317e894d6228b762cf9c3fbc6a Mon Sep 17 00:00:00 2001 From: David Woodhouse Date: Tue, 31 May 2011 00:22:52 +0100 Subject: [PATCH 39/46] intel-iommu: Fix off-by-one in RMRR setup We were mapping an extra byte (and hence usually an extra page): iommu_prepare_identity_map() expects to be given an 'end' argument which is the last byte to be mapped; not the first byte *not* to be mapped. Signed-off-by: David Woodhouse --- drivers/pci/intel-iommu.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/pci/intel-iommu.c b/drivers/pci/intel-iommu.c index b0c96d390802..a8867bd745e2 100644 --- a/drivers/pci/intel-iommu.c +++ b/drivers/pci/intel-iommu.c @@ -2147,7 +2147,7 @@ static inline int iommu_prepare_rmrr_dev(struct dmar_rmrr_unit *rmrr, if (pdev->dev.archdata.iommu == DUMMY_DEVICE_DOMAIN_INFO) return 0; return iommu_prepare_identity_map(pdev, rmrr->base_address, - rmrr->end_address + 1); + rmrr->end_address); } #ifdef CONFIG_DMAR_FLOPPY_WA @@ -2161,7 +2161,7 @@ static inline void iommu_prepare_isa(void) return; printk(KERN_INFO "IOMMU: Prepare 0-16MiB unity mapping for LPC\n"); - ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024); + ret = iommu_prepare_identity_map(pdev, 0, 16*1024*1024 - 1); if (ret) printk(KERN_ERR "IOMMU: Failed to create 0-16MiB identity map; " From 0f48f2600911d5de6393829e4a9986d4075558b3 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 2 Jun 2011 05:29:19 +0900 Subject: [PATCH 40/46] block: fix mismerge of the DISK_EVENT_MEDIA_CHANGE removal Jens' back-merge commit 698567f3fa79 ("Merge commit 'v2.6.39' into for-2.6.40/core") was incorrectly done, and re-introduced the DISK_EVENT_MEDIA_CHANGE lines that had been removed earlier in commits - 9fd097b14918 ("block: unexport DISK_EVENT_MEDIA_CHANGE for legacy/fringe drivers") - 7eec77a1816a ("ide: unexport DISK_EVENT_MEDIA_CHANGE for ide-gd and ide-cd") because of conflicts with the "g->flags" updates near-by by commit d4dc210f69bc ("block: don't block events on excl write for non-optical devices") As a result, we re-introduced the hanging behavior due to infinite disk media change reports. Tssk, tssk, people! Don't do back-merges at all, and *definitely* don't do them to hide merge conflicts from me - especially as I'm likely better at merging them than you are, since I do so many merges. Reported-by: Steven Rostedt Cc: Jens Axboe Signed-off-by: Linus Torvalds --- drivers/block/paride/pcd.c | 1 - drivers/cdrom/viocd.c | 1 - drivers/ide/ide-cd.c | 1 - 3 files changed, 3 deletions(-) diff --git a/drivers/block/paride/pcd.c b/drivers/block/paride/pcd.c index a0aabd904a51..46b8136c31bb 100644 --- a/drivers/block/paride/pcd.c +++ b/drivers/block/paride/pcd.c @@ -321,7 +321,6 @@ static void pcd_init_units(void) strcpy(disk->disk_name, cd->name); /* umm... */ disk->fops = &pcd_bdops; disk->flags = GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - disk->events = DISK_EVENT_MEDIA_CHANGE; } } diff --git a/drivers/cdrom/viocd.c b/drivers/cdrom/viocd.c index ae15a4ddaa9b..7878da89d29e 100644 --- a/drivers/cdrom/viocd.c +++ b/drivers/cdrom/viocd.c @@ -627,7 +627,6 @@ static int viocd_probe(struct vio_dev *vdev, const struct vio_device_id *id) gendisk->fops = &viocd_fops; gendisk->flags = GENHD_FL_CD | GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - gendisk->events = DISK_EVENT_MEDIA_CHANGE; set_capacity(gendisk, 0); gendisk->private_data = d; d->viocd_disk = gendisk; diff --git a/drivers/ide/ide-cd.c b/drivers/ide/ide-cd.c index 6e5123b1d341..144d27261e43 100644 --- a/drivers/ide/ide-cd.c +++ b/drivers/ide/ide-cd.c @@ -1782,7 +1782,6 @@ static int ide_cd_probe(ide_drive_t *drive) ide_cd_read_toc(drive, &sense); g->fops = &idecd_ops; g->flags |= GENHD_FL_REMOVABLE | GENHD_FL_BLOCK_EVENTS_ON_EXCL_WRITE; - g->events = DISK_EVENT_MEDIA_CHANGE; add_disk(g); return 0; From 1fa7b6a29c61358cc2ca6f64cef4aa0e1a7ca74c Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Thu, 2 Jun 2011 06:11:24 +0900 Subject: [PATCH 41/46] Revert "mm: fail GFP_DMA allocations when ZONE_DMA is not configured" This reverts commit a197b59ae6e8bee56fcef37ea2482dc08414e2ac. As rmk says: "Commit a197b59ae6e8 (mm: fail GFP_DMA allocations when ZONE_DMA is not configured) is causing regressions on ARM with various drivers which use GFP_DMA. The behaviour up until now has been to silently ignore that flag when CONFIG_ZONE_DMA is not enabled, and to allocate from the normal zone. However, as a result of the above commit, such allocations now fail which causes drivers to fail. These are regressions compared to the previous kernel version." so just revert it. Requested-by: Russell King Acked-by: Andrew Morton Cc: David Rientjes Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index a4e1db3f1981..4e8985acdab8 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -2247,10 +2247,6 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, if (should_fail_alloc_page(gfp_mask, order)) return NULL; -#ifndef CONFIG_ZONE_DMA - if (WARN_ON_ONCE(gfp_mask & __GFP_DMA)) - return NULL; -#endif /* * Check the zones suitable for the gfp_mask contain at least one From d21cc9f67d689effbfd24bac878bc2c057de8c46 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 10:39:43 -0300 Subject: [PATCH 42/46] perf evlist: Remove dependency on debug routines So far we avoided having to link debug.o in the python binding, keep it that way by not using ui__warning() in evlist.c. Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-4wtew8hd3g7ejnlehtspys2t@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/evlist.c | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 50aa34879c33..04c8a60075b3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -12,7 +12,6 @@ #include "evlist.h" #include "evsel.h" #include "util.h" -#include "debug.h" #include @@ -257,19 +256,15 @@ int perf_evlist__alloc_mmap(struct perf_evlist *evlist) return evlist->mmap != NULL ? 0 : -ENOMEM; } -static int __perf_evlist__mmap(struct perf_evlist *evlist, struct perf_evsel *evsel, +static int __perf_evlist__mmap(struct perf_evlist *evlist, int idx, int prot, int mask, int fd) { evlist->mmap[idx].prev = 0; evlist->mmap[idx].mask = mask; evlist->mmap[idx].base = mmap(NULL, evlist->mmap_len, prot, MAP_SHARED, fd, 0); - if (evlist->mmap[idx].base == MAP_FAILED) { - if (evlist->cpus->map[idx] == -1 && evsel->attr.inherit) - ui__warning("Inherit is not allowed on per-task " - "events using mmap.\n"); + if (evlist->mmap[idx].base == MAP_FAILED) return -1; - } perf_evlist__add_pollfd(evlist, fd); return 0; @@ -289,7 +284,7 @@ static int perf_evlist__mmap_per_cpu(struct perf_evlist *evlist, int prot, int m if (output == -1) { output = fd; - if (__perf_evlist__mmap(evlist, evsel, cpu, + if (__perf_evlist__mmap(evlist, cpu, prot, mask, output) < 0) goto out_unmap; } else { @@ -329,7 +324,7 @@ static int perf_evlist__mmap_per_thread(struct perf_evlist *evlist, int prot, in if (output == -1) { output = fd; - if (__perf_evlist__mmap(evlist, evsel, thread, + if (__perf_evlist__mmap(evlist, thread, prot, mask, output) < 0) goto out_unmap; } else { From 9c850d6c4b95bb07fb066eb7f43dd4e3b4842b85 Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 10:55:10 -0300 Subject: [PATCH 43/46] perf python: Use exception to propagate errors We were using pr_debug to tell the user about not being able to parse a sample where we should really use the python way of reporting errors: exceptions. Fixes this problem: [root@emilia ~]# python >>> import perf Traceback (most recent call last): File "", line 1, in ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: eprintf >>> [root@emilia ~] As we want to keep the objects linked in the python binding (and in the future in a shared library) minimal. Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-m9dba9kaluas0kq8r58z191c@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 69436b3200a4..2dd1698e0932 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -694,14 +694,12 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, err = perf_event__parse_sample(event, first->attr.sample_type, perf_sample_size(first->attr.sample_type), sample_id_all, &pevent->sample); - if (err) { - pr_err("Can't parse sample, err = %d\n", err); - goto end; - } - + if (err) + return PyErr_Format(PyExc_OSError, + "perf: can't parse sample, err=%d", err); return pyevent; } -end: + Py_INCREF(Py_None); return Py_None; } From 56722381b8506733852c44dacf6d7bc5f90aedaf Mon Sep 17 00:00:00 2001 From: Arnaldo Carvalho de Melo Date: Thu, 2 Jun 2011 11:04:54 -0300 Subject: [PATCH 44/46] perf evlist: Don't die if sample_{id_all|type} is invalid Fixes two more cases where the python binding would not load: . Not finding die(), which it shouldn't anyway, not good to just stop the world because some particular perf.data file is invalid, just propagate the error to the caller. . Not finding perf_sample_size: fix it by moving it from event.c to evsel, where it belongs, as most cases are moving to operate on an evsel object.o One of the fixed problems: [root@emilia ~]# python >>> import perf Traceback (most recent call last): File "", line 1, in ImportError: /home/acme/git/build/perf/python/perf.so: undefined symbol: perf_sample_size >>> [root@emilia ~]# Cc: Frederic Weisbecker Cc: Mike Galbraith Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Link: http://lkml.kernel.org/n/tip-1hkj7b2cvgbfnoizsekjb6c9@git.kernel.org Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/builtin-test.c | 2 +- tools/perf/util/event.c | 16 ------------ tools/perf/util/event.h | 2 -- tools/perf/util/evlist.c | 53 ++++++++++++++++++++++++--------------- tools/perf/util/evlist.h | 6 +++-- tools/perf/util/evsel.c | 16 ++++++++++++ tools/perf/util/evsel.h | 7 ++++++ tools/perf/util/python.c | 2 +- tools/perf/util/session.c | 12 ++++++++- 9 files changed, 73 insertions(+), 43 deletions(-) diff --git a/tools/perf/builtin-test.c b/tools/perf/builtin-test.c index b67186228c89..2da9162262b0 100644 --- a/tools/perf/builtin-test.c +++ b/tools/perf/builtin-test.c @@ -474,7 +474,7 @@ static int test__basic_mmap(void) unsigned int nr_events[nsyscalls], expected_nr_events[nsyscalls], i, j; struct perf_evsel *evsels[nsyscalls], *evsel; - int sample_size = perf_sample_size(attr.sample_type); + int sample_size = __perf_evsel__sample_size(attr.sample_type); for (i = 0; i < nsyscalls; ++i) { char name[64]; diff --git a/tools/perf/util/event.c b/tools/perf/util/event.c index 0fe9adf76379..3c1b8a632101 100644 --- a/tools/perf/util/event.c +++ b/tools/perf/util/event.c @@ -35,22 +35,6 @@ const char *perf_event__name(unsigned int id) return perf_event__names[id]; } -int perf_sample_size(u64 sample_type) -{ - u64 mask = sample_type & PERF_SAMPLE_MASK; - int size = 0; - int i; - - for (i = 0; i < 64; i++) { - if (mask & (1ULL << i)) - size++; - } - - size *= sizeof(u64); - - return size; -} - static struct perf_sample synth_sample = { .pid = -1, .tid = -1, diff --git a/tools/perf/util/event.h b/tools/perf/util/event.h index c08332871408..1d7f66488a88 100644 --- a/tools/perf/util/event.h +++ b/tools/perf/util/event.h @@ -82,8 +82,6 @@ struct perf_sample { struct ip_callchain *callchain; }; -int perf_sample_size(u64 sample_type); - #define BUILD_ID_SIZE 20 struct build_id_event { diff --git a/tools/perf/util/evlist.c b/tools/perf/util/evlist.c index 04c8a60075b3..b021ea9265c3 100644 --- a/tools/perf/util/evlist.c +++ b/tools/perf/util/evlist.c @@ -455,33 +455,46 @@ int perf_evlist__set_filters(struct perf_evlist *evlist) return 0; } -u64 perf_evlist__sample_type(struct perf_evlist *evlist) +bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist) { - struct perf_evsel *pos; - u64 type = 0; + struct perf_evsel *pos, *first; - list_for_each_entry(pos, &evlist->entries, node) { - if (!type) - type = pos->attr.sample_type; - else if (type != pos->attr.sample_type) - die("non matching sample_type"); + pos = first = list_entry(evlist->entries.next, struct perf_evsel, node); + + list_for_each_entry_continue(pos, &evlist->entries, node) { + if (first->attr.sample_type != pos->attr.sample_type) + return false; } - return type; + return true; +} + +u64 perf_evlist__sample_type(const struct perf_evlist *evlist) +{ + struct perf_evsel *first; + + first = list_entry(evlist->entries.next, struct perf_evsel, node); + return first->attr.sample_type; +} + +bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist) +{ + struct perf_evsel *pos, *first; + + pos = first = list_entry(evlist->entries.next, struct perf_evsel, node); + + list_for_each_entry_continue(pos, &evlist->entries, node) { + if (first->attr.sample_id_all != pos->attr.sample_id_all) + return false; + } + + return true; } bool perf_evlist__sample_id_all(const struct perf_evlist *evlist) { - bool value = false, first = true; - struct perf_evsel *pos; + struct perf_evsel *first; - list_for_each_entry(pos, &evlist->entries, node) { - if (first) { - value = pos->attr.sample_id_all; - first = false; - } else if (value != pos->attr.sample_id_all) - die("non matching sample_id_all"); - } - - return value; + first = list_entry(evlist->entries.next, struct perf_evsel, node); + return first->attr.sample_id_all; } diff --git a/tools/perf/util/evlist.h b/tools/perf/util/evlist.h index 0a1ef1f051f0..b2b862374f37 100644 --- a/tools/perf/util/evlist.h +++ b/tools/perf/util/evlist.h @@ -66,7 +66,9 @@ int perf_evlist__create_maps(struct perf_evlist *evlist, pid_t target_pid, void perf_evlist__delete_maps(struct perf_evlist *evlist); int perf_evlist__set_filters(struct perf_evlist *evlist); -u64 perf_evlist__sample_type(struct perf_evlist *evlist); -bool perf_evlist__sample_id_all(const struct perf_evlist *evlist); +u64 perf_evlist__sample_type(const struct perf_evlist *evlist); +bool perf_evlist__sample_id_all(const const struct perf_evlist *evlist); +bool perf_evlist__valid_sample_type(const struct perf_evlist *evlist); +bool perf_evlist__valid_sample_id_all(const struct perf_evlist *evlist); #endif /* __PERF_EVLIST_H */ diff --git a/tools/perf/util/evsel.c b/tools/perf/util/evsel.c index cca29ededb5b..0239eb87b232 100644 --- a/tools/perf/util/evsel.c +++ b/tools/perf/util/evsel.c @@ -15,6 +15,22 @@ #define FD(e, x, y) (*(int *)xyarray__entry(e->fd, x, y)) +int __perf_evsel__sample_size(u64 sample_type) +{ + u64 mask = sample_type & PERF_SAMPLE_MASK; + int size = 0; + int i; + + for (i = 0; i < 64; i++) { + if (mask & (1ULL << i)) + size++; + } + + size *= sizeof(u64); + + return size; +} + void perf_evsel__init(struct perf_evsel *evsel, struct perf_event_attr *attr, int idx) { diff --git a/tools/perf/util/evsel.h b/tools/perf/util/evsel.h index f79bb2c09a6c..7e9366e4490b 100644 --- a/tools/perf/util/evsel.h +++ b/tools/perf/util/evsel.h @@ -149,4 +149,11 @@ static inline int perf_evsel__read_scaled(struct perf_evsel *evsel, return __perf_evsel__read(evsel, ncpus, nthreads, true); } +int __perf_evsel__sample_size(u64 sample_type); + +static inline int perf_evsel__sample_size(struct perf_evsel *evsel) +{ + return __perf_evsel__sample_size(evsel->attr.sample_type); +} + #endif /* __PERF_EVSEL_H */ diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 2dd1698e0932..24063b4d41e0 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -692,7 +692,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, first = list_entry(evlist->entries.next, struct perf_evsel, node); err = perf_event__parse_sample(event, first->attr.sample_type, - perf_sample_size(first->attr.sample_type), + perf_evsel__sample_size(first), sample_id_all, &pevent->sample); if (err) return PyErr_Format(PyExc_OSError, diff --git a/tools/perf/util/session.c b/tools/perf/util/session.c index 64500fc78799..f5a8fbdd3f76 100644 --- a/tools/perf/util/session.c +++ b/tools/perf/util/session.c @@ -58,6 +58,16 @@ static int perf_session__open(struct perf_session *self, bool force) goto out_close; } + if (!perf_evlist__valid_sample_type(self->evlist)) { + pr_err("non matching sample_type"); + goto out_close; + } + + if (!perf_evlist__valid_sample_id_all(self->evlist)) { + pr_err("non matching sample_id_all"); + goto out_close; + } + self->size = input_stat.st_size; return 0; @@ -97,7 +107,7 @@ out: void perf_session__update_sample_type(struct perf_session *self) { self->sample_type = perf_evlist__sample_type(self->evlist); - self->sample_size = perf_sample_size(self->sample_type); + self->sample_size = __perf_evsel__sample_size(self->sample_type); self->sample_id_all = perf_evlist__sample_id_all(self->evlist); perf_session__id_header_size(self); } From b273fa9716aa1564bee88ceee62f9042981cdc81 Mon Sep 17 00:00:00 2001 From: Frederic Weisbecker Date: Thu, 31 Mar 2011 18:27:42 +0200 Subject: [PATCH 45/46] perf python: Fix argument name list of read_on_cpu() Mandatory arguments need to be present in the argument name list, as well as optional arguments, otherwise python barfs: # ./python/twatch.py Traceback (most recent call last): File "./python/twatch.py", line 41, in main() File "./python/twatch.py", line 32, in main event = evlist.read_on_cpu(cpu) RuntimeError: more argument specifiers than keyword list entries Hence, add cpu to the name list. Cc: David Ahern Cc: Ingo Molnar Cc: Paul Mackerras Cc: Peter Zijlstra Cc: Stephane Eranian Cc: Tom Zanussi Link: http://lkml.kernel.org/r/1301588863-20210-1-git-send-email-fweisbec@gmail.com Signed-off-by: Frederic Weisbecker Signed-off-by: Arnaldo Carvalho de Melo --- tools/perf/util/python.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/perf/util/python.c b/tools/perf/util/python.c index 24063b4d41e0..a9ac0504aabd 100644 --- a/tools/perf/util/python.c +++ b/tools/perf/util/python.c @@ -674,7 +674,7 @@ static PyObject *pyrf_evlist__read_on_cpu(struct pyrf_evlist *pevlist, struct perf_evlist *evlist = &pevlist->evlist; union perf_event *event; int sample_id_all = 1, cpu; - static char *kwlist[] = {"sample_id_all", NULL, NULL}; + static char *kwlist[] = {"cpu", "sample_id_all", NULL, NULL}; int err; if (!PyArg_ParseTupleAndKeywords(args, kwargs, "i|i", kwlist, From aa4a221875873d2a1f9656cb7fd7e545e952b4fa Mon Sep 17 00:00:00 2001 From: Vince Weaver Date: Fri, 3 Jun 2011 17:54:40 -0400 Subject: [PATCH 46/46] perf: Comment /proc/sys/kernel/perf_event_paranoid to be part of user ABI Turns out that distro packages use this file as an indicator of the perf event subsystem - this is easier to check for from scripts than the existence of the system call. This is easy enough to keep around for the kernel, so add a comment to make sure it stays so. Signed-off-by: Vince Weaver Cc: David Ahern Cc: Peter Zijlstra Cc: paulus@samba.org Cc: acme@redhat.com Cc: Linus Torvalds Cc: Andrew Morton Link: http://lkml.kernel.org/r/alpine.DEB.2.00.1106031751170.29381@cl320.eecs.utk.edu Signed-off-by: Ingo Molnar --- kernel/sysctl.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/kernel/sysctl.c b/kernel/sysctl.c index 4fc92445a29c..f175d98bd355 100644 --- a/kernel/sysctl.c +++ b/kernel/sysctl.c @@ -938,6 +938,12 @@ static struct ctl_table kern_table[] = { }, #endif #ifdef CONFIG_PERF_EVENTS + /* + * User-space scripts rely on the existence of this file + * as a feature check for perf_events being enabled. + * + * So it's an ABI, do not remove! + */ { .procname = "perf_event_paranoid", .data = &sysctl_perf_event_paranoid,