linux/arch/arm/mach-omap2/omap4-common.c

205 lines
4.6 KiB
C
Raw Normal View History

/*
* OMAP4 specific common source file.
*
* Copyright (C) 2010 Texas Instruments, Inc.
* Author:
* Santosh Shilimkar <santosh.shilimkar@ti.com>
*
*
* This program is free software,you can redistribute it and/or modify
* it under the terms of the GNU General Public License version 2 as
* published by the Free Software Foundation.
*/
#include <linux/kernel.h>
#include <linux/init.h>
#include <linux/io.h>
#include <linux/platform_device.h>
ARM: OMAP4: Fix errata i688 with MPU interconnect barriers. On OMAP4 SOC, intecronnects has many write buffers in the async bridges and they need to be drained before CPU enters into standby state. Patch 'OMAP4: PM: Add CPUX OFF mode support' added CPU PM support but OMAP errata i688 (Async Bridge Corruption) needs to be taken care to avoid issues like system freeze, CPU deadlocks, random crashes with register accesses, synchronisation loss on initiators operating on both interconnect port simultaneously. As per the errata, if a data is stalled inside asynchronous bridge because of back pressure, it may be accepted multiple times, creating pointer misalignment that will corrupt next transfers on that data path until next reset of the system (No recovery procedure once the issue is hit, the path remains consistently broken). Async bridge can be found on path between MPU to EMIF and MPU to L3 interconnect. This situation can happen only when the idle is initiated by a Master Request Disconnection (which is trigged by software when executing WFI on CPU). The work-around for this errata needs all the initiators connected through async bridge must ensure that data path is properly drained before issuing WFI. This condition will be met if one Strongly ordered access is performed to the target right before executing the WFI. In MPU case, L3 T2ASYNC FIFO and DDR T2ASYNC FIFO needs to be drained. IO barrier ensure that there is no synchronisation loss on initiators operating on both interconnect port simultaneously. Thanks to Russell for a tip to conver assembly function to C fuction there by reducing 40 odd lines of code from the patch. Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> Reviewed-by: Kevin Hilman <khilman@ti.com> Tested-by: Vishwanath BS <vishwanath.bs@ti.com> Signed-off-by: Kevin Hilman <khilman@ti.com>
2011-06-26 01:04:31 +00:00
#include <linux/memblock.h>
#include <asm/hardware/gic.h>
#include <asm/hardware/cache-l2x0.h>
ARM: OMAP4: Fix errata i688 with MPU interconnect barriers. On OMAP4 SOC, intecronnects has many write buffers in the async bridges and they need to be drained before CPU enters into standby state. Patch 'OMAP4: PM: Add CPUX OFF mode support' added CPU PM support but OMAP errata i688 (Async Bridge Corruption) needs to be taken care to avoid issues like system freeze, CPU deadlocks, random crashes with register accesses, synchronisation loss on initiators operating on both interconnect port simultaneously. As per the errata, if a data is stalled inside asynchronous bridge because of back pressure, it may be accepted multiple times, creating pointer misalignment that will corrupt next transfers on that data path until next reset of the system (No recovery procedure once the issue is hit, the path remains consistently broken). Async bridge can be found on path between MPU to EMIF and MPU to L3 interconnect. This situation can happen only when the idle is initiated by a Master Request Disconnection (which is trigged by software when executing WFI on CPU). The work-around for this errata needs all the initiators connected through async bridge must ensure that data path is properly drained before issuing WFI. This condition will be met if one Strongly ordered access is performed to the target right before executing the WFI. In MPU case, L3 T2ASYNC FIFO and DDR T2ASYNC FIFO needs to be drained. IO barrier ensure that there is no synchronisation loss on initiators operating on both interconnect port simultaneously. Thanks to Russell for a tip to conver assembly function to C fuction there by reducing 40 odd lines of code from the patch. Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> Reviewed-by: Kevin Hilman <khilman@ti.com> Tested-by: Vishwanath BS <vishwanath.bs@ti.com> Signed-off-by: Kevin Hilman <khilman@ti.com>
2011-06-26 01:04:31 +00:00
#include <asm/mach/map.h>
#include <plat/irqs.h>
ARM: OMAP4: Fix errata i688 with MPU interconnect barriers. On OMAP4 SOC, intecronnects has many write buffers in the async bridges and they need to be drained before CPU enters into standby state. Patch 'OMAP4: PM: Add CPUX OFF mode support' added CPU PM support but OMAP errata i688 (Async Bridge Corruption) needs to be taken care to avoid issues like system freeze, CPU deadlocks, random crashes with register accesses, synchronisation loss on initiators operating on both interconnect port simultaneously. As per the errata, if a data is stalled inside asynchronous bridge because of back pressure, it may be accepted multiple times, creating pointer misalignment that will corrupt next transfers on that data path until next reset of the system (No recovery procedure once the issue is hit, the path remains consistently broken). Async bridge can be found on path between MPU to EMIF and MPU to L3 interconnect. This situation can happen only when the idle is initiated by a Master Request Disconnection (which is trigged by software when executing WFI on CPU). The work-around for this errata needs all the initiators connected through async bridge must ensure that data path is properly drained before issuing WFI. This condition will be met if one Strongly ordered access is performed to the target right before executing the WFI. In MPU case, L3 T2ASYNC FIFO and DDR T2ASYNC FIFO needs to be drained. IO barrier ensure that there is no synchronisation loss on initiators operating on both interconnect port simultaneously. Thanks to Russell for a tip to conver assembly function to C fuction there by reducing 40 odd lines of code from the patch. Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> Reviewed-by: Kevin Hilman <khilman@ti.com> Tested-by: Vishwanath BS <vishwanath.bs@ti.com> Signed-off-by: Kevin Hilman <khilman@ti.com>
2011-06-26 01:04:31 +00:00
#include <plat/sram.h>
#include <mach/hardware.h>
#include <mach/omap-wakeupgen.h>
#include "common.h"
#include "omap4-sar-layout.h"
#ifdef CONFIG_CACHE_L2X0
static void __iomem *l2cache_base;
#endif
static void __iomem *sar_ram_base;
ARM: OMAP4: Fix errata i688 with MPU interconnect barriers. On OMAP4 SOC, intecronnects has many write buffers in the async bridges and they need to be drained before CPU enters into standby state. Patch 'OMAP4: PM: Add CPUX OFF mode support' added CPU PM support but OMAP errata i688 (Async Bridge Corruption) needs to be taken care to avoid issues like system freeze, CPU deadlocks, random crashes with register accesses, synchronisation loss on initiators operating on both interconnect port simultaneously. As per the errata, if a data is stalled inside asynchronous bridge because of back pressure, it may be accepted multiple times, creating pointer misalignment that will corrupt next transfers on that data path until next reset of the system (No recovery procedure once the issue is hit, the path remains consistently broken). Async bridge can be found on path between MPU to EMIF and MPU to L3 interconnect. This situation can happen only when the idle is initiated by a Master Request Disconnection (which is trigged by software when executing WFI on CPU). The work-around for this errata needs all the initiators connected through async bridge must ensure that data path is properly drained before issuing WFI. This condition will be met if one Strongly ordered access is performed to the target right before executing the WFI. In MPU case, L3 T2ASYNC FIFO and DDR T2ASYNC FIFO needs to be drained. IO barrier ensure that there is no synchronisation loss on initiators operating on both interconnect port simultaneously. Thanks to Russell for a tip to conver assembly function to C fuction there by reducing 40 odd lines of code from the patch. Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Signed-off-by: Richard Woodruff <r-woodruff2@ti.com> Acked-by: Jean Pihet <j-pihet@ti.com> Reviewed-by: Kevin Hilman <khilman@ti.com> Tested-by: Vishwanath BS <vishwanath.bs@ti.com> Signed-off-by: Kevin Hilman <khilman@ti.com>
2011-06-26 01:04:31 +00:00
#ifdef CONFIG_OMAP4_ERRATA_I688
/* Used to implement memory barrier on DRAM path */
#define OMAP4_DRAM_BARRIER_VA 0xfe600000
void __iomem *dram_sync, *sram_sync;
void omap_bus_sync(void)
{
if (dram_sync && sram_sync) {
writel_relaxed(readl_relaxed(dram_sync), dram_sync);
writel_relaxed(readl_relaxed(sram_sync), sram_sync);
isb();
}
}
static int __init omap_barriers_init(void)
{
struct map_desc dram_io_desc[1];
phys_addr_t paddr;
u32 size;
if (!cpu_is_omap44xx())
return -ENODEV;
size = ALIGN(PAGE_SIZE, SZ_1M);
paddr = memblock_alloc(size, SZ_1M);
if (!paddr) {
pr_err("%s: failed to reserve 4 Kbytes\n", __func__);
return -ENOMEM;
}
memblock_free(paddr, size);
memblock_remove(paddr, size);
dram_io_desc[0].virtual = OMAP4_DRAM_BARRIER_VA;
dram_io_desc[0].pfn = __phys_to_pfn(paddr);
dram_io_desc[0].length = size;
dram_io_desc[0].type = MT_MEMORY_SO;
iotable_init(dram_io_desc, ARRAY_SIZE(dram_io_desc));
dram_sync = (void __iomem *) dram_io_desc[0].virtual;
sram_sync = (void __iomem *) OMAP4_SRAM_VA;
pr_info("OMAP4: Map 0x%08llx to 0x%08lx for dram barrier\n",
(long long) paddr, dram_io_desc[0].virtual);
return 0;
}
core_initcall(omap_barriers_init);
#endif
void __init gic_init_irq(void)
{
void __iomem *omap_irq_base;
void __iomem *gic_dist_base_addr;
/* Static mapping, never released */
gic_dist_base_addr = ioremap(OMAP44XX_GIC_DIST_BASE, SZ_4K);
BUG_ON(!gic_dist_base_addr);
/* Static mapping, never released */
omap_irq_base = ioremap(OMAP44XX_GIC_CPU_BASE, SZ_512);
BUG_ON(!omap_irq_base);
omap_wakeupgen_init();
gic_init(0, 29, gic_dist_base_addr, omap_irq_base);
}
#ifdef CONFIG_CACHE_L2X0
void __iomem *omap4_get_l2cache_base(void)
{
return l2cache_base;
}
static void omap4_l2x0_disable(void)
{
/* Disable PL310 L2 Cache controller */
omap_smc1(0x102, 0x0);
}
static void omap4_l2x0_set_debug(unsigned long val)
{
/* Program PL310 L2 Cache controller debug register */
omap_smc1(0x100, val);
}
static int __init omap_l2_cache_init(void)
{
u32 aux_ctrl = 0;
/*
* To avoid code running on other OMAPs in
* multi-omap builds
*/
if (!cpu_is_omap44xx())
return -ENODEV;
/* Static mapping, never released */
l2cache_base = ioremap(OMAP44XX_L2CACHE_BASE, SZ_4K);
if (WARN_ON(!l2cache_base))
return -ENOMEM;
/*
* 16-way associativity, parity disabled
* Way size - 32KB (es1.0)
* Way size - 64KB (es2.0 +)
*/
aux_ctrl = ((1 << L2X0_AUX_CTRL_ASSOCIATIVITY_SHIFT) |
(0x1 << 25) |
(0x1 << L2X0_AUX_CTRL_NS_LOCKDOWN_SHIFT) |
(0x1 << L2X0_AUX_CTRL_NS_INT_CTRL_SHIFT));
omap4: l2x0: enable instruction and data prefetching Enabling L2 prefetching improves performance as shown on Panda ES2.1 board with mem test, and it has measurable impact on performances. I think we should consider it, even though it damages "writes" a bit. (rebased to k.org) Usually the prefetch is used at both levels together L1 + L2, however, to enable the CP15 prefetch engines, these are under security, and on GP devices, we cannot enable it(e.g. on PandaBoard). However, just enabling PL310 prefetch seems to provide performance improvement, as shown in the data below (from Ubuntu) and would be a great thing to pull in. What prefetch does is enable automatic next line prefetching. With this enabled, whenever the PL310 receives a cachable read request, it automatically prefetches the following cache line as well. Measurement Data: == STOCK 10.10 WITHOUT PATCH ======================== ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2aaad000 0x2b2ad000 copy libc 133 MB/s copy Android v5 273 MB/s copy Android NEON 235 MB/s copy INT32 116 MB/s copy ASM ARM 187 MB/s copy ASM VLDM 64 204 MB/s copy ASM VLDM 128 173 MB/s copy ASM VLD1 216 MB/s read ASM ARM 286 MB/s read ASM VLDM 242 MB/s read ASM VLD1 286 MB/s write libc 1947 MB/s write ASM ARM 1943 MB/s write ASM VSTM 1942 MB/s write ASM VST1 1935 MB/s 10.10 + PATCH ============= ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2ab17000 0x2b317000 copy libc 129 MB/s copy Android v5 256 MB/s copy Android NEON 356 MB/s copy INT32 127 MB/s copy ASM ARM 321 MB/s copy ASM VLDM 64 337 MB/s copy ASM VLDM 128 321 MB/s copy ASM VLD1 350 MB/s read ASM ARM 496 MB/s read ASM VLDM 470 MB/s read ASM VLD1 488 MB/s write libc 1701 MB/s write ASM ARM 1682 MB/s write ASM VSTM 1693 MB/s write ASM VST1 1681 MB/s Signed-off-by: Mans Rullgard <mans@mansr.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Nishanth Menon <nm@ti.com> Signed-off-by: Tony Lindgren <tony@atomide.com>
2010-11-19 17:31:04 +00:00
if (omap_rev() == OMAP4430_REV_ES1_0) {
aux_ctrl |= 0x2 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT;
omap4: l2x0: enable instruction and data prefetching Enabling L2 prefetching improves performance as shown on Panda ES2.1 board with mem test, and it has measurable impact on performances. I think we should consider it, even though it damages "writes" a bit. (rebased to k.org) Usually the prefetch is used at both levels together L1 + L2, however, to enable the CP15 prefetch engines, these are under security, and on GP devices, we cannot enable it(e.g. on PandaBoard). However, just enabling PL310 prefetch seems to provide performance improvement, as shown in the data below (from Ubuntu) and would be a great thing to pull in. What prefetch does is enable automatic next line prefetching. With this enabled, whenever the PL310 receives a cachable read request, it automatically prefetches the following cache line as well. Measurement Data: == STOCK 10.10 WITHOUT PATCH ======================== ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2aaad000 0x2b2ad000 copy libc 133 MB/s copy Android v5 273 MB/s copy Android NEON 235 MB/s copy INT32 116 MB/s copy ASM ARM 187 MB/s copy ASM VLDM 64 204 MB/s copy ASM VLDM 128 173 MB/s copy ASM VLD1 216 MB/s read ASM ARM 286 MB/s read ASM VLDM 242 MB/s read ASM VLD1 286 MB/s write libc 1947 MB/s write ASM ARM 1943 MB/s write ASM VSTM 1942 MB/s write ASM VST1 1935 MB/s 10.10 + PATCH ============= ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2ab17000 0x2b317000 copy libc 129 MB/s copy Android v5 256 MB/s copy Android NEON 356 MB/s copy INT32 127 MB/s copy ASM ARM 321 MB/s copy ASM VLDM 64 337 MB/s copy ASM VLDM 128 321 MB/s copy ASM VLD1 350 MB/s read ASM ARM 496 MB/s read ASM VLDM 470 MB/s read ASM VLD1 488 MB/s write libc 1701 MB/s write ASM ARM 1682 MB/s write ASM VSTM 1693 MB/s write ASM VST1 1681 MB/s Signed-off-by: Mans Rullgard <mans@mansr.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Nishanth Menon <nm@ti.com> Signed-off-by: Tony Lindgren <tony@atomide.com>
2010-11-19 17:31:04 +00:00
} else {
aux_ctrl |= ((0x3 << L2X0_AUX_CTRL_WAY_SIZE_SHIFT) |
(1 << L2X0_AUX_CTRL_SHARE_OVERRIDE_SHIFT) |
omap4: l2x0: enable instruction and data prefetching Enabling L2 prefetching improves performance as shown on Panda ES2.1 board with mem test, and it has measurable impact on performances. I think we should consider it, even though it damages "writes" a bit. (rebased to k.org) Usually the prefetch is used at both levels together L1 + L2, however, to enable the CP15 prefetch engines, these are under security, and on GP devices, we cannot enable it(e.g. on PandaBoard). However, just enabling PL310 prefetch seems to provide performance improvement, as shown in the data below (from Ubuntu) and would be a great thing to pull in. What prefetch does is enable automatic next line prefetching. With this enabled, whenever the PL310 receives a cachable read request, it automatically prefetches the following cache line as well. Measurement Data: == STOCK 10.10 WITHOUT PATCH ======================== ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2aaad000 0x2b2ad000 copy libc 133 MB/s copy Android v5 273 MB/s copy Android NEON 235 MB/s copy INT32 116 MB/s copy ASM ARM 187 MB/s copy ASM VLDM 64 204 MB/s copy ASM VLDM 128 173 MB/s copy ASM VLD1 216 MB/s read ASM ARM 286 MB/s read ASM VLDM 242 MB/s read ASM VLD1 286 MB/s write libc 1947 MB/s write ASM ARM 1943 MB/s write ASM VSTM 1942 MB/s write ASM VST1 1935 MB/s 10.10 + PATCH ============= ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2ab17000 0x2b317000 copy libc 129 MB/s copy Android v5 256 MB/s copy Android NEON 356 MB/s copy INT32 127 MB/s copy ASM ARM 321 MB/s copy ASM VLDM 64 337 MB/s copy ASM VLDM 128 321 MB/s copy ASM VLD1 350 MB/s read ASM ARM 496 MB/s read ASM VLDM 470 MB/s read ASM VLD1 488 MB/s write libc 1701 MB/s write ASM ARM 1682 MB/s write ASM VSTM 1693 MB/s write ASM VST1 1681 MB/s Signed-off-by: Mans Rullgard <mans@mansr.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Nishanth Menon <nm@ti.com> Signed-off-by: Tony Lindgren <tony@atomide.com>
2010-11-19 17:31:04 +00:00
(1 << L2X0_AUX_CTRL_DATA_PREFETCH_SHIFT) |
(1 << L2X0_AUX_CTRL_INSTR_PREFETCH_SHIFT) |
(1 << L2X0_AUX_CTRL_EARLY_BRESP_SHIFT));
omap4: l2x0: enable instruction and data prefetching Enabling L2 prefetching improves performance as shown on Panda ES2.1 board with mem test, and it has measurable impact on performances. I think we should consider it, even though it damages "writes" a bit. (rebased to k.org) Usually the prefetch is used at both levels together L1 + L2, however, to enable the CP15 prefetch engines, these are under security, and on GP devices, we cannot enable it(e.g. on PandaBoard). However, just enabling PL310 prefetch seems to provide performance improvement, as shown in the data below (from Ubuntu) and would be a great thing to pull in. What prefetch does is enable automatic next line prefetching. With this enabled, whenever the PL310 receives a cachable read request, it automatically prefetches the following cache line as well. Measurement Data: == STOCK 10.10 WITHOUT PATCH ======================== ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2aaad000 0x2b2ad000 copy libc 133 MB/s copy Android v5 273 MB/s copy Android NEON 235 MB/s copy INT32 116 MB/s copy ASM ARM 187 MB/s copy ASM VLDM 64 204 MB/s copy ASM VLDM 128 173 MB/s copy ASM VLD1 216 MB/s read ASM ARM 286 MB/s read ASM VLDM 242 MB/s read ASM VLD1 286 MB/s write libc 1947 MB/s write ASM ARM 1943 MB/s write ASM VSTM 1942 MB/s write ASM VST1 1935 MB/s 10.10 + PATCH ============= ~# ./memspeed size 8388608 8192k 8M offset 8388608, 0 buffers 0x2ab17000 0x2b317000 copy libc 129 MB/s copy Android v5 256 MB/s copy Android NEON 356 MB/s copy INT32 127 MB/s copy ASM ARM 321 MB/s copy ASM VLDM 64 337 MB/s copy ASM VLDM 128 321 MB/s copy ASM VLD1 350 MB/s read ASM ARM 496 MB/s read ASM VLDM 470 MB/s read ASM VLD1 488 MB/s write libc 1701 MB/s write ASM ARM 1682 MB/s write ASM VSTM 1693 MB/s write ASM VST1 1681 MB/s Signed-off-by: Mans Rullgard <mans@mansr.com> Signed-off-by: Santosh Shilimkar <santosh.shilimkar@ti.com> Tested-by: Nishanth Menon <nm@ti.com> Signed-off-by: Tony Lindgren <tony@atomide.com>
2010-11-19 17:31:04 +00:00
}
if (omap_rev() != OMAP4430_REV_ES1_0)
omap_smc1(0x109, aux_ctrl);
/* Enable PL310 L2 Cache controller */
omap_smc1(0x102, 0x1);
l2x0_init(l2cache_base, aux_ctrl, L2X0_AUX_CTRL_MASK);
/*
* Override default outer_cache.disable with a OMAP4
* specific one
*/
outer_cache.disable = omap4_l2x0_disable;
outer_cache.set_debug = omap4_l2x0_set_debug;
return 0;
}
early_initcall(omap_l2_cache_init);
#endif
void __iomem *omap4_get_sar_ram_base(void)
{
return sar_ram_base;
}
/*
* SAR RAM used to save and restore the HW
* context in low power modes
*/
static int __init omap4_sar_ram_init(void)
{
/*
* To avoid code running on other OMAPs in
* multi-omap builds
*/
if (!cpu_is_omap44xx())
return -ENOMEM;
/* Static mapping, never released */
sar_ram_base = ioremap(OMAP44XX_SAR_RAM_BASE, SZ_16K);
if (WARN_ON(!sar_ram_base))
return -ENOMEM;
return 0;
}
early_initcall(omap4_sar_ram_init);