diff --git a/drivers/spi/spi-bcm2835.c b/drivers/spi/spi-bcm2835.c
index c02a9982ac93..232002db2696 100644
--- a/drivers/spi/spi-bcm2835.c
+++ b/drivers/spi/spi-bcm2835.c
@@ -85,20 +85,30 @@
  * @regs: base address of register map
  * @clk: core clock, divided to calculate serial clock
  * @irq: interrupt, signals TX FIFO empty or RX FIFO ¾ full
+ * @tfr: SPI transfer currently processed
  * @tx_buf: pointer whence next transmitted byte is read
  * @rx_buf: pointer where next received byte is written
  * @tx_len: remaining bytes to transmit
  * @rx_len: remaining bytes to receive
+ * @tx_prologue: bytes transmitted without DMA if first TX sglist entry's
+ *	length is not a multiple of 4 (to overcome hardware limitation)
+ * @rx_prologue: bytes received without DMA if first RX sglist entry's
+ *	length is not a multiple of 4 (to overcome hardware limitation)
+ * @tx_spillover: whether @tx_prologue spills over to second TX sglist entry
  * @dma_pending: whether a DMA transfer is in progress
  */
 struct bcm2835_spi {
 	void __iomem *regs;
 	struct clk *clk;
 	int irq;
+	struct spi_transfer *tfr;
 	const u8 *tx_buf;
 	u8 *rx_buf;
 	int tx_len;
 	int rx_len;
+	int tx_prologue;
+	int rx_prologue;
+	bool tx_spillover;
 	bool dma_pending;
 };
 
@@ -137,6 +147,72 @@ static inline void bcm2835_wr_fifo(struct bcm2835_spi *bs)
 	}
 }
 
+/**
+ * bcm2835_rd_fifo_count() - blindly read exactly @count bytes from RX FIFO
+ * @bs: BCM2835 SPI controller
+ * @count: bytes to read from RX FIFO
+ *
+ * The caller must ensure that @bs->rx_len is greater than or equal to @count,
+ * that the RX FIFO contains at least @count bytes and that the DMA Enable flag
+ * in the CS register is set (such that a read from the FIFO register receives
+ * 32-bit instead of just 8-bit).
+ */
+static inline void bcm2835_rd_fifo_count(struct bcm2835_spi *bs, int count)
+{
+	u32 val;
+
+	bs->rx_len -= count;
+
+	while (count > 0) {
+		val = bcm2835_rd(bs, BCM2835_SPI_FIFO);
+		if (bs->rx_buf) {
+			int len = min(count, 4);
+			memcpy(bs->rx_buf, &val, len);
+			bs->rx_buf += len;
+		}
+		count -= 4;
+	}
+}
+
+/**
+ * bcm2835_wr_fifo_count() - blindly write exactly @count bytes to TX FIFO
+ * @bs: BCM2835 SPI controller
+ * @count: bytes to write to TX FIFO
+ *
+ * The caller must ensure that @bs->tx_len is greater than or equal to @count,
+ * that the TX FIFO can accommodate @count bytes and that the DMA Enable flag
+ * in the CS register is set (such that a write to the FIFO register transmits
+ * 32-bit instead of just 8-bit).
+ */
+static inline void bcm2835_wr_fifo_count(struct bcm2835_spi *bs, int count)
+{
+	u32 val;
+
+	bs->tx_len -= count;
+
+	while (count > 0) {
+		if (bs->tx_buf) {
+			int len = min(count, 4);
+			memcpy(&val, bs->tx_buf, len);
+			bs->tx_buf += len;
+		} else {
+			val = 0;
+		}
+		bcm2835_wr(bs, BCM2835_SPI_FIFO, val);
+		count -= 4;
+	}
+}
+
+/**
+ * bcm2835_wait_tx_fifo_empty() - busy-wait for TX FIFO to empty
+ * @bs: BCM2835 SPI controller
+ */
+static inline void bcm2835_wait_tx_fifo_empty(struct bcm2835_spi *bs)
+{
+	while (!(bcm2835_rd(bs, BCM2835_SPI_CS) & BCM2835_SPI_CS_DONE))
+		cpu_relax();
+}
+
 static void bcm2835_spi_reset_hw(struct spi_master *master)
 {
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
@@ -209,15 +285,161 @@ static int bcm2835_spi_transfer_one_irq(struct spi_master *master,
  * the main one being that DMA transfers are limited to 16 bit
  * (so 0 to 65535 bytes) by the SPI HW due to BCM2835_SPI_DLEN
  *
- * also we currently assume that the scatter-gather fragments are
- * all multiple of 4 (except the last) - otherwise we would need
- * to reset the FIFO before subsequent transfers...
- * this also means that tx/rx transfers sg's need to be of equal size!
- *
  * there may be a few more border-cases we may need to address as well
  * but unfortunately this would mean splitting up the scatter-gather
  * list making it slightly unpractical...
  */
+
+/**
+ * bcm2835_spi_transfer_prologue() - transfer first few bytes without DMA
+ * @master: SPI master
+ * @tfr: SPI transfer
+ * @bs: BCM2835 SPI controller
+ * @cs: CS register
+ *
+ * A limitation in DMA mode is that the FIFO must be accessed in 4 byte chunks.
+ * Only the final write access is permitted to transmit less than 4 bytes, the
+ * SPI controller deduces its intended size from the DLEN register.
+ *
+ * If a TX or RX sglist contains multiple entries, one per page, and the first
+ * entry starts in the middle of a page, that first entry's length may not be
+ * a multiple of 4.  Subsequent entries are fine because they span an entire
+ * page, hence do have a length that's a multiple of 4.
+ *
+ * This cannot happen with kmalloc'ed buffers (which is what most clients use)
+ * because they are contiguous in physical memory and therefore not split on
+ * page boundaries by spi_map_buf().  But it *can* happen with vmalloc'ed
+ * buffers.
+ *
+ * The DMA engine is incapable of combining sglist entries into a continuous
+ * stream of 4 byte chunks, it treats every entry separately:  A TX entry is
+ * rounded up a to a multiple of 4 bytes by transmitting surplus bytes, an RX
+ * entry is rounded up by throwing away received bytes.
+ *
+ * Overcome this limitation by transferring the first few bytes without DMA:
+ * E.g. if the first TX sglist entry's length is 23 and the first RX's is 42,
+ * write 3 bytes to the TX FIFO but read only 2 bytes from the RX FIFO.
+ * The residue of 1 byte in the RX FIFO is picked up by DMA.  Together with
+ * the rest of the first RX sglist entry it makes up a multiple of 4 bytes.
+ *
+ * Should the RX prologue be larger, say, 3 vis-à-vis a TX prologue of 1,
+ * write 1 + 4 = 5 bytes to the TX FIFO and read 3 bytes from the RX FIFO.
+ * Caution, the additional 4 bytes spill over to the second TX sglist entry
+ * if the length of the first is *exactly* 1.
+ *
+ * At most 6 bytes are written and at most 3 bytes read.  Do we know the
+ * transfer has this many bytes?  Yes, see BCM2835_SPI_DMA_MIN_LENGTH.
+ *
+ * The FIFO is normally accessed with 8-bit width by the CPU and 32-bit width
+ * by the DMA engine.  Toggling the DMA Enable flag in the CS register switches
+ * the width but also garbles the FIFO's contents.  The prologue must therefore
+ * be transmitted in 32-bit width to ensure that the following DMA transfer can
+ * pick up the residue in the RX FIFO in ungarbled form.
+ */
+static void bcm2835_spi_transfer_prologue(struct spi_master *master,
+					  struct spi_transfer *tfr,
+					  struct bcm2835_spi *bs,
+					  u32 cs)
+{
+	int tx_remaining;
+
+	bs->tfr		 = tfr;
+	bs->tx_prologue  = 0;
+	bs->rx_prologue  = 0;
+	bs->tx_spillover = false;
+
+	if (!sg_is_last(&tfr->tx_sg.sgl[0]))
+		bs->tx_prologue = sg_dma_len(&tfr->tx_sg.sgl[0]) & 3;
+
+	if (!sg_is_last(&tfr->rx_sg.sgl[0])) {
+		bs->rx_prologue = sg_dma_len(&tfr->rx_sg.sgl[0]) & 3;
+
+		if (bs->rx_prologue > bs->tx_prologue) {
+			if (sg_is_last(&tfr->tx_sg.sgl[0])) {
+				bs->tx_prologue  = bs->rx_prologue;
+			} else {
+				bs->tx_prologue += 4;
+				bs->tx_spillover =
+					!(sg_dma_len(&tfr->tx_sg.sgl[0]) & ~3);
+			}
+		}
+	}
+
+	/* rx_prologue > 0 implies tx_prologue > 0, so check only the latter */
+	if (!bs->tx_prologue)
+		return;
+
+	/* Write and read RX prologue.  Adjust first entry in RX sglist. */
+	if (bs->rx_prologue) {
+		bcm2835_wr(bs, BCM2835_SPI_DLEN, bs->rx_prologue);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_TA
+						  | BCM2835_SPI_CS_DMAEN);
+		bcm2835_wr_fifo_count(bs, bs->rx_prologue);
+		bcm2835_wait_tx_fifo_empty(bs);
+		bcm2835_rd_fifo_count(bs, bs->rx_prologue);
+		bcm2835_spi_reset_hw(master);
+
+		dma_sync_sg_for_device(master->dma_rx->device->dev,
+				       tfr->rx_sg.sgl, 1, DMA_FROM_DEVICE);
+
+		tfr->rx_sg.sgl[0].dma_address += bs->rx_prologue;
+		tfr->rx_sg.sgl[0].length      -= bs->rx_prologue;
+	}
+
+	/*
+	 * Write remaining TX prologue.  Adjust first entry in TX sglist.
+	 * Also adjust second entry if prologue spills over to it.
+	 */
+	tx_remaining = bs->tx_prologue - bs->rx_prologue;
+	if (tx_remaining) {
+		bcm2835_wr(bs, BCM2835_SPI_DLEN, tx_remaining);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_TA
+						  | BCM2835_SPI_CS_DMAEN);
+		bcm2835_wr_fifo_count(bs, tx_remaining);
+		bcm2835_wait_tx_fifo_empty(bs);
+		bcm2835_wr(bs, BCM2835_SPI_CS, cs | BCM2835_SPI_CS_CLEAR_TX);
+	}
+
+	if (likely(!bs->tx_spillover)) {
+		tfr->tx_sg.sgl[0].dma_address += bs->tx_prologue;
+		tfr->tx_sg.sgl[0].length      -= bs->tx_prologue;
+	} else {
+		tfr->tx_sg.sgl[0].length       = 0;
+		tfr->tx_sg.sgl[1].dma_address += 4;
+		tfr->tx_sg.sgl[1].length      -= 4;
+	}
+}
+
+/**
+ * bcm2835_spi_undo_prologue() - reconstruct original sglist state
+ * @bs: BCM2835 SPI controller
+ *
+ * Undo changes which were made to an SPI transfer's sglist when transmitting
+ * the prologue.  This is necessary to ensure the same memory ranges are
+ * unmapped that were originally mapped.
+ */
+static void bcm2835_spi_undo_prologue(struct bcm2835_spi *bs)
+{
+	struct spi_transfer *tfr = bs->tfr;
+
+	if (!bs->tx_prologue)
+		return;
+
+	if (bs->rx_prologue) {
+		tfr->rx_sg.sgl[0].dma_address -= bs->rx_prologue;
+		tfr->rx_sg.sgl[0].length      += bs->rx_prologue;
+	}
+
+	if (likely(!bs->tx_spillover)) {
+		tfr->tx_sg.sgl[0].dma_address -= bs->tx_prologue;
+		tfr->tx_sg.sgl[0].length      += bs->tx_prologue;
+	} else {
+		tfr->tx_sg.sgl[0].length       = bs->tx_prologue - 4;
+		tfr->tx_sg.sgl[1].dma_address -= 4;
+		tfr->tx_sg.sgl[1].length      += 4;
+	}
+}
+
 static void bcm2835_spi_dma_done(void *data)
 {
 	struct spi_master *master = data;
@@ -233,6 +455,7 @@ static void bcm2835_spi_dma_done(void *data)
 	 */
 	if (cmpxchg(&bs->dma_pending, true, false)) {
 		dmaengine_terminate_all(master->dma_tx);
+		bcm2835_spi_undo_prologue(bs);
 	}
 
 	/* and mark as completed */;
@@ -283,20 +506,6 @@ static int bcm2835_spi_prepare_sg(struct spi_master *master,
 	return dma_submit_error(cookie);
 }
 
-static inline int bcm2835_check_sg_length(struct sg_table *sgt)
-{
-	int i;
-	struct scatterlist *sgl;
-
-	/* check that the sg entries are word-sized (except for last) */
-	for_each_sg(sgt->sgl, sgl, (int)sgt->nents - 1, i) {
-		if (sg_dma_len(sgl) % 4)
-			return -EFAULT;
-	}
-
-	return 0;
-}
-
 static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 					struct spi_device *spi,
 					struct spi_transfer *tfr,
@@ -305,18 +514,16 @@ static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 	struct bcm2835_spi *bs = spi_master_get_devdata(master);
 	int ret;
 
-	/* check that the scatter gather segments are all a multiple of 4 */
-	if (bcm2835_check_sg_length(&tfr->tx_sg) ||
-	    bcm2835_check_sg_length(&tfr->rx_sg)) {
-		dev_warn_once(&spi->dev,
-			      "scatter gather segment length is not a multiple of 4 - falling back to interrupt mode\n");
-		return bcm2835_spi_transfer_one_irq(master, spi, tfr, cs);
-	}
+	/*
+	 * Transfer first few bytes without DMA if length of first TX or RX
+	 * sglist entry is not a multiple of 4 bytes (hardware limitation).
+	 */
+	bcm2835_spi_transfer_prologue(master, tfr, bs, cs);
 
 	/* setup tx-DMA */
 	ret = bcm2835_spi_prepare_sg(master, tfr, true);
 	if (ret)
-		return ret;
+		goto err_reset_hw;
 
 	/* start TX early */
 	dma_async_issue_pending(master->dma_tx);
@@ -325,7 +532,7 @@ static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 	bs->dma_pending = 1;
 
 	/* set the DMA length */
-	bcm2835_wr(bs, BCM2835_SPI_DLEN, tfr->len);
+	bcm2835_wr(bs, BCM2835_SPI_DLEN, bs->tx_len);
 
 	/* start the HW */
 	bcm2835_wr(bs, BCM2835_SPI_CS,
@@ -340,8 +547,7 @@ static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 		/* need to reset on errors */
 		dmaengine_terminate_all(master->dma_tx);
 		bs->dma_pending = false;
-		bcm2835_spi_reset_hw(master);
-		return ret;
+		goto err_reset_hw;
 	}
 
 	/* start rx dma late */
@@ -349,6 +555,11 @@ static int bcm2835_spi_transfer_one_dma(struct spi_master *master,
 
 	/* wait for wakeup in framework */
 	return 1;
+
+err_reset_hw:
+	bcm2835_spi_reset_hw(master);
+	bcm2835_spi_undo_prologue(bs);
+	return ret;
 }
 
 static bool bcm2835_spi_can_dma(struct spi_master *master,
@@ -372,25 +583,6 @@ static bool bcm2835_spi_can_dma(struct spi_master *master,
 		return false;
 	}
 
-	/* if we run rx/tx_buf with word aligned addresses then we are OK */
-	if ((((size_t)tfr->rx_buf & 3) == 0) &&
-	    (((size_t)tfr->tx_buf & 3) == 0))
-		return true;
-
-	/* otherwise we only allow transfers within the same page
-	 * to avoid wasting time on dma_mapping when it is not practical
-	 */
-	if (((size_t)tfr->tx_buf & (PAGE_SIZE - 1)) + tfr->len > PAGE_SIZE) {
-		dev_warn_once(&spi->dev,
-			      "Unaligned spi tx-transfer bridging page\n");
-		return false;
-	}
-	if (((size_t)tfr->rx_buf & (PAGE_SIZE - 1)) + tfr->len > PAGE_SIZE) {
-		dev_warn_once(&spi->dev,
-			      "Unaligned spi rx-transfer bridging page\n");
-		return false;
-	}
-
 	/* return OK */
 	return true;
 }
@@ -614,6 +806,7 @@ static void bcm2835_spi_handle_err(struct spi_master *master,
 	if (cmpxchg(&bs->dma_pending, true, false)) {
 		dmaengine_terminate_all(master->dma_tx);
 		dmaengine_terminate_all(master->dma_rx);
+		bcm2835_spi_undo_prologue(bs);
 	}
 	/* and reset */
 	bcm2835_spi_reset_hw(master);