/*
 * SPI port driver for ETRAX FS et al. using a synchronous serial
 * port, but simplified by using the spi_bitbang framework.
 *
 * Copyright (c) 2007 Axis Communications AB
 *
 * Author: Hans-Peter Nilsson, though copying parts of
 * spi_s3c24xx_gpio.c, hence also:
 * Copyright (c) 2006 Ben Dooks
 * Copyright (c) 2006 Simtec Electronics
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License version 2 as
 * published by the Free Software Foundation.
 *
 * This driver restricts frequency, polarity, "word" length and endian
 * much more than the hardware does.  I'm happy to unrestrict it, but
 * only with what I can test myself (at time of writing, just SD/MMC
 * SPI) and what people actually test and report.
 */

#include <linux/types.h>
#include <linux/device.h>
#include <linux/spi/spi.h>
#include <linux/spi/spi_bitbang.h>
#include <linux/delay.h>
#include <linux/platform_device.h>
#include <linux/interrupt.h>
#include <asm/io.h>
#include <asm/arch/board.h>
#include <asm/arch/hwregs/reg_map.h>
#include <asm/arch/hwregs/reg_rdwr.h>
#include <asm/arch/hwregs/sser_defs.h>
#include <asm/arch/dma.h>
#include <asm/arch/hwregs/dma.h>

/* A size "not much larger" than the max typical transfer size.  */
#define DMA_CHUNKSIZ 512

/*
 * For a transfer expected to take this long, we busy-wait instead of enabling
 * interrupts.
 */
#define IRQ_USAGE_THRESHOLD_NS 14000

/* A few register access macros to avoid verbiage and reduce typos.  */
#define REG_RD_DI(reg) REG_RD(dma, regi_dmain, reg)
#define REG_RD_DO(reg) REG_RD(dma, regi_dmaout, reg)
#define REG_RD_SSER(reg) REG_RD(sser, regi_sser, reg)
#define REG_WR_DI(reg, val) REG_WR(dma, regi_dmain, reg, val)
#define REG_WR_DO(reg, val) REG_WR(dma, regi_dmaout, reg, val)
#define REG_WR_SSER(reg, val) REG_WR(sser, regi_sser, reg, val)
#define REG_WRINT_DI(reg, val) REG_WR_INT(dma, regi_dmain, reg, val)
#define REG_WRINT_DO(reg, val) REG_WR_INT(dma, regi_dmaout, reg, val)
#define REG_WRINT_SSER(reg, val) REG_WR_INT(sser, regi_sser, reg, val)
#define REG_RDINT_DI(reg) REG_RD_INT(dma, regi_dmain, reg)
#define REG_RDINT_DO(reg) REG_RD_INT(dma, regi_dmaout, reg)
#define REG_RDINT_SSER(reg) REG_RD_INT(sser, regi_sser, reg)

#define DMA_WAIT_UNTIL_RESET(inst)			\
  do {							\
	reg_dma_rw_stat r;				\
	do {						\
	  	r = REG_RD(dma, (inst), rw_stat);	\
	} while (r.mode != regk_dma_rst);		\
  } while (0)

#define DMA_BUSY(inst) (REG_RD(dma, inst, rw_stream_cmd)).busy

/* Our main driver state.  */
struct crisv32_spi_hw_info {
	struct crisv32_regi_n_int sser;
	struct crisv32_regi_n_int dmain;
	struct crisv32_regi_n_int dmaout;

	reg_sser_rw_cfg cfg;
	reg_sser_rw_frm_cfg frm_cfg;
	reg_sser_rw_tr_cfg tr_cfg;
	reg_sser_rw_rec_cfg rec_cfg;
	reg_sser_rw_extra extra;

	/* We store the speed in kHz, so we can have expressions
	 * multiplying 100MHz by * 4 before dividing by it, and still
	 * keep it in an u32. */
	u32 effective_speed_kHz;

	/*
	 * The time in 10s of nanoseconds for half a cycles.
	 * For convenience and performance; derived from the above.
	 */
	u32 half_cycle_delay_ns;

	/* This should be overridable by a module parameter.  */
	u32 max_speed_Hz;

	/* Pre-computed timout for the max transfer chunk-size.  */
	u32 dma_timeout;

	struct completion dma_done;

	/*
	 * If we get a timeout from wait_for_completion_timeout on the
	 * above, first look at this before panicking.
	 */
	u32 dma_actually_done;

	/*
	 * Resources don't seem available at the remove call, so we
	 * have to save information we get through them.
	 */
	struct crisv32_spi_sser_controller_data *gc;
};

/*
 * The driver state hides behind the spi_bitbang state; we're
 * responsible for allocating that, so we can get a little something
 * for ourselves.
 */
struct crisv32_spi_sser_devdata {
	struct spi_bitbang bitbang;
	struct crisv32_spi_hw_info hw;
};

/* Our DMA descriptors that need alignment.  */
struct crisv32_spi_dma_descrs {
	dma_descr_context in_ctxt __attribute__ ((__aligned__(32)));
	dma_descr_context out_ctxt __attribute__ ((__aligned__(32)));

	/*
	 * The code takes advantage of the fact that in_descr and
	 * out_descr are on the same cache-line when working around
	 * the cache-bug in TR 106.
	 */
	dma_descr_data in_descr __attribute__ ((__aligned__(16)));
	dma_descr_data out_descr __attribute__ ((__aligned__(16)));
};

/*
 * Whatever needs DMA access is here, besides whatever DMA-able memory
 * comes in transfers.
 */
struct crisv32_spi_dma_cs {
	struct crisv32_spi_dma_descrs *descrp;

	/* Scratch-buffers when the original was non-DMA.  */
	u8 rx_buf[DMA_CHUNKSIZ];
	u8 tx_buf[DMA_CHUNKSIZ];
};

/*
 * Max speed.  If set, we won't go faster, promise.  May be useful
 * when dealing with weak hardware; misrouted signal paths or various
 * debug-situations.
 */
static ulong crisv32_spi_speed_limit_Hz = 0;

/* Helper function getting the driver state from a spi_device.  */

static inline struct crisv32_spi_hw_info *spidev_to_hw(struct spi_device *spi)
{
	struct crisv32_spi_sser_devdata *dd = spi_master_get_devdata(spi->master);
	return &dd->hw;
}

/* SPI-bitbang word transmit-function for non-DMA.  */

static u32 crisv32_spi_sser_txrx_mode3(struct spi_device *spi,
				       unsigned nsecs, u32 word, u8 bits)
{
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	u32 regi_sser = hw->sser.regi;
	reg_sser_rw_ack_intr ack_intr = { .trdy = 1, .rdav = 1 };
	reg_sser_r_intr intr = {0};
	reg_sser_rw_tr_data w_data = { .data = (u8) word };
	reg_sser_r_rec_data r_data;
	u32 i;

	/*
	 * The timeout reflects one iteration per 10ns (impossible at
	 * 200MHz clock even without the ndelay) and a wait for a full
	 * byte.
	 */
	u32 timeout = 1000000/10*8/hw->effective_speed_kHz;

	BUG_ON(bits != 8);

	intr = REG_RD_SSER(r_intr);

	/*
	 * We should never get xruns when we control the transmitter
	 * and receiver in register mode.  And if we don't have
	 * transmitter-ready and data-ready on entry, something's
	 * seriously fishy.
	 */
	if (!intr.trdy || !intr.rdav || intr.orun || intr.urun)
		panic("sser hardware or SPI driver broken (1) 0x%x\n",
		      REG_TYPE_CONV(u32, reg_sser_r_intr, intr));

	REG_WR_SSER(rw_ack_intr, ack_intr);
	REG_WR_SSER(rw_tr_data, w_data);

	for (i = 0; i < timeout; i++) {
		intr = REG_RD_SSER(r_intr);
		/* Wait for received data.  */
		if (intr.rdav)
			break;
		ndelay(10);
	}

	if (!(intr.trdy && intr.rdav) || intr.orun || intr.urun)
		panic("sser hardware or SPI driver broken (2) 0x%x\n",
		      REG_TYPE_CONV(u32, reg_sser_r_intr, intr));

	r_data = REG_RD_SSER(r_rec_data);
	return r_data.data & 0xff;
}

/*
 * Wait for 1/2 bit-time if the transmitter or receiver is enabled.
 * We need to do this as the data-available indications may arrive
 * right at the edge, with half the last cycle remaining.
 */
static void inline crisv32_spi_sser_wait_halfabit(struct crisv32_spi_hw_info
						  *hw)
{
	if (hw->cfg.en)
		ndelay(hw->half_cycle_delay_ns);
}

/*
 * Assert or de-assert chip-select.
 * We have two functions, with the active one assigned to the bitbang
 * slot at setup, to avoid a performance penalty (1% on reads).
 */
static void crisv32_spi_sser_chip_select_active_high(struct spi_device *spi,
						     int value)
{
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	u32 regi_sser = hw->sser.regi;

	/*
	 * We may have received data at the "last producing clock
	 * edge".  Thus we delay for another half a clock cycle.
	 */
	crisv32_spi_sser_wait_halfabit(hw);

	hw->frm_cfg.frame_pin_use
		= value == BITBANG_CS_ACTIVE ? regk_sser_gio1 : regk_sser_gio0;
	REG_WR_SSER(rw_frm_cfg, hw->frm_cfg);
}

static void crisv32_spi_sser_chip_select_active_low(struct spi_device *spi,
						    int value)
{
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	u32 regi_sser = hw->sser.regi;

	crisv32_spi_sser_wait_halfabit(hw);
	hw->frm_cfg.frame_pin_use
		= value == BITBANG_CS_ACTIVE ? regk_sser_gio0 : regk_sser_gio1;
	REG_WR_SSER(rw_frm_cfg, hw->frm_cfg);
}

/* Set the transmission speed in Hz.  */

static int crisv32_spi_sser_set_speed_Hz(struct crisv32_spi_hw_info *hw,
					 u32 Hz)
{
	u32 kHz;
	u32 ns_delay;
	u32 regi_sser = hw->sser.regi;

	if (Hz > hw->max_speed_Hz)
		/*
		 * Should we complain?  Return error?  Current caller
		 * sequences want just the max speed.
		 */
		Hz = hw->max_speed_Hz;

	kHz = Hz/1000;

	/*
	 * If absolutely needed, we *could* change the base frequency
	 * and go lower.  Usually, a frequency set higher than wanted
	 * is a problem but lower isn't.
	 */
	if (Hz < 100000000 / 65536 + 1) {
		printk(KERN_ERR "attempt to set invalid sser speed: %u Hz\n",
		       Hz);
		Hz = 100000000 / 65536 + 1;
	}

	pr_debug("setting sser speed to %u Hz\n", Hz);

	/*
	 * Avoid going above the requested speed if there's a
	 * remainder for the 100 MHz clock-divider calculation, but
	 * don't unnecessarily go below if it's even.
	 */
	hw->cfg.clk_div = 100000000/Hz - ((100000000 % Hz) == 0);

	/* Make sure there's no ongoing transmission. */
	crisv32_spi_sser_wait_halfabit(hw);

	/*
	 * Wait for 3 times max of the old and the new clock before and after
	 * changing the frequency.  Not because of documentation or empirical
	 * need, but because it seems sane to do so.  The three-bit-times
	 * value is because that's the documented time it takes for a reset to
	 * take effect.
	 */
	ns_delay = 1000000*3/(kHz > hw->effective_speed_kHz
			      ? kHz : hw->effective_speed_kHz);
	ndelay(ns_delay);
	REG_WR_SSER(rw_cfg, hw->cfg);
	ndelay(ns_delay);

	hw->effective_speed_kHz = kHz;

	/*
	 * A timeout of twice the time for the largest chunk (not
	 * counting DMA overhead) plus one jiffy, should be more than
	 * enough for the transmission.
	 */
	hw->dma_timeout = 1 + usecs_to_jiffies(1000*2*DMA_CHUNKSIZ*8/kHz);

	hw->half_cycle_delay_ns
		= 1000000/2/hw->effective_speed_kHz;

	pr_debug(".clk_div %d, half %d, eff %d\n",
		 hw->cfg.clk_div, hw->half_cycle_delay_ns,
		 hw->effective_speed_kHz);
	return 0;
}

/*
 * Set up transmitter and receiver for non-DMA access.
 * Unfortunately, it doesn't seem like hispeed works for this mode
 * (mea culpa), so we're stuck with lospeed-mode.  A little slower,
 * but that's what you get for not allocating DMA.
 */
static int crisv32_setup_spi_sser_for_reg_access(struct crisv32_spi_hw_info *hw)
{
	u32 regi_sser = hw->sser.regi;

	reg_sser_rw_cfg cfg = {0};
	reg_sser_rw_frm_cfg frm_cfg = {0};
	reg_sser_rw_tr_cfg tr_cfg = {0};
	reg_sser_rw_rec_cfg rec_cfg = {0};
	reg_sser_rw_intr_mask mask = {0};
	reg_sser_rw_extra extra = {0};
	reg_sser_rw_tr_data tr_data = {0};
	reg_sser_r_intr intr;

	cfg.en = 0;
	tr_cfg.tr_en = 1;
	rec_cfg.rec_en = 1;
	REG_WR_SSER(rw_cfg, cfg);
	REG_WR_SSER(rw_tr_cfg, tr_cfg);
	REG_WR_SSER(rw_rec_cfg, rec_cfg);
	REG_WR_SSER(rw_intr_mask, mask);

	/*
	 * See 23.7.2 SPI in the hardware documentation.
	 * Except our configuration uses bulk mode; MMC/SD-SPI
	 * isn't isochronous in nature.
	 * Step 1.
	 */
	cfg.gate_clk = regk_sser_yes;
	cfg.clkgate_in = regk_sser_no;
	cfg.clkgate_ctrl = regk_sser_tr;

	/* Step 2.  */
	cfg.out_clk_pol = regk_sser_pos;
	cfg.out_clk_src = regk_sser_intern_clk;

	/* Step 3.  */
	tr_cfg.clk_src = regk_sser_intern;
	rec_cfg.clk_src = regk_sser_intern;
	frm_cfg.clk_src = regk_sser_intern;

	/* Step 4.  */
	tr_cfg.clk_pol = regk_sser_neg;
	rec_cfg.clk_pol = regk_sser_pos;
	frm_cfg.clk_pol = regk_sser_neg;

	/*
	 * Step 5: frame pin (PC03 or PD03) is frame; the status pin
	 * (PC02, PD02) is configured as input.
	 */
	frm_cfg.frame_pin_dir = regk_sser_out;

	/*
	 * Contrary to the doc example, we don't generate the frame
	 * signal "automatically".  This setting of the frame pin as
	 * constant 1, reflects an inactive /CS setting, for just idle
	 * clocking.  When we need to transmit or receive data, we
	 * change it.
	 */
	frm_cfg.frame_pin_use = regk_sser_gio1;
	frm_cfg.status_pin_dir = regk_sser_in;

	/*
	 * Step 6.  This is probably not necessary, as we don't
	 * generate the frame signal automatically.  Nevertheless,
	 * modified for bulk transmission.
	 */
	frm_cfg.out_on = regk_sser_tr;
	frm_cfg.out_off = regk_sser_tr;

	/* Step 7.  Similarly, maybe not necessary.  */
	frm_cfg.type = regk_sser_level;
	frm_cfg.level = regk_sser_neg_lo;

	/* Step 8.  These we have to set according to the bulk mode,
	 * which for tr_delay is the same as for iso; a value of 1
	 * means in sync with the frame signal.  For rec_delay, we
	 * start it at the same time as the transmitter.  See figure
	 * 23.7 in the hw documentation.  */
	frm_cfg.tr_delay = 1;
	frm_cfg.rec_delay = 0;

	/* Step 9.  */
	tr_cfg.sample_size = 7;
	rec_cfg.sample_size = 7;

	/* Step 10.  */
	frm_cfg.wordrate = 7;

	/* Step 11 (but for bulk).  */
	tr_cfg.rate_ctrl = regk_sser_bulk;

	/*
	 * Step 12.  Similarly, maybe not necessary; still, modified
	 * for bulk.
	 */
	tr_cfg.frm_src = regk_sser_intern;
	rec_cfg.frm_src = regk_sser_tx_bulk;

	/* Step 13.  */
	tr_cfg.mode = regk_sser_lospeed;
	rec_cfg.mode = regk_sser_lospeed;

	/* Step 14.  */
	tr_cfg.sh_dir = regk_sser_msbfirst;
	rec_cfg.sh_dir = regk_sser_msbfirst;

	/*
	 * Extra step for bulk-specific settings and other general
	 * settings not specified in the SPI config example.
	 * It's uncertain whether all of these are needed.
	 */
	tr_cfg.bulk_wspace = 1;
	tr_cfg.use_dma = 0;

	tr_cfg.urun_stop = 1;
	rec_cfg.orun_stop = 1;
	rec_cfg.use_dma = 0;

	rec_cfg.fifo_thr = regk_sser_inf;
	frm_cfg.early_wend = regk_sser_yes;

	cfg.clk_dir = regk_sser_out;
	tr_cfg.data_pin_use = regk_sser_dout;
	cfg.base_freq = regk_sser_f100;

	/* Setup for the initial frequency given to us.  */
	hw->cfg = cfg;
	crisv32_spi_sser_set_speed_Hz(hw, hw->max_speed_Hz);
	cfg = hw->cfg;

	/*
	 * Write it all, except cfg which is already written by
	 * crisv32_spi_sser_set_speed_Hz.
	 */
	REG_WR_SSER(rw_frm_cfg, frm_cfg);
	REG_WR_SSER(rw_tr_cfg, tr_cfg);
	REG_WR_SSER(rw_rec_cfg, rec_cfg);
	REG_WR_SSER(rw_extra, extra);

	/*
	 * The transmit-register needs to be written before the
	 * transmitter is enabled, and to get a valid trdy signal
	 * waiting for us when we want to transmit a byte.  Because
	 * the "frame event" is that the transmitter is written, this
	 * will cause a dummy 0xff-byte to be transmitted, but that's
	 * ok, because /CS is inactive.
	 */
	tr_data.data = 0xffff;
	REG_WR_SSER(rw_tr_data, tr_data);

	/*
	 * We ack everything interrupt-wise; left-over indicators don't have
	 * to come from *this* code.
	 */
	REG_WRINT_SSER(rw_ack_intr, -1);

	/*
	 * Wait 3 cycles before enabling, after the transmit register
	 * has been written.  (This'll be just a few microseconds for
	 * e.g. 400 KHz.)
	 */
	ndelay(3 * 2 * hw->half_cycle_delay_ns);
	cfg.en = 1;

	REG_WR_SSER(rw_cfg, cfg);

	/*
	 * Now wait for 8 + 3 cycles.  The 0xff byte should now have
	 * been transmitted and dummy data received.
	 */
	ndelay((8 + 3) * 2 * hw->half_cycle_delay_ns);

	/*
	 * Sanity-check that we have data-available and the
	 * transmitter is ready to send new data.
	 */
	intr = REG_RD_SSER(r_intr);
	if (!intr.rdav || !intr.trdy)
		panic("sser hw or SPI driver broken (3) 0x%x",
		      REG_TYPE_CONV(u32, reg_sser_r_intr, intr));

	hw->frm_cfg = frm_cfg;
	hw->tr_cfg = tr_cfg;
	hw->rec_cfg = rec_cfg;
	hw->extra = extra;
	hw->cfg = cfg;
	return 0;
}

/* Initialization, maybe fault recovery.  */

static void crisv32_reset_dma_hw(u32 regi)
{
	REG_WR_INT(dma, regi, rw_intr_mask, 0);

	DMA_RESET(regi);
	DMA_WAIT_UNTIL_RESET(regi);
	DMA_ENABLE(regi);
	REG_WR_INT(dma, regi, rw_ack_intr, -1);

	DMA_WR_CMD(regi, regk_dma_set_w_size1);
}

/* Interrupt from SSER, for use with DMA when only the transmitter is used.  */

static irqreturn_t sser_interrupt(int irqno, void *arg)
{
	struct crisv32_spi_hw_info *hw = arg;
	u32 regi_sser = hw->sser.regi;
	reg_sser_r_intr intr = REG_RD_SSER(r_intr);

	if (intr.tidle == 0 && intr.urun == 0) {
		printk(KERN_ERR
		       "sser @0x%x: spurious sser intr, flags: 0x%x\n",
		       regi_sser, REG_TYPE_CONV(u32, reg_sser_r_intr, intr));
	} else if (intr.urun == 0) {
		hw->dma_actually_done = 1;
		complete(&hw->dma_done);
	} else {
		/*
		 * Make any reception time out and notice the error,
		 * which it might not otherwise do data was *received*
		 * successfully.
		 */
		u32 regi_dmain = hw->dmain.regi;

		/*
		 * Recommended practice before acking urun is to turn
		 * off sser.  That might not be enough to stop DMA-in
		 * from signalling success if the underrun was late in
		 * the transmission, so we disable the DMA-in
		 * interrupts too.
		 */
		REG_WRINT_SSER(rw_cfg, 0);
		REG_WRINT_DI(rw_intr_mask, 0);
		REG_WRINT_DI(rw_ack_intr, -1);
	}

	REG_WRINT_SSER(rw_intr_mask, 0);

	/*
	 * We must at least ack urun together with tidle, but keep it
	 * simple and ack them all.
	 */
	REG_WRINT_SSER(rw_ack_intr, -1);

	return IRQ_HANDLED;
}

/*
 * Interrupt from receiver DMA connected to SSER, for use when the
 * receiver is used, with or without the transmitter.
 */
static irqreturn_t rec_dma_interrupt(int irqno, void *arg)
{
	struct crisv32_spi_hw_info *hw = arg;
	u32 regi_dmain = hw->dmain.regi;
	u32 regi_sser = hw->sser.regi;
	reg_dma_r_intr intr = REG_RD_DI(r_intr);

	if (intr.data == 0) {
		printk(KERN_ERR
		       "sser @0x%x: spurious rec dma intr, flags: 0x%x\n",
		       regi_dmain, REG_TYPE_CONV(u32, reg_dma_r_intr, intr));
	} else {
		hw->dma_actually_done = 1;
		complete(&hw->dma_done);
	}

	REG_WRINT_DI(rw_intr_mask, 0);

	/* Avoid false underrun indications; stop all sser interrupts.   */
	REG_WRINT_SSER(rw_intr_mask, 0);
	REG_WRINT_SSER(rw_ack_intr, -1);

	REG_WRINT_DI(rw_ack_intr, -1);
	return IRQ_HANDLED;
}

/*
 * Set up transmitter and receiver for DMA access.  We use settings
 * from the "Atmel fast flash" example.
 */
static int crisv32_setup_spi_sser_for_dma_access(struct crisv32_spi_hw_info
						 *hw)
{
	int ret;
	u32 regi_sser = hw->sser.regi;

	reg_sser_rw_cfg cfg = {0};
	reg_sser_rw_frm_cfg frm_cfg = {0};
	reg_sser_rw_tr_cfg tr_cfg = {0};
	reg_sser_rw_rec_cfg rec_cfg = {0};
	reg_sser_rw_intr_mask mask = {0};
	reg_sser_rw_extra extra = {0};

	cfg.en = 0;
	tr_cfg.tr_en = 1;
	rec_cfg.rec_en = 1;
	REG_WR_SSER(rw_cfg, cfg);
	REG_WR_SSER(rw_tr_cfg, tr_cfg);
	REG_WR_SSER(rw_rec_cfg, rec_cfg);
	REG_WR_SSER(rw_intr_mask, mask);

	/*
	 * See 23.7.5.2 (Atmel fast flash) in the hardware documentation.
	 * Step 1.
	 */
	cfg.gate_clk = regk_sser_no;

	/* Step 2.  */
	cfg.out_clk_pol = regk_sser_pos;

	/* Step 3.  */
	cfg.out_clk_src = regk_sser_intern_clk;

	/* Step 4.  */
	tr_cfg.sample_size = 1;
	rec_cfg.sample_size = 1;

	/* Step 5.  */
	frm_cfg.wordrate = 7;

	/* Step 6.  */
	tr_cfg.clk_src = regk_sser_intern;
	rec_cfg.clk_src = regk_sser_intern;
	frm_cfg.clk_src = regk_sser_intern;
	tr_cfg.clk_pol = regk_sser_neg;
	frm_cfg.clk_pol = regk_sser_neg;

	/* Step 7.  */
	rec_cfg.clk_pol = regk_sser_pos;

	/* Step 8.  */
	frm_cfg.tr_delay = 1;

	/* Step 9.  */
	frm_cfg.rec_delay = 1;

	/* Step 10.  */
	tr_cfg.sh_dir = regk_sser_msbfirst;
	rec_cfg.sh_dir = regk_sser_msbfirst;

	/* Step 11.  */
	tr_cfg.frm_src = regk_sser_intern;
	rec_cfg.frm_src = regk_sser_intern;

	/* Step 12.  */
	tr_cfg.rate_ctrl = regk_sser_iso;

	/*
	 * Step 13.  Note that 0 != tx_null, so we're good regarding
	 * the descriptor .md field.
	 */
	tr_cfg.eop_stop = 1;

	/* Step 14.  */
	frm_cfg.frame_pin_use = regk_sser_gio1;
	frm_cfg.frame_pin_dir = regk_sser_out;

	/* Step 15.  */
	extra.clkon_en = 1;
	extra.clkoff_en = 1;

	/* Step 16.  We'll modify this value for each "burst".  */
	extra.clkoff_cycles = 7;

	/* Step 17.  */
	cfg.prepare = 1;

	/*
	 * Things left out from the documented startup procedure.
	 * It's uncertain whether all of these are needed.
	 */
	frm_cfg.status_pin_dir = regk_sser_in;
	tr_cfg.mode = regk_sser_hispeed;
	rec_cfg.mode = regk_sser_hispeed;
	frm_cfg.out_on = regk_sser_intern_tb;
	frm_cfg.out_off = regk_sser_rec;
	frm_cfg.type = regk_sser_level;
	tr_cfg.use_dma = 1;
	tr_cfg.urun_stop = 1;
	rec_cfg.orun_stop = 1;
	rec_cfg.use_dma = 1;
	rec_cfg.fifo_thr = regk_sser_inf;
	frm_cfg.early_wend = regk_sser_yes;
	cfg.clk_dir = regk_sser_out;

	tr_cfg.data_pin_use = regk_sser_dout;
	cfg.base_freq = regk_sser_f100;

	REG_WR_SSER(rw_frm_cfg, frm_cfg);
	REG_WR_SSER(rw_tr_cfg, tr_cfg);
	REG_WR_SSER(rw_rec_cfg, rec_cfg);
	REG_WR_SSER(rw_extra, extra);
	REG_WR_SSER(rw_cfg, cfg);
	hw->frm_cfg = frm_cfg;
	hw->tr_cfg = tr_cfg;
	hw->rec_cfg = rec_cfg;
	hw->extra = extra;
	hw->cfg = cfg;

	crisv32_spi_sser_set_speed_Hz(hw, hw->max_speed_Hz);

	ret = request_irq(hw->sser.irq, sser_interrupt, 0, "sser", hw);
	if (ret != 0)
		goto noirq;

	ret = request_irq(hw->dmain.irq, rec_dma_interrupt, 0, "sser rec", hw);
	if (ret != 0)
		goto free_outirq;

	crisv32_reset_dma_hw(hw->dmain.regi);
	crisv32_reset_dma_hw(hw->dmaout.regi);
	return 0;

  free_outirq:
	free_irq(hw->sser.irq, hw);
  noirq:
	return ret;
}

/* SPI-master setup function for non-DMA.  */

static int crisv32_spi_sser_regs_master_setup(struct spi_device *spi)
{
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	struct spi_bitbang *bitbang = spi_master_get_devdata(spi->master);
	int ret = 0;

	/* Just do a little initial constraining checks.  */
	if (spi->bits_per_word == 0)
		spi->bits_per_word = 8;

	if (spi->bits_per_word != 8)
		return -EINVAL;

	bitbang->chipselect = (spi->mode & SPI_CS_HIGH) != 0
		? crisv32_spi_sser_chip_select_active_high
		: crisv32_spi_sser_chip_select_active_low;

	if (hw->max_speed_Hz == 0) {
		u32 max_speed_Hz;

		/*
		 * At this time; at the first call to the SPI master
		 * setup function, spi->max_speed_hz reflects the
		 * board-init value.  It will be changed later on by
		 * the protocol master, but at the master setup call
		 * is the only time we actually get to see the hw max
		 * and thus a reasonable time to init the hw field.
		 */

		/* The module parameter overrides everything.  */
		if (crisv32_spi_speed_limit_Hz != 0)
			max_speed_Hz = crisv32_spi_speed_limit_Hz;
		/*
		 * I never could get hispeed mode to work for non-DMA.
		 * We adjust the max speed here (where we could
		 * presumably fix it), not in the board info file.
		 */
		else if (spi->max_speed_hz > 16667000)
			max_speed_Hz = 16667000;
		else
			max_speed_Hz = spi->max_speed_hz;

		hw->max_speed_Hz = max_speed_Hz;
		spi->max_speed_hz = max_speed_Hz;

		/*
		 * We also do one-time initialization of the hardware at this
		 * point.  We could defer to the return to the probe-function
		 * from spi_bitbang_start, but other hardware setup (like
		 * subsequent calls to this function before that) would have
		 * to be deferred until then too.
		 */
		ret = crisv32_setup_spi_sser_for_reg_access(hw);
		if (ret != 0)
			return ret;

		ret = spi_bitbang_setup(spi);
		if (ret != 0)
			return ret;

		dev_info(&spi->dev,
			 "CRIS v32 SPI driver for sser%d\n",
			 spi->master->bus_num);
	}

	return 0;
}

/*
 * SPI-master setup_transfer-function used for both DMA and non-DMA
 * (single function for DMA, together with spi_bitbang_setup_transfer
 * for non-DMA).
 */

static int crisv32_spi_sser_common_setup_transfer(struct spi_device *spi,
						  struct spi_transfer *t)
{
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	u8 bits_per_word;
	u32 hz;
	int ret = 0;

	if (t) {
		bits_per_word = t->bits_per_word;
		hz = t->speed_hz;
	} else {
		bits_per_word = 0;
		hz = 0;
	}

	if (bits_per_word == 0)
		bits_per_word = spi->bits_per_word;

	if (bits_per_word != 8)
		return -EINVAL;

	if (hz == 0)
		hz = spi->max_speed_hz;

	if (hz != hw->effective_speed_kHz*1000 && hz != 0)
		ret = crisv32_spi_sser_set_speed_Hz(hw, hz);

	return ret;
}

/* Helper for a SPI-master setup_transfer function for non-DMA.  */

static int crisv32_spi_sser_regs_setup_transfer(struct spi_device *spi,
						struct spi_transfer *t)
{
	int ret = crisv32_spi_sser_common_setup_transfer(spi, t);

	if (ret != 0)
		return ret;

	/* Set up the loop-over-buffer parts.  */
	return spi_bitbang_setup_transfer (spi, t);
}

/* SPI-master setup function for DMA.  */

static int crisv32_spi_sser_dma_master_setup(struct spi_device *spi)
{
	/*
	 * As we don't dispatch to the spi_bitbang default function,
	 * we need to do whatever tests it does; keep it in sync.  On
	 * the bright side, we can use the spi->controller_state slot;
	 * we use it for DMA:able memory for the descriptors and
	 * temporary buffers to copy non-DMA:able transfers.
	 */
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	struct spi_bitbang *bitbang = spi_master_get_devdata(spi->master);
	struct crisv32_spi_dma_cs *cs;
	u32 dmasize;
	int ret = 0;

	if (hw->max_speed_Hz == 0) {
		struct crisv32_spi_dma_descrs *descrp;
		u32 descrp_dma;
		u32 max_speed_Hz;

		/* The module parameter overrides everything.  */
		if (crisv32_spi_speed_limit_Hz != 0)
			max_speed_Hz = crisv32_spi_speed_limit_Hz;
		/*
		 * See comment at corresponding statement in
		 * crisv32_spi_sser_regs_master_setup.
		 */
		else
			max_speed_Hz = spi->max_speed_hz;

		hw->max_speed_Hz = max_speed_Hz;
		spi->max_speed_hz = max_speed_Hz;

		ret = crisv32_setup_spi_sser_for_dma_access(hw);
		if (ret != 0)
			return ret;

		/* Allocate some extra for necessary alignment.  */
		dmasize = sizeof *cs + 31
			+ sizeof(struct crisv32_spi_dma_descrs);

		cs = kzalloc(dmasize, GFP_KERNEL | GFP_DMA);
		if (cs == NULL)
			return -ENOMEM;

		/*
		 * Make descriptors aligned within the allocated area,
		 * some-place after cs.
		 */
		descrp = (struct crisv32_spi_dma_descrs *)
			(((u32) (cs + 1) + 31) & ~31);
		descrp_dma = virt_to_phys(descrp);

		/* Set up the "constant" parts of the descriptors.  */
		descrp->out_descr.eol = 1;
		descrp->out_descr.intr = 1;
		descrp->out_descr.out_eop = 1;
		descrp->out_ctxt.saved_data = (dma_descr_data *)
		  (descrp_dma
		   + offsetof(struct crisv32_spi_dma_descrs, out_descr));
		descrp->out_ctxt.next = 0;

		descrp->in_descr.eol = 1;
		descrp->in_descr.intr = 1;
		descrp->in_ctxt.saved_data = (dma_descr_data *)
			(descrp_dma
			 + offsetof(struct crisv32_spi_dma_descrs, in_descr));
		descrp->in_ctxt.next = 0;

		cs->descrp = descrp;
		spi->controller_state = cs;

		init_completion(&hw->dma_done);

		dev_info(&spi->dev,
			 "CRIS v32 SPI driver for sser%d/DMA\n",
			 spi->master->bus_num);
	}

	/* Do our extra constraining checks.  */
	if (spi->bits_per_word == 0)
		spi->bits_per_word = 8;

	if (spi->bits_per_word != 8)
		return -EINVAL;

	/* SPI_LSB_FIRST deliberately left out, and we only support mode 3.  */
	if ((spi->mode & ~(SPI_TX_1|SPI_CS_HIGH)) != SPI_MODE_3)
		return -EINVAL;

	bitbang->chipselect = (spi->mode & SPI_CS_HIGH) != 0
		? crisv32_spi_sser_chip_select_active_high
		: crisv32_spi_sser_chip_select_active_low;

	ret = bitbang->setup_transfer(spi, NULL);
	if (ret != 0)
		return ret;

	/* Remember to de-assert chip-select before the first transfer.  */
	spin_lock(&bitbang->lock);
	if (!bitbang->busy) {
		bitbang->chipselect(spi, BITBANG_CS_INACTIVE);
		ndelay(hw->half_cycle_delay_ns);
	}
	spin_unlock(&bitbang->lock);

	return 0;
}

/* SPI-master cleanup function for DMA.  */

static void crisv32_spi_sser_dma_cleanup(struct spi_device *spi)
{
	kfree(spi->controller_state);
	spi->controller_state = NULL;
}

/*
 * Set up DMA transmitter descriptors for a chunk of data.
 * The caller is responsible for working around TR 106.
 */
static void crisv32_spi_sser_setup_dma_descr_out(u32 regi,
						 struct crisv32_spi_dma_cs *cs,
						 u32 out_phys, u32 chunk_len)
{
	BUG_ON(chunk_len > DMA_CHUNKSIZ);
	struct crisv32_spi_dma_descrs *descrp = cs->descrp;
	u32 descrp_dma = virt_to_phys(descrp);

	descrp->out_descr.buf = (u8 *) out_phys;
	descrp->out_descr.after = (u8 *) out_phys + chunk_len;
	descrp->out_ctxt.saved_data_buf = (u8 *) out_phys;

	DMA_START_CONTEXT(regi,
			  descrp_dma
			  + offsetof(struct crisv32_spi_dma_descrs, out_ctxt));
}

/*
 * Set up DMA receiver descriptors for a chunk of data.
 * Also, work around TR 106.
 */
static void crisv32_spi_sser_setup_dma_descr_in(u32 regi_dmain,
						struct crisv32_spi_dma_cs *cs,
						u32 in_phys, u32 chunk_len)
{
	BUG_ON(chunk_len > DMA_CHUNKSIZ);
	struct crisv32_spi_dma_descrs *descrp = cs->descrp;
	u32 descrp_dma = virt_to_phys(descrp);

	descrp->in_descr.buf = (u8 *) in_phys;
	descrp->in_descr.after = (u8 *) in_phys + chunk_len;
	descrp->in_ctxt.saved_data_buf = (u8 *) in_phys;

	flush_dma_descr(&descrp->in_descr, 1);

	DMA_START_CONTEXT(regi_dmain,
			  descrp_dma
			  + offsetof(struct crisv32_spi_dma_descrs, in_ctxt));
}

/*
 * SPI-bitbang txrx_bufs function for DMA.
 * FIXME: We have SG DMA descriptors; use them.
 * (Requires abandoning the spi_bitbang framework if done reasonably.)
 */
static int crisv32_spi_sser_dma_txrx_bufs(struct spi_device *spi,
					  struct spi_transfer *t)
{
	struct crisv32_spi_dma_cs *cs = spi->controller_state;
	struct crisv32_spi_hw_info *hw = spidev_to_hw(spi);
	u32 len = t->len;
	reg_sser_rw_cfg cfg = hw->cfg;
	reg_sser_rw_tr_cfg tr_cfg = hw->tr_cfg;
	reg_sser_rw_rec_cfg rec_cfg = hw->rec_cfg;
	reg_sser_rw_extra extra = hw->extra;
	u32 regi_sser = hw->sser.regi;
	u32 dmain = 0;
	u32 dmaout = 0;
	u32 regi_dmain = hw->dmain.regi;
	u8 *rx_buf = t->rx_buf;

	/*
	 * Using IRQ+completion is measured to give an overhead of 14
	 * us, so let's instead busy-wait for the time that would be
	 * wasted anyway, and get back sooner.  We're not counting in
	 * other overhead such as the DMA descriptor in the
	 * time-expression, which causes us to use busy-wait for
	 * data-lengths that actually take a bit longer than
	 * IRQ_USAGE_THRESHOLD_NS.  Still, with IRQ_USAGE_THRESHOLD_NS
	 * = 14000, the threshold is for 20 MHz => 35 bytes, 25 => 44
	 * and 50 => 88 and the typical SPI transfer lengths for
	 * SDcard are { 1, 2, 7, 512 } bytes so a more complicated
	 * would likely give nothing but worse performance due to
	 * complexity.
	 */
	int use_irq = len * hw->half_cycle_delay_ns
		> IRQ_USAGE_THRESHOLD_NS / 8 / 2;

	if (len > DMA_CHUNKSIZ) {
		/*
		 * It should be quite easy to adjust the code if the need
		 * arises for something much larger than the preallocated
		 * buffers (which could themselves easily just be increased)
		 * but still what fits in extra.clkoff_cycles: kmalloc a
		 * temporary dmaable buffer in this function and free it at
		 * the end.  No need to optimize rare requests.  Until then,
		 * we'll keep the code as simple as performance allows.
		 * Alternatively or if we need to send even larger data,
		 * consider calling self with the required number of "faked"
		 * shorter transfers here.
		 */
		dev_err(&spi->dev,
			"Trying to transfer %d > max %d bytes:"
			" need to adjust the SPI driver\n",
			len, DMA_CHUNKSIZ);
		return -EMSGSIZE;
	}

	/*
	 * Need to separately tell the hispeed machinery the number of
	 * bits in this transmission.
	 */
	extra.clkoff_cycles = len * 8 - 1;

	if (t->tx_buf != NULL) {
		if (t->tx_dma == 0) {
			memcpy(cs->tx_buf, t->tx_buf, len);
			dmaout = virt_to_phys(cs->tx_buf);
		} else
			dmaout = t->tx_dma;

		crisv32_spi_sser_setup_dma_descr_out(hw->dmaout.regi,
						     cs, dmaout,
						     len);

		/* No need to do anything for TR 106; this DMA only reads.  */
		tr_cfg.tr_en = 1;
		tr_cfg.data_pin_use = regk_sser_dout;
	} else {
		tr_cfg.data_pin_use = (spi->mode & SPI_TX_1)
			? regk_sser_gio1 : regk_sser_gio0;
		tr_cfg.tr_en = 0;
	}

	if (rx_buf != 0) {
		if (t->rx_dma == 0)
			dmain = virt_to_phys(cs->rx_buf);
		else
			dmain = t->rx_dma;

		crisv32_spi_sser_setup_dma_descr_in(regi_dmain, cs,
						    dmain, len);
		rec_cfg.rec_en = 1;

		REG_WRINT_SSER(rw_ack_intr, -1);
		REG_WRINT_DI(rw_ack_intr, -1);

		/*
		 * If we're receiving, use the rec data interrupt from DMA as
		 * a signal that the HW is done.
		 */
		if (use_irq) {
			reg_sser_rw_intr_mask mask = { .urun = 1 };
			reg_dma_rw_intr_mask dmask = { .data = 1 };

			REG_WR_DI(rw_intr_mask, dmask);

			/*
			 * Catch transmitter underruns too.  We don't
			 * have to conditionalize that on the
			 * transmitter being enabled; it's off when
			 * the transmitter is off.  Any overruns will
			 * be indicated by a timeout, so we don't have
			 * to check for that specifically.
			 */
			REG_WR_SSER(rw_intr_mask, mask);
		}
	} else {
		rec_cfg.rec_en = 0;

		/*
		 * Ack previous overrun, underrun and tidle interrupts.  Or
		 * why not all.  We'll get orun and urun "normally" due to the
		 * way hispeed is (documented to) work and need to clear them,
		 * and we'll have a tidle from a previous transmit if we used
		 * to both receive and transmit, but now only transmit.
		 */
		REG_WRINT_SSER(rw_ack_intr, -1);

		if (use_irq) {
			reg_sser_rw_intr_mask mask = { .urun = 1, .tidle = 1 };
			REG_WR_SSER(rw_intr_mask, mask);
		}
	}

	REG_WR_SSER(rw_rec_cfg, rec_cfg);
	REG_WR_SSER(rw_tr_cfg, tr_cfg);
	REG_WR_SSER(rw_extra, extra);

	/*
	 * Barriers are needed to make sure that the completion inits don't
	 * migrate past the register writes due to gcc scheduling.
	 */
	mb();
	hw->dma_actually_done = 0;
	INIT_COMPLETION(hw->dma_done);
	mb();

	/*
	 * Wait until DMA tx FIFO has more than one byte (it reads one
	 * directly then one "very quickly") before starting sser tx.
	 */
	if (tr_cfg.tr_en) {
		u32 regi_dmaout = hw->dmaout.regi;
		u32 minlen = len > 2 ? 2 : len;
		while ((REG_RD_DO(rw_stat)).buf < minlen)
			;
	}

	/* Wait until DMA-in is finished reading the descriptors.  */
	if (rec_cfg.rec_en)
		while (DMA_BUSY(regi_dmain))
			;
	/*
	 * Wait 3 cycles before enabling (with .prepare = 1).
	 * FIXME: Can we cut this by some time already passed?
	 */
	ndelay(3 * 2 * hw->half_cycle_delay_ns);
	cfg.en = 1;
	REG_WR_SSER(rw_cfg, cfg);

	/*
	 * Wait 3 more cycles plus 30 ns before letting go.
	 * FIXME: Can we do something else before but after the
	 * previous cfg write and cut this by the time already passed?
	 */
	cfg.prepare = 0;
	hw->cfg = cfg;
	ndelay(3 * 2 * hw->half_cycle_delay_ns + 30);

	REG_WR_SSER(rw_cfg, cfg);

	/*, We'll disable sser next the time we change the configuration.  */
	cfg.en = 0;
	cfg.prepare = 1;
	hw->cfg = cfg;

	if (!use_irq) {
		/*
		 * We use a timeout corresponding to one iteration per ns,
		 * which of course is at least five * insns / loop times as
		 * much as reality, but we'll avoid a need for reading hw
		 * timers directly.
		 */
		u32 countdown = IRQ_USAGE_THRESHOLD_NS;

		do
			if (rec_cfg.rec_en == 0) {
				/* Using the transmitter only.  */
				reg_sser_r_intr intr = REG_RD_SSER(r_intr);

				if (intr.tidle != 0) {
					/*
					 * Almost done...  Just check if we
					 * had a transmitter underrun too.
					 */
					if (!intr.urun)
						goto transmission_done;

					/*
					 * Fall over to the "time is up" case;
					 * no need to provide a special path
					 * for the error case.
					 */
					countdown = 1;
				}
			} else {
				/* Using at least the receiver.  */
				if ((REG_RD_DI(r_intr)).data != 0) {
					if ((REG_RD_SSER(r_intr)).urun == 0)
						goto transmission_done;
					countdown = 1;
				}
			}
		while (--countdown != 0);

		/*
		 * The time is up.  Something might be wrong, or perhaps we've
		 * started using data lengths where the threshold was about a
		 * magnitude wrong.  Fall over to IRQ.  Remember not to ack
		 * interrupts here (but always above, before starting), else
		 * we'll have a race condition with the interrupt.
		 */
		if (!rec_cfg.rec_en) {
			reg_sser_rw_intr_mask mask = { .urun = 1, .tidle = 1 };
			REG_WR_SSER(rw_intr_mask, mask);
		} else {
			reg_dma_rw_intr_mask dmask = { .data = 1 };
			reg_sser_rw_intr_mask mask = { .urun = 1 };

			/*
			 * Never mind checking for tr being disabled; urun
			 * won't happen then.
			 */
			REG_WR_SSER(rw_intr_mask, mask);
			REG_WR_DI(rw_intr_mask, dmask);
		}
	}

	if (!wait_for_completion_timeout(&hw->dma_done, hw->dma_timeout)
	    /*
	     * Have to keep track manually too, else we'll get a timeout
	     * indication for being scheduled out too long, while the
	     * completion will still have trigged.
	     */
	    && !hw->dma_actually_done) {
		u32 regi_dmaout = hw->dmaout.regi;

		/*
		 * Transfer timed out.  Should not happen for a
		 * working controller, except perhaps if the system is
		 * badly conditioned, causing DMA memory bandwidth
		 * starvation.  Not much to do afterwards, but perhaps
		 * reset DMA and sser and hope it works the next time.
		 */
		REG_WRINT_SSER(rw_cfg, 0);
		REG_WR_SSER(rw_cfg, cfg);
		REG_WRINT_SSER(rw_intr_mask, 0);
		REG_WRINT_DI(rw_intr_mask, 0);
		REG_WRINT_SSER(rw_ack_intr, -1);
		crisv32_reset_dma_hw(hw->dmain.regi);
		crisv32_reset_dma_hw(hw->dmaout.regi);

		dev_err(&spi->dev, "timeout %u bytes %u kHz\n",
			len, hw->effective_speed_kHz);
		dev_err(&spi->dev, "sser=(%x,%x,%x,%x,%x)\n",
			REG_RDINT_SSER(rw_cfg), REG_RDINT_SSER(rw_tr_cfg),
			REG_RDINT_SSER(rw_rec_cfg), REG_RDINT_SSER(rw_extra),
			REG_RDINT_SSER(r_intr));
		dev_err(&spi->dev, "tx=(%x,%x,%x,%x)\n",
			dmaout, REG_RDINT_DO(rw_stat), REG_RDINT_DO(rw_data),
			REG_RDINT_DO(r_intr));
		dev_err(&spi->dev, "rx=(%x,%x,%x,%x)\n",
			dmain, REG_RDINT_DI(rw_stat), REG_RDINT_DI(rw_data),
			REG_RDINT_DI(r_intr));
		return -EIO;
	}

 transmission_done:
	/* Wait for the last half-cycle of the last cycle.  */
	crisv32_spi_sser_wait_halfabit(hw);

	/* Reset for another call.  */
	REG_WR_SSER(rw_cfg, cfg);

	/*
	 * If we had to use the temp DMAable rec buffer, copy it to the right
	 * position.
	 */
	if (t->rx_buf != 0 && t->rx_dma == 0)
		memcpy (t->rx_buf, cs->rx_buf, len);

	/*
	 * All clear.  The interrupt function disabled the interrupt, we don't
	 * have to do more.
	 */
	return len;
}

/* Platform-device probe function.  */

static int __devinit crisv32_spi_sser_probe(struct platform_device *dev)
{
	struct spi_master *master;
	struct crisv32_spi_sser_devdata *dd;
	struct crisv32_spi_hw_info *hw;
	struct resource *res;
	struct crisv32_spi_sser_controller_data *gc;
	int ret;

	/*
	 * We need to get the controller data as a hardware resource,
	 * or else it wouldn't be available until *after* the
	 * spi_bitbang_start call!
	 */
	res = platform_get_resource_byname(dev, 0, "controller_data_ptr");
	if (res == NULL) {
		dev_err(&dev->dev,
			"can't get controller_data resource at probe\n");
		return -EIO;
	}

	gc = (struct crisv32_spi_sser_controller_data *) res->start;

	master = spi_alloc_master(&dev->dev, sizeof *dd);
	if (master == NULL) {
		dev_err(&dev->dev, "failed to allocate spi master\n");
		ret = -ENOMEM;
		goto err;
	}

	dd = spi_master_get_devdata(master);
	platform_set_drvdata(dev, dd);

	/*
	 * The device data asks for this driver, and holds the id
	 * number, which must be unique among the same-type devices.
	 * We use this as the number of this SPI bus.
	 */
	master->bus_num = dev->id;

	/* Setup SPI bitbang adapter hooks.  */
	dd->bitbang.master = spi_master_get(master);
	dd->bitbang.chipselect = crisv32_spi_sser_chip_select_active_low;

	hw = &dd->hw;
	hw->gc = gc;

	/* Pre-spi_bitbang_start setup. */
	if (gc->using_dma) {
		/* Setup DMA and interrupts.  */
		ret = gc->iface_allocate(&hw->sser, &hw->dmain, &hw->dmaout);
		if (ret != 0)
			goto err_no_regs;

		dd->bitbang.master->setup = crisv32_spi_sser_dma_master_setup;
		dd->bitbang.setup_transfer
			= crisv32_spi_sser_common_setup_transfer;
		dd->bitbang.txrx_bufs = crisv32_spi_sser_dma_txrx_bufs;
		dd->bitbang.master->cleanup = crisv32_spi_sser_dma_cleanup;
	} else {
		/* Just registers, then.  */
		ret = gc->iface_allocate(&hw->sser, NULL, NULL);
		if (ret != 0)
			goto err_no_regs;

		dd->bitbang.master->setup
			= crisv32_spi_sser_regs_master_setup;
		dd->bitbang.setup_transfer
			= crisv32_spi_sser_regs_setup_transfer;
		dd->bitbang.master->cleanup = spi_bitbang_cleanup;

		/*
		 * We can do all modes pretty simply, but I have no
		 * simple enough way to test them, so I won't.
		 */
		dd->bitbang.txrx_word[SPI_MODE_3]
			= crisv32_spi_sser_txrx_mode3;
	}

	ret = spi_bitbang_start(&dd->bitbang);
	if (ret)
		goto err_no_bitbang;

	/*
	 * We don't have a dev_info here, as initialization that may fail is
	 * postponed to the first master->setup call.  It's called from
	 * spi_bitbang_start (above), where the call-chain doesn't look too
	 * close at error return values; we'll get here successfully anyway,
	 * so emitting a separate message here is at most confusing.
	 */
	dev_dbg(&dev->dev,
		"CRIS v32 SPI driver for sser%d%s present\n",
		master->bus_num,
		gc->using_dma ? "/DMA" : "");

	return 0;

 err_no_bitbang:
	gc->iface_free();

 err_no_regs:
	platform_set_drvdata(dev, NULL);
	spi_master_put(dd->bitbang.master);

 err:
	return ret;
}

/* Platform-device remove-function.  */

static int __devexit crisv32_spi_sser_remove(struct platform_device *dev)
{
	struct crisv32_spi_sser_devdata *dd = platform_get_drvdata(dev);
	struct crisv32_spi_hw_info *hw = &dd->hw;
	struct crisv32_spi_sser_controller_data *gc = hw->gc;
	int ret;

	/* We need to stop all bitbanging activity separately.  */
	ret = spi_bitbang_stop(&dd->bitbang);
	if (ret != 0)
		return ret;

	spi_master_put(dd->bitbang.master);

	/*
	 * If we get here, the queue is empty and there's no activity;
	 * it's safe to flip the switch on the interfaces.
	 */
	if (gc->using_dma) {
		u32 regi_dmain = hw->dmain.regi;
		u32 regi_dmaout = hw->dmaout.regi;
		u32 regi_sser = hw->sser.regi;

		REG_WRINT_SSER(rw_intr_mask, 0);
		REG_WRINT_DI(rw_intr_mask, 0);
		REG_WRINT_DO(rw_intr_mask, 0);
		hw->cfg.en = 0;
		REG_WR_SSER(rw_cfg, hw->cfg);
		DMA_RESET(regi_dmain);
		DMA_RESET(regi_dmaout);
		free_irq(hw->sser.irq, hw);
		free_irq(hw->dmain.irq, hw);
	}

	gc->iface_free();

	platform_set_drvdata(dev, NULL);
	return 0;
}

/*
 * For the time being, there's no suspend/resume support to care
 * about, so those handlers default to NULL.
 */
static struct platform_driver crisv32_spi_sser_drv = {
	.probe		= crisv32_spi_sser_probe,
	.remove		= __devexit_p(crisv32_spi_sser_remove),
	.driver		= {
		.name	= "spi_crisv32_sser",
		.owner	= THIS_MODULE,
	},
};

/* Module init function.  */

static int __devinit crisv32_spi_sser_init(void)
{
	return platform_driver_register(&crisv32_spi_sser_drv);
}

/* Module exit function.  */

static void __devexit crisv32_spi_sser_exit(void)
{
	platform_driver_unregister(&crisv32_spi_sser_drv);
}

/* Setter function for speed limit.  */

static int crisv32_spi_speed_limit_Hz_setter(const char *val,
					     struct kernel_param *kp)
{
	char *endp;
	ulong num = simple_strtoul(val, &endp, 0);
	if (endp == val
	    || *endp != 0
	    || num <= 0
	    /*
	     * We can't go above 100 MHz speed.  Actually we can't go
	     * above 50 MHz using the sser support but it might make
	     * sense trying.
	     */
	    || num > 100000000)
		return -EINVAL;
	*(ulong *) kp->arg = num;
	return 0;
}

module_param_call(crisv32_spi_max_speed_hz,
		  crisv32_spi_speed_limit_Hz_setter, param_get_ulong,
		  &crisv32_spi_speed_limit_Hz, 0644);

module_init(crisv32_spi_sser_init);
module_exit(crisv32_spi_sser_exit);

MODULE_DESCRIPTION("CRIS v32 SPI-SSER Driver");
MODULE_AUTHOR("Hans-Peter Nilsson, <hp@axis.com>");
MODULE_LICENSE("GPL");