rdma/siw: receive path
authorBernard Metzler <bmt@zurich.ibm.com>
Thu, 20 Jun 2019 16:21:31 +0000 (18:21 +0200)
committerJason Gunthorpe <jgg@mellanox.com>
Tue, 2 Jul 2019 20:03:41 +0000 (17:03 -0300)
Broken up commit to add the Soft iWarp RDMA driver.

Signed-off-by: Bernard Metzler <bmt@zurich.ibm.com>
Signed-off-by: Jason Gunthorpe <jgg@mellanox.com>
drivers/infiniband/sw/siw/siw_qp_rx.c [new file with mode: 0644]

diff --git a/drivers/infiniband/sw/siw/siw_qp_rx.c b/drivers/infiniband/sw/siw/siw_qp_rx.c
new file mode 100644 (file)
index 0000000..682a290
--- /dev/null
@@ -0,0 +1,1456 @@
+// SPDX-License-Identifier: GPL-2.0 or BSD-3-Clause
+
+/* Authors: Bernard Metzler <bmt@zurich.ibm.com> */
+/* Copyright (c) 2008-2019, IBM Corporation */
+
+#include <linux/errno.h>
+#include <linux/types.h>
+#include <linux/net.h>
+#include <linux/scatterlist.h>
+#include <linux/highmem.h>
+
+#include <rdma/iw_cm.h>
+#include <rdma/ib_verbs.h>
+
+#include "siw.h"
+#include "siw_verbs.h"
+#include "siw_mem.h"
+
+/*
+ * siw_rx_umem()
+ *
+ * Receive data of @len into target referenced by @dest_addr.
+ *
+ * @srx:       Receive Context
+ * @umem:      siw representation of target memory
+ * @dest_addr: user virtual address
+ * @len:       number of bytes to place
+ */
+static int siw_rx_umem(struct siw_rx_stream *srx, struct siw_umem *umem,
+                      u64 dest_addr, int len)
+{
+       int copied = 0;
+
+       while (len) {
+               struct page *p;
+               int pg_off, bytes, rv;
+               void *dest;
+
+               p = siw_get_upage(umem, dest_addr);
+               if (unlikely(!p)) {
+                       pr_warn("siw: %s: [QP %u]: bogus addr: %p, %p\n",
+                               __func__, qp_id(rx_qp(srx)),
+                               (void *)dest_addr, (void *)umem->fp_addr);
+                       /* siw internal error */
+                       srx->skb_copied += copied;
+                       srx->skb_new -= copied;
+
+                       return -EFAULT;
+               }
+               pg_off = dest_addr & ~PAGE_MASK;
+               bytes = min(len, (int)PAGE_SIZE - pg_off);
+
+               siw_dbg_qp(rx_qp(srx), "page %p, bytes=%u\n", p, bytes);
+
+               dest = kmap_atomic(p);
+               rv = skb_copy_bits(srx->skb, srx->skb_offset, dest + pg_off,
+                                  bytes);
+
+               if (unlikely(rv)) {
+                       kunmap_atomic(dest);
+                       srx->skb_copied += copied;
+                       srx->skb_new -= copied;
+
+                       pr_warn("siw: [QP %u]: %s, len %d, page %p, rv %d\n",
+                               qp_id(rx_qp(srx)), __func__, len, p, rv);
+
+                       return -EFAULT;
+               }
+               if (srx->mpa_crc_hd) {
+                       if (rx_qp(srx)->kernel_verbs) {
+                               crypto_shash_update(srx->mpa_crc_hd,
+                                       (u8 *)(dest + pg_off), bytes);
+                               kunmap_atomic(dest);
+                       } else {
+                               kunmap_atomic(dest);
+                               /*
+                                * Do CRC on original, not target buffer.
+                                * Some user land applications may
+                                * concurrently write the target buffer,
+                                * which would yield a broken CRC.
+                                * Walking the skb twice is very ineffcient.
+                                * Folding the CRC into skb_copy_bits()
+                                * would be much better, but is currently
+                                * not supported.
+                                */
+                               siw_crc_skb(srx, bytes);
+                       }
+               } else {
+                       kunmap_atomic(dest);
+               }
+               srx->skb_offset += bytes;
+               copied += bytes;
+               len -= bytes;
+               dest_addr += bytes;
+               pg_off = 0;
+       }
+       srx->skb_copied += copied;
+       srx->skb_new -= copied;
+
+       return copied;
+}
+
+static int siw_rx_kva(struct siw_rx_stream *srx, void *kva, int len)
+{
+       int rv;
+
+       siw_dbg_qp(rx_qp(srx), "kva: 0x%p, len: %u\n", kva, len);
+
+       rv = skb_copy_bits(srx->skb, srx->skb_offset, kva, len);
+       if (unlikely(rv)) {
+               pr_warn("siw: [QP %u]: %s, len %d, kva 0x%p, rv %d\n",
+                       qp_id(rx_qp(srx)), __func__, len, kva, rv);
+
+               return rv;
+       }
+       if (srx->mpa_crc_hd)
+               crypto_shash_update(srx->mpa_crc_hd, (u8 *)kva, len);
+
+       srx->skb_offset += len;
+       srx->skb_copied += len;
+       srx->skb_new -= len;
+
+       return len;
+}
+
+static int siw_rx_pbl(struct siw_rx_stream *srx, int *pbl_idx,
+                     struct siw_mem *mem, u64 addr, int len)
+{
+       struct siw_pbl *pbl = mem->pbl;
+       u64 offset = addr - mem->va;
+       int copied = 0;
+
+       while (len) {
+               int bytes;
+               u64 buf_addr =
+                       siw_pbl_get_buffer(pbl, offset, &bytes, pbl_idx);
+               if (!buf_addr)
+                       break;
+
+               bytes = min(bytes, len);
+               if (siw_rx_kva(srx, (void *)buf_addr, bytes) == bytes) {
+                       copied += bytes;
+                       offset += bytes;
+                       len -= bytes;
+               } else {
+                       break;
+               }
+       }
+       return copied;
+}
+
+/*
+ * siw_rresp_check_ntoh()
+ *
+ * Check incoming RRESP fragment header against expected
+ * header values and update expected values for potential next
+ * fragment.
+ *
+ * NOTE: This function must be called only if a RRESP DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_rresp_check_ntoh(struct siw_rx_stream *srx,
+                               struct siw_rx_fpdu *frx)
+{
+       struct iwarp_rdma_rresp *rresp = &srx->hdr.rresp;
+       struct siw_wqe *wqe = &frx->wqe_active;
+       enum ddp_ecode ecode;
+
+       u32 sink_stag = be32_to_cpu(rresp->sink_stag);
+       u64 sink_to = be64_to_cpu(rresp->sink_to);
+
+       if (frx->first_ddp_seg) {
+               srx->ddp_stag = wqe->sqe.sge[0].lkey;
+               srx->ddp_to = wqe->sqe.sge[0].laddr;
+               frx->pbl_idx = 0;
+       }
+       /* Below checks extend beyond the semantics of DDP, and
+        * into RDMAP:
+        * We check if the read response matches exactly the
+        * read request which was send to the remote peer to
+        * trigger this read response. RFC5040/5041 do not
+        * always have a proper error code for the detected
+        * error cases. We choose 'base or bounds error' for
+        * cases where the inbound STag is valid, but offset
+        * or length do not match our response receive state.
+        */
+       if (unlikely(srx->ddp_stag != sink_stag)) {
+               pr_warn("siw: [QP %u]: rresp stag: %08x != %08x\n",
+                       qp_id(rx_qp(srx)), sink_stag, srx->ddp_stag);
+               ecode = DDP_ECODE_T_INVALID_STAG;
+               goto error;
+       }
+       if (unlikely(srx->ddp_to != sink_to)) {
+               pr_warn("siw: [QP %u]: rresp off: %016llx != %016llx\n",
+                       qp_id(rx_qp(srx)), (unsigned long long)sink_to,
+                       (unsigned long long)srx->ddp_to);
+               ecode = DDP_ECODE_T_BASE_BOUNDS;
+               goto error;
+       }
+       if (unlikely(!frx->more_ddp_segs &&
+                    (wqe->processed + srx->fpdu_part_rem != wqe->bytes))) {
+               pr_warn("siw: [QP %u]: rresp len: %d != %d\n",
+                       qp_id(rx_qp(srx)),
+                       wqe->processed + srx->fpdu_part_rem, wqe->bytes);
+               ecode = DDP_ECODE_T_BASE_BOUNDS;
+               goto error;
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_TAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+/*
+ * siw_write_check_ntoh()
+ *
+ * Check incoming WRITE fragment header against expected
+ * header values and update expected values for potential next
+ * fragment
+ *
+ * NOTE: This function must be called only if a WRITE DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_write_check_ntoh(struct siw_rx_stream *srx,
+                               struct siw_rx_fpdu *frx)
+{
+       struct iwarp_rdma_write *write = &srx->hdr.rwrite;
+       enum ddp_ecode ecode;
+
+       u32 sink_stag = be32_to_cpu(write->sink_stag);
+       u64 sink_to = be64_to_cpu(write->sink_to);
+
+       if (frx->first_ddp_seg) {
+               srx->ddp_stag = sink_stag;
+               srx->ddp_to = sink_to;
+               frx->pbl_idx = 0;
+       } else {
+               if (unlikely(srx->ddp_stag != sink_stag)) {
+                       pr_warn("siw: [QP %u]: write stag: %08x != %08x\n",
+                               qp_id(rx_qp(srx)), sink_stag,
+                               srx->ddp_stag);
+                       ecode = DDP_ECODE_T_INVALID_STAG;
+                       goto error;
+               }
+               if (unlikely(srx->ddp_to != sink_to)) {
+                       pr_warn("siw: [QP %u]: write off: %016llx != %016llx\n",
+                               qp_id(rx_qp(srx)),
+                               (unsigned long long)sink_to,
+                               (unsigned long long)srx->ddp_to);
+                       ecode = DDP_ECODE_T_BASE_BOUNDS;
+                       goto error;
+               }
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_TAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+/*
+ * siw_send_check_ntoh()
+ *
+ * Check incoming SEND fragment header against expected
+ * header values and update expected MSN if no next
+ * fragment expected
+ *
+ * NOTE: This function must be called only if a SEND DDP segment
+ *       starts but not for fragmented consecutive pieces of an
+ *       already started DDP segment.
+ */
+static int siw_send_check_ntoh(struct siw_rx_stream *srx,
+                              struct siw_rx_fpdu *frx)
+{
+       struct iwarp_send_inv *send = &srx->hdr.send_inv;
+       struct siw_wqe *wqe = &frx->wqe_active;
+       enum ddp_ecode ecode;
+
+       u32 ddp_msn = be32_to_cpu(send->ddp_msn);
+       u32 ddp_mo = be32_to_cpu(send->ddp_mo);
+       u32 ddp_qn = be32_to_cpu(send->ddp_qn);
+
+       if (unlikely(ddp_qn != RDMAP_UNTAGGED_QN_SEND)) {
+               pr_warn("siw: [QP %u]: invalid ddp qn %d for send\n",
+                       qp_id(rx_qp(srx)), ddp_qn);
+               ecode = DDP_ECODE_UT_INVALID_QN;
+               goto error;
+       }
+       if (unlikely(ddp_msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND])) {
+               pr_warn("siw: [QP %u]: send msn: %u != %u\n",
+                       qp_id(rx_qp(srx)), ddp_msn,
+                       srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]);
+               ecode = DDP_ECODE_UT_INVALID_MSN_RANGE;
+               goto error;
+       }
+       if (unlikely(ddp_mo != wqe->processed)) {
+               pr_warn("siw: [QP %u], send mo: %u != %u\n",
+                       qp_id(rx_qp(srx)), ddp_mo, wqe->processed);
+               ecode = DDP_ECODE_UT_INVALID_MO;
+               goto error;
+       }
+       if (frx->first_ddp_seg) {
+               /* initialize user memory write position */
+               frx->sge_idx = 0;
+               frx->sge_off = 0;
+               frx->pbl_idx = 0;
+
+               /* only valid for SEND_INV and SEND_SE_INV operations */
+               srx->inval_stag = be32_to_cpu(send->inval_stag);
+       }
+       if (unlikely(wqe->bytes < wqe->processed + srx->fpdu_part_rem)) {
+               siw_dbg_qp(rx_qp(srx), "receive space short: %d - %d < %d\n",
+                          wqe->bytes, wqe->processed, srx->fpdu_part_rem);
+               wqe->wc_status = SIW_WC_LOC_LEN_ERR;
+               ecode = DDP_ECODE_UT_INVALID_MSN_NOBUF;
+               goto error;
+       }
+       return 0;
+error:
+       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                          DDP_ETYPE_UNTAGGED_BUF, ecode, 0);
+       return -EINVAL;
+}
+
+static struct siw_wqe *siw_rqe_get(struct siw_qp *qp)
+{
+       struct siw_rqe *rqe;
+       struct siw_srq *srq;
+       struct siw_wqe *wqe = NULL;
+       bool srq_event = false;
+       unsigned long flags;
+
+       srq = qp->srq;
+       if (srq) {
+               spin_lock_irqsave(&srq->lock, flags);
+               if (unlikely(!srq->num_rqe))
+                       goto out;
+
+               rqe = &srq->recvq[srq->rq_get % srq->num_rqe];
+       } else {
+               if (unlikely(!qp->recvq))
+                       goto out;
+
+               rqe = &qp->recvq[qp->rq_get % qp->attrs.rq_size];
+       }
+       if (likely(rqe->flags == SIW_WQE_VALID)) {
+               int num_sge = rqe->num_sge;
+
+               if (likely(num_sge <= SIW_MAX_SGE)) {
+                       int i = 0;
+
+                       wqe = rx_wqe(&qp->rx_untagged);
+                       rx_type(wqe) = SIW_OP_RECEIVE;
+                       wqe->wr_status = SIW_WR_INPROGRESS;
+                       wqe->bytes = 0;
+                       wqe->processed = 0;
+
+                       wqe->rqe.id = rqe->id;
+                       wqe->rqe.num_sge = num_sge;
+
+                       while (i < num_sge) {
+                               wqe->rqe.sge[i].laddr = rqe->sge[i].laddr;
+                               wqe->rqe.sge[i].lkey = rqe->sge[i].lkey;
+                               wqe->rqe.sge[i].length = rqe->sge[i].length;
+                               wqe->bytes += wqe->rqe.sge[i].length;
+                               wqe->mem[i] = NULL;
+                               i++;
+                       }
+                       /* can be re-used by appl */
+                       smp_store_mb(rqe->flags, 0);
+               } else {
+                       siw_dbg_qp(qp, "too many sge's: %d\n", rqe->num_sge);
+                       if (srq)
+                               spin_unlock_irqrestore(&srq->lock, flags);
+                       return NULL;
+               }
+               if (!srq) {
+                       qp->rq_get++;
+               } else {
+                       if (srq->armed) {
+                               /* Test SRQ limit */
+                               u32 off = (srq->rq_get + srq->limit) %
+                                         srq->num_rqe;
+                               struct siw_rqe *rqe2 = &srq->recvq[off];
+
+                               if (!(rqe2->flags & SIW_WQE_VALID)) {
+                                       srq->armed = 0;
+                                       srq_event = true;
+                               }
+                       }
+                       srq->rq_get++;
+               }
+       }
+out:
+       if (srq) {
+               spin_unlock_irqrestore(&srq->lock, flags);
+               if (srq_event)
+                       siw_srq_event(srq, IB_EVENT_SRQ_LIMIT_REACHED);
+       }
+       return wqe;
+}
+
+/*
+ * siw_proc_send:
+ *
+ * Process one incoming SEND and place data into memory referenced by
+ * receive wqe.
+ *
+ * Function supports partially received sends (suspending/resuming
+ * current receive wqe processing)
+ *
+ * return value:
+ *     0:       reached the end of a DDP segment
+ *     -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_send(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_untagged;
+       struct siw_wqe *wqe;
+       u32 data_bytes; /* all data bytes available */
+       u32 rcvd_bytes; /* sum of data bytes rcvd */
+       int rv = 0;
+
+       if (frx->first_ddp_seg) {
+               wqe = siw_rqe_get(qp);
+               if (unlikely(!wqe)) {
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_UNTAGGED_BUF,
+                                          DDP_ECODE_UT_INVALID_MSN_NOBUF, 0);
+                       return -ENOENT;
+               }
+       } else {
+               wqe = rx_wqe(frx);
+       }
+       if (srx->state == SIW_GET_DATA_START) {
+               rv = siw_send_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+               if (!srx->fpdu_part_rem) /* zero length SEND */
+                       return 0;
+       }
+       data_bytes = min(srx->fpdu_part_rem, srx->skb_new);
+       rcvd_bytes = 0;
+
+       /* A zero length SEND will skip below loop */
+       while (data_bytes) {
+               struct ib_pd *pd;
+               struct siw_mem **mem, *mem_p;
+               struct siw_sge *sge;
+               u32 sge_bytes; /* data bytes avail for SGE */
+
+               sge = &wqe->rqe.sge[frx->sge_idx];
+
+               if (!sge->length) {
+                       /* just skip empty sge's */
+                       frx->sge_idx++;
+                       frx->sge_off = 0;
+                       frx->pbl_idx = 0;
+                       continue;
+               }
+               sge_bytes = min(data_bytes, sge->length - frx->sge_off);
+               mem = &wqe->mem[frx->sge_idx];
+
+               /*
+                * check with QP's PD if no SRQ present, SRQ's PD otherwise
+                */
+               pd = qp->srq == NULL ? qp->pd : qp->srq->base_srq.pd;
+
+               rv = siw_check_sge(pd, sge, mem, IB_ACCESS_LOCAL_WRITE,
+                                  frx->sge_off, sge_bytes);
+               if (unlikely(rv)) {
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_CATASTROPHIC,
+                                          DDP_ECODE_CATASTROPHIC, 0);
+
+                       siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+                       break;
+               }
+               mem_p = *mem;
+               if (mem_p->mem_obj == NULL)
+                       rv = siw_rx_kva(srx,
+                                       (void *)(sge->laddr + frx->sge_off),
+                                       sge_bytes);
+               else if (!mem_p->is_pbl)
+                       rv = siw_rx_umem(srx, mem_p->umem,
+                                        sge->laddr + frx->sge_off, sge_bytes);
+               else
+                       rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+                                       sge->laddr + frx->sge_off, sge_bytes);
+
+               if (unlikely(rv != sge_bytes)) {
+                       wqe->processed += rcvd_bytes;
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_CATASTROPHIC,
+                                          DDP_ECODE_CATASTROPHIC, 0);
+                       return -EINVAL;
+               }
+               frx->sge_off += rv;
+
+               if (frx->sge_off == sge->length) {
+                       frx->sge_idx++;
+                       frx->sge_off = 0;
+                       frx->pbl_idx = 0;
+               }
+               data_bytes -= rv;
+               rcvd_bytes += rv;
+
+               srx->fpdu_part_rem -= rv;
+               srx->fpdu_part_rcvd += rv;
+       }
+       wqe->processed += rcvd_bytes;
+
+       if (!srx->fpdu_part_rem)
+               return 0;
+
+       return (rv < 0) ? rv : -EAGAIN;
+}
+
+/*
+ * siw_proc_write:
+ *
+ * Place incoming WRITE after referencing and checking target buffer
+
+ * Function supports partially received WRITEs (suspending/resuming
+ * current receive processing)
+ *
+ * return value:
+ *     0:       reached the end of a DDP segment
+ *     -EAGAIN: to be called again to finish the DDP segment
+ */
+int siw_proc_write(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_tagged;
+       struct siw_mem *mem;
+       int bytes, rv;
+
+       if (srx->state == SIW_GET_DATA_START) {
+               if (!srx->fpdu_part_rem) /* zero length WRITE */
+                       return 0;
+
+               rv = siw_write_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+       }
+       bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+       if (frx->first_ddp_seg) {
+               struct siw_wqe *wqe = rx_wqe(frx);
+
+               rx_mem(frx) = siw_mem_id2obj(qp->sdev, srx->ddp_stag >> 8);
+               if (unlikely(!rx_mem(frx))) {
+                       siw_dbg_qp(qp,
+                                  "sink stag not found/invalid, stag 0x%08x\n",
+                                  srx->ddp_stag);
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_TAGGED_BUF,
+                                          DDP_ECODE_T_INVALID_STAG, 0);
+                       return -EINVAL;
+               }
+               wqe->rqe.num_sge = 1;
+               rx_type(wqe) = SIW_OP_WRITE;
+               wqe->wr_status = SIW_WR_INPROGRESS;
+       }
+       mem = rx_mem(frx);
+
+       /*
+        * Check if application re-registered memory with different
+        * key field of STag.
+        */
+       if (unlikely(mem->stag != srx->ddp_stag)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_TAGGED_BUF,
+                                  DDP_ECODE_T_INVALID_STAG, 0);
+               return -EINVAL;
+       }
+       rv = siw_check_mem(qp->pd, mem, srx->ddp_to + srx->fpdu_part_rcvd,
+                          IB_ACCESS_REMOTE_WRITE, bytes);
+       if (unlikely(rv)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_TAGGED_BUF, siw_tagged_error(-rv),
+                                  0);
+
+               siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+               return -EINVAL;
+       }
+
+       if (mem->mem_obj == NULL)
+               rv = siw_rx_kva(srx,
+                               (void *)(srx->ddp_to + srx->fpdu_part_rcvd),
+                               bytes);
+       else if (!mem->is_pbl)
+               rv = siw_rx_umem(srx, mem->umem,
+                                srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+       else
+               rv = siw_rx_pbl(srx, &frx->pbl_idx, mem,
+                               srx->ddp_to + srx->fpdu_part_rcvd, bytes);
+
+       if (unlikely(rv != bytes)) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_CATASTROPHIC,
+                                  DDP_ECODE_CATASTROPHIC, 0);
+               return -EINVAL;
+       }
+       srx->fpdu_part_rem -= rv;
+       srx->fpdu_part_rcvd += rv;
+
+       if (!srx->fpdu_part_rem) {
+               srx->ddp_to += srx->fpdu_part_rcvd;
+               return 0;
+       }
+       return -EAGAIN;
+}
+
+/*
+ * Inbound RREQ's cannot carry user data.
+ */
+int siw_proc_rreq(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+
+       if (!srx->fpdu_part_rem)
+               return 0;
+
+       pr_warn("siw: [QP %u]: rreq with mpa len %d\n", qp_id(qp),
+               be16_to_cpu(srx->hdr.ctrl.mpa_len));
+
+       return -EPROTO;
+}
+
+/*
+ * siw_init_rresp:
+ *
+ * Process inbound RDMA READ REQ. Produce a pseudo READ RESPONSE WQE.
+ * Put it at the tail of the IRQ, if there is another WQE currently in
+ * transmit processing. If not, make it the current WQE to be processed
+ * and schedule transmit processing.
+ *
+ * Can be called from softirq context and from process
+ * context (RREAD socket loopback case!)
+ *
+ * return value:
+ *     0:      success,
+ *             failure code otherwise
+ */
+
+static int siw_init_rresp(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+       struct siw_wqe *tx_work = tx_wqe(qp);
+       struct siw_sqe *resp;
+
+       uint64_t raddr = be64_to_cpu(srx->hdr.rreq.sink_to),
+                laddr = be64_to_cpu(srx->hdr.rreq.source_to);
+       uint32_t length = be32_to_cpu(srx->hdr.rreq.read_size),
+                lkey = be32_to_cpu(srx->hdr.rreq.source_stag),
+                rkey = be32_to_cpu(srx->hdr.rreq.sink_stag),
+                msn = be32_to_cpu(srx->hdr.rreq.ddp_msn);
+
+       int run_sq = 1, rv = 0;
+       unsigned long flags;
+
+       if (unlikely(msn != srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ])) {
+               siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                  DDP_ETYPE_UNTAGGED_BUF,
+                                  DDP_ECODE_UT_INVALID_MSN_RANGE, 0);
+               return -EPROTO;
+       }
+       spin_lock_irqsave(&qp->sq_lock, flags);
+
+       if (tx_work->wr_status == SIW_WR_IDLE) {
+               /*
+                * immediately schedule READ response w/o
+                * consuming IRQ entry: IRQ must be empty.
+                */
+               tx_work->processed = 0;
+               tx_work->mem[0] = NULL;
+               tx_work->wr_status = SIW_WR_QUEUED;
+               resp = &tx_work->sqe;
+       } else {
+               resp = irq_alloc_free(qp);
+               run_sq = 0;
+       }
+       if (likely(resp)) {
+               resp->opcode = SIW_OP_READ_RESPONSE;
+
+               resp->sge[0].length = length;
+               resp->sge[0].laddr = laddr;
+               resp->sge[0].lkey = lkey;
+
+               /* Keep aside message sequence number for potential
+                * error reporting during Read Response generation.
+                */
+               resp->sge[1].length = msn;
+
+               resp->raddr = raddr;
+               resp->rkey = rkey;
+               resp->num_sge = length ? 1 : 0;
+
+               /* RRESP now valid as current TX wqe or placed into IRQ */
+               smp_store_mb(resp->flags, SIW_WQE_VALID);
+       } else {
+               pr_warn("siw: [QP %u]: irq %d exceeded %d\n", qp_id(qp),
+                       qp->irq_put % qp->attrs.irq_size, qp->attrs.irq_size);
+
+               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                  RDMAP_ETYPE_REMOTE_OPERATION,
+                                  RDMAP_ECODE_CATASTROPHIC_STREAM, 0);
+               rv = -EPROTO;
+       }
+
+       spin_unlock_irqrestore(&qp->sq_lock, flags);
+
+       if (run_sq)
+               rv = siw_sq_start(qp);
+
+       return rv;
+}
+
+/*
+ * Only called at start of Read.Resonse processing.
+ * Transfer pending Read from tip of ORQ into currrent rx wqe,
+ * but keep ORQ entry valid until Read.Response processing done.
+ * No Queue locking needed.
+ */
+static int siw_orqe_start_rx(struct siw_qp *qp)
+{
+       struct siw_sqe *orqe;
+       struct siw_wqe *wqe = NULL;
+
+       /* make sure ORQ indices are current */
+       smp_mb();
+
+       orqe = orq_get_current(qp);
+       if (READ_ONCE(orqe->flags) & SIW_WQE_VALID) {
+               /* RRESP is a TAGGED RDMAP operation */
+               wqe = rx_wqe(&qp->rx_tagged);
+               wqe->sqe.id = orqe->id;
+               wqe->sqe.opcode = orqe->opcode;
+               wqe->sqe.sge[0].laddr = orqe->sge[0].laddr;
+               wqe->sqe.sge[0].lkey = orqe->sge[0].lkey;
+               wqe->sqe.sge[0].length = orqe->sge[0].length;
+               wqe->sqe.flags = orqe->flags;
+               wqe->sqe.num_sge = 1;
+               wqe->bytes = orqe->sge[0].length;
+               wqe->processed = 0;
+               wqe->mem[0] = NULL;
+               /* make sure WQE is completely written before valid */
+               smp_wmb();
+               wqe->wr_status = SIW_WR_INPROGRESS;
+
+               return 0;
+       }
+       return -EPROTO;
+}
+
+/*
+ * siw_proc_rresp:
+ *
+ * Place incoming RRESP data into memory referenced by RREQ WQE
+ * which is at the tip of the ORQ
+ *
+ * Function supports partially received RRESP's (suspending/resuming
+ * current receive processing)
+ */
+int siw_proc_rresp(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_rx_fpdu *frx = &qp->rx_tagged;
+       struct siw_wqe *wqe = rx_wqe(frx);
+       struct siw_mem **mem, *mem_p;
+       struct siw_sge *sge;
+       int bytes, rv;
+
+       if (frx->first_ddp_seg) {
+               if (unlikely(wqe->wr_status != SIW_WR_IDLE)) {
+                       pr_warn("siw: [QP %u]: proc RRESP: status %d, op %d\n",
+                               qp_id(qp), wqe->wr_status, wqe->sqe.opcode);
+                       rv = -EPROTO;
+                       goto error_term;
+               }
+               /*
+                * fetch pending RREQ from orq
+                */
+               rv = siw_orqe_start_rx(qp);
+               if (rv) {
+                       pr_warn("siw: [QP %u]: ORQ empty at idx %d\n",
+                               qp_id(qp), qp->orq_get % qp->attrs.orq_size);
+                       goto error_term;
+               }
+               rv = siw_rresp_check_ntoh(srx, frx);
+               if (unlikely(rv)) {
+                       siw_qp_event(qp, IB_EVENT_QP_FATAL);
+                       return rv;
+               }
+       } else {
+               if (unlikely(wqe->wr_status != SIW_WR_INPROGRESS)) {
+                       pr_warn("siw: [QP %u]: resume RRESP: status %d\n",
+                               qp_id(qp), wqe->wr_status);
+                       rv = -EPROTO;
+                       goto error_term;
+               }
+       }
+       if (!srx->fpdu_part_rem) /* zero length RRESPONSE */
+               return 0;
+
+       sge = wqe->sqe.sge; /* there is only one */
+       mem = &wqe->mem[0];
+
+       if (!(*mem)) {
+               /*
+                * check target memory which resolves memory on first fragment
+                */
+               rv = siw_check_sge(qp->pd, sge, mem, IB_ACCESS_LOCAL_WRITE, 0,
+                                  wqe->bytes);
+               if (unlikely(rv)) {
+                       siw_dbg_qp(qp, "target mem check: %d\n", rv);
+                       wqe->wc_status = SIW_WC_LOC_PROT_ERR;
+
+                       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP,
+                                          DDP_ETYPE_TAGGED_BUF,
+                                          siw_tagged_error(-rv), 0);
+
+                       siw_qp_event(qp, IB_EVENT_QP_ACCESS_ERR);
+
+                       return -EINVAL;
+               }
+       }
+       mem_p = *mem;
+
+       bytes = min(srx->fpdu_part_rem, srx->skb_new);
+
+       if (mem_p->mem_obj == NULL)
+               rv = siw_rx_kva(srx, (void *)(sge->laddr + wqe->processed),
+                               bytes);
+       else if (!mem_p->is_pbl)
+               rv = siw_rx_umem(srx, mem_p->umem, sge->laddr + wqe->processed,
+                                bytes);
+       else
+               rv = siw_rx_pbl(srx, &frx->pbl_idx, mem_p,
+                               sge->laddr + wqe->processed, bytes);
+       if (rv != bytes) {
+               wqe->wc_status = SIW_WC_GENERAL_ERR;
+               rv = -EINVAL;
+               goto error_term;
+       }
+       srx->fpdu_part_rem -= rv;
+       srx->fpdu_part_rcvd += rv;
+       wqe->processed += rv;
+
+       if (!srx->fpdu_part_rem) {
+               srx->ddp_to += srx->fpdu_part_rcvd;
+               return 0;
+       }
+       return -EAGAIN;
+
+error_term:
+       siw_init_terminate(qp, TERM_ERROR_LAYER_DDP, DDP_ETYPE_CATASTROPHIC,
+                          DDP_ECODE_CATASTROPHIC, 0);
+       return rv;
+}
+
+int siw_proc_terminate(struct siw_qp *qp)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct sk_buff *skb = srx->skb;
+       struct iwarp_terminate *term = &srx->hdr.terminate;
+       union iwarp_hdr term_info;
+       u8 *infop = (u8 *)&term_info;
+       enum rdma_opcode op;
+       u16 to_copy = sizeof(struct iwarp_ctrl);
+
+       pr_warn("siw: got TERMINATE. layer %d, type %d, code %d\n",
+               __rdmap_term_layer(term), __rdmap_term_etype(term),
+               __rdmap_term_ecode(term));
+
+       if (be32_to_cpu(term->ddp_qn) != RDMAP_UNTAGGED_QN_TERMINATE ||
+           be32_to_cpu(term->ddp_msn) !=
+                   qp->rx_stream.ddp_msn[RDMAP_UNTAGGED_QN_TERMINATE] ||
+           be32_to_cpu(term->ddp_mo) != 0) {
+               pr_warn("siw: rx bogus TERM [QN x%08x, MSN x%08x, MO x%08x]\n",
+                       be32_to_cpu(term->ddp_qn), be32_to_cpu(term->ddp_msn),
+                       be32_to_cpu(term->ddp_mo));
+               return -ECONNRESET;
+       }
+       /*
+        * Receive remaining pieces of TERM if indicated
+        */
+       if (!term->flag_m)
+               return -ECONNRESET;
+
+       /* Do not take the effort to reassemble a network fragmented
+        * TERM message
+        */
+       if (srx->skb_new < sizeof(struct iwarp_ctrl_tagged))
+               return -ECONNRESET;
+
+       memset(infop, 0, sizeof(term_info));
+
+       skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+       op = __rdmap_get_opcode(&term_info.ctrl);
+       if (op >= RDMAP_TERMINATE)
+               goto out;
+
+       infop += to_copy;
+       srx->skb_offset += to_copy;
+       srx->skb_new -= to_copy;
+       srx->skb_copied += to_copy;
+       srx->fpdu_part_rcvd += to_copy;
+       srx->fpdu_part_rem -= to_copy;
+
+       to_copy = iwarp_pktinfo[op].hdr_len - to_copy;
+
+       /* Again, no network fragmented TERM's */
+       if (to_copy + MPA_CRC_SIZE > srx->skb_new)
+               return -ECONNRESET;
+
+       skb_copy_bits(skb, srx->skb_offset, infop, to_copy);
+
+       if (term->flag_r) {
+               siw_dbg_qp(qp, "TERM reports RDMAP hdr type %u, len %u (%s)\n",
+                          op, be16_to_cpu(term_info.ctrl.mpa_len),
+                          term->flag_m ? "valid" : "invalid");
+       } else if (term->flag_d) {
+               siw_dbg_qp(qp, "TERM reports DDP hdr type %u, len %u (%s)\n",
+                          op, be16_to_cpu(term_info.ctrl.mpa_len),
+                          term->flag_m ? "valid" : "invalid");
+       }
+out:
+       srx->skb_new -= to_copy;
+       srx->skb_offset += to_copy;
+       srx->skb_copied += to_copy;
+       srx->fpdu_part_rcvd += to_copy;
+       srx->fpdu_part_rem -= to_copy;
+
+       return -ECONNRESET;
+}
+
+static int siw_get_trailer(struct siw_qp *qp, struct siw_rx_stream *srx)
+{
+       struct sk_buff *skb = srx->skb;
+       u8 *tbuf = (u8 *)&srx->trailer.crc - srx->pad;
+       __wsum crc_in, crc_own = 0;
+
+       siw_dbg_qp(qp, "expected %d, available %d, pad %u\n",
+                  srx->fpdu_part_rem, srx->skb_new, srx->pad);
+
+       if (srx->skb_new < srx->fpdu_part_rem)
+               return -EAGAIN;
+
+       skb_copy_bits(skb, srx->skb_offset, tbuf, srx->fpdu_part_rem);
+
+       if (srx->mpa_crc_hd && srx->pad)
+               crypto_shash_update(srx->mpa_crc_hd, tbuf, srx->pad);
+
+       srx->skb_new -= srx->fpdu_part_rem;
+       srx->skb_offset += srx->fpdu_part_rem;
+       srx->skb_copied += srx->fpdu_part_rem;
+
+       if (!srx->mpa_crc_hd)
+               return 0;
+
+       /*
+        * CRC32 is computed, transmitted and received directly in NBO,
+        * so there's never a reason to convert byte order.
+        */
+       crypto_shash_final(srx->mpa_crc_hd, (u8 *)&crc_own);
+       crc_in = (__force __wsum)srx->trailer.crc;
+
+       if (unlikely(crc_in != crc_own)) {
+               pr_warn("siw: crc error. in: %08x, own %08x, op %u\n",
+                       crc_in, crc_own, qp->rx_stream.rdmap_op);
+
+               siw_init_terminate(qp, TERM_ERROR_LAYER_LLP,
+                                  LLP_ETYPE_MPA,
+                                  LLP_ECODE_RECEIVED_CRC, 0);
+               return -EINVAL;
+       }
+       return 0;
+}
+
+#define MIN_DDP_HDR sizeof(struct iwarp_ctrl_tagged)
+
+static int siw_get_hdr(struct siw_rx_stream *srx)
+{
+       struct sk_buff *skb = srx->skb;
+       struct siw_qp *qp = rx_qp(srx);
+       struct iwarp_ctrl *c_hdr = &srx->hdr.ctrl;
+       struct siw_rx_fpdu *frx;
+       u8 opcode;
+       int bytes;
+
+       if (srx->fpdu_part_rcvd < MIN_DDP_HDR) {
+               /*
+                * copy a mimimum sized (tagged) DDP frame control part
+                */
+               bytes = min_t(int, srx->skb_new,
+                             MIN_DDP_HDR - srx->fpdu_part_rcvd);
+
+               skb_copy_bits(skb, srx->skb_offset,
+                             (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+               srx->fpdu_part_rcvd += bytes;
+
+               srx->skb_new -= bytes;
+               srx->skb_offset += bytes;
+               srx->skb_copied += bytes;
+
+               if (srx->fpdu_part_rcvd < MIN_DDP_HDR)
+                       return -EAGAIN;
+
+               if (unlikely(__ddp_get_version(c_hdr) != DDP_VERSION)) {
+                       enum ddp_etype etype;
+                       enum ddp_ecode ecode;
+
+                       pr_warn("siw: received ddp version unsupported %d\n",
+                               __ddp_get_version(c_hdr));
+
+                       if (c_hdr->ddp_rdmap_ctrl & DDP_FLAG_TAGGED) {
+                               etype = DDP_ETYPE_TAGGED_BUF;
+                               ecode = DDP_ECODE_T_VERSION;
+                       } else {
+                               etype = DDP_ETYPE_UNTAGGED_BUF;
+                               ecode = DDP_ECODE_UT_VERSION;
+                       }
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_DDP,
+                                          etype, ecode, 0);
+                       return -EINVAL;
+               }
+               if (unlikely(__rdmap_get_version(c_hdr) != RDMAP_VERSION)) {
+                       pr_warn("siw: received rdmap version unsupported %d\n",
+                               __rdmap_get_version(c_hdr));
+
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+                                          RDMAP_ETYPE_REMOTE_OPERATION,
+                                          RDMAP_ECODE_VERSION, 0);
+                       return -EINVAL;
+               }
+               opcode = __rdmap_get_opcode(c_hdr);
+
+               if (opcode > RDMAP_TERMINATE) {
+                       pr_warn("siw: received unknown packet type %u\n",
+                               opcode);
+
+                       siw_init_terminate(rx_qp(srx), TERM_ERROR_LAYER_RDMAP,
+                                          RDMAP_ETYPE_REMOTE_OPERATION,
+                                          RDMAP_ECODE_OPCODE, 0);
+                       return -EINVAL;
+               }
+               siw_dbg_qp(rx_qp(srx), "new header, opcode %u\n", opcode);
+       } else {
+               opcode = __rdmap_get_opcode(c_hdr);
+       }
+       set_rx_fpdu_context(qp, opcode);
+       frx = qp->rx_fpdu;
+
+       /*
+        * Figure out len of current hdr: variable length of
+        * iwarp hdr may force us to copy hdr information in
+        * two steps. Only tagged DDP messages are already
+        * completely received.
+        */
+       if (iwarp_pktinfo[opcode].hdr_len > sizeof(struct iwarp_ctrl_tagged)) {
+               bytes = iwarp_pktinfo[opcode].hdr_len - MIN_DDP_HDR;
+
+               if (srx->skb_new < bytes)
+                       return -EAGAIN;
+
+               skb_copy_bits(skb, srx->skb_offset,
+                             (char *)c_hdr + srx->fpdu_part_rcvd, bytes);
+
+               srx->fpdu_part_rcvd += bytes;
+
+               srx->skb_new -= bytes;
+               srx->skb_offset += bytes;
+               srx->skb_copied += bytes;
+       }
+
+       /*
+        * DDP/RDMAP header receive completed. Check if the current
+        * DDP segment starts a new RDMAP message or continues a previously
+        * started RDMAP message.
+        *
+        * Alternating reception of DDP segments (or FPDUs) from incomplete
+        * tagged and untagged RDMAP messages is supported, as long as
+        * the current tagged or untagged message gets eventually completed
+        * w/o intersection from another message of the same type
+        * (tagged/untagged). E.g., a WRITE can get intersected by a SEND,
+        * but not by a READ RESPONSE etc.
+        */
+       if (srx->mpa_crc_hd) {
+               /*
+                * Restart CRC computation
+                */
+               crypto_shash_init(srx->mpa_crc_hd);
+               crypto_shash_update(srx->mpa_crc_hd, (u8 *)c_hdr,
+                                   srx->fpdu_part_rcvd);
+       }
+       if (frx->more_ddp_segs) {
+               frx->first_ddp_seg = 0;
+               if (frx->prev_rdmap_op != opcode) {
+                       pr_warn("siw: packet intersection: %u : %u\n",
+                               frx->prev_rdmap_op, opcode);
+                       /*
+                        * The last inbound RDMA operation of same type
+                        * (tagged or untagged) is left unfinished.
+                        * To complete it in error, make it the current
+                        * operation again, even with the header already
+                        * overwritten. For error handling, only the opcode
+                        * and current rx context are relevant.
+                        */
+                       set_rx_fpdu_context(qp, frx->prev_rdmap_op);
+                       __rdmap_set_opcode(c_hdr, frx->prev_rdmap_op);
+                       return -EPROTO;
+               }
+       } else {
+               frx->prev_rdmap_op = opcode;
+               frx->first_ddp_seg = 1;
+       }
+       frx->more_ddp_segs = c_hdr->ddp_rdmap_ctrl & DDP_FLAG_LAST ? 0 : 1;
+
+       return 0;
+}
+
+static int siw_check_tx_fence(struct siw_qp *qp)
+{
+       struct siw_wqe *tx_waiting = tx_wqe(qp);
+       struct siw_sqe *rreq;
+       int resume_tx = 0, rv = 0;
+       unsigned long flags;
+
+       spin_lock_irqsave(&qp->orq_lock, flags);
+
+       rreq = orq_get_current(qp);
+
+       /* free current orq entry */
+       WRITE_ONCE(rreq->flags, 0);
+
+       if (qp->tx_ctx.orq_fence) {
+               if (unlikely(tx_waiting->wr_status != SIW_WR_QUEUED)) {
+                       pr_warn("siw: [QP %u]: fence resume: bad status %d\n",
+                               qp_id(qp), tx_waiting->wr_status);
+                       rv = -EPROTO;
+                       goto out;
+               }
+               /* resume SQ processing */
+               if (tx_waiting->sqe.opcode == SIW_OP_READ ||
+                   tx_waiting->sqe.opcode == SIW_OP_READ_LOCAL_INV) {
+                       rreq = orq_get_tail(qp);
+                       if (unlikely(!rreq)) {
+                               pr_warn("siw: [QP %u]: no ORQE\n", qp_id(qp));
+                               rv = -EPROTO;
+                               goto out;
+                       }
+                       siw_read_to_orq(rreq, &tx_waiting->sqe);
+
+                       qp->orq_put++;
+                       qp->tx_ctx.orq_fence = 0;
+                       resume_tx = 1;
+
+               } else if (siw_orq_empty(qp)) {
+                       qp->tx_ctx.orq_fence = 0;
+                       resume_tx = 1;
+               } else {
+                       pr_warn("siw: [QP %u]: fence resume: orq idx: %d:%d\n",
+                               qp_id(qp), qp->orq_get, qp->orq_put);
+                       rv = -EPROTO;
+               }
+       }
+       qp->orq_get++;
+out:
+       spin_unlock_irqrestore(&qp->orq_lock, flags);
+
+       if (resume_tx)
+               rv = siw_sq_start(qp);
+
+       return rv;
+}
+
+/*
+ * siw_rdmap_complete()
+ *
+ * Complete processing of an RDMA message after receiving all
+ * DDP segmens or ABort processing after encountering error case.
+ *
+ *   o SENDs + RRESPs will need for completion,
+ *   o RREQs need for  READ RESPONSE initialization
+ *   o WRITEs need memory dereferencing
+ *
+ * TODO: Failed WRITEs need local error to be surfaced.
+ */
+static int siw_rdmap_complete(struct siw_qp *qp, int error)
+{
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       struct siw_wqe *wqe = rx_wqe(qp->rx_fpdu);
+       enum siw_wc_status wc_status = wqe->wc_status;
+       u8 opcode = __rdmap_get_opcode(&srx->hdr.ctrl);
+       int rv = 0;
+
+       switch (opcode) {
+       case RDMAP_SEND_SE:
+       case RDMAP_SEND_SE_INVAL:
+               wqe->rqe.flags |= SIW_WQE_SOLICITED;
+       case RDMAP_SEND:
+       case RDMAP_SEND_INVAL:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               srx->ddp_msn[RDMAP_UNTAGGED_QN_SEND]++;
+
+               if (error != 0 && wc_status == SIW_WC_SUCCESS)
+                       wc_status = SIW_WC_GENERAL_ERR;
+               /*
+                * Handle STag invalidation request
+                */
+               if (wc_status == SIW_WC_SUCCESS &&
+                   (opcode == RDMAP_SEND_INVAL ||
+                    opcode == RDMAP_SEND_SE_INVAL)) {
+                       rv = siw_invalidate_stag(qp->pd, srx->inval_stag);
+                       if (rv) {
+                               siw_init_terminate(
+                                       qp, TERM_ERROR_LAYER_RDMAP,
+                                       rv == -EACCES ?
+                                               RDMAP_ETYPE_REMOTE_PROTECTION :
+                                               RDMAP_ETYPE_REMOTE_OPERATION,
+                                       RDMAP_ECODE_CANNOT_INVALIDATE, 0);
+
+                               wc_status = SIW_WC_REM_INV_REQ_ERR;
+                       }
+                       rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+                                             rv ? 0 : srx->inval_stag,
+                                             wc_status);
+               } else {
+                       rv = siw_rqe_complete(qp, &wqe->rqe, wqe->processed,
+                                             0, wc_status);
+               }
+               siw_wqe_put_mem(wqe, SIW_OP_RECEIVE);
+               break;
+
+       case RDMAP_RDMA_READ_RESP:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               if (error != 0) {
+                       if ((srx->state == SIW_GET_HDR &&
+                            qp->rx_fpdu->first_ddp_seg) || error == -ENODATA)
+                               /* possible RREQ in ORQ left untouched */
+                               break;
+
+                       if (wc_status == SIW_WC_SUCCESS)
+                               wc_status = SIW_WC_GENERAL_ERR;
+               } else if (qp->kernel_verbs &&
+                          rx_type(wqe) == SIW_OP_READ_LOCAL_INV) {
+                       /*
+                        * Handle any STag invalidation request
+                        */
+                       rv = siw_invalidate_stag(qp->pd, wqe->sqe.sge[0].lkey);
+                       if (rv) {
+                               siw_init_terminate(qp, TERM_ERROR_LAYER_RDMAP,
+                                                  RDMAP_ETYPE_CATASTROPHIC,
+                                                  RDMAP_ECODE_UNSPECIFIED, 0);
+
+                               if (wc_status == SIW_WC_SUCCESS) {
+                                       wc_status = SIW_WC_GENERAL_ERR;
+                                       error = rv;
+                               }
+                       }
+               }
+               /*
+                * All errors turn the wqe into signalled.
+                */
+               if ((wqe->sqe.flags & SIW_WQE_SIGNALLED) || error != 0)
+                       rv = siw_sqe_complete(qp, &wqe->sqe, wqe->processed,
+                                             wc_status);
+               siw_wqe_put_mem(wqe, SIW_OP_READ);
+
+               if (!error)
+                       rv = siw_check_tx_fence(qp);
+               else
+                       /* Disable current ORQ eleement */
+                       WRITE_ONCE(orq_get_current(qp)->flags, 0);
+               break;
+
+       case RDMAP_RDMA_READ_REQ:
+               if (!error) {
+                       rv = siw_init_rresp(qp, srx);
+                       srx->ddp_msn[RDMAP_UNTAGGED_QN_RDMA_READ]++;
+               }
+               break;
+
+       case RDMAP_RDMA_WRITE:
+               if (wqe->wr_status == SIW_WR_IDLE)
+                       break;
+
+               /*
+                * Free References from memory object if
+                * attached to receive context (inbound WRITE).
+                * While a zero-length WRITE is allowed,
+                * no memory reference got created.
+                */
+               if (rx_mem(&qp->rx_tagged)) {
+                       siw_mem_put(rx_mem(&qp->rx_tagged));
+                       rx_mem(&qp->rx_tagged) = NULL;
+               }
+               break;
+
+       default:
+               break;
+       }
+       wqe->wr_status = SIW_WR_IDLE;
+
+       return rv;
+}
+
+/*
+ * siw_tcp_rx_data()
+ *
+ * Main routine to consume inbound TCP payload
+ *
+ * @rd_desc:   read descriptor
+ * @skb:       socket buffer
+ * @off:       offset in skb
+ * @len:       skb->len - offset : payload in skb
+ */
+int siw_tcp_rx_data(read_descriptor_t *rd_desc, struct sk_buff *skb,
+                   unsigned int off, size_t len)
+{
+       struct siw_qp *qp = rd_desc->arg.data;
+       struct siw_rx_stream *srx = &qp->rx_stream;
+       int rv;
+
+       srx->skb = skb;
+       srx->skb_new = skb->len - off;
+       srx->skb_offset = off;
+       srx->skb_copied = 0;
+
+       siw_dbg_qp(qp, "new data, len %d\n", srx->skb_new);
+
+       while (srx->skb_new) {
+               int run_completion = 1;
+
+               if (unlikely(srx->rx_suspend)) {
+                       /* Do not process any more data */
+                       srx->skb_copied += srx->skb_new;
+                       break;
+               }
+               switch (srx->state) {
+               case SIW_GET_HDR:
+                       rv = siw_get_hdr(srx);
+                       if (!rv) {
+                               srx->fpdu_part_rem =
+                                       be16_to_cpu(srx->hdr.ctrl.mpa_len) -
+                                       srx->fpdu_part_rcvd + MPA_HDR_SIZE;
+
+                               if (srx->fpdu_part_rem)
+                                       srx->pad = -srx->fpdu_part_rem & 0x3;
+                               else
+                                       srx->pad = 0;
+
+                               srx->state = SIW_GET_DATA_START;
+                               srx->fpdu_part_rcvd = 0;
+                       }
+                       break;
+
+               case SIW_GET_DATA_MORE:
+                       /*
+                        * Another data fragment of the same DDP segment.
+                        * Setting first_ddp_seg = 0 avoids repeating
+                        * initializations that shall occur only once per
+                        * DDP segment.
+                        */
+                       qp->rx_fpdu->first_ddp_seg = 0;
+                       /* Fall through */
+
+               case SIW_GET_DATA_START:
+                       /*
+                        * Headers will be checked by the opcode-specific
+                        * data receive function below.
+                        */
+                       rv = iwarp_pktinfo[qp->rx_stream.rdmap_op].rx_data(qp);
+                       if (!rv) {
+                               int mpa_len =
+                                       be16_to_cpu(srx->hdr.ctrl.mpa_len)
+                                       + MPA_HDR_SIZE;
+
+                               srx->fpdu_part_rem = (-mpa_len & 0x3)
+                                                     + MPA_CRC_SIZE;
+                               srx->fpdu_part_rcvd = 0;
+                               srx->state = SIW_GET_TRAILER;
+                       } else {
+                               if (unlikely(rv == -ECONNRESET))
+                                       run_completion = 0;
+                               else
+                                       srx->state = SIW_GET_DATA_MORE;
+                       }
+                       break;
+
+               case SIW_GET_TRAILER:
+                       /*
+                        * read CRC + any padding
+                        */
+                       rv = siw_get_trailer(qp, srx);
+                       if (likely(!rv)) {
+                               /*
+                                * FPDU completed.
+                                * complete RDMAP message if last fragment
+                                */
+                               srx->state = SIW_GET_HDR;
+                               srx->fpdu_part_rcvd = 0;
+
+                               if (!(srx->hdr.ctrl.ddp_rdmap_ctrl &
+                                     DDP_FLAG_LAST))
+                                       /* more frags */
+                                       break;
+
+                               rv = siw_rdmap_complete(qp, 0);
+                               run_completion = 0;
+                       }
+                       break;
+
+               default:
+                       pr_warn("QP[%u]: RX out of state\n", qp_id(qp));
+                       rv = -EPROTO;
+                       run_completion = 0;
+               }
+               if (unlikely(rv != 0 && rv != -EAGAIN)) {
+                       if ((srx->state > SIW_GET_HDR ||
+                            qp->rx_fpdu->more_ddp_segs) && run_completion)
+                               siw_rdmap_complete(qp, rv);
+
+                       siw_dbg_qp(qp, "rx error %d, rx state %d\n", rv,
+                                  srx->state);
+
+                       siw_qp_cm_drop(qp, 1);
+
+                       break;
+               }
+               if (rv) {
+                       siw_dbg_qp(qp, "fpdu fragment, state %d, missing %d\n",
+                                  srx->state, srx->fpdu_part_rem);
+                       break;
+               }
+       }
+       return srx->skb_copied;
+}