Skip to content

Commit e114e66

Browse files
Paolo Abenigregkh
authored andcommitted
IB/ipoib: move back IB LL address into the hard header
commit fc791b6335152c5278dc4a4991bcb2d329f806f9 upstream. After the commit 9207f9d45b0a ("net: preserve IP control block during GSO segmentation"), the GSO CB and the IPoIB CB conflict. That destroy the IPoIB address information cached there, causing a severe performance regression, as better described here: http://marc.info/?l=linux-kernel&m=146787279825501&w=2 This change moves the data cached by the IPoIB driver from the skb control lock into the IPoIB hard header, as done before the commit 936d7de ("IPoIB: Stop lying about hard_header_len and use skb->cb to stash LL addresses"). In order to avoid GRO issue, on packet reception, the IPoIB driver stash into the skb a dummy pseudo header, so that the received packets have actually a hard header matching the declared length. To avoid changing the connected mode maximum mtu, the allocated head buffer size is increased by the pseudo header length. After this commit, IPoIB performances are back to pre-regression value. v2 -> v3: rebased v1 -> v2: avoid changing the max mtu, increasing the head buf size Fixes: 9207f9d45b0a ("net: preserve IP control block during GSO segmentation") Signed-off-by: Paolo Abeni <pabeni@redhat.com> Signed-off-by: David S. Miller <davem@davemloft.net> Cc: Vasiliy Tolstov <v.tolstov@selfip.ru> Cc: Nikolay Borisov <n.borisov.lkml@gmail.com> Cc: Doug Ledford <dledford@redhat.com> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
1 parent 800a78f commit e114e66

5 files changed

Lines changed: 64 additions & 43 deletions

File tree

drivers/infiniband/ulp/ipoib/ipoib.h

Lines changed: 14 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ enum ipoib_flush_level {
6363

6464
enum {
6565
IPOIB_ENCAP_LEN = 4,
66+
IPOIB_PSEUDO_LEN = 20,
67+
IPOIB_HARD_LEN = IPOIB_ENCAP_LEN + IPOIB_PSEUDO_LEN,
6668

6769
IPOIB_UD_HEAD_SIZE = IB_GRH_BYTES + IPOIB_ENCAP_LEN,
6870
IPOIB_UD_RX_SG = 2, /* max buffer needed for 4K mtu */
@@ -131,15 +133,21 @@ struct ipoib_header {
131133
u16 reserved;
132134
};
133135

134-
struct ipoib_cb {
135-
struct qdisc_skb_cb qdisc_cb;
136-
u8 hwaddr[INFINIBAND_ALEN];
136+
struct ipoib_pseudo_header {
137+
u8 hwaddr[INFINIBAND_ALEN];
137138
};
138139

139-
static inline struct ipoib_cb *ipoib_skb_cb(const struct sk_buff *skb)
140+
static inline void skb_add_pseudo_hdr(struct sk_buff *skb)
140141
{
141-
BUILD_BUG_ON(sizeof(skb->cb) < sizeof(struct ipoib_cb));
142-
return (struct ipoib_cb *)skb->cb;
142+
char *data = skb_push(skb, IPOIB_PSEUDO_LEN);
143+
144+
/*
145+
* only the ipoib header is present now, make room for a dummy
146+
* pseudo header and set skb field accordingly
147+
*/
148+
memset(data, 0, IPOIB_PSEUDO_LEN);
149+
skb_reset_mac_header(skb);
150+
skb_pull(skb, IPOIB_HARD_LEN);
143151
}
144152

145153
/* Used for all multicast joins (broadcast, IPv4 mcast and IPv6 mcast) */

drivers/infiniband/ulp/ipoib/ipoib_cm.c

Lines changed: 8 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -63,6 +63,8 @@ MODULE_PARM_DESC(cm_data_debug_level,
6363
#define IPOIB_CM_RX_DELAY (3 * 256 * HZ)
6464
#define IPOIB_CM_RX_UPDATE_MASK (0x3)
6565

66+
#define IPOIB_CM_RX_RESERVE (ALIGN(IPOIB_HARD_LEN, 16) - IPOIB_ENCAP_LEN)
67+
6668
static struct ib_qp_attr ipoib_cm_err_attr = {
6769
.qp_state = IB_QPS_ERR
6870
};
@@ -147,15 +149,15 @@ static struct sk_buff *ipoib_cm_alloc_rx_skb(struct net_device *dev,
147149
struct sk_buff *skb;
148150
int i;
149151

150-
skb = dev_alloc_skb(IPOIB_CM_HEAD_SIZE + 12);
152+
skb = dev_alloc_skb(ALIGN(IPOIB_CM_HEAD_SIZE + IPOIB_PSEUDO_LEN, 16));
151153
if (unlikely(!skb))
152154
return NULL;
153155

154156
/*
155-
* IPoIB adds a 4 byte header. So we need 12 more bytes to align the
157+
* IPoIB adds a IPOIB_ENCAP_LEN byte header, this will align the
156158
* IP header to a multiple of 16.
157159
*/
158-
skb_reserve(skb, 12);
160+
skb_reserve(skb, IPOIB_CM_RX_RESERVE);
159161

160162
mapping[0] = ib_dma_map_single(priv->ca, skb->data, IPOIB_CM_HEAD_SIZE,
161163
DMA_FROM_DEVICE);
@@ -624,9 +626,9 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
624626
if (wc->byte_len < IPOIB_CM_COPYBREAK) {
625627
int dlen = wc->byte_len;
626628

627-
small_skb = dev_alloc_skb(dlen + 12);
629+
small_skb = dev_alloc_skb(dlen + IPOIB_CM_RX_RESERVE);
628630
if (small_skb) {
629-
skb_reserve(small_skb, 12);
631+
skb_reserve(small_skb, IPOIB_CM_RX_RESERVE);
630632
ib_dma_sync_single_for_cpu(priv->ca, rx_ring[wr_id].mapping[0],
631633
dlen, DMA_FROM_DEVICE);
632634
skb_copy_from_linear_data(skb, small_skb->data, dlen);
@@ -663,8 +665,7 @@ void ipoib_cm_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
663665

664666
copied:
665667
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
666-
skb_reset_mac_header(skb);
667-
skb_pull(skb, IPOIB_ENCAP_LEN);
668+
skb_add_pseudo_hdr(skb);
668669

669670
++dev->stats.rx_packets;
670671
dev->stats.rx_bytes += skb->len;

drivers/infiniband/ulp/ipoib/ipoib_ib.c

Lines changed: 5 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -130,16 +130,15 @@ static struct sk_buff *ipoib_alloc_rx_skb(struct net_device *dev, int id)
130130

131131
buf_size = IPOIB_UD_BUF_SIZE(priv->max_ib_mtu);
132132

133-
skb = dev_alloc_skb(buf_size + IPOIB_ENCAP_LEN);
133+
skb = dev_alloc_skb(buf_size + IPOIB_HARD_LEN);
134134
if (unlikely(!skb))
135135
return NULL;
136136

137137
/*
138-
* IB will leave a 40 byte gap for a GRH and IPoIB adds a 4 byte
139-
* header. So we need 4 more bytes to get to 48 and align the
140-
* IP header to a multiple of 16.
138+
* the IP header will be at IPOIP_HARD_LEN + IB_GRH_BYTES, that is
139+
* 64 bytes aligned
141140
*/
142-
skb_reserve(skb, 4);
141+
skb_reserve(skb, sizeof(struct ipoib_pseudo_header));
143142

144143
mapping = priv->rx_ring[id].mapping;
145144
mapping[0] = ib_dma_map_single(priv->ca, skb->data, buf_size,
@@ -242,8 +241,7 @@ static void ipoib_ib_handle_rx_wc(struct net_device *dev, struct ib_wc *wc)
242241
skb_pull(skb, IB_GRH_BYTES);
243242

244243
skb->protocol = ((struct ipoib_header *) skb->data)->proto;
245-
skb_reset_mac_header(skb);
246-
skb_pull(skb, IPOIB_ENCAP_LEN);
244+
skb_add_pseudo_hdr(skb);
247245

248246
++dev->stats.rx_packets;
249247
dev->stats.rx_bytes += skb->len;

drivers/infiniband/ulp/ipoib/ipoib_main.c

Lines changed: 33 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -850,9 +850,12 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
850850
ipoib_neigh_free(neigh);
851851
goto err_drop;
852852
}
853-
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE)
853+
if (skb_queue_len(&neigh->queue) <
854+
IPOIB_MAX_PATH_REC_QUEUE) {
855+
/* put pseudoheader back on for next time */
856+
skb_push(skb, IPOIB_PSEUDO_LEN);
854857
__skb_queue_tail(&neigh->queue, skb);
855-
else {
858+
} else {
856859
ipoib_warn(priv, "queue length limit %d. Packet drop.\n",
857860
skb_queue_len(&neigh->queue));
858861
goto err_drop;
@@ -889,24 +892,26 @@ static void neigh_add_path(struct sk_buff *skb, u8 *daddr,
889892
}
890893

891894
static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
892-
struct ipoib_cb *cb)
895+
struct ipoib_pseudo_header *phdr)
893896
{
894897
struct ipoib_dev_priv *priv = netdev_priv(dev);
895898
struct ipoib_path *path;
896899
unsigned long flags;
897900

898901
spin_lock_irqsave(&priv->lock, flags);
899902

900-
path = __path_find(dev, cb->hwaddr + 4);
903+
path = __path_find(dev, phdr->hwaddr + 4);
901904
if (!path || !path->valid) {
902905
int new_path = 0;
903906

904907
if (!path) {
905-
path = path_rec_create(dev, cb->hwaddr + 4);
908+
path = path_rec_create(dev, phdr->hwaddr + 4);
906909
new_path = 1;
907910
}
908911
if (path) {
909912
if (skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
913+
/* put pseudoheader back on for next time */
914+
skb_push(skb, IPOIB_PSEUDO_LEN);
910915
__skb_queue_tail(&path->queue, skb);
911916
} else {
912917
++dev->stats.tx_dropped;
@@ -934,10 +939,12 @@ static void unicast_arp_send(struct sk_buff *skb, struct net_device *dev,
934939
be16_to_cpu(path->pathrec.dlid));
935940

936941
spin_unlock_irqrestore(&priv->lock, flags);
937-
ipoib_send(dev, skb, path->ah, IPOIB_QPN(cb->hwaddr));
942+
ipoib_send(dev, skb, path->ah, IPOIB_QPN(phdr->hwaddr));
938943
return;
939944
} else if ((path->query || !path_rec_start(dev, path)) &&
940945
skb_queue_len(&path->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
946+
/* put pseudoheader back on for next time */
947+
skb_push(skb, IPOIB_PSEUDO_LEN);
941948
__skb_queue_tail(&path->queue, skb);
942949
} else {
943950
++dev->stats.tx_dropped;
@@ -951,13 +958,15 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
951958
{
952959
struct ipoib_dev_priv *priv = netdev_priv(dev);
953960
struct ipoib_neigh *neigh;
954-
struct ipoib_cb *cb = ipoib_skb_cb(skb);
961+
struct ipoib_pseudo_header *phdr;
955962
struct ipoib_header *header;
956963
unsigned long flags;
957964

965+
phdr = (struct ipoib_pseudo_header *) skb->data;
966+
skb_pull(skb, sizeof(*phdr));
958967
header = (struct ipoib_header *) skb->data;
959968

960-
if (unlikely(cb->hwaddr[4] == 0xff)) {
969+
if (unlikely(phdr->hwaddr[4] == 0xff)) {
961970
/* multicast, arrange "if" according to probability */
962971
if ((header->proto != htons(ETH_P_IP)) &&
963972
(header->proto != htons(ETH_P_IPV6)) &&
@@ -970,13 +979,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
970979
return NETDEV_TX_OK;
971980
}
972981
/* Add in the P_Key for multicast*/
973-
cb->hwaddr[8] = (priv->pkey >> 8) & 0xff;
974-
cb->hwaddr[9] = priv->pkey & 0xff;
982+
phdr->hwaddr[8] = (priv->pkey >> 8) & 0xff;
983+
phdr->hwaddr[9] = priv->pkey & 0xff;
975984

976-
neigh = ipoib_neigh_get(dev, cb->hwaddr);
985+
neigh = ipoib_neigh_get(dev, phdr->hwaddr);
977986
if (likely(neigh))
978987
goto send_using_neigh;
979-
ipoib_mcast_send(dev, cb->hwaddr, skb);
988+
ipoib_mcast_send(dev, phdr->hwaddr, skb);
980989
return NETDEV_TX_OK;
981990
}
982991

@@ -985,16 +994,16 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
985994
case htons(ETH_P_IP):
986995
case htons(ETH_P_IPV6):
987996
case htons(ETH_P_TIPC):
988-
neigh = ipoib_neigh_get(dev, cb->hwaddr);
997+
neigh = ipoib_neigh_get(dev, phdr->hwaddr);
989998
if (unlikely(!neigh)) {
990-
neigh_add_path(skb, cb->hwaddr, dev);
999+
neigh_add_path(skb, phdr->hwaddr, dev);
9911000
return NETDEV_TX_OK;
9921001
}
9931002
break;
9941003
case htons(ETH_P_ARP):
9951004
case htons(ETH_P_RARP):
9961005
/* for unicast ARP and RARP should always perform path find */
997-
unicast_arp_send(skb, dev, cb);
1006+
unicast_arp_send(skb, dev, phdr);
9981007
return NETDEV_TX_OK;
9991008
default:
10001009
/* ethertype not supported by IPoIB */
@@ -1011,11 +1020,13 @@ static int ipoib_start_xmit(struct sk_buff *skb, struct net_device *dev)
10111020
goto unref;
10121021
}
10131022
} else if (neigh->ah) {
1014-
ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(cb->hwaddr));
1023+
ipoib_send(dev, skb, neigh->ah, IPOIB_QPN(phdr->hwaddr));
10151024
goto unref;
10161025
}
10171026

10181027
if (skb_queue_len(&neigh->queue) < IPOIB_MAX_PATH_REC_QUEUE) {
1028+
/* put pseudoheader back on for next time */
1029+
skb_push(skb, sizeof(*phdr));
10191030
spin_lock_irqsave(&priv->lock, flags);
10201031
__skb_queue_tail(&neigh->queue, skb);
10211032
spin_unlock_irqrestore(&priv->lock, flags);
@@ -1047,8 +1058,8 @@ static int ipoib_hard_header(struct sk_buff *skb,
10471058
unsigned short type,
10481059
const void *daddr, const void *saddr, unsigned len)
10491060
{
1061+
struct ipoib_pseudo_header *phdr;
10501062
struct ipoib_header *header;
1051-
struct ipoib_cb *cb = ipoib_skb_cb(skb);
10521063

10531064
header = (struct ipoib_header *) skb_push(skb, sizeof *header);
10541065

@@ -1057,12 +1068,13 @@ static int ipoib_hard_header(struct sk_buff *skb,
10571068

10581069
/*
10591070
* we don't rely on dst_entry structure, always stuff the
1060-
* destination address into skb->cb so we can figure out where
1071+
* destination address into skb hard header so we can figure out where
10611072
* to send the packet later.
10621073
*/
1063-
memcpy(cb->hwaddr, daddr, INFINIBAND_ALEN);
1074+
phdr = (struct ipoib_pseudo_header *) skb_push(skb, sizeof(*phdr));
1075+
memcpy(phdr->hwaddr, daddr, INFINIBAND_ALEN);
10641076

1065-
return sizeof *header;
1077+
return IPOIB_HARD_LEN;
10661078
}
10671079

10681080
static void ipoib_set_mcast_list(struct net_device *dev)
@@ -1638,7 +1650,7 @@ void ipoib_setup(struct net_device *dev)
16381650

16391651
dev->flags |= IFF_BROADCAST | IFF_MULTICAST;
16401652

1641-
dev->hard_header_len = IPOIB_ENCAP_LEN;
1653+
dev->hard_header_len = IPOIB_HARD_LEN;
16421654
dev->addr_len = INFINIBAND_ALEN;
16431655
dev->type = ARPHRD_INFINIBAND;
16441656
dev->tx_queue_len = ipoib_sendq_size * 2;

drivers/infiniband/ulp/ipoib/ipoib_multicast.c

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -756,9 +756,11 @@ void ipoib_mcast_send(struct net_device *dev, u8 *daddr, struct sk_buff *skb)
756756
__ipoib_mcast_add(dev, mcast);
757757
list_add_tail(&mcast->list, &priv->multicast_list);
758758
}
759-
if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE)
759+
if (skb_queue_len(&mcast->pkt_queue) < IPOIB_MAX_MCAST_QUEUE) {
760+
/* put pseudoheader back on for next time */
761+
skb_push(skb, sizeof(struct ipoib_pseudo_header));
760762
skb_queue_tail(&mcast->pkt_queue, skb);
761-
else {
763+
} else {
762764
++dev->stats.tx_dropped;
763765
dev_kfree_skb_any(skb);
764766
}

0 commit comments

Comments
 (0)