963 lines
22 KiB
C
963 lines
22 KiB
C
|
/*
|
||
|
* Pseudo-driver for the intermediate queue device.
|
||
|
*
|
||
|
* This program is free software; you can redistribute it and/or
|
||
|
* modify it under the terms of the GNU General Public License
|
||
|
* as published by the Free Software Foundation; either version
|
||
|
* 2 of the License, or (at your option) any later version.
|
||
|
*
|
||
|
* Authors: Patrick McHardy, <kaber@trash.net>
|
||
|
*
|
||
|
* The first version was written by Martin Devera, <devik@cdi.cz>
|
||
|
*
|
||
|
* See Credits.txt
|
||
|
*/
|
||
|
|
||
|
#include <linux/module.h>
|
||
|
#include <linux/kernel.h>
|
||
|
#include <linux/moduleparam.h>
|
||
|
#include <linux/list.h>
|
||
|
#include <linux/skbuff.h>
|
||
|
#include <linux/netdevice.h>
|
||
|
#include <linux/etherdevice.h>
|
||
|
#include <linux/rtnetlink.h>
|
||
|
#include <linux/if_arp.h>
|
||
|
#include <linux/netfilter.h>
|
||
|
#include <linux/netfilter_ipv4.h>
|
||
|
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
|
||
|
#include <linux/netfilter_ipv6.h>
|
||
|
#endif
|
||
|
#include <linux/imq.h>
|
||
|
#include <net/pkt_sched.h>
|
||
|
#include <net/netfilter/nf_queue.h>
|
||
|
#include <net/sock.h>
|
||
|
#include <linux/ip.h>
|
||
|
#include <linux/ipv6.h>
|
||
|
#include <linux/if_vlan.h>
|
||
|
#include <linux/if_pppox.h>
|
||
|
#include <net/ip.h>
|
||
|
#include <net/ipv6.h>
|
||
|
|
||
|
static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num);
|
||
|
|
||
|
static nf_hookfn imq_nf_hook;
|
||
|
|
||
|
static struct nf_hook_ops imq_ops[] = {
|
||
|
{
|
||
|
/* imq_ingress_ipv4 */
|
||
|
.hook = imq_nf_hook,
|
||
|
.pf = PF_INET,
|
||
|
.hooknum = NF_INET_PRE_ROUTING,
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
|
||
|
.priority = NF_IP_PRI_MANGLE + 1,
|
||
|
#else
|
||
|
.priority = NF_IP_PRI_NAT_DST + 1,
|
||
|
#endif
|
||
|
},
|
||
|
{
|
||
|
/* imq_egress_ipv4 */
|
||
|
.hook = imq_nf_hook,
|
||
|
.pf = PF_INET,
|
||
|
.hooknum = NF_INET_POST_ROUTING,
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
|
||
|
.priority = NF_IP_PRI_LAST,
|
||
|
#else
|
||
|
.priority = NF_IP_PRI_NAT_SRC - 1,
|
||
|
#endif
|
||
|
},
|
||
|
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
|
||
|
{
|
||
|
/* imq_ingress_ipv6 */
|
||
|
.hook = imq_nf_hook,
|
||
|
.pf = PF_INET6,
|
||
|
.hooknum = NF_INET_PRE_ROUTING,
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
|
||
|
.priority = NF_IP6_PRI_MANGLE + 1,
|
||
|
#else
|
||
|
.priority = NF_IP6_PRI_NAT_DST + 1,
|
||
|
#endif
|
||
|
},
|
||
|
{
|
||
|
/* imq_egress_ipv6 */
|
||
|
.hook = imq_nf_hook,
|
||
|
.pf = PF_INET6,
|
||
|
.hooknum = NF_INET_POST_ROUTING,
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_AA) || defined(CONFIG_IMQ_BEHAVIOR_BA)
|
||
|
.priority = NF_IP6_PRI_LAST,
|
||
|
#else
|
||
|
.priority = NF_IP6_PRI_NAT_SRC - 1,
|
||
|
#endif
|
||
|
},
|
||
|
#endif
|
||
|
};
|
||
|
|
||
|
#if defined(CONFIG_IMQ_NUM_DEVS)
|
||
|
static int numdevs = CONFIG_IMQ_NUM_DEVS;
|
||
|
#else
|
||
|
static int numdevs = IMQ_MAX_DEVS;
|
||
|
#endif
|
||
|
|
||
|
static struct net_device *imq_devs_cache[IMQ_MAX_DEVS];
|
||
|
|
||
|
#define IMQ_MAX_QUEUES 32
|
||
|
static int numqueues = 1;
|
||
|
static u32 imq_hashrnd;
|
||
|
static int imq_dev_accurate_stats = 1;
|
||
|
|
||
|
static inline __be16 pppoe_proto(const struct sk_buff *skb)
|
||
|
{
|
||
|
return *((__be16 *)(skb_mac_header(skb) + ETH_HLEN +
|
||
|
sizeof(struct pppoe_hdr)));
|
||
|
}
|
||
|
|
||
|
static u16 imq_hash(struct net_device *dev, struct sk_buff *skb)
|
||
|
{
|
||
|
unsigned int pull_len;
|
||
|
u16 protocol = skb->protocol;
|
||
|
u32 addr1, addr2;
|
||
|
u32 hash, ihl = 0;
|
||
|
union {
|
||
|
u16 in16[2];
|
||
|
u32 in32;
|
||
|
} ports;
|
||
|
u8 ip_proto;
|
||
|
|
||
|
pull_len = 0;
|
||
|
|
||
|
recheck:
|
||
|
switch (protocol) {
|
||
|
case htons(ETH_P_8021Q): {
|
||
|
if (unlikely(skb_pull(skb, VLAN_HLEN) == NULL))
|
||
|
goto other;
|
||
|
|
||
|
pull_len += VLAN_HLEN;
|
||
|
skb->network_header += VLAN_HLEN;
|
||
|
|
||
|
protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto;
|
||
|
goto recheck;
|
||
|
}
|
||
|
|
||
|
case htons(ETH_P_PPP_SES): {
|
||
|
if (unlikely(skb_pull(skb, PPPOE_SES_HLEN) == NULL))
|
||
|
goto other;
|
||
|
|
||
|
pull_len += PPPOE_SES_HLEN;
|
||
|
skb->network_header += PPPOE_SES_HLEN;
|
||
|
|
||
|
protocol = pppoe_proto(skb);
|
||
|
goto recheck;
|
||
|
}
|
||
|
|
||
|
case htons(ETH_P_IP): {
|
||
|
const struct iphdr *iph = ip_hdr(skb);
|
||
|
|
||
|
if (unlikely(!pskb_may_pull(skb, sizeof(struct iphdr))))
|
||
|
goto other;
|
||
|
|
||
|
addr1 = iph->daddr;
|
||
|
addr2 = iph->saddr;
|
||
|
|
||
|
ip_proto = !(ip_hdr(skb)->frag_off & htons(IP_MF | IP_OFFSET)) ?
|
||
|
iph->protocol : 0;
|
||
|
ihl = ip_hdrlen(skb);
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
#if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE)
|
||
|
case htons(ETH_P_IPV6): {
|
||
|
const struct ipv6hdr *iph = ipv6_hdr(skb);
|
||
|
__be16 fo = 0;
|
||
|
|
||
|
if (unlikely(!pskb_may_pull(skb, sizeof(struct ipv6hdr))))
|
||
|
goto other;
|
||
|
|
||
|
addr1 = iph->daddr.s6_addr32[3];
|
||
|
addr2 = iph->saddr.s6_addr32[3];
|
||
|
ihl = ipv6_skip_exthdr(skb, sizeof(struct ipv6hdr), &ip_proto,
|
||
|
&fo);
|
||
|
if (unlikely(ihl < 0))
|
||
|
goto other;
|
||
|
|
||
|
break;
|
||
|
}
|
||
|
#endif
|
||
|
default:
|
||
|
other:
|
||
|
if (pull_len != 0) {
|
||
|
skb_push(skb, pull_len);
|
||
|
skb->network_header -= pull_len;
|
||
|
}
|
||
|
|
||
|
return (u16)(ntohs(protocol) % dev->real_num_tx_queues);
|
||
|
}
|
||
|
|
||
|
if (addr1 > addr2)
|
||
|
swap(addr1, addr2);
|
||
|
|
||
|
switch (ip_proto) {
|
||
|
case IPPROTO_TCP:
|
||
|
case IPPROTO_UDP:
|
||
|
case IPPROTO_DCCP:
|
||
|
case IPPROTO_ESP:
|
||
|
case IPPROTO_AH:
|
||
|
case IPPROTO_SCTP:
|
||
|
case IPPROTO_UDPLITE: {
|
||
|
if (likely(skb_copy_bits(skb, ihl, &ports.in32, 4) >= 0)) {
|
||
|
if (ports.in16[0] > ports.in16[1])
|
||
|
swap(ports.in16[0], ports.in16[1]);
|
||
|
break;
|
||
|
}
|
||
|
/* fall-through */
|
||
|
}
|
||
|
default:
|
||
|
ports.in32 = 0;
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (pull_len != 0) {
|
||
|
skb_push(skb, pull_len);
|
||
|
skb->network_header -= pull_len;
|
||
|
}
|
||
|
|
||
|
hash = jhash_3words(addr1, addr2, ports.in32, imq_hashrnd ^ ip_proto);
|
||
|
|
||
|
return (u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
|
||
|
}
|
||
|
|
||
|
static inline bool sk_tx_queue_recorded(struct sock *sk)
|
||
|
{
|
||
|
return (sk_tx_queue_get(sk) >= 0);
|
||
|
}
|
||
|
|
||
|
static struct netdev_queue *imq_select_queue(struct net_device *dev,
|
||
|
struct sk_buff *skb)
|
||
|
{
|
||
|
u16 queue_index = 0;
|
||
|
u32 hash;
|
||
|
|
||
|
if (likely(dev->real_num_tx_queues == 1))
|
||
|
goto out;
|
||
|
|
||
|
/* IMQ can be receiving ingress or engress packets. */
|
||
|
|
||
|
/* Check first for if rx_queue is set */
|
||
|
if (skb_rx_queue_recorded(skb)) {
|
||
|
queue_index = skb_get_rx_queue(skb);
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
/* Check if socket has tx_queue set */
|
||
|
if (sk_tx_queue_recorded(skb->sk)) {
|
||
|
queue_index = sk_tx_queue_get(skb->sk);
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
/* Try use socket hash */
|
||
|
if (skb->sk && skb->sk->sk_hash) {
|
||
|
hash = skb->sk->sk_hash;
|
||
|
queue_index =
|
||
|
(u16)(((u64)hash * dev->real_num_tx_queues) >> 32);
|
||
|
goto out;
|
||
|
}
|
||
|
|
||
|
/* Generate hash from packet data */
|
||
|
queue_index = imq_hash(dev, skb);
|
||
|
|
||
|
out:
|
||
|
if (unlikely(queue_index >= dev->real_num_tx_queues))
|
||
|
queue_index = (u16)((u32)queue_index % dev->real_num_tx_queues);
|
||
|
|
||
|
skb_set_queue_mapping(skb, queue_index);
|
||
|
return netdev_get_tx_queue(dev, queue_index);
|
||
|
}
|
||
|
|
||
|
static struct net_device_stats *imq_get_stats(struct net_device *dev)
|
||
|
{
|
||
|
return &dev->stats;
|
||
|
}
|
||
|
|
||
|
/* called for packets kfree'd in qdiscs at places other than enqueue */
|
||
|
static void imq_skb_destructor(struct sk_buff *skb)
|
||
|
{
|
||
|
struct nf_queue_entry *entry = skb->nf_queue_entry;
|
||
|
|
||
|
skb->nf_queue_entry = NULL;
|
||
|
|
||
|
if (entry) {
|
||
|
nf_queue_entry_release_refs(entry);
|
||
|
kfree(entry);
|
||
|
}
|
||
|
|
||
|
skb_restore_cb(skb); /* kfree backup */
|
||
|
}
|
||
|
|
||
|
static void imq_done_check_queue_mapping(struct sk_buff *skb,
|
||
|
struct net_device *dev)
|
||
|
{
|
||
|
unsigned int queue_index;
|
||
|
|
||
|
/* Don't let queue_mapping be left too large after exiting IMQ */
|
||
|
if (likely(skb->dev != dev && skb->dev != NULL)) {
|
||
|
queue_index = skb_get_queue_mapping(skb);
|
||
|
if (unlikely(queue_index >= skb->dev->real_num_tx_queues)) {
|
||
|
queue_index = (u16)((u32)queue_index %
|
||
|
skb->dev->real_num_tx_queues);
|
||
|
skb_set_queue_mapping(skb, queue_index);
|
||
|
}
|
||
|
} else {
|
||
|
/* skb->dev was IMQ device itself or NULL, be on safe side and
|
||
|
* just clear queue mapping.
|
||
|
*/
|
||
|
skb_set_queue_mapping(skb, 0);
|
||
|
}
|
||
|
}
|
||
|
|
||
|
static netdev_tx_t imq_dev_xmit(struct sk_buff *skb, struct net_device *dev)
|
||
|
{
|
||
|
struct nf_queue_entry *entry = skb->nf_queue_entry;
|
||
|
|
||
|
rcu_read_lock();
|
||
|
|
||
|
skb->nf_queue_entry = NULL;
|
||
|
netif_trans_update(dev);
|
||
|
|
||
|
dev->stats.tx_bytes += skb->len;
|
||
|
dev->stats.tx_packets++;
|
||
|
|
||
|
if (unlikely(entry == NULL)) {
|
||
|
/* We don't know what is going on here.. packet is queued for
|
||
|
* imq device, but (probably) not by us.
|
||
|
*
|
||
|
* If this packet was not send here by imq_nf_queue(), then
|
||
|
* skb_save_cb() was not used and skb_free() should not show:
|
||
|
* WARNING: IMQ: kfree_skb: skb->cb_next:..
|
||
|
* and/or
|
||
|
* WARNING: IMQ: kfree_skb: skb->nf_queue_entry...
|
||
|
*
|
||
|
* However if this message is shown, then IMQ is somehow broken
|
||
|
* and you should report this to linuximq.net.
|
||
|
*/
|
||
|
|
||
|
/* imq_dev_xmit is black hole that eats all packets, report that
|
||
|
* we eat this packet happily and increase dropped counters.
|
||
|
*/
|
||
|
|
||
|
dev->stats.tx_dropped++;
|
||
|
dev_kfree_skb(skb);
|
||
|
|
||
|
rcu_read_unlock();
|
||
|
return NETDEV_TX_OK;
|
||
|
}
|
||
|
|
||
|
skb_restore_cb(skb); /* restore skb->cb */
|
||
|
|
||
|
skb->imq_flags = 0;
|
||
|
skb->destructor = NULL;
|
||
|
|
||
|
imq_done_check_queue_mapping(skb, dev);
|
||
|
|
||
|
nf_reinject(entry, NF_ACCEPT);
|
||
|
|
||
|
rcu_read_unlock();
|
||
|
return NETDEV_TX_OK;
|
||
|
}
|
||
|
|
||
|
static struct net_device *get_imq_device_by_index(int index)
|
||
|
{
|
||
|
struct net_device *dev = NULL;
|
||
|
struct net *net;
|
||
|
char buf[8];
|
||
|
|
||
|
/* get device by name and cache result */
|
||
|
snprintf(buf, sizeof(buf), "imq%d", index);
|
||
|
|
||
|
/* Search device from all namespaces. */
|
||
|
for_each_net(net) {
|
||
|
dev = dev_get_by_name(net, buf);
|
||
|
if (dev)
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (WARN_ON_ONCE(dev == NULL)) {
|
||
|
/* IMQ device not found. Exotic config? */
|
||
|
return ERR_PTR(-ENODEV);
|
||
|
}
|
||
|
|
||
|
imq_devs_cache[index] = dev;
|
||
|
dev_put(dev);
|
||
|
|
||
|
return dev;
|
||
|
}
|
||
|
|
||
|
static struct nf_queue_entry *nf_queue_entry_dup(struct nf_queue_entry *e)
|
||
|
{
|
||
|
struct nf_queue_entry *entry = kmemdup(e, e->size, GFP_ATOMIC);
|
||
|
if (entry) {
|
||
|
nf_queue_entry_get_refs(entry);
|
||
|
return entry;
|
||
|
}
|
||
|
return NULL;
|
||
|
}
|
||
|
|
||
|
#ifdef CONFIG_BRIDGE_NETFILTER
|
||
|
/* When called from bridge netfilter, skb->data must point to MAC header
|
||
|
* before calling skb_gso_segment(). Else, original MAC header is lost
|
||
|
* and segmented skbs will be sent to wrong destination.
|
||
|
*/
|
||
|
static void nf_bridge_adjust_skb_data(struct sk_buff *skb)
|
||
|
{
|
||
|
if (skb->nf_bridge)
|
||
|
__skb_push(skb, skb->network_header - skb->mac_header);
|
||
|
}
|
||
|
|
||
|
static void nf_bridge_adjust_segmented_data(struct sk_buff *skb)
|
||
|
{
|
||
|
if (skb->nf_bridge)
|
||
|
__skb_pull(skb, skb->network_header - skb->mac_header);
|
||
|
}
|
||
|
#else
|
||
|
#define nf_bridge_adjust_skb_data(s) do {} while (0)
|
||
|
#define nf_bridge_adjust_segmented_data(s) do {} while (0)
|
||
|
#endif
|
||
|
|
||
|
static void free_entry(struct nf_queue_entry *entry)
|
||
|
{
|
||
|
nf_queue_entry_release_refs(entry);
|
||
|
kfree(entry);
|
||
|
}
|
||
|
|
||
|
static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev);
|
||
|
|
||
|
static int __imq_nf_queue_gso(struct nf_queue_entry *entry,
|
||
|
struct net_device *dev, struct sk_buff *skb)
|
||
|
{
|
||
|
int ret = -ENOMEM;
|
||
|
struct nf_queue_entry *entry_seg;
|
||
|
|
||
|
nf_bridge_adjust_segmented_data(skb);
|
||
|
|
||
|
if (skb->next == NULL) { /* last packet, no need to copy entry */
|
||
|
struct sk_buff *gso_skb = entry->skb;
|
||
|
entry->skb = skb;
|
||
|
ret = __imq_nf_queue(entry, dev);
|
||
|
if (ret)
|
||
|
entry->skb = gso_skb;
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
skb->next = NULL;
|
||
|
|
||
|
entry_seg = nf_queue_entry_dup(entry);
|
||
|
if (entry_seg) {
|
||
|
entry_seg->skb = skb;
|
||
|
ret = __imq_nf_queue(entry_seg, dev);
|
||
|
if (ret)
|
||
|
free_entry(entry_seg);
|
||
|
}
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
static int imq_nf_queue(struct nf_queue_entry *entry, unsigned queue_num)
|
||
|
{
|
||
|
struct sk_buff *skb, *segs;
|
||
|
struct net_device *dev;
|
||
|
unsigned int queued;
|
||
|
int index, retval, err;
|
||
|
|
||
|
index = entry->skb->imq_flags & IMQ_F_IFMASK;
|
||
|
if (unlikely(index > numdevs - 1)) {
|
||
|
if (net_ratelimit())
|
||
|
pr_warn("IMQ: invalid device specified, highest is %u\n",
|
||
|
numdevs - 1);
|
||
|
retval = -EINVAL;
|
||
|
goto out_no_dev;
|
||
|
}
|
||
|
|
||
|
/* check for imq device by index from cache */
|
||
|
dev = imq_devs_cache[index];
|
||
|
if (unlikely(!dev)) {
|
||
|
dev = get_imq_device_by_index(index);
|
||
|
if (IS_ERR(dev)) {
|
||
|
retval = PTR_ERR(dev);
|
||
|
goto out_no_dev;
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if (unlikely(!(dev->flags & IFF_UP))) {
|
||
|
entry->skb->imq_flags = 0;
|
||
|
retval = -ECANCELED;
|
||
|
goto out_no_dev;
|
||
|
}
|
||
|
|
||
|
/* Since 3.10.x, GSO handling moved here as result of upstream commit
|
||
|
* a5fedd43d5f6c94c71053a66e4c3d2e35f1731a2 (netfilter: move
|
||
|
* skb_gso_segment into nfnetlink_queue module).
|
||
|
*
|
||
|
* Following code replicates the gso handling from
|
||
|
* 'net/netfilter/nfnetlink_queue_core.c':nfqnl_enqueue_packet().
|
||
|
*/
|
||
|
|
||
|
skb = entry->skb;
|
||
|
|
||
|
switch (entry->state.pf) {
|
||
|
case NFPROTO_IPV4:
|
||
|
skb->protocol = htons(ETH_P_IP);
|
||
|
break;
|
||
|
case NFPROTO_IPV6:
|
||
|
skb->protocol = htons(ETH_P_IPV6);
|
||
|
break;
|
||
|
}
|
||
|
|
||
|
if (!skb_is_gso(entry->skb))
|
||
|
return __imq_nf_queue(entry, dev);
|
||
|
|
||
|
nf_bridge_adjust_skb_data(skb);
|
||
|
segs = skb_gso_segment(skb, 0);
|
||
|
/* Does not use PTR_ERR to limit the number of error codes that can be
|
||
|
* returned by nf_queue. For instance, callers rely on -ECANCELED to
|
||
|
* mean 'ignore this hook'.
|
||
|
*/
|
||
|
err = -ENOBUFS;
|
||
|
if (IS_ERR(segs))
|
||
|
goto out_err;
|
||
|
queued = 0;
|
||
|
err = 0;
|
||
|
do {
|
||
|
struct sk_buff *nskb = segs->next;
|
||
|
if (nskb && nskb->next)
|
||
|
nskb->cb_next = NULL;
|
||
|
if (err == 0)
|
||
|
err = __imq_nf_queue_gso(entry, dev, segs);
|
||
|
if (err == 0)
|
||
|
queued++;
|
||
|
else
|
||
|
kfree_skb(segs);
|
||
|
segs = nskb;
|
||
|
} while (segs);
|
||
|
|
||
|
if (queued) {
|
||
|
if (err) /* some segments are already queued */
|
||
|
free_entry(entry);
|
||
|
kfree_skb(skb);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
out_err:
|
||
|
nf_bridge_adjust_segmented_data(skb);
|
||
|
retval = err;
|
||
|
out_no_dev:
|
||
|
return retval;
|
||
|
}
|
||
|
|
||
|
static int __imq_nf_queue(struct nf_queue_entry *entry, struct net_device *dev)
|
||
|
{
|
||
|
struct sk_buff *skb_orig, *skb, *skb_shared, *skb_popd;
|
||
|
struct Qdisc *q;
|
||
|
struct sk_buff *to_free = NULL;
|
||
|
struct netdev_queue *txq;
|
||
|
spinlock_t *root_lock;
|
||
|
int users;
|
||
|
int retval = -EINVAL;
|
||
|
unsigned int orig_queue_index;
|
||
|
|
||
|
dev->last_rx = jiffies;
|
||
|
|
||
|
skb = entry->skb;
|
||
|
skb_orig = NULL;
|
||
|
|
||
|
/* skb has owner? => make clone */
|
||
|
if (unlikely(skb->destructor)) {
|
||
|
skb_orig = skb;
|
||
|
skb = skb_clone(skb, GFP_ATOMIC);
|
||
|
if (unlikely(!skb)) {
|
||
|
retval = -ENOMEM;
|
||
|
goto out;
|
||
|
}
|
||
|
skb->cb_next = NULL;
|
||
|
entry->skb = skb;
|
||
|
}
|
||
|
|
||
|
dev->stats.rx_bytes += skb->len;
|
||
|
dev->stats.rx_packets++;
|
||
|
|
||
|
if (!skb->dev) {
|
||
|
/* skb->dev == NULL causes problems, try the find cause. */
|
||
|
if (net_ratelimit()) {
|
||
|
dev_warn(&dev->dev,
|
||
|
"received packet with skb->dev == NULL\n");
|
||
|
dump_stack();
|
||
|
}
|
||
|
|
||
|
skb->dev = dev;
|
||
|
}
|
||
|
|
||
|
/* Disables softirqs for lock below */
|
||
|
rcu_read_lock_bh();
|
||
|
|
||
|
/* Multi-queue selection */
|
||
|
orig_queue_index = skb_get_queue_mapping(skb);
|
||
|
txq = imq_select_queue(dev, skb);
|
||
|
|
||
|
q = rcu_dereference(txq->qdisc);
|
||
|
if (unlikely(!q->enqueue))
|
||
|
goto packet_not_eaten_by_imq_dev;
|
||
|
|
||
|
skb->nf_queue_entry = entry;
|
||
|
root_lock = qdisc_lock(q);
|
||
|
spin_lock(root_lock);
|
||
|
|
||
|
users = refcount_read(&skb->users);
|
||
|
|
||
|
skb_shared = skb_get(skb); /* increase reference count by one */
|
||
|
|
||
|
/* backup skb->cb, as qdisc layer will overwrite it */
|
||
|
skb_save_cb(skb_shared);
|
||
|
qdisc_enqueue_root(skb_shared, q, &to_free); /* might kfree_skb */
|
||
|
if (likely(refcount_read(&skb_shared->users) == users + 1)) {
|
||
|
bool validate;
|
||
|
|
||
|
kfree_skb(skb_shared); /* decrease reference count by one */
|
||
|
|
||
|
skb->destructor = &imq_skb_destructor;
|
||
|
|
||
|
skb_popd = qdisc_dequeue_skb(q, &validate);
|
||
|
|
||
|
/* cloned? */
|
||
|
if (unlikely(skb_orig))
|
||
|
kfree_skb(skb_orig); /* free original */
|
||
|
|
||
|
spin_unlock(root_lock);
|
||
|
|
||
|
#if 0
|
||
|
/* schedule qdisc dequeue */
|
||
|
__netif_schedule(q);
|
||
|
#else
|
||
|
if (likely(skb_popd)) {
|
||
|
/* Note that we validate skb (GSO, checksum, ...) outside of locks */
|
||
|
if (validate)
|
||
|
skb_popd = validate_xmit_skb_list(skb_popd, dev);
|
||
|
|
||
|
if (skb_popd) {
|
||
|
int dummy_ret;
|
||
|
int cpu = smp_processor_id(); /* ok because BHs are off */
|
||
|
|
||
|
txq = skb_get_tx_queue(dev, skb_popd);
|
||
|
/*
|
||
|
IMQ device will not be frozen or stoped, and it always be successful.
|
||
|
So we need not check its status and return value to accelerate.
|
||
|
*/
|
||
|
if (imq_dev_accurate_stats && txq->xmit_lock_owner != cpu) {
|
||
|
HARD_TX_LOCK(dev, txq, cpu);
|
||
|
if (!netif_xmit_frozen_or_stopped(txq)) {
|
||
|
dev_hard_start_xmit(skb_popd, dev, txq, &dummy_ret);
|
||
|
}
|
||
|
HARD_TX_UNLOCK(dev, txq);
|
||
|
} else {
|
||
|
if (!netif_xmit_frozen_or_stopped(txq)) {
|
||
|
dev_hard_start_xmit(skb_popd, dev, txq, &dummy_ret);
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
} else {
|
||
|
/* No ready skb, then schedule it */
|
||
|
__netif_schedule(q);
|
||
|
}
|
||
|
#endif
|
||
|
rcu_read_unlock_bh();
|
||
|
retval = 0;
|
||
|
goto out;
|
||
|
} else {
|
||
|
skb_restore_cb(skb_shared); /* restore skb->cb */
|
||
|
skb->nf_queue_entry = NULL;
|
||
|
/*
|
||
|
* qdisc dropped packet and decreased skb reference count of
|
||
|
* skb, so we don't really want to and try refree as that would
|
||
|
* actually destroy the skb.
|
||
|
*/
|
||
|
spin_unlock(root_lock);
|
||
|
goto packet_not_eaten_by_imq_dev;
|
||
|
}
|
||
|
|
||
|
packet_not_eaten_by_imq_dev:
|
||
|
skb_set_queue_mapping(skb, orig_queue_index);
|
||
|
rcu_read_unlock_bh();
|
||
|
|
||
|
/* cloned? restore original */
|
||
|
if (unlikely(skb_orig)) {
|
||
|
kfree_skb(skb);
|
||
|
entry->skb = skb_orig;
|
||
|
}
|
||
|
retval = -1;
|
||
|
out:
|
||
|
if (unlikely(to_free)) {
|
||
|
kfree_skb_list(to_free);
|
||
|
}
|
||
|
return retval;
|
||
|
}
|
||
|
static unsigned int imq_nf_hook(void *priv,
|
||
|
struct sk_buff *skb,
|
||
|
const struct nf_hook_state *state)
|
||
|
{
|
||
|
return (skb->imq_flags & IMQ_F_ENQUEUE) ? NF_IMQ_QUEUE : NF_ACCEPT;
|
||
|
}
|
||
|
|
||
|
static int imq_close(struct net_device *dev)
|
||
|
{
|
||
|
netif_stop_queue(dev);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static int imq_open(struct net_device *dev)
|
||
|
{
|
||
|
netif_start_queue(dev);
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static struct device_type imq_device_type = {
|
||
|
.name = "imq",
|
||
|
};
|
||
|
|
||
|
static const struct net_device_ops imq_netdev_ops = {
|
||
|
.ndo_open = imq_open,
|
||
|
.ndo_stop = imq_close,
|
||
|
.ndo_start_xmit = imq_dev_xmit,
|
||
|
.ndo_get_stats = imq_get_stats,
|
||
|
};
|
||
|
|
||
|
static void imq_setup(struct net_device *dev)
|
||
|
{
|
||
|
dev->netdev_ops = &imq_netdev_ops;
|
||
|
dev->type = ARPHRD_VOID;
|
||
|
dev->mtu = 16000; /* too small? */
|
||
|
dev->tx_queue_len = 11000; /* too big? */
|
||
|
dev->flags = IFF_NOARP;
|
||
|
dev->features = NETIF_F_SG | NETIF_F_FRAGLIST |
|
||
|
NETIF_F_GSO | NETIF_F_HW_CSUM |
|
||
|
NETIF_F_HIGHDMA;
|
||
|
dev->priv_flags &= ~(IFF_XMIT_DST_RELEASE |
|
||
|
IFF_TX_SKB_SHARING);
|
||
|
}
|
||
|
|
||
|
static int imq_validate(struct nlattr *tb[], struct nlattr *data[],
|
||
|
struct netlink_ext_ack *extack)
|
||
|
{
|
||
|
int ret = 0;
|
||
|
|
||
|
if (tb[IFLA_ADDRESS]) {
|
||
|
if (nla_len(tb[IFLA_ADDRESS]) != ETH_ALEN) {
|
||
|
ret = -EINVAL;
|
||
|
goto end;
|
||
|
}
|
||
|
if (!is_valid_ether_addr(nla_data(tb[IFLA_ADDRESS]))) {
|
||
|
ret = -EADDRNOTAVAIL;
|
||
|
goto end;
|
||
|
}
|
||
|
}
|
||
|
return 0;
|
||
|
end:
|
||
|
pr_warn("IMQ: imq_validate failed (%d)\n", ret);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
static struct rtnl_link_ops imq_link_ops __read_mostly = {
|
||
|
.kind = "imq",
|
||
|
.priv_size = 0,
|
||
|
.setup = imq_setup,
|
||
|
.validate = imq_validate,
|
||
|
};
|
||
|
|
||
|
static const struct nf_queue_handler imq_nfqh = {
|
||
|
.outfn = imq_nf_queue,
|
||
|
};
|
||
|
|
||
|
static int __net_init imq_nf_register(struct net *net)
|
||
|
{
|
||
|
return nf_register_net_hooks(net, imq_ops,
|
||
|
ARRAY_SIZE(imq_ops));
|
||
|
};
|
||
|
|
||
|
static void __net_exit imq_nf_unregister(struct net *net)
|
||
|
{
|
||
|
nf_unregister_net_hooks(net, imq_ops,
|
||
|
ARRAY_SIZE(imq_ops));
|
||
|
};
|
||
|
|
||
|
static struct pernet_operations imq_net_ops = {
|
||
|
.init = imq_nf_register,
|
||
|
.exit = imq_nf_unregister,
|
||
|
};
|
||
|
|
||
|
static int __net_init imq_init_hooks(void)
|
||
|
{
|
||
|
int ret;
|
||
|
nf_register_queue_imq_handler(&imq_nfqh);
|
||
|
|
||
|
ret = register_pernet_subsys(&imq_net_ops);
|
||
|
if (ret < 0)
|
||
|
nf_unregister_queue_imq_handler();
|
||
|
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
#ifdef CONFIG_LOCKDEP
|
||
|
static struct lock_class_key imq_netdev_addr_lock_key;
|
||
|
|
||
|
static void __init imq_dev_set_lockdep_one(struct net_device *dev,
|
||
|
struct netdev_queue *txq, void *arg)
|
||
|
{
|
||
|
/*
|
||
|
* the IMQ transmit locks can be taken recursively,
|
||
|
* for example with one IMQ rule for input- and one for
|
||
|
* output network devices in iptables!
|
||
|
* until we find a better solution ignore them.
|
||
|
*/
|
||
|
lockdep_set_novalidate_class(&txq->_xmit_lock);
|
||
|
}
|
||
|
|
||
|
static void imq_dev_set_lockdep_class(struct net_device *dev)
|
||
|
{
|
||
|
lockdep_set_class_and_name(&dev->addr_list_lock,
|
||
|
&imq_netdev_addr_lock_key, "_xmit_addr_IMQ");
|
||
|
netdev_for_each_tx_queue(dev, imq_dev_set_lockdep_one, NULL);
|
||
|
}
|
||
|
#else
|
||
|
static inline void imq_dev_set_lockdep_class(struct net_device *dev)
|
||
|
{
|
||
|
}
|
||
|
#endif
|
||
|
|
||
|
static int __init imq_init_one(int index)
|
||
|
{
|
||
|
struct net_device *dev;
|
||
|
int ret;
|
||
|
|
||
|
dev = alloc_netdev_mq(0, "imq%d", NET_NAME_UNKNOWN, imq_setup, numqueues);
|
||
|
if (!dev)
|
||
|
return -ENOMEM;
|
||
|
|
||
|
ret = dev_alloc_name(dev, dev->name);
|
||
|
if (ret < 0)
|
||
|
goto fail;
|
||
|
|
||
|
dev->rtnl_link_ops = &imq_link_ops;
|
||
|
SET_NETDEV_DEVTYPE(dev, &imq_device_type);
|
||
|
ret = register_netdevice(dev);
|
||
|
if (ret < 0)
|
||
|
goto fail;
|
||
|
|
||
|
imq_dev_set_lockdep_class(dev);
|
||
|
|
||
|
return 0;
|
||
|
fail:
|
||
|
free_netdev(dev);
|
||
|
return ret;
|
||
|
}
|
||
|
|
||
|
static int __init imq_init_devs(void)
|
||
|
{
|
||
|
int err, i;
|
||
|
|
||
|
if (numdevs < 1 || numdevs > IMQ_MAX_DEVS) {
|
||
|
pr_err("IMQ: numdevs has to be betweed 1 and %u\n",
|
||
|
IMQ_MAX_DEVS);
|
||
|
return -EINVAL;
|
||
|
}
|
||
|
|
||
|
if (numqueues < 1 || numqueues > IMQ_MAX_QUEUES) {
|
||
|
pr_err("IMQ: numqueues has to be betweed 1 and %u\n",
|
||
|
IMQ_MAX_QUEUES);
|
||
|
return -EINVAL;
|
||
|
}
|
||
|
|
||
|
get_random_bytes(&imq_hashrnd, sizeof(imq_hashrnd));
|
||
|
|
||
|
rtnl_lock();
|
||
|
err = __rtnl_link_register(&imq_link_ops);
|
||
|
|
||
|
for (i = 0; i < numdevs && !err; i++)
|
||
|
err = imq_init_one(i);
|
||
|
|
||
|
if (err) {
|
||
|
__rtnl_link_unregister(&imq_link_ops);
|
||
|
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
|
||
|
}
|
||
|
rtnl_unlock();
|
||
|
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
static int __init imq_init_module(void)
|
||
|
{
|
||
|
int err;
|
||
|
|
||
|
#if defined(CONFIG_IMQ_NUM_DEVS)
|
||
|
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS > 16);
|
||
|
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS < 2);
|
||
|
BUILD_BUG_ON(CONFIG_IMQ_NUM_DEVS - 1 > IMQ_F_IFMASK);
|
||
|
#endif
|
||
|
|
||
|
err = imq_init_devs();
|
||
|
if (err) {
|
||
|
pr_err("IMQ: Error trying imq_init_devs(net)\n");
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
err = imq_init_hooks();
|
||
|
if (err) {
|
||
|
pr_err(KERN_ERR "IMQ: Error trying imq_init_hooks()\n");
|
||
|
rtnl_link_unregister(&imq_link_ops);
|
||
|
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
|
||
|
return err;
|
||
|
}
|
||
|
|
||
|
pr_info("IMQ driver loaded successfully. (numdevs = %d, numqueues = %d, imq_dev_accurate_stats = %d)\n",
|
||
|
numdevs, numqueues, imq_dev_accurate_stats);
|
||
|
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_BA) || defined(CONFIG_IMQ_BEHAVIOR_BB)
|
||
|
pr_info("\tHooking IMQ before NAT on PREROUTING.\n");
|
||
|
#else
|
||
|
pr_info("\tHooking IMQ after NAT on PREROUTING.\n");
|
||
|
#endif
|
||
|
#if defined(CONFIG_IMQ_BEHAVIOR_AB) || defined(CONFIG_IMQ_BEHAVIOR_BB)
|
||
|
pr_info("\tHooking IMQ before NAT on POSTROUTING.\n");
|
||
|
#else
|
||
|
pr_info("\tHooking IMQ after NAT on POSTROUTING.\n");
|
||
|
#endif
|
||
|
|
||
|
return 0;
|
||
|
}
|
||
|
|
||
|
static void __exit imq_unhook(void)
|
||
|
{
|
||
|
unregister_pernet_subsys(&imq_net_ops);
|
||
|
nf_unregister_queue_imq_handler();
|
||
|
}
|
||
|
|
||
|
static void __exit imq_cleanup_devs(void)
|
||
|
{
|
||
|
rtnl_link_unregister(&imq_link_ops);
|
||
|
memset(imq_devs_cache, 0, sizeof(imq_devs_cache));
|
||
|
}
|
||
|
|
||
|
static void __exit imq_exit_module(void)
|
||
|
{
|
||
|
imq_unhook();
|
||
|
imq_cleanup_devs();
|
||
|
pr_info("IMQ driver unloaded successfully.\n");
|
||
|
}
|
||
|
|
||
|
module_init(imq_init_module);
|
||
|
module_exit(imq_exit_module);
|
||
|
|
||
|
module_param(numdevs, int, 0);
|
||
|
module_param(numqueues, int, 0);
|
||
|
module_param(imq_dev_accurate_stats, int, 0);
|
||
|
MODULE_PARM_DESC(numdevs, "number of IMQ devices (how many imq* devices will be created)");
|
||
|
MODULE_PARM_DESC(numqueues, "number of queues per IMQ device");
|
||
|
MODULE_PARM_DESC(imq_dev_accurate_stats, "Notify if need the accurate imq device stats");
|
||
|
|
||
|
MODULE_AUTHOR("https://github.com/imq/linuximq");
|
||
|
MODULE_DESCRIPTION("Pseudo-driver for the intermediate queue device. See https://github.com/imq/linuximq/wiki for more information.");
|
||
|
MODULE_LICENSE("GPL");
|
||
|
MODULE_ALIAS_RTNL_LINK("imq");
|