Resource exhaustion in IP fragment reassembly.

Corresponding upstream changeset from
https://www.freebsd.org/security/advisories/FreeBSD-SA-18:10.ip.asc.
This commit is contained in:
fengbojiang(姜凤波) 2019-11-22 21:06:30 +08:00
parent 30c2a48ca1
commit c476ff78a9
5 changed files with 425 additions and 147 deletions

View File

@ -42,6 +42,7 @@ __FBSDID("$FreeBSD$");
#include <sys/hash.h> #include <sys/hash.h>
#include <sys/mbuf.h> #include <sys/mbuf.h>
#include <sys/malloc.h> #include <sys/malloc.h>
#include <sys/limits.h>
#include <sys/lock.h> #include <sys/lock.h>
#include <sys/mutex.h> #include <sys/mutex.h>
#include <sys/sysctl.h> #include <sys/sysctl.h>
@ -63,13 +64,14 @@ SYSCTL_DECL(_net_inet_ip);
/* /*
* Reassembly headers are stored in hash buckets. * Reassembly headers are stored in hash buckets.
*/ */
#define IPREASS_NHASH_LOG2 6 #define IPREASS_NHASH_LOG2 10
#define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2) #define IPREASS_NHASH (1 << IPREASS_NHASH_LOG2)
#define IPREASS_HMASK (IPREASS_NHASH - 1) #define IPREASS_HMASK (IPREASS_NHASH - 1)
struct ipqbucket { struct ipqbucket {
TAILQ_HEAD(ipqhead, ipq) head; TAILQ_HEAD(ipqhead, ipq) head;
struct mtx lock; struct mtx lock;
int count;
}; };
static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]); static VNET_DEFINE(struct ipqbucket, ipq[IPREASS_NHASH]);
@ -82,6 +84,9 @@ static VNET_DEFINE(uint32_t, ipq_hashseed);
#define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock) #define IPQ_UNLOCK(i) mtx_unlock(&V_ipq[i].lock)
#define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED) #define IPQ_LOCK_ASSERT(i) mtx_assert(&V_ipq[i].lock, MA_OWNED)
static VNET_DEFINE(int, ipreass_maxbucketsize);
#define V_ipreass_maxbucketsize VNET(ipreass_maxbucketsize)
void ipreass_init(void); void ipreass_init(void);
void ipreass_drain(void); void ipreass_drain(void);
void ipreass_slowtimo(void); void ipreass_slowtimo(void);
@ -89,27 +94,53 @@ void ipreass_slowtimo(void);
void ipreass_destroy(void); void ipreass_destroy(void);
#endif #endif
static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS); static int sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS);
static int sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS);
static void ipreass_zone_change(void *); static void ipreass_zone_change(void *);
static void ipreass_drain_tomax(void); static void ipreass_drain_tomax(void);
static void ipq_free(struct ipqhead *, struct ipq *); static void ipq_free(struct ipqbucket *, struct ipq *);
static struct ipq * ipq_reuse(int); static struct ipq * ipq_reuse(int);
static inline void static inline void
ipq_timeout(struct ipqhead *head, struct ipq *fp) ipq_timeout(struct ipqbucket *bucket, struct ipq *fp)
{ {
IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
ipq_free(head, fp); ipq_free(bucket, fp);
} }
static inline void static inline void
ipq_drop(struct ipqhead *head, struct ipq *fp) ipq_drop(struct ipqbucket *bucket, struct ipq *fp)
{ {
IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags); IPSTAT_ADD(ips_fragdropped, fp->ipq_nfrags);
ipq_free(head, fp); ipq_free(bucket, fp);
} }
/*
* By default, limit the number of IP fragments across all reassembly
* queues to 1/32 of the total number of mbuf clusters.
*
* Limit the total number of reassembly queues per VNET to the
* IP fragment limit, but ensure the limit will not allow any bucket
* to grow above 100 items. (The bucket limit is
* IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
* multiplier to reach a 100-item limit.)
* The 100-item limit was chosen as brief testing seems to show that
* this produces "reasonable" performance on some subset of systems
* under DoS attack.
*/
#define IP_MAXFRAGS (nmbclusters / 32)
#define IP_MAXFRAGPACKETS (imin(IP_MAXFRAGS, IPREASS_NHASH * 50))
static int maxfrags;
static volatile u_int nfrags;
SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfrags, CTLFLAG_RW,
&maxfrags, 0,
"Maximum number of IPv4 fragments allowed across all reassembly queues");
SYSCTL_UINT(_net_inet_ip, OID_AUTO, curfrags, CTLFLAG_RD,
__DEVOLATILE(u_int *, &nfrags), 0,
"Current number of IPv4 fragments across all reassembly queues");
static VNET_DEFINE(uma_zone_t, ipq_zone); static VNET_DEFINE(uma_zone_t, ipq_zone);
#define V_ipq_zone VNET(ipq_zone) #define V_ipq_zone VNET(ipq_zone)
SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET | SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragpackets, CTLFLAG_VNET |
@ -127,6 +158,10 @@ static VNET_DEFINE(int, maxfragsperpacket);
SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW, SYSCTL_INT(_net_inet_ip, OID_AUTO, maxfragsperpacket, CTLFLAG_VNET | CTLFLAG_RW,
&VNET_NAME(maxfragsperpacket), 0, &VNET_NAME(maxfragsperpacket), 0,
"Maximum number of IPv4 fragments allowed per packet"); "Maximum number of IPv4 fragments allowed per packet");
SYSCTL_PROC(_net_inet_ip, OID_AUTO, maxfragbucketsize,
CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_MPSAFE | CTLFLAG_RW, NULL, 0,
sysctl_maxfragbucketsize, "I",
"Maximum number of IPv4 fragment reassembly queue entries per bucket");
/* /*
* Take incoming datagram fragment and try to reassemble it into * Take incoming datagram fragment and try to reassemble it into
@ -146,9 +181,9 @@ ip_reass(struct mbuf *m)
struct mbuf *p, *q, *nq, *t; struct mbuf *p, *q, *nq, *t;
struct ipq *fp; struct ipq *fp;
struct ipqhead *head; struct ipqhead *head;
int i, hlen, next; int i, hlen, next, tmpmax;
u_int8_t ecn, ecn0; u_int8_t ecn, ecn0;
uint32_t hash; uint32_t hash, hashkey[3];
#ifdef RSS #ifdef RSS
uint32_t rss_hash, rss_type; uint32_t rss_hash, rss_type;
#endif #endif
@ -156,8 +191,12 @@ ip_reass(struct mbuf *m)
/* /*
* If no reassembling or maxfragsperpacket are 0, * If no reassembling or maxfragsperpacket are 0,
* never accept fragments. * never accept fragments.
* Also, drop packet if it would exceed the maximum
* number of fragments.
*/ */
if (V_noreass == 1 || V_maxfragsperpacket == 0) { tmpmax = maxfrags;
if (V_noreass == 1 || V_maxfragsperpacket == 0 ||
(tmpmax >= 0 && nfrags >= (u_int)tmpmax)) {
IPSTAT_INC(ips_fragments); IPSTAT_INC(ips_fragments);
IPSTAT_INC(ips_fragdropped); IPSTAT_INC(ips_fragdropped);
m_freem(m); m_freem(m);
@ -202,8 +241,12 @@ ip_reass(struct mbuf *m)
m->m_data += hlen; m->m_data += hlen;
m->m_len -= hlen; m->m_len -= hlen;
hash = ip->ip_src.s_addr ^ ip->ip_id; hashkey[0] = ip->ip_src.s_addr;
hash = jenkins_hash32(&hash, 1, V_ipq_hashseed) & IPREASS_HMASK; hashkey[1] = ip->ip_dst.s_addr;
hashkey[2] = (uint32_t)ip->ip_p << 16;
hashkey[2] += ip->ip_id;
hash = jenkins_hash32(hashkey, nitems(hashkey), V_ipq_hashseed);
hash &= IPREASS_HMASK;
head = &V_ipq[hash].head; head = &V_ipq[hash].head;
IPQ_LOCK(hash); IPQ_LOCK(hash);
@ -224,9 +267,12 @@ ip_reass(struct mbuf *m)
* If first fragment to arrive, create a reassembly queue. * If first fragment to arrive, create a reassembly queue.
*/ */
if (fp == NULL) { if (fp == NULL) {
fp = uma_zalloc(V_ipq_zone, M_NOWAIT); if (V_ipq[hash].count < V_ipreass_maxbucketsize)
fp = uma_zalloc(V_ipq_zone, M_NOWAIT);
if (fp == NULL) if (fp == NULL)
fp = ipq_reuse(hash); fp = ipq_reuse(hash);
if (fp == NULL)
goto dropfrag;
#ifdef MAC #ifdef MAC
if (mac_ipq_init(fp, M_NOWAIT) != 0) { if (mac_ipq_init(fp, M_NOWAIT) != 0) {
uma_zfree(V_ipq_zone, fp); uma_zfree(V_ipq_zone, fp);
@ -236,7 +282,9 @@ ip_reass(struct mbuf *m)
mac_ipq_create(m, fp); mac_ipq_create(m, fp);
#endif #endif
TAILQ_INSERT_HEAD(head, fp, ipq_list); TAILQ_INSERT_HEAD(head, fp, ipq_list);
V_ipq[hash].count++;
fp->ipq_nfrags = 1; fp->ipq_nfrags = 1;
atomic_add_int(&nfrags, 1);
fp->ipq_ttl = IPFRAGTTL; fp->ipq_ttl = IPFRAGTTL;
fp->ipq_p = ip->ip_p; fp->ipq_p = ip->ip_p;
fp->ipq_id = ip->ip_id; fp->ipq_id = ip->ip_id;
@ -247,6 +295,7 @@ ip_reass(struct mbuf *m)
goto done; goto done;
} else { } else {
fp->ipq_nfrags++; fp->ipq_nfrags++;
atomic_add_int(&nfrags, 1);
#ifdef MAC #ifdef MAC
mac_ipq_update(m, fp); mac_ipq_update(m, fp);
#endif #endif
@ -323,6 +372,7 @@ ip_reass(struct mbuf *m)
m->m_nextpkt = nq; m->m_nextpkt = nq;
IPSTAT_INC(ips_fragdropped); IPSTAT_INC(ips_fragdropped);
fp->ipq_nfrags--; fp->ipq_nfrags--;
atomic_subtract_int(&nfrags, 1);
m_freem(q); m_freem(q);
} }
@ -340,7 +390,7 @@ ip_reass(struct mbuf *m)
for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) { for (p = NULL, q = fp->ipq_frags; q; p = q, q = q->m_nextpkt) {
if (ntohs(GETIP(q)->ip_off) != next) { if (ntohs(GETIP(q)->ip_off) != next) {
if (fp->ipq_nfrags > V_maxfragsperpacket) if (fp->ipq_nfrags > V_maxfragsperpacket)
ipq_drop(head, fp); ipq_drop(&V_ipq[hash], fp);
goto done; goto done;
} }
next += ntohs(GETIP(q)->ip_len); next += ntohs(GETIP(q)->ip_len);
@ -348,7 +398,7 @@ ip_reass(struct mbuf *m)
/* Make sure the last packet didn't have the IP_MF flag */ /* Make sure the last packet didn't have the IP_MF flag */
if (p->m_flags & M_IP_FRAG) { if (p->m_flags & M_IP_FRAG) {
if (fp->ipq_nfrags > V_maxfragsperpacket) if (fp->ipq_nfrags > V_maxfragsperpacket)
ipq_drop(head, fp); ipq_drop(&V_ipq[hash], fp);
goto done; goto done;
} }
@ -359,7 +409,7 @@ ip_reass(struct mbuf *m)
ip = GETIP(q); ip = GETIP(q);
if (next + (ip->ip_hl << 2) > IP_MAXPACKET) { if (next + (ip->ip_hl << 2) > IP_MAXPACKET) {
IPSTAT_INC(ips_toolong); IPSTAT_INC(ips_toolong);
ipq_drop(head, fp); ipq_drop(&V_ipq[hash], fp);
goto done; goto done;
} }
@ -387,6 +437,7 @@ ip_reass(struct mbuf *m)
while (m->m_pkthdr.csum_data & 0xffff0000) while (m->m_pkthdr.csum_data & 0xffff0000)
m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) + m->m_pkthdr.csum_data = (m->m_pkthdr.csum_data & 0xffff) +
(m->m_pkthdr.csum_data >> 16); (m->m_pkthdr.csum_data >> 16);
atomic_subtract_int(&nfrags, fp->ipq_nfrags);
#ifdef MAC #ifdef MAC
mac_ipq_reassemble(fp, m); mac_ipq_reassemble(fp, m);
mac_ipq_destroy(fp); mac_ipq_destroy(fp);
@ -401,6 +452,7 @@ ip_reass(struct mbuf *m)
ip->ip_src = fp->ipq_src; ip->ip_src = fp->ipq_src;
ip->ip_dst = fp->ipq_dst; ip->ip_dst = fp->ipq_dst;
TAILQ_REMOVE(head, fp, ipq_list); TAILQ_REMOVE(head, fp, ipq_list);
V_ipq[hash].count--;
uma_zfree(V_ipq_zone, fp); uma_zfree(V_ipq_zone, fp);
m->m_len += (ip->ip_hl << 2); m->m_len += (ip->ip_hl << 2);
m->m_data -= (ip->ip_hl << 2); m->m_data -= (ip->ip_hl << 2);
@ -446,8 +498,10 @@ ip_reass(struct mbuf *m)
dropfrag: dropfrag:
IPSTAT_INC(ips_fragdropped); IPSTAT_INC(ips_fragdropped);
if (fp != NULL) if (fp != NULL) {
fp->ipq_nfrags--; fp->ipq_nfrags--;
atomic_subtract_int(&nfrags, 1);
}
m_freem(m); m_freem(m);
done: done:
IPQ_UNLOCK(hash); IPQ_UNLOCK(hash);
@ -462,21 +516,27 @@ done:
void void
ipreass_init(void) ipreass_init(void)
{ {
int max;
for (int i = 0; i < IPREASS_NHASH; i++) { for (int i = 0; i < IPREASS_NHASH; i++) {
TAILQ_INIT(&V_ipq[i].head); TAILQ_INIT(&V_ipq[i].head);
mtx_init(&V_ipq[i].lock, "IP reassembly", NULL, mtx_init(&V_ipq[i].lock, "IP reassembly", NULL,
MTX_DEF | MTX_DUPOK); MTX_DEF | MTX_DUPOK);
V_ipq[i].count = 0;
} }
V_ipq_hashseed = arc4random(); V_ipq_hashseed = arc4random();
V_maxfragsperpacket = 16; V_maxfragsperpacket = 16;
V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL, V_ipq_zone = uma_zcreate("ipq", sizeof(struct ipq), NULL, NULL, NULL,
NULL, UMA_ALIGN_PTR, 0); NULL, UMA_ALIGN_PTR, 0);
uma_zone_set_max(V_ipq_zone, nmbclusters / 32); max = IP_MAXFRAGPACKETS;
max = uma_zone_set_max(V_ipq_zone, max);
V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
if (IS_DEFAULT_VNET(curvnet)) if (IS_DEFAULT_VNET(curvnet)) {
maxfrags = IP_MAXFRAGS;
EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change, EVENTHANDLER_REGISTER(nmbclusters_change, ipreass_zone_change,
NULL, EVENTHANDLER_PRI_ANY); NULL, EVENTHANDLER_PRI_ANY);
}
} }
/* /*
@ -491,7 +551,7 @@ ipreass_slowtimo(void)
IPQ_LOCK(i); IPQ_LOCK(i);
TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp) TAILQ_FOREACH_SAFE(fp, &V_ipq[i].head, ipq_list, tmp)
if (--fp->ipq_ttl == 0) if (--fp->ipq_ttl == 0)
ipq_timeout(&V_ipq[i].head, fp); ipq_timeout(&V_ipq[i], fp);
IPQ_UNLOCK(i); IPQ_UNLOCK(i);
} }
} }
@ -506,7 +566,10 @@ ipreass_drain(void)
for (int i = 0; i < IPREASS_NHASH; i++) { for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i); IPQ_LOCK(i);
while(!TAILQ_EMPTY(&V_ipq[i].head)) while(!TAILQ_EMPTY(&V_ipq[i].head))
ipq_drop(&V_ipq[i].head, TAILQ_FIRST(&V_ipq[i].head)); ipq_drop(&V_ipq[i], TAILQ_FIRST(&V_ipq[i].head));
KASSERT(V_ipq[i].count == 0,
("%s: V_ipq[%d] count %d (V_ipq=%p)", __func__, i,
V_ipq[i].count, V_ipq));
IPQ_UNLOCK(i); IPQ_UNLOCK(i);
} }
} }
@ -534,8 +597,22 @@ ipreass_destroy(void)
static void static void
ipreass_drain_tomax(void) ipreass_drain_tomax(void)
{ {
struct ipq *fp;
int target; int target;
/*
* Make sure each bucket is under the new limit. If
* necessary, drop enough of the oldest elements from
* each bucket to get under the new limit.
*/
for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i);
while (V_ipq[i].count > V_ipreass_maxbucketsize &&
(fp = TAILQ_LAST(&V_ipq[i].head, ipqhead)) != NULL)
ipq_timeout(&V_ipq[i], fp);
IPQ_UNLOCK(i);
}
/* /*
* If we are over the maximum number of fragments, * If we are over the maximum number of fragments,
* drain off enough to get down to the new limit, * drain off enough to get down to the new limit,
@ -544,13 +621,11 @@ ipreass_drain_tomax(void)
*/ */
target = uma_zone_get_max(V_ipq_zone); target = uma_zone_get_max(V_ipq_zone);
while (uma_zone_get_cur(V_ipq_zone) > target) { while (uma_zone_get_cur(V_ipq_zone) > target) {
struct ipq *fp;
for (int i = 0; i < IPREASS_NHASH; i++) { for (int i = 0; i < IPREASS_NHASH; i++) {
IPQ_LOCK(i); IPQ_LOCK(i);
fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); fp = TAILQ_LAST(&V_ipq[i].head, ipqhead);
if (fp != NULL) if (fp != NULL)
ipq_timeout(&V_ipq[i].head, fp); ipq_timeout(&V_ipq[i], fp);
IPQ_UNLOCK(i); IPQ_UNLOCK(i);
} }
} }
@ -559,9 +634,20 @@ ipreass_drain_tomax(void)
static void static void
ipreass_zone_change(void *tag) ipreass_zone_change(void *tag)
{ {
VNET_ITERATOR_DECL(vnet_iter);
int max;
uma_zone_set_max(V_ipq_zone, nmbclusters / 32); maxfrags = IP_MAXFRAGS;
ipreass_drain_tomax(); max = IP_MAXFRAGPACKETS;
VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
max = uma_zone_set_max(V_ipq_zone, max);
V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
ipreass_drain_tomax();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
} }
/* /*
@ -589,6 +675,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
* and place an extreme upper bound. * and place an extreme upper bound.
*/ */
max = uma_zone_set_max(V_ipq_zone, max); max = uma_zone_set_max(V_ipq_zone, max);
V_ipreass_maxbucketsize = imax(max / (IPREASS_NHASH / 2), 1);
ipreass_drain_tomax(); ipreass_drain_tomax();
V_noreass = 0; V_noreass = 0;
} else if (max == 0) { } else if (max == 0) {
@ -597,6 +684,7 @@ sysctl_maxfragpackets(SYSCTL_HANDLER_ARGS)
} else if (max == -1) { } else if (max == -1) {
V_noreass = 0; V_noreass = 0;
uma_zone_set_max(V_ipq_zone, 0); uma_zone_set_max(V_ipq_zone, 0);
V_ipreass_maxbucketsize = INT_MAX;
} else } else
return (EINVAL); return (EINVAL);
return (0); return (0);
@ -610,49 +698,72 @@ static struct ipq *
ipq_reuse(int start) ipq_reuse(int start)
{ {
struct ipq *fp; struct ipq *fp;
int i; int bucket, i;
IPQ_LOCK_ASSERT(start); IPQ_LOCK_ASSERT(start);
for (i = start;; i++) { for (i = 0; i < IPREASS_NHASH; i++) {
if (i == IPREASS_NHASH) bucket = (start + i) % IPREASS_NHASH;
i = 0; if (bucket != start && IPQ_TRYLOCK(bucket) == 0)
if (i != start && IPQ_TRYLOCK(i) == 0)
continue; continue;
fp = TAILQ_LAST(&V_ipq[i].head, ipqhead); fp = TAILQ_LAST(&V_ipq[bucket].head, ipqhead);
if (fp) { if (fp) {
struct mbuf *m; struct mbuf *m;
IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags); IPSTAT_ADD(ips_fragtimeout, fp->ipq_nfrags);
atomic_subtract_int(&nfrags, fp->ipq_nfrags);
while (fp->ipq_frags) { while (fp->ipq_frags) {
m = fp->ipq_frags; m = fp->ipq_frags;
fp->ipq_frags = m->m_nextpkt; fp->ipq_frags = m->m_nextpkt;
m_freem(m); m_freem(m);
} }
TAILQ_REMOVE(&V_ipq[i].head, fp, ipq_list); TAILQ_REMOVE(&V_ipq[bucket].head, fp, ipq_list);
if (i != start) V_ipq[bucket].count--;
IPQ_UNLOCK(i); if (bucket != start)
IPQ_LOCK_ASSERT(start); IPQ_UNLOCK(bucket);
return (fp); break;
} }
if (i != start) if (bucket != start)
IPQ_UNLOCK(i); IPQ_UNLOCK(bucket);
} }
IPQ_LOCK_ASSERT(start);
return (fp);
} }
/* /*
* Free a fragment reassembly header and all associated datagrams. * Free a fragment reassembly header and all associated datagrams.
*/ */
static void static void
ipq_free(struct ipqhead *fhp, struct ipq *fp) ipq_free(struct ipqbucket *bucket, struct ipq *fp)
{ {
struct mbuf *q; struct mbuf *q;
atomic_subtract_int(&nfrags, fp->ipq_nfrags);
while (fp->ipq_frags) { while (fp->ipq_frags) {
q = fp->ipq_frags; q = fp->ipq_frags;
fp->ipq_frags = q->m_nextpkt; fp->ipq_frags = q->m_nextpkt;
m_freem(q); m_freem(q);
} }
TAILQ_REMOVE(fhp, fp, ipq_list); TAILQ_REMOVE(&bucket->head, fp, ipq_list);
bucket->count--;
uma_zfree(V_ipq_zone, fp); uma_zfree(V_ipq_zone, fp);
} }
/*
* Get or set the maximum number of reassembly queues per bucket.
*/
static int
sysctl_maxfragbucketsize(SYSCTL_HANDLER_ARGS)
{
int error, max;
max = V_ipreass_maxbucketsize;
error = sysctl_handle_int(oidp, &max, 0, req);
if (error || !req->newptr)
return (error);
if (max <= 0)
return (EINVAL);
V_ipreass_maxbucketsize = max;
ipreass_drain_tomax();
return (0);
}

View File

@ -36,6 +36,7 @@ __FBSDID("$FreeBSD$");
#include <sys/param.h> #include <sys/param.h>
#include <sys/systm.h> #include <sys/systm.h>
#include <sys/hash.h>
#include <sys/malloc.h> #include <sys/malloc.h>
#include <sys/mbuf.h> #include <sys/mbuf.h>
#include <sys/domain.h> #include <sys/domain.h>
@ -47,6 +48,8 @@ __FBSDID("$FreeBSD$");
#include <sys/kernel.h> #include <sys/kernel.h>
#include <sys/syslog.h> #include <sys/syslog.h>
#include <machine/atomic.h>
#include <net/if.h> #include <net/if.h>
#include <net/if_var.h> #include <net/if_var.h>
#include <net/netisr.h> #include <net/netisr.h>
@ -63,58 +66,112 @@ __FBSDID("$FreeBSD$");
#include <security/mac/mac_framework.h> #include <security/mac/mac_framework.h>
static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *);
static void frag6_deq(struct ip6asfrag *);
static void frag6_insque(struct ip6q *, struct ip6q *);
static void frag6_remque(struct ip6q *);
static void frag6_freef(struct ip6q *);
static struct mtx ip6qlock;
/* /*
* These fields all protected by ip6qlock. * Reassembly headers are stored in hash buckets.
*/ */
static VNET_DEFINE(u_int, frag6_nfragpackets); #define IP6REASS_NHASH_LOG2 10
static VNET_DEFINE(u_int, frag6_nfrags); #define IP6REASS_NHASH (1 << IP6REASS_NHASH_LOG2)
static VNET_DEFINE(struct ip6q, ip6q); /* ip6 reassemble queue */ #define IP6REASS_HMASK (IP6REASS_NHASH - 1)
static void frag6_enq(struct ip6asfrag *, struct ip6asfrag *,
uint32_t bucket __unused);
static void frag6_deq(struct ip6asfrag *, uint32_t bucket __unused);
static void frag6_insque_head(struct ip6q *, struct ip6q *,
uint32_t bucket);
static void frag6_remque(struct ip6q *, uint32_t bucket);
static void frag6_freef(struct ip6q *, uint32_t bucket);
struct ip6qbucket {
struct ip6q ip6q;
struct mtx lock;
int count;
};
static VNET_DEFINE(volatile u_int, frag6_nfragpackets);
volatile u_int frag6_nfrags = 0;
static VNET_DEFINE(struct ip6qbucket, ip6q[IP6REASS_NHASH]);
static VNET_DEFINE(uint32_t, ip6q_hashseed);
#define V_frag6_nfragpackets VNET(frag6_nfragpackets) #define V_frag6_nfragpackets VNET(frag6_nfragpackets)
#define V_frag6_nfrags VNET(frag6_nfrags)
#define V_ip6q VNET(ip6q) #define V_ip6q VNET(ip6q)
#define V_ip6q_hashseed VNET(ip6q_hashseed)
#define IP6Q_LOCK_INIT() mtx_init(&ip6qlock, "ip6qlock", NULL, MTX_DEF); #define IP6Q_LOCK(i) mtx_lock(&V_ip6q[(i)].lock)
#define IP6Q_LOCK() mtx_lock(&ip6qlock) #define IP6Q_TRYLOCK(i) mtx_trylock(&V_ip6q[(i)].lock)
#define IP6Q_TRYLOCK() mtx_trylock(&ip6qlock) #define IP6Q_LOCK_ASSERT(i) mtx_assert(&V_ip6q[(i)].lock, MA_OWNED)
#define IP6Q_LOCK_ASSERT() mtx_assert(&ip6qlock, MA_OWNED) #define IP6Q_UNLOCK(i) mtx_unlock(&V_ip6q[(i)].lock)
#define IP6Q_UNLOCK() mtx_unlock(&ip6qlock) #define IP6Q_HEAD(i) (&V_ip6q[(i)].ip6q)
static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header"); static MALLOC_DEFINE(M_FTABLE, "fragment", "fragment reassembly header");
/*
* By default, limit the number of IP6 fragments across all reassembly
* queues to 1/32 of the total number of mbuf clusters.
*
* Limit the total number of reassembly queues per VNET to the
* IP6 fragment limit, but ensure the limit will not allow any bucket
* to grow above 100 items. (The bucket limit is
* IP_MAXFRAGPACKETS / (IPREASS_NHASH / 2), so the 50 is the correct
* multiplier to reach a 100-item limit.)
* The 100-item limit was chosen as brief testing seems to show that
* this produces "reasonable" performance on some subset of systems
* under DoS attack.
*/
#define IP6_MAXFRAGS (nmbclusters / 32)
#define IP6_MAXFRAGPACKETS (imin(IP6_MAXFRAGS, IP6REASS_NHASH * 50))
/* /*
* Initialise reassembly queue and fragment identifier. * Initialise reassembly queue and fragment identifier.
*/ */
void
frag6_set_bucketsize()
{
int i;
if ((i = V_ip6_maxfragpackets) > 0)
V_ip6_maxfragbucketsize = imax(i / (IP6REASS_NHASH / 2), 1);
}
static void static void
frag6_change(void *tag) frag6_change(void *tag)
{ {
VNET_ITERATOR_DECL(vnet_iter);
V_ip6_maxfragpackets = nmbclusters / 4; ip6_maxfrags = IP6_MAXFRAGS;
V_ip6_maxfrags = nmbclusters / 4; VNET_LIST_RLOCK_NOSLEEP();
VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter);
V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
frag6_set_bucketsize();
CURVNET_RESTORE();
}
VNET_LIST_RUNLOCK_NOSLEEP();
} }
void void
frag6_init(void) frag6_init(void)
{ {
struct ip6q *q6;
int i;
V_ip6_maxfragpackets = IP6_MAXFRAGPACKETS;
frag6_set_bucketsize();
for (i = 0; i < IP6REASS_NHASH; i++) {
q6 = IP6Q_HEAD(i);
q6->ip6q_next = q6->ip6q_prev = q6;
mtx_init(&V_ip6q[i].lock, "ip6qlock", NULL, MTX_DEF);
V_ip6q[i].count = 0;
}
V_ip6q_hashseed = arc4random();
V_ip6_maxfragsperpacket = 64;
V_ip6_maxfragpackets = nmbclusters / 4;
V_ip6_maxfrags = nmbclusters / 4;
V_ip6q.ip6q_next = V_ip6q.ip6q_prev = &V_ip6q;
if (!IS_DEFAULT_VNET(curvnet)) if (!IS_DEFAULT_VNET(curvnet))
return; return;
ip6_maxfrags = IP6_MAXFRAGS;
EVENTHANDLER_REGISTER(nmbclusters_change, EVENTHANDLER_REGISTER(nmbclusters_change,
frag6_change, NULL, EVENTHANDLER_PRI_ANY); frag6_change, NULL, EVENTHANDLER_PRI_ANY);
IP6Q_LOCK_INIT();
} }
/* /*
@ -155,12 +212,13 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
struct mbuf *m = *mp, *t; struct mbuf *m = *mp, *t;
struct ip6_hdr *ip6; struct ip6_hdr *ip6;
struct ip6_frag *ip6f; struct ip6_frag *ip6f;
struct ip6q *q6; struct ip6q *head, *q6;
struct ip6asfrag *af6, *ip6af, *af6dwn; struct ip6asfrag *af6, *ip6af, *af6dwn;
struct in6_ifaddr *ia; struct in6_ifaddr *ia;
int offset = *offp, nxt, i, next; int offset = *offp, nxt, i, next;
int first_frag = 0; int first_frag = 0;
int fragoff, frgpartlen; /* must be larger than u_int16_t */ int fragoff, frgpartlen; /* must be larger than u_int16_t */
uint32_t hash, hashkey[sizeof(struct in6_addr) * 2 + 1], *hashkeyp;
struct ifnet *dstifp; struct ifnet *dstifp;
u_int8_t ecn, ecn0; u_int8_t ecn, ecn0;
#ifdef RSS #ifdef RSS
@ -228,19 +286,38 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
return (ip6f->ip6f_nxt); return (ip6f->ip6f_nxt);
} }
IP6Q_LOCK(); /* Get fragment length and discard 0-byte fragments. */
frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
if (frgpartlen == 0) {
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
offsetof(struct ip6_hdr, ip6_plen));
in6_ifstat_inc(dstifp, ifs6_reass_fail);
IP6STAT_INC(ip6s_fragdropped);
return IPPROTO_DONE;
}
hashkeyp = hashkey;
memcpy(hashkeyp, &ip6->ip6_src, sizeof(struct in6_addr));
hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
memcpy(hashkeyp, &ip6->ip6_dst, sizeof(struct in6_addr));
hashkeyp += sizeof(struct in6_addr) / sizeof(*hashkeyp);
*hashkeyp = ip6f->ip6f_ident;
hash = jenkins_hash32(hashkey, nitems(hashkey), V_ip6q_hashseed);
hash &= IP6REASS_HMASK;
head = IP6Q_HEAD(hash);
IP6Q_LOCK(hash);
/* /*
* Enforce upper bound on number of fragments. * Enforce upper bound on number of fragments.
* If maxfrag is 0, never accept fragments. * If maxfrag is 0, never accept fragments.
* If maxfrag is -1, accept all fragments without limitation. * If maxfrag is -1, accept all fragments without limitation.
*/ */
if (V_ip6_maxfrags < 0) if (ip6_maxfrags < 0)
; ;
else if (V_frag6_nfrags >= (u_int)V_ip6_maxfrags) else if (frag6_nfrags >= (u_int)ip6_maxfrags)
goto dropfrag; goto dropfrag;
for (q6 = V_ip6q.ip6q_next; q6 != &V_ip6q; q6 = q6->ip6q_next) for (q6 = head->ip6q_next; q6 != head; q6 = q6->ip6q_next)
if (ip6f->ip6f_ident == q6->ip6q_ident && if (ip6f->ip6f_ident == q6->ip6q_ident &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) && IN6_ARE_ADDR_EQUAL(&ip6->ip6_src, &q6->ip6q_src) &&
IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst) IN6_ARE_ADDR_EQUAL(&ip6->ip6_dst, &q6->ip6q_dst)
@ -250,7 +327,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
) )
break; break;
if (q6 == &V_ip6q) { if (q6 == head) {
/* /*
* the first fragment to arrive, create a reassembly queue. * the first fragment to arrive, create a reassembly queue.
*/ */
@ -265,9 +342,10 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
*/ */
if (V_ip6_maxfragpackets < 0) if (V_ip6_maxfragpackets < 0)
; ;
else if (V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets) else if (V_ip6q[hash].count >= V_ip6_maxfragbucketsize ||
V_frag6_nfragpackets >= (u_int)V_ip6_maxfragpackets)
goto dropfrag; goto dropfrag;
V_frag6_nfragpackets++; atomic_add_int(&V_frag6_nfragpackets, 1);
q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE, q6 = (struct ip6q *)malloc(sizeof(struct ip6q), M_FTABLE,
M_NOWAIT); M_NOWAIT);
if (q6 == NULL) if (q6 == NULL)
@ -280,7 +358,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
} }
mac_ip6q_create(m, q6); mac_ip6q_create(m, q6);
#endif #endif
frag6_insque(q6, &V_ip6q); frag6_insque_head(q6, head, hash);
/* ip6q_nxt will be filled afterwards, from 1st fragment */ /* ip6q_nxt will be filled afterwards, from 1st fragment */
q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6; q6->ip6q_down = q6->ip6q_up = (struct ip6asfrag *)q6;
@ -314,21 +392,20 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
* in size. * in size.
* If it would exceed, discard the fragment and return an ICMP error. * If it would exceed, discard the fragment and return an ICMP error.
*/ */
frgpartlen = sizeof(struct ip6_hdr) + ntohs(ip6->ip6_plen) - offset;
if (q6->ip6q_unfrglen >= 0) { if (q6->ip6q_unfrglen >= 0) {
/* The 1st fragment has already arrived. */ /* The 1st fragment has already arrived. */
if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) { if (q6->ip6q_unfrglen + fragoff + frgpartlen > IPV6_MAXPACKET) {
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
offset - sizeof(struct ip6_frag) + offset - sizeof(struct ip6_frag) +
offsetof(struct ip6_frag, ip6f_offlg)); offsetof(struct ip6_frag, ip6f_offlg));
IP6Q_UNLOCK(); IP6Q_UNLOCK(hash);
return (IPPROTO_DONE); return (IPPROTO_DONE);
} }
} else if (fragoff + frgpartlen > IPV6_MAXPACKET) { } else if (fragoff + frgpartlen > IPV6_MAXPACKET) {
icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER, icmp6_error(m, ICMP6_PARAM_PROB, ICMP6_PARAMPROB_HEADER,
offset - sizeof(struct ip6_frag) + offset - sizeof(struct ip6_frag) +
offsetof(struct ip6_frag, ip6f_offlg)); offsetof(struct ip6_frag, ip6f_offlg));
IP6Q_UNLOCK(); IP6Q_UNLOCK(hash);
return (IPPROTO_DONE); return (IPPROTO_DONE);
} }
/* /*
@ -347,7 +424,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
int erroff = af6->ip6af_offset; int erroff = af6->ip6af_offset;
/* dequeue the fragment. */ /* dequeue the fragment. */
frag6_deq(af6); frag6_deq(af6, hash);
free(af6, M_FTABLE); free(af6, M_FTABLE);
/* adjust pointer. */ /* adjust pointer. */
@ -445,7 +522,7 @@ frag6_input(struct mbuf **mp, int *offp, int proto)
} }
af6 = af6->ip6af_down; af6 = af6->ip6af_down;
m_freem(IP6_REASS_MBUF(af6->ip6af_up)); m_freem(IP6_REASS_MBUF(af6->ip6af_up));
frag6_deq(af6->ip6af_up); frag6_deq(af6->ip6af_up, hash);
} }
#else #else
/* /*
@ -494,29 +571,38 @@ insert:
/* /*
* Stick new segment in its place; * Stick new segment in its place;
* check for complete reassembly. * check for complete reassembly.
* If not complete, check fragment limit.
* Move to front of packet queue, as we are * Move to front of packet queue, as we are
* the most recently active fragmented packet. * the most recently active fragmented packet.
*/ */
frag6_enq(ip6af, af6->ip6af_up); frag6_enq(ip6af, af6->ip6af_up, hash);
V_frag6_nfrags++; atomic_add_int(&frag6_nfrags, 1);
q6->ip6q_nfrag++; q6->ip6q_nfrag++;
#if 0 /* xxx */ #if 0 /* xxx */
if (q6 != V_ip6q.ip6q_next) { if (q6 != head->ip6q_next) {
frag6_remque(q6); frag6_remque(q6, hash);
frag6_insque(q6, &V_ip6q); frag6_insque_head(q6, head, hash);
} }
#endif #endif
next = 0; next = 0;
for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
af6 = af6->ip6af_down) { af6 = af6->ip6af_down) {
if (af6->ip6af_off != next) { if (af6->ip6af_off != next) {
IP6Q_UNLOCK(); if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
IP6STAT_INC(ip6s_fragdropped);
frag6_freef(q6, hash);
}
IP6Q_UNLOCK(hash);
return IPPROTO_DONE; return IPPROTO_DONE;
} }
next += af6->ip6af_frglen; next += af6->ip6af_frglen;
} }
if (af6->ip6af_up->ip6af_mff) { if (af6->ip6af_up->ip6af_mff) {
IP6Q_UNLOCK(); if (q6->ip6q_nfrag > V_ip6_maxfragsperpacket) {
IP6STAT_INC(ip6s_fragdropped);
frag6_freef(q6, hash);
}
IP6Q_UNLOCK(hash);
return IPPROTO_DONE; return IPPROTO_DONE;
} }
@ -526,10 +612,10 @@ insert:
ip6af = q6->ip6q_down; ip6af = q6->ip6q_down;
t = m = IP6_REASS_MBUF(ip6af); t = m = IP6_REASS_MBUF(ip6af);
af6 = ip6af->ip6af_down; af6 = ip6af->ip6af_down;
frag6_deq(ip6af); frag6_deq(ip6af, hash);
while (af6 != (struct ip6asfrag *)q6) { while (af6 != (struct ip6asfrag *)q6) {
af6dwn = af6->ip6af_down; af6dwn = af6->ip6af_down;
frag6_deq(af6); frag6_deq(af6, hash);
while (t->m_next) while (t->m_next)
t = t->m_next; t = t->m_next;
m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset); m_adj(IP6_REASS_MBUF(af6), af6->ip6af_offset);
@ -551,13 +637,13 @@ insert:
#endif #endif
if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) { if (ip6_deletefraghdr(m, offset, M_NOWAIT) != 0) {
frag6_remque(q6); frag6_remque(q6, hash);
V_frag6_nfrags -= q6->ip6q_nfrag; atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC #ifdef MAC
mac_ip6q_destroy(q6); mac_ip6q_destroy(q6);
#endif #endif
free(q6, M_FTABLE); free(q6, M_FTABLE);
V_frag6_nfragpackets--; atomic_subtract_int(&V_frag6_nfragpackets, 1);
goto dropfrag; goto dropfrag;
} }
@ -570,14 +656,14 @@ insert:
*prvnxtp = nxt; *prvnxtp = nxt;
} }
frag6_remque(q6); frag6_remque(q6, hash);
V_frag6_nfrags -= q6->ip6q_nfrag; atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC #ifdef MAC
mac_ip6q_reassemble(q6, m); mac_ip6q_reassemble(q6, m);
mac_ip6q_destroy(q6); mac_ip6q_destroy(q6);
#endif #endif
free(q6, M_FTABLE); free(q6, M_FTABLE);
V_frag6_nfragpackets--; atomic_subtract_int(&V_frag6_nfragpackets, 1);
if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */ if (m->m_flags & M_PKTHDR) { /* Isn't it always true? */
int plen = 0; int plen = 0;
@ -599,7 +685,7 @@ insert:
m_tag_prepend(m, mtag); m_tag_prepend(m, mtag);
#endif #endif
IP6Q_UNLOCK(); IP6Q_UNLOCK(hash);
IP6STAT_INC(ip6s_reassembled); IP6STAT_INC(ip6s_reassembled);
in6_ifstat_inc(dstifp, ifs6_reass_ok); in6_ifstat_inc(dstifp, ifs6_reass_ok);
@ -621,7 +707,7 @@ insert:
return nxt; return nxt;
dropfrag: dropfrag:
IP6Q_UNLOCK(); IP6Q_UNLOCK(hash);
in6_ifstat_inc(dstifp, ifs6_reass_fail); in6_ifstat_inc(dstifp, ifs6_reass_fail);
IP6STAT_INC(ip6s_fragdropped); IP6STAT_INC(ip6s_fragdropped);
m_freem(m); m_freem(m);
@ -632,19 +718,19 @@ insert:
* Free a fragment reassembly header and all * Free a fragment reassembly header and all
* associated datagrams. * associated datagrams.
*/ */
void static void
frag6_freef(struct ip6q *q6) frag6_freef(struct ip6q *q6, uint32_t bucket)
{ {
struct ip6asfrag *af6, *down6; struct ip6asfrag *af6, *down6;
IP6Q_LOCK_ASSERT(); IP6Q_LOCK_ASSERT(bucket);
for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6; for (af6 = q6->ip6q_down; af6 != (struct ip6asfrag *)q6;
af6 = down6) { af6 = down6) {
struct mbuf *m = IP6_REASS_MBUF(af6); struct mbuf *m = IP6_REASS_MBUF(af6);
down6 = af6->ip6af_down; down6 = af6->ip6af_down;
frag6_deq(af6); frag6_deq(af6, bucket);
/* /*
* Return ICMP time exceeded error for the 1st fragment. * Return ICMP time exceeded error for the 1st fragment.
@ -666,24 +752,25 @@ frag6_freef(struct ip6q *q6)
m_freem(m); m_freem(m);
free(af6, M_FTABLE); free(af6, M_FTABLE);
} }
frag6_remque(q6); frag6_remque(q6, bucket);
V_frag6_nfrags -= q6->ip6q_nfrag; atomic_subtract_int(&frag6_nfrags, q6->ip6q_nfrag);
#ifdef MAC #ifdef MAC
mac_ip6q_destroy(q6); mac_ip6q_destroy(q6);
#endif #endif
free(q6, M_FTABLE); free(q6, M_FTABLE);
V_frag6_nfragpackets--; atomic_subtract_int(&V_frag6_nfragpackets, 1);
} }
/* /*
* Put an ip fragment on a reassembly chain. * Put an ip fragment on a reassembly chain.
* Like insque, but pointers in middle of structure. * Like insque, but pointers in middle of structure.
*/ */
void static void
frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6) frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6,
uint32_t bucket __unused)
{ {
IP6Q_LOCK_ASSERT(); IP6Q_LOCK_ASSERT(bucket);
af6->ip6af_up = up6; af6->ip6af_up = up6;
af6->ip6af_down = up6->ip6af_down; af6->ip6af_down = up6->ip6af_down;
@ -694,36 +781,41 @@ frag6_enq(struct ip6asfrag *af6, struct ip6asfrag *up6)
/* /*
* To frag6_enq as remque is to insque. * To frag6_enq as remque is to insque.
*/ */
void static void
frag6_deq(struct ip6asfrag *af6) frag6_deq(struct ip6asfrag *af6, uint32_t bucket __unused)
{ {
IP6Q_LOCK_ASSERT(); IP6Q_LOCK_ASSERT(bucket);
af6->ip6af_up->ip6af_down = af6->ip6af_down; af6->ip6af_up->ip6af_down = af6->ip6af_down;
af6->ip6af_down->ip6af_up = af6->ip6af_up; af6->ip6af_down->ip6af_up = af6->ip6af_up;
} }
void static void
frag6_insque(struct ip6q *new, struct ip6q *old) frag6_insque_head(struct ip6q *new, struct ip6q *old, uint32_t bucket)
{ {
IP6Q_LOCK_ASSERT(); IP6Q_LOCK_ASSERT(bucket);
KASSERT(IP6Q_HEAD(bucket) == old,
("%s: attempt to insert at head of wrong bucket"
" (bucket=%u, old=%p)", __func__, bucket, old));
new->ip6q_prev = old; new->ip6q_prev = old;
new->ip6q_next = old->ip6q_next; new->ip6q_next = old->ip6q_next;
old->ip6q_next->ip6q_prev= new; old->ip6q_next->ip6q_prev= new;
old->ip6q_next = new; old->ip6q_next = new;
V_ip6q[bucket].count++;
} }
void static void
frag6_remque(struct ip6q *p6) frag6_remque(struct ip6q *p6, uint32_t bucket)
{ {
IP6Q_LOCK_ASSERT(); IP6Q_LOCK_ASSERT(bucket);
p6->ip6q_prev->ip6q_next = p6->ip6q_next; p6->ip6q_prev->ip6q_next = p6->ip6q_next;
p6->ip6q_next->ip6q_prev = p6->ip6q_prev; p6->ip6q_next->ip6q_prev = p6->ip6q_prev;
V_ip6q[bucket].count--;
} }
/* /*
@ -735,37 +827,72 @@ void
frag6_slowtimo(void) frag6_slowtimo(void)
{ {
VNET_ITERATOR_DECL(vnet_iter); VNET_ITERATOR_DECL(vnet_iter);
struct ip6q *q6; struct ip6q *head, *q6;
int i;
VNET_LIST_RLOCK_NOSLEEP(); VNET_LIST_RLOCK_NOSLEEP();
IP6Q_LOCK();
VNET_FOREACH(vnet_iter) { VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter); CURVNET_SET(vnet_iter);
q6 = V_ip6q.ip6q_next; for (i = 0; i < IP6REASS_NHASH; i++) {
if (q6) IP6Q_LOCK(i);
while (q6 != &V_ip6q) { head = IP6Q_HEAD(i);
q6 = head->ip6q_next;
if (q6 == NULL) {
/*
* XXXJTL: This should never happen. This
* should turn into an assertion.
*/
IP6Q_UNLOCK(i);
continue;
}
while (q6 != head) {
--q6->ip6q_ttl; --q6->ip6q_ttl;
q6 = q6->ip6q_next; q6 = q6->ip6q_next;
if (q6->ip6q_prev->ip6q_ttl == 0) { if (q6->ip6q_prev->ip6q_ttl == 0) {
IP6STAT_INC(ip6s_fragtimeout); IP6STAT_INC(ip6s_fragtimeout);
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ /* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
frag6_freef(q6->ip6q_prev); frag6_freef(q6->ip6q_prev, i);
} }
} }
/*
* If we are over the maximum number of fragments
* (due to the limit being lowered), drain off
* enough to get down to the new limit.
* Note that we drain all reassembly queues if
* maxfragpackets is 0 (fragmentation is disabled),
* and don't enforce a limit when maxfragpackets
* is negative.
*/
while ((V_ip6_maxfragpackets == 0 ||
(V_ip6_maxfragpackets > 0 &&
V_ip6q[i].count > V_ip6_maxfragbucketsize)) &&
head->ip6q_prev != head) {
IP6STAT_INC(ip6s_fragoverflow);
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
frag6_freef(head->ip6q_prev, i);
}
IP6Q_UNLOCK(i);
}
/* /*
* If we are over the maximum number of fragments * If we are still over the maximum number of fragmented
* (due to the limit being lowered), drain off * packets, drain off enough to get down to the new limit.
* enough to get down to the new limit.
*/ */
while (V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets && i = 0;
V_ip6q.ip6q_prev) { while (V_ip6_maxfragpackets >= 0 &&
IP6STAT_INC(ip6s_fragoverflow); V_frag6_nfragpackets > (u_int)V_ip6_maxfragpackets) {
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ IP6Q_LOCK(i);
frag6_freef(V_ip6q.ip6q_prev); head = IP6Q_HEAD(i);
if (head->ip6q_prev != head) {
IP6STAT_INC(ip6s_fragoverflow);
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
frag6_freef(head->ip6q_prev, i);
}
IP6Q_UNLOCK(i);
i = (i + 1) % IP6REASS_NHASH;
} }
CURVNET_RESTORE(); CURVNET_RESTORE();
} }
IP6Q_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP(); VNET_LIST_RUNLOCK_NOSLEEP();
} }
@ -776,22 +903,25 @@ void
frag6_drain(void) frag6_drain(void)
{ {
VNET_ITERATOR_DECL(vnet_iter); VNET_ITERATOR_DECL(vnet_iter);
struct ip6q *head;
int i;
VNET_LIST_RLOCK_NOSLEEP(); VNET_LIST_RLOCK_NOSLEEP();
if (IP6Q_TRYLOCK() == 0) {
VNET_LIST_RUNLOCK_NOSLEEP();
return;
}
VNET_FOREACH(vnet_iter) { VNET_FOREACH(vnet_iter) {
CURVNET_SET(vnet_iter); CURVNET_SET(vnet_iter);
while (V_ip6q.ip6q_next != &V_ip6q) { for (i = 0; i < IP6REASS_NHASH; i++) {
IP6STAT_INC(ip6s_fragdropped); if (IP6Q_TRYLOCK(i) == 0)
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */ continue;
frag6_freef(V_ip6q.ip6q_next); head = IP6Q_HEAD(i);
while (head->ip6q_next != head) {
IP6STAT_INC(ip6s_fragdropped);
/* XXX in6_ifstat_inc(ifp, ifs6_reass_fail) */
frag6_freef(head->ip6q_next, i);
}
IP6Q_UNLOCK(i);
} }
CURVNET_RESTORE(); CURVNET_RESTORE();
} }
IP6Q_UNLOCK();
VNET_LIST_RUNLOCK_NOSLEEP(); VNET_LIST_RUNLOCK_NOSLEEP();
} }

View File

@ -637,7 +637,13 @@ struct ip6_mtuinfo {
* receiving IF. */ * receiving IF. */
#define IPV6CTL_RFC6204W3 50 /* Accept defroute even when forwarding #define IPV6CTL_RFC6204W3 50 /* Accept defroute even when forwarding
enabled */ enabled */
#define IPV6CTL_MAXID 51 #define IPV6CTL_INTRQMAXLEN 51 /* max length of IPv6 netisr queue */
#define IPV6CTL_INTRDQMAXLEN 52 /* max length of direct IPv6 netisr
* queue */
#define IPV6CTL_MAXFRAGSPERPACKET 53 /* Max fragments per packet */
#define IPV6CTL_MAXFRAGBUCKETSIZE 54 /* Max reassembly queues per bucket */
#define IPV6CTL_MAXID 55
#endif /* __BSD_VISIBLE */ #endif /* __BSD_VISIBLE */
/* /*

View File

@ -418,7 +418,9 @@ VNET_DEFINE(int, ip6_no_radr) = 0;
VNET_DEFINE(int, ip6_norbit_raif) = 0; VNET_DEFINE(int, ip6_norbit_raif) = 0;
VNET_DEFINE(int, ip6_rfc6204w3) = 0; VNET_DEFINE(int, ip6_rfc6204w3) = 0;
VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */ VNET_DEFINE(int, ip6_maxfragpackets); /* initialized in frag6.c:frag6_init() */
VNET_DEFINE(int, ip6_maxfrags); /* initialized in frag6.c:frag6_init() */ int ip6_maxfrags; /* initialized in frag6.c:frag6_init() */
VNET_DEFINE(int, ip6_maxfragbucketsize);/* initialized in frag6.c:frag6_init() */
VNET_DEFINE(int, ip6_maxfragsperpacket); /* initialized in frag6.c:frag6_init() */
VNET_DEFINE(int, ip6_log_interval) = 5; VNET_DEFINE(int, ip6_log_interval) = 5;
VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we VNET_DEFINE(int, ip6_hdrnestlimit) = 15;/* How many header options will we
* process? */ * process? */
@ -505,6 +507,20 @@ sysctl_ip6_tempvltime(SYSCTL_HANDLER_ARGS)
return (0); return (0);
} }
static int
sysctl_ip6_maxfragpackets(SYSCTL_HANDLER_ARGS)
{
int error, val;
val = V_ip6_maxfragpackets;
error = sysctl_handle_int(oidp, &val, 0, req);
if (error != 0 || !req->newptr)
return (error);
V_ip6_maxfragpackets = val;
frag6_set_bucketsize();
return (0);
}
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_FORWARDING, forwarding,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_forwarding), 0,
"Enable IPv6 forwarding between interfaces"); "Enable IPv6 forwarding between interfaces");
@ -517,9 +533,12 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_DEFHLIM, hlim,
SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat, SYSCTL_VNET_PCPUSTAT(_net_inet6_ip6, IPV6CTL_STATS, stats, struct ip6stat,
ip6stat, ip6stat,
"IP6 statistics (struct ip6stat, netinet6/ip6_var.h)"); "IP6 statistics (struct ip6stat, netinet6/ip6_var.h)");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets, SYSCTL_PROC(_net_inet6_ip6, IPV6CTL_MAXFRAGPACKETS, maxfragpackets,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragpackets), 0, CTLFLAG_VNET | CTLTYPE_INT | CTLFLAG_RW, NULL, 0,
"Maximum allowed number of outstanding fragmented IPv6 packets"); sysctl_ip6_maxfragpackets, "I",
"Default maximum number of outstanding fragmented IPv6 packets. "
"A value of 0 means no fragmented packets will be accepted, while a "
"a value of -1 means no limit");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_ACCEPT_RTADV, accept_rtadv,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_accept_rtadv), 0,
"Default value of per-interface flag for accepting ICMPv6 RA messages"); "Default value of per-interface flag for accepting ICMPv6 RA messages");
@ -588,8 +607,16 @@ SYSCTL_INT(_net_inet6_ip6, IPV6CTL_USE_DEFAULTZONE, use_defaultzone,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_use_defzone), 0,
"Use the default scope zone when none is specified"); "Use the default scope zone when none is specified");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGS, maxfrags,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfrags), 0, CTLFLAG_RW, &ip6_maxfrags, 0,
"Maximum allowed number of outstanding IPv6 packet fragments"); "Maximum allowed number of outstanding IPv6 packet fragments. "
"A value of 0 means no fragmented packets will be accepted, while a "
"a value of -1 means no limit");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGBUCKETSIZE, maxfragbucketsize,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragbucketsize), 0,
"Maximum number of reassembly queues per hash bucket");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MAXFRAGSPERPACKET, maxfragsperpacket,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_maxfragsperpacket), 0,
"Maximum allowed number of fragments per packet");
SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu, SYSCTL_INT(_net_inet6_ip6, IPV6CTL_MCAST_PMTU, mcast_pmtu,
CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0, CTLFLAG_VNET | CTLFLAG_RW, &VNET_NAME(ip6_mcast_pmtu), 0,
"Enable path MTU discovery for multicast packets"); "Enable path MTU discovery for multicast packets");

View File

@ -296,8 +296,10 @@ VNET_DECLARE(struct socket *, ip6_mrouter); /* multicast routing daemon */
VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */ VNET_DECLARE(int, ip6_sendredirects); /* send IP redirects when forwarding? */
VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly VNET_DECLARE(int, ip6_maxfragpackets); /* Maximum packets in reassembly
* queue */ * queue */
VNET_DECLARE(int, ip6_maxfrags); /* Maximum fragments in reassembly extern int ip6_maxfrags; /* Maximum fragments in reassembly
* queue */ * queue */
VNET_DECLARE(int, ip6_maxfragbucketsize); /* Maximum reassembly queues per bucket */
VNET_DECLARE(int, ip6_maxfragsperpacket); /* Maximum fragments per packet */
VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */ VNET_DECLARE(int, ip6_accept_rtadv); /* Acts as a host not a router */
VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */ VNET_DECLARE(int, ip6_no_radr); /* No defroute from RA */
VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA VNET_DECLARE(int, ip6_norbit_raif); /* Disable R-bit in NA on RA
@ -312,7 +314,8 @@ VNET_DECLARE(int, ip6_dad_count); /* DupAddrDetectionTransmits */
#define V_ip6_mrouter VNET(ip6_mrouter) #define V_ip6_mrouter VNET(ip6_mrouter)
#define V_ip6_sendredirects VNET(ip6_sendredirects) #define V_ip6_sendredirects VNET(ip6_sendredirects)
#define V_ip6_maxfragpackets VNET(ip6_maxfragpackets) #define V_ip6_maxfragpackets VNET(ip6_maxfragpackets)
#define V_ip6_maxfrags VNET(ip6_maxfrags) #define V_ip6_maxfragbucketsize VNET(ip6_maxfragbucketsize)
#define V_ip6_maxfragsperpacket VNET(ip6_maxfragsperpacket)
#define V_ip6_accept_rtadv VNET(ip6_accept_rtadv) #define V_ip6_accept_rtadv VNET(ip6_accept_rtadv)
#define V_ip6_no_radr VNET(ip6_no_radr) #define V_ip6_no_radr VNET(ip6_no_radr)
#define V_ip6_norbit_raif VNET(ip6_norbit_raif) #define V_ip6_norbit_raif VNET(ip6_norbit_raif)
@ -399,6 +402,7 @@ int ip6_fragment(struct ifnet *, struct mbuf *, int, u_char, int,
int route6_input(struct mbuf **, int *, int); int route6_input(struct mbuf **, int *, int);
void frag6_set_bucketsize(void);
void frag6_init(void); void frag6_init(void);
int frag6_input(struct mbuf **, int *, int); int frag6_input(struct mbuf **, int *, int);
void frag6_slowtimo(void); void frag6_slowtimo(void);