Skip to content

Commit 33dccbb

Browse files
herbertxdavem330
authored andcommitted
tun: Limit amount of queued packets per device
Unlike a normal socket path, the tuntap device send path does not have any accounting. This means that the user-space sender may be able to pin down arbitrary amounts of kernel memory by continuing to send data to an end-point that is congested. Even when this isn't an issue because of limited queueing at most end points, this can also be a problem because its only response to congestion is packet loss. That is, when those local queues at the end-point fills up, the tuntap device will start wasting system time because it will continue to send data there which simply gets dropped straight away. Of course one could argue that everybody should do congestion control end-to-end, unfortunately there are people in this world still hooked on UDP, and they don't appear to be going away anywhere fast. In fact, we've always helped them by performing accounting in our UDP code, the sole purpose of which is to provide congestion feedback other than through packet loss. This patch attempts to apply the same bandaid to the tuntap device. It creates a pseudo-socket object which is used to account our packets just as a normal socket does for UDP. Of course things are a little complex because we're actually reinjecting traffic back into the stack rather than out of the stack. The stack complexities however should have been resolved by preceding patches. So this one can simply start using skb_set_owner_w. For now the accounting is essentially disabled by default for backwards compatibility. In particular, we set the cap to INT_MAX. This is so that existing applications don't get confused by the sudden arrival EAGAIN errors. In future we may wish (or be forced to) do this by default. Signed-off-by: Herbert Xu <herbert@gondor.apana.org.au> Signed-off-by: David S. Miller <davem@davemloft.net>
1 parent 4cc7f68 commit 33dccbb

File tree

3 files changed

+118
-53
lines changed

3 files changed

+118
-53
lines changed

drivers/net/tun.c

+114-53
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,7 @@
6464
#include <net/net_namespace.h>
6565
#include <net/netns/generic.h>
6666
#include <net/rtnetlink.h>
67+
#include <net/sock.h>
6768

6869
#include <asm/system.h>
6970
#include <asm/uaccess.h>
@@ -95,6 +96,8 @@ struct tun_file {
9596
wait_queue_head_t read_wait;
9697
};
9798

99+
struct tun_sock;
100+
98101
struct tun_struct {
99102
struct tun_file *tfile;
100103
unsigned int flags;
@@ -107,12 +110,24 @@ struct tun_struct {
107110
struct fasync_struct *fasync;
108111

109112
struct tap_filter txflt;
113+
struct sock *sk;
114+
struct socket socket;
110115

111116
#ifdef TUN_DEBUG
112117
int debug;
113118
#endif
114119
};
115120

121+
struct tun_sock {
122+
struct sock sk;
123+
struct tun_struct *tun;
124+
};
125+
126+
static inline struct tun_sock *tun_sk(struct sock *sk)
127+
{
128+
return container_of(sk, struct tun_sock, sk);
129+
}
130+
116131
static int tun_attach(struct tun_struct *tun, struct file *file)
117132
{
118133
struct tun_file *tfile = file->private_data;
@@ -461,7 +476,8 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
461476
{
462477
struct tun_file *tfile = file->private_data;
463478
struct tun_struct *tun = __tun_get(tfile);
464-
unsigned int mask = POLLOUT | POLLWRNORM;
479+
struct sock *sk = tun->sk;
480+
unsigned int mask = 0;
465481

466482
if (!tun)
467483
return POLLERR;
@@ -473,6 +489,11 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
473489
if (!skb_queue_empty(&tun->readq))
474490
mask |= POLLIN | POLLRDNORM;
475491

492+
if (sock_writeable(sk) ||
493+
(!test_and_set_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags) &&
494+
sock_writeable(sk)))
495+
mask |= POLLOUT | POLLWRNORM;
496+
476497
if (tun->dev->reg_state != NETREG_REGISTERED)
477498
mask = POLLERR;
478499

@@ -482,66 +503,35 @@ static unsigned int tun_chr_poll(struct file *file, poll_table * wait)
482503

483504
/* prepad is the amount to reserve at front. len is length after that.
484505
* linear is a hint as to how much to copy (usually headers). */
485-
static struct sk_buff *tun_alloc_skb(size_t prepad, size_t len, size_t linear,
486-
gfp_t gfp)
506+
static inline struct sk_buff *tun_alloc_skb(struct tun_struct *tun,
507+
size_t prepad, size_t len,
508+
size_t linear, int noblock)
487509
{
510+
struct sock *sk = tun->sk;
488511
struct sk_buff *skb;
489-
unsigned int i;
490-
491-
skb = alloc_skb(prepad + len, gfp|__GFP_NOWARN);
492-
if (skb) {
493-
skb_reserve(skb, prepad);
494-
skb_put(skb, len);
495-
return skb;
496-
}
512+
int err;
497513

498514
/* Under a page? Don't bother with paged skb. */
499515
if (prepad + len < PAGE_SIZE)
500-
return NULL;
516+
linear = len;
501517

502-
/* Start with a normal skb, and add pages. */
503-
skb = alloc_skb(prepad + linear, gfp);
518+
skb = sock_alloc_send_pskb(sk, prepad + linear, len - linear, noblock,
519+
&err);
504520
if (!skb)
505-
return NULL;
521+
return ERR_PTR(err);
506522

507523
skb_reserve(skb, prepad);
508524
skb_put(skb, linear);
509-
510-
len -= linear;
511-
512-
for (i = 0; i < MAX_SKB_FRAGS; i++) {
513-
skb_frag_t *f = &skb_shinfo(skb)->frags[i];
514-
515-
f->page = alloc_page(gfp|__GFP_ZERO);
516-
if (!f->page)
517-
break;
518-
519-
f->page_offset = 0;
520-
f->size = PAGE_SIZE;
521-
522-
skb->data_len += PAGE_SIZE;
523-
skb->len += PAGE_SIZE;
524-
skb->truesize += PAGE_SIZE;
525-
skb_shinfo(skb)->nr_frags++;
526-
527-
if (len < PAGE_SIZE) {
528-
len = 0;
529-
break;
530-
}
531-
len -= PAGE_SIZE;
532-
}
533-
534-
/* Too large, or alloc fail? */
535-
if (unlikely(len)) {
536-
kfree_skb(skb);
537-
skb = NULL;
538-
}
525+
skb->data_len = len - linear;
526+
skb->len += len - linear;
539527

540528
return skb;
541529
}
542530

543531
/* Get packet from user space buffer */
544-
static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv, size_t count)
532+
static __inline__ ssize_t tun_get_user(struct tun_struct *tun,
533+
struct iovec *iv, size_t count,
534+
int noblock)
545535
{
546536
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
547537
struct sk_buff *skb;
@@ -573,9 +563,11 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
573563
return -EINVAL;
574564
}
575565

576-
if (!(skb = tun_alloc_skb(align, len, gso.hdr_len, GFP_KERNEL))) {
577-
tun->dev->stats.rx_dropped++;
578-
return -ENOMEM;
566+
skb = tun_alloc_skb(tun, align, len, gso.hdr_len, noblock);
567+
if (IS_ERR(skb)) {
568+
if (PTR_ERR(skb) != -EAGAIN)
569+
tun->dev->stats.rx_dropped++;
570+
return PTR_ERR(skb);
579571
}
580572

581573
if (skb_copy_datagram_from_iovec(skb, 0, iv, len)) {
@@ -661,15 +653,17 @@ static __inline__ ssize_t tun_get_user(struct tun_struct *tun, struct iovec *iv,
661653
static ssize_t tun_chr_aio_write(struct kiocb *iocb, const struct iovec *iv,
662654
unsigned long count, loff_t pos)
663655
{
664-
struct tun_struct *tun = tun_get(iocb->ki_filp);
656+
struct file *file = iocb->ki_filp;
657+
struct tun_struct *tun = file->private_data;
665658
ssize_t result;
666659

667660
if (!tun)
668661
return -EBADFD;
669662

670663
DBG(KERN_INFO "%s: tun_chr_write %ld\n", tun->dev->name, count);
671664

672-
result = tun_get_user(tun, (struct iovec *) iv, iov_length(iv, count));
665+
result = tun_get_user(tun, (struct iovec *)iv, iov_length(iv, count),
666+
file->f_flags & O_NONBLOCK);
673667

674668
tun_put(tun);
675669
return result;
@@ -828,11 +822,40 @@ static struct rtnl_link_ops tun_link_ops __read_mostly = {
828822
.validate = tun_validate,
829823
};
830824

825+
static void tun_sock_write_space(struct sock *sk)
826+
{
827+
struct tun_struct *tun;
828+
829+
if (!sock_writeable(sk))
830+
return;
831+
832+
if (sk->sk_sleep && waitqueue_active(sk->sk_sleep))
833+
wake_up_interruptible_sync(sk->sk_sleep);
834+
835+
if (!test_and_clear_bit(SOCK_ASYNC_NOSPACE, &sk->sk_socket->flags))
836+
return;
837+
838+
tun = container_of(sk, struct tun_sock, sk)->tun;
839+
kill_fasync(&tun->fasync, SIGIO, POLL_OUT);
840+
}
841+
842+
static void tun_sock_destruct(struct sock *sk)
843+
{
844+
dev_put(container_of(sk, struct tun_sock, sk)->tun->dev);
845+
}
846+
847+
static struct proto tun_proto = {
848+
.name = "tun",
849+
.owner = THIS_MODULE,
850+
.obj_size = sizeof(struct tun_sock),
851+
};
831852

832853
static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
833854
{
855+
struct sock *sk;
834856
struct tun_struct *tun;
835857
struct net_device *dev;
858+
struct tun_file *tfile = file->private_data;
836859
int err;
837860

838861
dev = __dev_get_by_name(net, ifr->ifr_name);
@@ -885,14 +908,31 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
885908
tun->flags = flags;
886909
tun->txflt.count = 0;
887910

911+
err = -ENOMEM;
912+
sk = sk_alloc(net, AF_UNSPEC, GFP_KERNEL, &tun_proto);
913+
if (!sk)
914+
goto err_free_dev;
915+
916+
/* This ref count is for tun->sk. */
917+
dev_hold(dev);
918+
sock_init_data(&tun->socket, sk);
919+
sk->sk_write_space = tun_sock_write_space;
920+
sk->sk_destruct = tun_sock_destruct;
921+
sk->sk_sndbuf = INT_MAX;
922+
sk->sk_sleep = &tfile->read_wait;
923+
924+
tun->sk = sk;
925+
container_of(sk, struct tun_sock, sk)->tun = tun;
926+
888927
tun_net_init(dev);
889928

890929
if (strchr(dev->name, '%')) {
891930
err = dev_alloc_name(dev, dev->name);
892931
if (err < 0)
893-
goto err_free_dev;
932+
goto err_free_sk;
894933
}
895934

935+
err = -EINVAL;
896936
err = register_netdevice(tun->dev);
897937
if (err < 0)
898938
goto err_free_dev;
@@ -928,6 +968,8 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
928968
strcpy(ifr->ifr_name, tun->dev->name);
929969
return 0;
930970

971+
err_free_sk:
972+
sock_put(sk);
931973
err_free_dev:
932974
free_netdev(dev);
933975
failed:
@@ -1012,6 +1054,7 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
10121054
struct tun_struct *tun;
10131055
void __user* argp = (void __user*)arg;
10141056
struct ifreq ifr;
1057+
int sndbuf;
10151058
int ret;
10161059

10171060
if (cmd == TUNSETIFF || _IOC_TYPE(cmd) == 0x89)
@@ -1151,6 +1194,22 @@ static int tun_chr_ioctl(struct inode *inode, struct file *file,
11511194
ret = dev_set_mac_address(tun->dev, &ifr.ifr_hwaddr);
11521195
rtnl_unlock();
11531196
break;
1197+
1198+
case TUNGETSNDBUF:
1199+
sndbuf = tun->sk->sk_sndbuf;
1200+
if (copy_to_user(argp, &sndbuf, sizeof(sndbuf)))
1201+
ret = -EFAULT;
1202+
break;
1203+
1204+
case TUNSETSNDBUF:
1205+
if (copy_from_user(&sndbuf, argp, sizeof(sndbuf))) {
1206+
ret = -EFAULT;
1207+
break;
1208+
}
1209+
1210+
tun->sk->sk_sndbuf = sndbuf;
1211+
break;
1212+
11541213
default:
11551214
ret = -EINVAL;
11561215
break;
@@ -1218,8 +1277,10 @@ static int tun_chr_close(struct inode *inode, struct file *file)
12181277
__tun_detach(tun);
12191278

12201279
/* If desireable, unregister the netdevice. */
1221-
if (!(tun->flags & TUN_PERSIST))
1280+
if (!(tun->flags & TUN_PERSIST)) {
1281+
sock_put(tun->sk);
12221282
unregister_netdevice(tun->dev);
1283+
}
12231284

12241285
rtnl_unlock();
12251286
}

fs/compat_ioctl.c

+2
Original file line numberDiff line numberDiff line change
@@ -1988,6 +1988,8 @@ COMPATIBLE_IOCTL(TUNSETGROUP)
19881988
COMPATIBLE_IOCTL(TUNGETFEATURES)
19891989
COMPATIBLE_IOCTL(TUNSETOFFLOAD)
19901990
COMPATIBLE_IOCTL(TUNSETTXFILTER)
1991+
COMPATIBLE_IOCTL(TUNGETSNDBUF)
1992+
COMPATIBLE_IOCTL(TUNSETSNDBUF)
19911993
/* Big V */
19921994
COMPATIBLE_IOCTL(VT_SETMODE)
19931995
COMPATIBLE_IOCTL(VT_GETMODE)

include/linux/if_tun.h

+2
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,8 @@
4646
#define TUNSETOFFLOAD _IOW('T', 208, unsigned int)
4747
#define TUNSETTXFILTER _IOW('T', 209, unsigned int)
4848
#define TUNGETIFF _IOR('T', 210, unsigned int)
49+
#define TUNGETSNDBUF _IOR('T', 211, int)
50+
#define TUNSETSNDBUF _IOW('T', 212, int)
4951

5052
/* TUNSETIFF ifr flags */
5153
#define IFF_TUN 0x0001

0 commit comments

Comments
 (0)