Received: (from majordomo@localhost) by oss.sgi.com (8.11.2/8.11.3) id fBBLqHw21084 for netdev-outgoing; Tue, 11 Dec 2001 13:52:17 -0800 Received: from localhost.localdomain (adsl-64-109-170-29.dsl.chcgil.ameritech.net [64.109.170.29]) by oss.sgi.com (8.11.2/8.11.3) with SMTP id fBBLppo21071 for ; Tue, 11 Dec 2001 13:51:51 -0800 Received: (from rochberg@localhost) by localhost.localdomain (8.11.6/8.11.6) id fBBKpXG01812; Tue, 11 Dec 2001 15:51:33 -0500 Date: Tue, 11 Dec 2001 15:51:33 -0500 Message-Id: <200112112051.fBBKpXG01812@localhost.localdomain> To: netdev@oss.sgi.com From: rochberg+l@61Cnetworks.com Subject: [Patch] fwmark on locally-originated packets Sender: owner-netdev@oss.sgi.com Precedence: bulk Content-Length: 8827 Lines: 263 This patch lets you set the fwmark for locally-originated packets on a per-socket basis. This means that with a little application tweaking (add an ioctl call) you can control packet routing on a per-socket basis. Select QoS on each connection, load-balance by hand, slice, dice! I've written a patch to do this. It does: 1. Add ioctls to set the fwmark for a socket 2. Make sure that the fwmark is passed to the routing functions 2a. Add new route function ip_route_output_sk which fetches necessary data out of sk (currently sk->bound_dev_if and sk->fwmark) and stuffs it into a route key 2b. Convert relevant calls to ip_route_output to use ip_route_output_sk 2c. Convert ip_route_connect to use ip_route_output_sk 3. Change ip_queue_xmit to copy skb->nfmark into sk->nfmark on outgoing packets 4. (unrelated bonus patch) Initialize key correctly in fib_frontend; the old "key.foo = bar...." lines didn't initialize fwmark. The new initializer zeros all unused fields Question 1: Do I want to hook in at ip_queue_xmit, or is there a better place? Question 2: Do I want to send this to some other mailing list (like linux-net@vger)? -david patches follow diff -X ~/dontdiff -Naur linux-2.4.16/include/linux/sockios.h linux-2.4.16-fwmark/include/linux/sockios.h --- linux-2.4.16/include/linux/sockios.h Tue Dec 11 14:47:10 2001 +++ linux-2.4.16-fwmark/include/linux/sockios.h Sat Dec 8 14:48:17 2001 @@ -105,6 +105,13 @@ #define SIOCGIFVLAN 0x8982 /* 802.1Q VLAN support */ #define SIOCSIFVLAN 0x8983 /* Set 802.1Q VLAN options */ +/* Set netfilter fwmark on packets for this connection */ +#define SIOCSFWMARK 0x8984 /* Set netfilter fwmark on packets from this cxn */ +#define SIOCGFWMARK 0x8985 + + + + /* bonding calls */ #define SIOCBONDENSLAVE 0x8990 /* enslave a device to the bond */ diff -X ~/dontdiff -Naur linux-2.4.16/include/net/route.h linux-2.4.16-fwmark/include/net/route.h --- linux-2.4.16/include/net/route.h Tue Dec 11 14:47:27 2001 +++ linux-2.4.16-fwmark/include/net/route.h Tue Dec 11 12:17:18 2001 @@ -27,6 +27,7 @@ #include #include #include +#include #include #include #include @@ -140,6 +141,17 @@ return ip_route_output_key(rp, &key); } +static inline int ip_route_output_sk(struct rtable **rp, + u32 daddr, u32 saddr, u32 tos, const struct sock *sk) +{ + struct rt_key key = { dst:daddr, src:saddr, oif:sk->bound_dev_if, tos:tos, +#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE) + fwmark:sk->nfmark, +#endif + }; + return ip_route_output_key(rp, &key); +} + static inline void ip_rt_put(struct rtable * rt) { @@ -156,17 +168,17 @@ return ip_tos2prio[IPTOS_TOS(tos)>>1]; } -static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif) +static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, const struct sock *sk) { int err; - err = ip_route_output(rp, dst, src, tos, oif); + err = ip_route_output_sk(rp, dst, src, tos, sk); if (err || (dst && src)) return err; dst = (*rp)->rt_dst; src = (*rp)->rt_src; ip_rt_put(*rp); *rp = NULL; - return ip_route_output(rp, dst, src, tos, oif); + return ip_route_output_sk(rp, dst, src, tos, sk); } extern void rt_bind_peer(struct rtable *rt, int create); diff -X ~/dontdiff -Naur linux-2.4.16/include/net/sock.h linux-2.4.16-fwmark/include/net/sock.h --- linux-2.4.16/include/net/sock.h Tue Dec 11 14:47:28 2001 +++ linux-2.4.16-fwmark/include/net/sock.h Tue Dec 11 12:17:05 2001 @@ -602,6 +602,10 @@ long rcvtimeo; long sndtimeo; +#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE) + int nfmark; /* Set nfmark on outgoing packets if non-zero */ +#endif + #ifdef CONFIG_FILTER /* Socket Filtering Instructions */ struct sk_filter *filter; diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/af_inet.c linux-2.4.16-fwmark/net/ipv4/af_inet.c --- linux-2.4.16/net/ipv4/af_inet.c Tue Dec 11 14:48:09 2001 +++ linux-2.4.16-fwmark/net/ipv4/af_inet.c Mon Dec 10 16:50:07 2001 @@ -931,6 +931,23 @@ #endif return -ENOPKG; +#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE) + case SIOCSFWMARK: + err = get_user(sk->nfmark,(int *) arg); + if (err) { + return err; + } + sk_dst_reset(sk); + break; + case SIOCGFWMARK: + err = put_user(sk->nfmark,(int *) arg); + if (err) { + return err; + } + break; +#endif + + default: if ((cmd >= SIOCDEVPRIVATE) && (cmd <= (SIOCDEVPRIVATE + 15))) diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/fib_frontend.c linux-2.4.16-fwmark/net/ipv4/fib_frontend.c --- linux-2.4.16/net/ipv4/fib_frontend.c Tue Dec 11 14:48:10 2001 +++ linux-2.4.16-fwmark/net/ipv4/fib_frontend.c Sat Dec 8 14:47:25 2001 @@ -207,17 +207,10 @@ struct net_device *dev, u32 *spec_dst, u32 *itag) { struct in_device *in_dev; - struct rt_key key; + struct rt_key key = { dst:src, src:dst, tos:tos, oif:0,iif:oif,scope:RT_SCOPE_UNIVERSE}; struct fib_result res; int no_addr, rpf; int ret; - - key.dst = src; - key.src = dst; - key.tos = tos; - key.oif = 0; - key.iif = oif; - key.scope = RT_SCOPE_UNIVERSE; no_addr = rpf = 0; read_lock(&inetdev_lock); diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/ip_output.c linux-2.4.16-fwmark/net/ipv4/ip_output.c --- linux-2.4.16/net/ipv4/ip_output.c Tue Dec 11 14:48:14 2001 +++ linux-2.4.16-fwmark/net/ipv4/ip_output.c Sat Dec 8 14:47:25 2001 @@ -345,6 +345,12 @@ struct rtable *rt; struct iphdr *iph; +#if defined(CONFIG_NETFILTER) || defined(CONFIG_NETFILTER_MODULE) + if (sk->nfmark) { + skb->nfmark=sk->nfmark; + } +#endif + /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. */ @@ -366,9 +372,9 @@ * keep trying until route appears or the connection times itself * out. */ - if (ip_route_output(&rt, daddr, sk->saddr, + if (ip_route_output_sk(&rt, daddr, sk->saddr, RT_CONN_FLAGS(sk), - sk->bound_dev_if)) + sk)) goto no_route; __sk_dst_set(sk, &rt->u.dst); sk->route_caps = rt->u.dst.dev->features; @@ -964,6 +970,7 @@ daddr = replyopts.opt.faddr; } + /* XXX should this use sk->oif ? */ if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) return; diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/tcp_ipv4.c linux-2.4.16-fwmark/net/ipv4/tcp_ipv4.c --- linux-2.4.16/net/ipv4/tcp_ipv4.c Tue Dec 11 14:48:27 2001 +++ linux-2.4.16-fwmark/net/ipv4/tcp_ipv4.c Sat Dec 8 14:47:25 2001 @@ -667,7 +667,7 @@ } tmp = ip_route_connect(&rt, nexthop, sk->saddr, - RT_CONN_FLAGS(sk), sk->bound_dev_if); + RT_CONN_FLAGS(sk), sk); if (tmp < 0) return tmp; @@ -1150,11 +1150,11 @@ struct ip_options *opt; opt = req->af.v4_req.opt; - if(ip_route_output(&rt, ((opt && opt->srr) ? + if(ip_route_output_sk(&rt, ((opt && opt->srr) ? opt->faddr : req->af.v4_req.rmt_addr), req->af.v4_req.loc_addr, - RT_CONN_FLAGS(sk), sk->bound_dev_if)) { + RT_CONN_FLAGS(sk), sk)) { IP_INC_STATS_BH(IpOutNoRoutes); return NULL; } @@ -1733,7 +1733,7 @@ /* Query new route. */ err = ip_route_connect(&rt, daddr, 0, RT_TOS(sk->protinfo.af_inet.tos)|sk->localroute, - sk->bound_dev_if); + sk); if (err) return err; @@ -1781,8 +1781,8 @@ if(sk->protinfo.af_inet.opt && sk->protinfo.af_inet.opt->srr) daddr = sk->protinfo.af_inet.opt->faddr; - err = ip_route_output(&rt, daddr, sk->saddr, - RT_CONN_FLAGS(sk), sk->bound_dev_if); + err = ip_route_output_sk(&rt, daddr, sk->saddr, + RT_CONN_FLAGS(sk), sk); if (!err) { __sk_dst_set(sk, &rt->u.dst); sk->route_caps = rt->u.dst.dev->features; diff -X ~/dontdiff -Naur linux-2.4.16/net/ipv4/udp.c linux-2.4.16-fwmark/net/ipv4/udp.c --- linux-2.4.16/net/ipv4/udp.c Tue Dec 11 14:48:29 2001 +++ linux-2.4.16-fwmark/net/ipv4/udp.c Sat Dec 8 14:47:25 2001 @@ -724,7 +724,7 @@ sk_dst_reset(sk); err = ip_route_connect(&rt, usin->sin_addr.s_addr, sk->saddr, - RT_CONN_FLAGS(sk), sk->bound_dev_if); + RT_CONN_FLAGS(sk), sk); if (err) return err; if ((rt->rt_flags&RTCF_BROADCAST) && !sk->broadcast) {