--- linux-2.6.37/include/linux/netfilter.h 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/include/linux/netfilter.h 2011-03-17 09:20:15.609856481 +0100 @@ -47,6 +47,8 @@ NF_INET_FORWARD, NF_INET_LOCAL_OUT, NF_INET_POST_ROUTING, + NF_INET_XFRM_IN, + NF_INET_XFRM_OUT, NF_INET_NUMHOOKS }; --- linux-2.6.37/include/linux/netfilter_ipv4.h 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/include/linux/netfilter_ipv4.h 2011-03-17 09:20:15.609856481 +0100 @@ -48,7 +48,11 @@ #define NF_IP_LOCAL_OUT 3 /* Packets about to hit the wire. */ #define NF_IP_POST_ROUTING 4 -#define NF_IP_NUMHOOKS 5 +/* Packets going into XFRM input transformation. */ +#define NF_IP_XFRM_IN 5 +/* Packets going into XFRM output transformation. */ +#define NF_IP_XFRM_OUT 6 +#define NF_IP_NUMHOOKS 7 #endif /* ! __KERNEL__ */ enum nf_ip_hook_priorities { --- linux-2.6.37/include/linux/netfilter_ipv6.h 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/include/linux/netfilter_ipv6.h 2011-03-17 09:20:15.609856481 +0100 @@ -52,7 +52,11 @@ #define NF_IP6_LOCAL_OUT 3 /* Packets about to hit the wire. */ #define NF_IP6_POST_ROUTING 4 -#define NF_IP6_NUMHOOKS 5 +/* Packets going into XFRM input transformation. */ +#define NF_IP6_XFRM_IN 5 +/* Packets going into XFRM output transformation. */ +#define NF_IP6_XFRM_OUT 6 +#define NF_IP6_NUMHOOKS 7 #endif /* ! __KERNEL__ */ --- linux-2.6.37/net/ipv4/netfilter/ipt_CLUSTERIP.c 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/net/ipv4/netfilter/ipt_CLUSTERIP.c 2011-03-17 09:20:15.913856481 +0100 @@ -21,6 +21,7 @@ #include #include #include +#include #include #include #include @@ -30,8 +31,9 @@ #include #include #include +#include -#define CLUSTERIP_VERSION "0.8" +#define CLUSTERIP_VERSION "0.9" MODULE_LICENSE("GPL"); MODULE_AUTHOR("Harald Welte "); @@ -113,7 +115,7 @@ } static struct clusterip_config * -__clusterip_config_find(__be32 clusterip) +clusterip_config_find(__be32 clusterip) { struct clusterip_config *c; @@ -125,13 +127,26 @@ return NULL; } +static struct clusterip_config * +clusterip_config_find_mac(u_int8_t clustermac[]) +{ + struct clusterip_config *c; + + list_for_each_entry_rcu(c, &clusterip_configs, list) { + if (memcmp(c->clustermac, clustermac, ETH_ALEN) == 0) + return c; + } + + return NULL; +} + static inline struct clusterip_config * clusterip_config_find_get(__be32 clusterip, int entry) { struct clusterip_config *c; rcu_read_lock_bh(); - c = __clusterip_config_find(clusterip); + c = clusterip_config_find(clusterip); if (c) { if (unlikely(!atomic_inc_not_zero(&c->refcount))) c = NULL; @@ -143,6 +158,22 @@ return c; } +static inline struct clusterip_config * +clusterip_config_find_get_mac(u_int8_t clustermac[]) +{ + struct clusterip_config *c; + + rcu_read_lock_bh(); + c = clusterip_config_find_mac(clustermac); + if (c) { + if (unlikely(!atomic_inc_not_zero(&c->refcount))) + c = NULL; + } + rcu_read_unlock_bh(); + + return c; +} + static void clusterip_config_init_nodelist(struct clusterip_config *c, const struct ipt_clusterip_tgt_info *i) @@ -227,6 +258,13 @@ #endif static inline u_int32_t +clusterip_hash_to_node(const struct clusterip_config *c, u64 hash) +{ + /* node numbers are 1..n, not 0..n */ + return ((hash * c->num_total_nodes) >> 32) + 1; +} + +static inline u_int32_t clusterip_hashfn(const struct sk_buff *skb, const struct clusterip_config *config) { @@ -273,8 +311,7 @@ break; } - /* node numbers are 1..n, not 0..n */ - return (((u64)hashval * config->num_total_nodes) >> 32) + 1; + return clusterip_hash_to_node(config, hashval); } static inline int @@ -308,12 +345,31 @@ return NF_DROP; } - /* special case: ICMP error handling. conntrack distinguishes between - * error messages (RELATED) and information requests (see below) */ - if (ip_hdr(skb)->protocol == IPPROTO_ICMP && - (ctinfo == IP_CT_RELATED || - ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY)) + switch (ip_hdr(skb)->protocol) { + case IPPROTO_ICMP: + /* ICMP error handling: conntrack distinguishes between error + * messages (RELATED) and information requests (see below)*/ + if (ctinfo == IP_CT_RELATED || + ctinfo == IP_CT_RELATED + IP_CT_IS_REPLY) + return XT_CONTINUE; + break; + case IPPROTO_ESP: + case IPPROTO_AH: + case IPPROTO_IPIP: + /* responsibility for IPsec is handled in xfrm input hook */ return XT_CONTINUE; + case IPPROTO_UDP: { + /* UDP 4500 with an SPI is encapsulated ESP */ + const struct iphdr *iph = ip_hdr(skb); + const u_int16_t *halfs = (const void *)iph+iph->ihl*4; + + if (halfs[1] == htons(4500) && (halfs[4] || halfs[5])) + return XT_CONTINUE; + break; + } + default: + break; + } /* ip_conntrack_icmp guarantees us that we only have ICMP_ECHO, * TIMESTAMP, INFO_REQUEST or ADDRESS type icmp packets from here @@ -521,14 +577,11 @@ /* normally the linux kernel always replies to arp queries of * addresses on different interfacs. However, in the CLUSTERIP case - * this wouldn't work, since we didn't subscribe the mcast group on - * other interfaces */ + * this wouldn't work. We need the multicast MAC to identify packets + * to pass to forwarding, so drop that ARP. */ if (c->dev != out) { - pr_debug("not mangling arp reply on different " - "interface: cip'%s'-skb'%s'\n", - c->dev->name, out->name); clusterip_config_put(c); - return NF_ACCEPT; + return NF_DROP; } /* mangle reply hardware address */ @@ -552,6 +605,154 @@ }; /*********************************************************************** + * IPSEC FORWARDING HOOKS + ***********************************************************************/ + +static unsigned int +cip_pre_routing_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + if (skb_mac_header(skb) < skb->head || + skb_mac_header(skb) + ETH_HLEN > skb->data || + !is_multicast_ether_addr(eth_hdr(skb)->h_dest)) + return NF_ACCEPT; + + /* if we receive a packet for a CLUSTERIP multicast address, + * we let it pass through ip_forward. */ + if (clusterip_config_find_mac(eth_hdr(skb)->h_dest)) + skb->pkt_type = PACKET_HOST; + + return NF_ACCEPT; +} + +static struct nf_hook_ops cip_pre_routing_ops __read_mostly = { + .hook = cip_pre_routing_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_PRE_ROUTING, + .priority = -1, +}; + +static inline u_int32_t +clusterip_hashfn_xfrm(const struct xfrm_state *x, + const struct clusterip_config *config) +{ + unsigned long hashval; + + hashval = jhash_2words(ntohl(x->id.daddr.a4), ntohl(x->id.spi), + config->hash_initval); + return clusterip_hash_to_node(config, hashval); +} + +/* interval to process packet not responsible */ +#define SEQ_UPDATE_INTERVAL 16 + +static unsigned int +cip_xfrm_in_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct clusterip_config *c; + struct xfrm_state *x; + u_int32_t hash; + __be32 seq; + unsigned int res = NF_DROP; + + x = skb->sp->xvec[skb->sp->len - 1]; + + switch (x->id.proto) { + case IPPROTO_ESP: + case IPPROTO_AH: + break; + case IPPROTO_IPIP: + case IPPROTO_COMP: + /* FIXME: Accept IPCOMP if packet was encrypted only */ + default: + return NF_ACCEPT; + } + + c = clusterip_config_find_get(x->id.daddr.a4, 0); + if (!c) + return NF_ACCEPT; + + /* process every n-th packet to update sequence counter, but drop it */ + hash = clusterip_hashfn_xfrm(x, c); + seq = XFRM_SKB_CB(skb)->seq.input; + if (clusterip_responsible(c, hash)) + res = NF_ACCEPT; + else if (ntohl(seq) % SEQ_UPDATE_INTERVAL == 0) { + if (x->type->input(x, skb) > 0) { + spin_lock(&x->lock); + + if (x->props.replay_window) + xfrm_replay_advance(x, seq); + + spin_unlock(&x->lock); + } + } + clusterip_config_put(c); + return res; +} + +static struct nf_hook_ops cip_xfrm_in_ops __read_mostly = { + .hook = cip_xfrm_in_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_XFRM_IN, + .priority = -1, +}; + +static unsigned int +cip_xfrm_out_hook(unsigned int hook, + struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + struct clusterip_config *c; + struct xfrm_state *x; + u_int32_t hash; + unsigned int res = NF_DROP; + + x = skb_dst(skb)->xfrm; + + switch (x->id.proto) { + case IPPROTO_ESP: + case IPPROTO_AH: + break; + case IPPROTO_IPIP: + case IPPROTO_COMP: + /* FIXME: Skip IPCOMP processing if we are not responsible */ + default: + return NF_ACCEPT; + } + + c = clusterip_config_find_get(x->props.saddr.a4, 0); + if (!c) + return NF_ACCEPT; + + hash = clusterip_hashfn_xfrm(x, c); + if (clusterip_responsible(c, hash)) + res = NF_ACCEPT; + + clusterip_config_put(c); + return res; +} + +static struct nf_hook_ops cip_xfrm_out_ops __read_mostly = { + .hook = cip_xfrm_out_hook, + .owner = THIS_MODULE, + .pf = PF_INET, + .hooknum = NF_INET_XFRM_OUT, + .priority = -1, +}; + +/*********************************************************************** * PROC DIR HANDLING ***********************************************************************/ @@ -709,6 +910,18 @@ if (ret < 0) goto cleanup_target; + ret = nf_register_hook(&cip_pre_routing_ops); + if (ret < 0) + goto cleanup_arp; + + ret = nf_register_hook(&cip_xfrm_in_ops); + if (ret < 0) + goto cleanup_pre; + + ret = nf_register_hook(&cip_xfrm_out_ops); + if (ret < 0) + goto cleanup_xfrm_in; + #ifdef CONFIG_PROC_FS clusterip_procdir = proc_mkdir("ipt_CLUSTERIP", init_net.proc_net); if (!clusterip_procdir) { @@ -724,8 +937,14 @@ #ifdef CONFIG_PROC_FS cleanup_hook: - nf_unregister_hook(&cip_arp_ops); + nf_unregister_hook(&cip_xfrm_out_ops); #endif /* CONFIG_PROC_FS */ +cleanup_xfrm_in: + nf_unregister_hook(&cip_xfrm_in_ops); +cleanup_pre: + nf_unregister_hook(&cip_pre_routing_ops); +cleanup_arp: + nf_unregister_hook(&cip_arp_ops); cleanup_target: xt_unregister_target(&clusterip_tg_reg); return ret; @@ -738,6 +957,9 @@ remove_proc_entry(clusterip_procdir->name, clusterip_procdir->parent); #endif nf_unregister_hook(&cip_arp_ops); + nf_unregister_hook(&cip_pre_routing_ops); + nf_unregister_hook(&cip_xfrm_in_ops); + nf_unregister_hook(&cip_xfrm_out_ops); xt_unregister_target(&clusterip_tg_reg); /* Wait for completion of call_rcu_bh()'s (clusterip_config_rcu_free) */ --- linux-2.6.37/net/xfrm/xfrm_input.c 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/net/xfrm/xfrm_input.c 2011-03-17 09:20:16.041856481 +0100 @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -102,6 +103,14 @@ } EXPORT_SYMBOL(xfrm_prepare_input); +static int xfrm_type_input(struct sk_buff *skb) +{ + struct xfrm_state *x; + + x = skb->sp->xvec[skb->sp->len - 1]; + return x->type->input(x, skb); +} + int xfrm_input(struct sk_buff *skb, int nexthdr, __be32 spi, int encap_type) { struct net *net = dev_net(skb->dev); @@ -186,9 +195,10 @@ XFRM_SKB_CB(skb)->seq.input = seq; - nexthdr = x->type->input(x, skb); + nexthdr = NF_HOOK(family, NF_INET_XFRM_IN, skb, + skb->dev, NULL, xfrm_type_input); - if (nexthdr == -EINPROGRESS) + if (nexthdr == -EINPROGRESS || nexthdr == -EPERM) return 0; resume: --- linux-2.6.37/net/xfrm/xfrm_output.c 2011-01-05 01:50:19.000000000 +0100 +++ linux-dumm/net/xfrm/xfrm_output.c 2011-03-17 09:20:16.041856481 +0100 @@ -38,6 +38,13 @@ return pskb_expand_head(skb, nhead, ntail, GFP_ATOMIC); } +static int xfrm_type_output(struct sk_buff *skb) +{ + struct xfrm_state *x = skb_dst(skb)->xfrm; + + return x->type->output(x, skb); +} + static int xfrm_output_one(struct sk_buff *skb, int err) { struct dst_entry *dst = skb_dst(skb); @@ -85,8 +92,9 @@ spin_unlock_bh(&x->lock); - err = x->type->output(x, skb); - if (err == -EINPROGRESS) + err = NF_HOOK(dst->ops->family, NF_INET_XFRM_OUT, skb, + NULL, dst->dev, xfrm_type_output); + if (err == -EINPROGRESS || err == -EPERM) goto out_exit; resume: