--- v2.2.20/linux/include/net/ip_fib.h Sat Oct 21 12:10:47 2000 +++ linux/include/net/ip_fib.h Fri Dec 14 02:23:44 2001 @@ -160,7 +160,8 @@ extern __inline__ void fib_select_default(const struct rt_key *key, struct fib_result *res) { - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + if ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || + FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST) main_table->tb_select_default(main_table, key, res); } @@ -171,6 +172,7 @@ extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; extern int fib_lookup(const struct rt_key *key, struct fib_result *res); extern struct fib_table *__fib_new_table(int id); +extern __inline__ int fib_result_table(struct fib_result *res); extern __inline__ struct fib_table *fib_get_table(int id) { @@ -214,6 +216,7 @@ extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, struct fib_info *fi); +extern int fib_num_down_nh_devs(struct fib_info *fi); extern int fib_sync_down(u32 local, struct device *dev, int force); extern int fib_sync_up(struct device *dev); extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, --- v2.2.20/linux/include/net/ip_masq.h Sat Aug 4 12:52:32 2001 +++ linux/include/net/ip_masq.h Fri Dec 14 02:24:21 2001 @@ -76,6 +76,7 @@ #define IP_MASQ_F_NO_SPORT 0x0008 /* no sport set yet */ #define IP_MASQ_F_DLOOSE 0x0010 /* loose dest binding */ +#define IP_MASQ_F_NOREROUTE 0x0020 /* rerouting is not needed */ #define IP_MASQ_F_NO_REPLY 0x0080 /* no reply yet from outside */ #define IP_MASQ_F_HASHED 0x0100 /* hashed entry */ @@ -179,7 +180,7 @@ extern struct list_head ip_masq_d_table[IP_MASQ_TAB_SIZE]; extern const char * ip_masq_state_name(int state); extern struct ip_masq_hook *ip_masq_user_hook; -extern u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope); +extern int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr, struct ip_masq *ms); /* * * IP_MASQ_APP: IP application masquerading definitions --- v2.2.20/linux/include/net/route.h Sat Oct 21 12:10:57 2000 +++ linux/include/net/route.h Fri Dec 14 02:24:50 2001 @@ -57,6 +57,7 @@ __u32 src; int iif; int oif; + __u32 gw; #ifdef CONFIG_IP_ROUTE_FWMARK __u32 fwmark; #endif @@ -111,7 +112,7 @@ u32 src, u8 tos, struct device *dev); extern void ip_rt_advice(struct rtable **rp, int advice); extern void rt_cache_flush(int how); -extern int ip_route_output(struct rtable **, u32 dst, u32 src, u32 tos, int oif); +extern int ip_route_output(struct rtable **, u32 dst, u32 src, u32 tos, int oif, u32 gw); extern int ip_route_input(struct sk_buff*, u32 dst, u32 src, u8 tos, struct device *devin); extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); extern void ip_rt_update_pmtu(struct dst_entry *dst, unsigned mtu); @@ -140,14 +141,14 @@ extern __inline__ int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif) { int err; - err = ip_route_output(rp, dst, src, tos, oif); + err = ip_route_output(rp, dst, src, tos, oif, 0); if (err || (dst && src)) return err; dst = (*rp)->rt_dst; src = (*rp)->rt_src; ip_rt_put(*rp); *rp = NULL; - return ip_route_output(rp, dst, src, tos, oif); + return ip_route_output(rp, dst, src, tos, oif, 0); } #endif /* _ROUTE_H */ --- v2.2.20/linux/include/linux/rtnetlink.h Sat Oct 21 12:11:58 2000 +++ linux/include/linux/rtnetlink.h Fri Dec 14 02:23:44 2001 @@ -230,6 +230,8 @@ #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_SUSPECT 8 /* We don't know the real state */ +#define RTNH_F_BADSTATE (RTNH_F_DEAD | RTNH_F_SUSPECT) /* Macros to handle hexthops */ --- v2.2.20/linux/net/ipv4/arp.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/arp.c Fri Dec 14 02:24:50 2001 @@ -311,18 +311,16 @@ u32 saddr; u8 *dst_ha = NULL; struct device *dev = neigh->dev; - struct device *dev2; - struct in_device *in_dev2; u32 target = *(u32*)neigh->primary_key; int probes = neigh->probes; + struct rtable *rt; - if (skb && - (dev2 = ip_dev_find(skb->nh.iph->saddr)) != NULL && - (in_dev2 = dev2->ip_ptr) != NULL && - !IN_DEV_HIDDEN(in_dev2)) - saddr = skb->nh.iph->saddr; - else - saddr = inet_select_addr(dev, target, RT_SCOPE_LINK); + if (ip_route_output(&rt, target, 0, 0, dev->ifindex, 0) < 0) + return; + saddr = rt->rt_src; + ip_rt_put(rt); + if (!saddr) + return; if ((probes -= neigh->parms->ucast_probes) < 0) { if (!(neigh->nud_state&NUD_VALID)) @@ -345,7 +343,7 @@ int flag = 0; //unsigned long now; - if (ip_route_output(&rt, sip, tip, 0, 0) < 0) + if (ip_route_output(&rt, sip, tip, 0, 0, 0) < 0) return 1; if (rt->u.dst.dev != dev) { net_statistics.ArpFilter++; @@ -834,7 +832,7 @@ r->arp_flags |= ATF_COM; if (dev == NULL) { struct rtable * rt; - if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0, 0)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); @@ -919,7 +917,7 @@ if (dev == NULL) { struct rtable * rt; - if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0)) != 0) + if ((err = ip_route_output(&rt, ip, 0, RTO_ONLINK, 0, 0)) != 0) return err; dev = rt->u.dst.dev; ip_rt_put(rt); --- v2.2.20/linux/net/ipv4/fib_hash.c Sat Oct 21 12:10:50 2000 +++ linux/net/ipv4/fib_hash.c Fri Dec 14 02:24:50 2001 @@ -69,6 +69,7 @@ struct fib_info *fn_info; #define FIB_INFO(f) ((f)->fn_info) fn_key_t fn_key; + int fn_last_dflt; u8 fn_tos; u8 fn_type; u8 fn_scope; @@ -302,68 +303,109 @@ return 1; } -static int fn_hash_last_dflt=-1; - -static int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx) +static int fib_detect_death(struct fib_info *fi, int order, int last_dflt, + struct fib_info **last_resort, int *last_idx, + int *last_nhsel, const struct rt_key *key) { struct neighbour *n; - int state = NUD_NONE; + int nhsel; + int state; + struct fib_nh * nh; + u32 dst; + int dead = 1; + + /* change_nexthops(fi) { */ + for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) { + if (key->oif && key->oif != nh->nh_oif) + continue; + if (key->gw && key->gw != nh->nh_gw && nh->nh_gw) + continue; + if (nh->nh_flags & RTNH_F_DEAD) + continue; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); - if (n) { - state = n->nud_state; - neigh_release(n); - } - if (state==NUD_REACHABLE) - return 0; - if ((state&NUD_VALID) && order != fn_hash_last_dflt) - return 0; - if ((state&NUD_VALID) || - (*last_idx<0 && order > fn_hash_last_dflt)) { - *last_resort = fi; - *last_idx = order; + nh->nh_flags &= ~RTNH_F_SUSPECT; + if (nh->nh_dev->flags & IFF_NOARP) { + dead = 0; + continue; + } + + dst = nh->nh_gw; + if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK) + dst = key->dst; + + state = NUD_NONE; + n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state==NUD_REACHABLE || + ((state&NUD_VALID) && order != last_dflt)) { + dead = 0; + continue; + } + if (!(state&NUD_VALID)) { + nh->nh_flags |= RTNH_F_SUSPECT; + } + if (!dead) continue; + if ((state&NUD_VALID) || + (*last_idx<0 && order >= last_dflt)) { + *last_resort = fi; + *last_idx = order; + *last_nhsel = nhsel; + } } - return 1; + /* } endfor_nexthops(fi) */ + + return dead; } static void fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) { - int order, last_idx; - struct fib_node *f; + int order, last_idx, last_dflt, last_nhsel; + struct fib_node *f, *first_node; struct fib_info *fi = NULL; struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash*)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; + struct fn_zone *fz = t->fn_zones[res->prefixlen]; + fn_key_t k; if (fz == NULL) return; + k = fz_key(key->dst, fz); + last_dflt = -2; + first_node = NULL; last_idx = -1; last_resort = NULL; + last_nhsel = 0; order = -1; - for (f = fz->fz_hash[0]; f; f = f->fn_next) { + for (f = fz_chain(k, fz); f; f = f->fn_next) { struct fib_info *next_fi = FIB_INFO(f); - if ((f->fn_state&FN_S_ZOMBIE) || + if (!fn_key_eq(k, f->fn_key) || + (f->fn_state&FN_S_ZOMBIE) || f->fn_scope != res->scope || +#ifdef CONFIG_IP_ROUTE_TOS + (f->fn_tos && f->fn_tos != key->tos) || +#endif f->fn_type != RTN_UNICAST) continue; if (next_fi->fib_priority > res->fi->fib_priority) break; - if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; f->fn_state |= FN_S_ACCESSED; - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + if (!first_node) { + last_dflt = f->fn_last_dflt; + first_node = f; + } + if (fi && !fib_detect_death(fi, order, last_dflt, + &last_resort, &last_idx, &last_nhsel, key)) { res->fi = fi; - fn_hash_last_dflt = order; + first_node->fn_last_dflt = order; return; } fi = next_fi; @@ -371,19 +413,28 @@ } if (order<=0 || fi==NULL) { - fn_hash_last_dflt = -1; + if (fi && fi->fib_nhs > 1 && + fib_detect_death(fi, order, last_dflt, + &last_resort, &last_idx, &last_nhsel, key) && + last_resort == fi) { + fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + } + if (first_node) first_node->fn_last_dflt = -1; return; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + if (!fib_detect_death(fi, order, last_dflt, &last_resort, &last_idx, + &last_nhsel, key)) { res->fi = fi; - fn_hash_last_dflt = order; + first_node->fn_last_dflt = order; return; } - if (last_idx >= 0) + if (last_idx >= 0) { res->fi = last_resort; - fn_hash_last_dflt = last_idx; + last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + first_node->fn_last_dflt = last_idx; + } } #define FIB_SCAN(f, fp) \ @@ -547,6 +598,7 @@ memset(new_f, 0, sizeof(struct fib_node)); + new_f->fn_last_dflt = -1; new_f->fn_key = key; #ifdef CONFIG_IP_ROUTE_TOS new_f->fn_tos = tos; @@ -686,7 +738,10 @@ while ((f = *fp) != NULL) { struct fib_info *fi = FIB_INFO(f); - if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + if (fi && ((f->fn_state&FN_S_ZOMBIE) || + (fi->fib_flags&RTNH_F_DEAD && + (fi->fib_protocol != RTPROT_STATIC || + !fib_num_down_nh_devs(fi))))) { *fp = f->fn_next; synchronize_bh(); --- v2.2.20/linux/net/ipv4/fib_frontend.c Sat Oct 21 12:10:47 2000 +++ linux/net/ipv4/fib_frontend.c Fri Dec 14 02:24:50 2001 @@ -54,6 +54,8 @@ struct fib_table *local_table; struct fib_table *main_table; +#define FIB_RES_TABLE(r) (RT_TABLE_MAIN) + #else #define RT_TABLE_MIN 1 @@ -71,6 +73,7 @@ return tb; } +#define FIB_RES_TABLE(r) (fib_result_table(r)) #endif /* CONFIG_IP_MULTIPLE_TABLES */ @@ -194,11 +197,15 @@ struct in_device *in_dev = dev->ip_ptr; struct rt_key key; struct fib_result res; + int table; + unsigned char prefixlen; + unsigned char scope; key.dst = src; key.src = dst; key.tos = tos; key.oif = 0; + key.gw = 0; key.iif = oif; key.scope = RT_SCOPE_UNIVERSE; @@ -209,24 +216,25 @@ if (res.type != RTN_UNICAST) return -EINVAL; *spec_dst = FIB_RES_PREFSRC(res); - if (itag) - fib_combine_itag(itag, &res); -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) -#else + fib_combine_itag(itag, &res); if (FIB_RES_DEV(res) == dev) -#endif return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; if (in_dev->ifa_list == NULL) goto last_resort; - if (IN_DEV_RPFILTER(in_dev)) - return -EINVAL; + table = FIB_RES_TABLE(&res); + prefixlen = res.prefixlen; + scope = res.scope; key.oif = dev->ifindex; - if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) { + if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST && + ((table == FIB_RES_TABLE(&res) && res.prefixlen >= prefixlen && + res.scope >= scope) || + !IN_DEV_RPFILTER(in_dev))) { *spec_dst = FIB_RES_PREFSRC(res); return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; return 0; last_resort: @@ -543,6 +551,8 @@ switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->dev) + fib_sync_up(ifa->ifa_dev->dev); rt_cache_flush(-1); break; case NETDEV_DOWN: @@ -573,9 +583,7 @@ for_ifa(in_dev) { fib_add_ifaddr(ifa); } endfor_ifa(in_dev); -#ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); -#endif rt_cache_flush(-1); break; case NETDEV_DOWN: --- v2.2.20/linux/net/ipv4/fib_rules.c Wed Dec 13 11:19:12 2000 +++ linux/net/ipv4/fib_rules.c Fri Dec 14 02:23:45 2001 @@ -265,6 +265,11 @@ } } +int fib_result_table(struct fib_result *res) +{ + return res->r->r_table; +} + int fib_lookup(const struct rt_key *key, struct fib_result *res) { int err; @@ -320,7 +325,8 @@ void fib_select_default(const struct rt_key *key, struct fib_result *res) { if (res->r && res->r->r_action == RTN_UNICAST && - FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { + ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || + FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST)) { struct fib_table *tb; if ((tb = fib_get_table(res->r->r_table)) != NULL) tb->tb_select_default(tb, key, res); --- v2.2.20/linux/net/ipv4/fib_semantics.c Sat Oct 21 12:10:47 2000 +++ linux/net/ipv4/fib_semantics.c Fri Dec 14 02:24:50 2001 @@ -127,7 +127,7 @@ #ifdef CONFIG_NET_CLS_ROUTE nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_BADSTATE)) return -1; onh++; } endfor_nexthops(fi); @@ -145,7 +145,7 @@ nfi->fib_mtu == fi->fib_mtu && nfi->fib_rtt == fi->fib_rtt && nfi->fib_window == fi->fib_window && - ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_BADSTATE) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } endfor_fib_info(); @@ -170,6 +170,30 @@ return -1; } +/* + * Return 0 only when we are sure that the preferred source is deleted + * or when all nexthop devices are removed + */ + +int fib_num_down_nh_devs(struct fib_info *fi) +{ +struct in_device *in_dev; +struct device *dev; +int dead = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) + return 0; + dev = dev_get_by_index(nh->nh_oif); + if (dev && !(dev->flags&IFF_UP) && + ((in_dev = dev->ip_ptr) != NULL) && + in_dev->ifa_list) + dead ++; + } endfor_nexthops(fi) + /* dead>0: All are marked DEAD but there is one in DOWN state */ + return dead; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) @@ -354,11 +378,25 @@ if (key.scope < RT_SCOPE_LINK) key.scope = RT_SCOPE_LINK; - if ((err = fib_lookup(&key, &res)) != 0) - return err; - nh->nh_scope = res.scope; - nh->nh_oif = FIB_RES_OIF(res); - nh->nh_dev = FIB_RES_DEV(res); + err = fib_lookup(&key, &res); + if (err) { + if (err == -ENETUNREACH && + fi->fib_protocol == RTPROT_STATIC) { + struct device *dev; + + dev = dev_get_by_index(nh->nh_oif); + if (dev == NULL || dev->flags & IFF_UP || + inet_addr_type(nh->nh_gw) == RTN_LOCAL) + return err; + nh->nh_flags |= RTNH_F_DEAD; + nh->nh_scope = RT_SCOPE_LINK; + nh->nh_dev = dev; + } else return err; + } else { + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = FIB_RES_DEV(res); + } } else { struct in_device *in_dev; @@ -368,8 +406,11 @@ in_dev = inetdev_by_index(nh->nh_oif); if (in_dev == NULL) return -ENODEV; - if (!(in_dev->dev->flags&IFF_UP)) - return -ENETDOWN; + if (!(in_dev->dev->flags&IFF_UP)) { + if (fi->fib_protocol != RTPROT_STATIC) + return -ENETDOWN; + nh->nh_flags |= RTNH_F_DEAD; + } nh->nh_dev = in_dev->dev; nh->nh_scope = RT_SCOPE_HOST; } @@ -490,10 +531,16 @@ if (nh->nh_dev == NULL) goto failure; } else { + int dead = 0; change_nexthops(fi) { if ((err = fib_check_nh(r, fi, nh)) != 0) goto failure; + if (nh->nh_flags & RTNH_F_DEAD) + dead ++; } endfor_nexthops(fi) + if (dead >= fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + } } if (fi->fib_prefsrc) { @@ -553,8 +600,12 @@ for_nexthops(fi) { if (nh->nh_flags&RTNH_F_DEAD) continue; - if (!key->oif || key->oif == nh->nh_oif) - break; + if (key->oif && key->oif != nh->nh_oif) + continue; + if (key->gw && key->gw != nh->nh_gw && + nh->nh_gw) + continue; + break; } #ifdef CONFIG_IP_ROUTE_MULTIPATH if (nhsel < fi->fib_nhs) { @@ -858,20 +909,22 @@ return ret; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* Dead device goes up. We wake up dead nexthops. - It takes sense only on multipath routes. */ int fib_sync_up(struct device *dev) { - int ret = 0; + struct rt_key key; + struct fib_result res; + int ret, rep; +repeat: if (!(dev->flags&IFF_UP)) return 0; + ret = 0; + rep = 0; for_fib_info() { int alive = 0; @@ -880,23 +933,40 @@ alive++; continue; } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + if (nh->nh_oif != dev->ifindex || dev->ip_ptr == NULL) continue; - if (nh->nh_dev != dev || dev->ip_ptr == NULL) + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) continue; + if (nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) { + memset(&key, 0, sizeof(key)); + key.dst = nh->nh_gw; + key.oif = nh->nh_oif; + key.scope = nh->nh_scope; + if (fib_lookup(&key, &res) != 0) + continue; + if (res.type != RTN_UNICAST) + continue; + rep = 1; + } alive++; +#ifdef CONFIG_IP_ROUTE_MULTIPATH nh->nh_power = 0; +#endif nh->nh_flags &= ~RTNH_F_DEAD; } endfor_nexthops(fi) - if (alive == fi->fib_nhs) { + if (alive > 0) { fi->fib_flags &= ~RTNH_F_DEAD; ret++; } } endfor_fib_info(); + if (rep) + goto repeat; return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* The algorithm is suboptimal, but it provides really fair weighted route distribution. @@ -905,12 +975,37 @@ void fib_select_multipath(const struct rt_key *key, struct fib_result *res) { struct fib_info *fi = res->fi; - int w; + int w, alive; + + if (key->oif) { + int sel = -1; + w = -1; + change_nexthops(fi) { + if (key->oif != nh->nh_oif) + continue; + if (key->gw && key->gw != nh->nh_gw && + nh->nh_gw) + continue; + if (!(nh->nh_flags&RTNH_F_BADSTATE)) { + if (nh->nh_power > w) { + w = nh->nh_power; + sel = nhsel; + } + } + } endfor_nexthops(fi); + if (sel >= 0) { + res->nh_sel = sel; + return; + } + goto last_resort; + } + +repeat: if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!(nh->nh_flags&RTNH_F_BADSTATE)) { power += nh->nh_weight; nh->nh_power = nh->nh_weight; } @@ -918,8 +1013,9 @@ fi->fib_power = power; #if 1 if (power <= 0) { - printk(KERN_CRIT "impossible 777\n"); - return; + goto last_resort; + /* printk(KERN_CRIT "impossible 777\n"); */ + /* return; */ } #endif } @@ -931,14 +1027,34 @@ w = jiffies % fi->fib_power; + alive = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if (!(nh->nh_flags&RTNH_F_BADSTATE) && nh->nh_power) { if ((w -= nh->nh_power) <= 0) { nh->nh_power--; fi->fib_power--; res->nh_sel = nhsel; return; } + alive = 1; + } + } endfor_nexthops(fi); + if (alive) { + fi->fib_power = 0; + goto repeat; + } + +last_resort: + + for_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (key->oif && key->oif != nh->nh_oif) + continue; + if (key->gw && key->gw != nh->nh_gw && + nh->nh_gw) + continue; + res->nh_sel = nhsel; + return; } } endfor_nexthops(fi); --- v2.2.20/linux/net/ipv4/icmp.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/icmp.c Fri Dec 14 02:24:50 2001 @@ -492,7 +492,7 @@ ipc.opt = &icmp_param->replyopts; if (ipc.opt->srr) daddr = icmp_param->replyopts.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0, 0)) return; if (icmpv4_xrlim_allow(rt, icmp_param->icmph.type, icmp_param->icmph.code)) { @@ -614,7 +614,7 @@ * fast routing cache at first. Otherwise an attacker can * grow the routing table. */ - if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0)) + if (ip_route_output(&rt, iph->saddr, saddr, RT_TOS(tos), 0, 0)) return; if (ip_options_echo(&icmp_param.replyopts, skb_in)) @@ -637,7 +637,7 @@ ipc.opt = &icmp_param.replyopts; if (icmp_param.replyopts.srr) { ip_rt_put(rt); - if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0)) + if (ip_route_output(&rt, icmp_param.replyopts.faddr, saddr, RT_TOS(tos), 0, 0)) return; } --- v2.2.20/linux/net/ipv4/igmp.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/igmp.c Fri Dec 14 02:24:50 2001 @@ -164,7 +164,7 @@ if (type == IGMP_HOST_LEAVE_MESSAGE) dst = IGMP_ALL_ROUTER; - if (ip_route_output(&rt, dst, 0, 0, dev->ifindex)) + if (ip_route_output(&rt, dst, 0, 0, dev->ifindex, 0)) return -1; if (rt->rt_src == 0) { ip_rt_put(rt); @@ -527,7 +527,7 @@ return NULL; } - if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0)) { + if (!dev && !ip_route_output(&rt, imr->imr_multiaddr.s_addr, 0, 0, 0, 0)) { dev = rt->u.dst.dev; ip_rt_put(rt); } --- v2.2.20/linux/net/ipv4/ipip.c Sat Oct 21 12:10:50 2000 +++ linux/net/ipv4/ipip.c Fri Dec 14 02:24:50 2001 @@ -400,7 +400,7 @@ skb2->nh.raw = skb2->data; /* Try to guess incoming interface */ - if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0, 0)) { kfree_skb(skb2); return; } @@ -410,7 +410,7 @@ if (rt->rt_flags&RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0, 0) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -516,7 +516,7 @@ goto tx_error_icmp; } - if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link, 0)) { tunnel->stat.tx_carrier_errors++; goto tx_error_icmp; } @@ -775,7 +775,7 @@ if (iph->daddr) { struct rtable *rt; - if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link, 0)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } --- v2.2.20/linux/net/ipv4/ipmr.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/ipmr.c Fri Dec 14 02:24:50 2001 @@ -1049,11 +1049,11 @@ #endif if (vif->flags&VIFF_TUNNEL) { - if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link)) + if (ip_route_output(&rt, vif->remote, vif->local, RT_TOS(iph->tos), vif->link, 0)) return; encap = sizeof(struct iphdr); } else { - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link)) + if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos), vif->link, 0)) return; } --- v2.2.20/linux/net/ipv4/ip_gre.c Sat Oct 21 12:10:50 2000 +++ linux/net/ipv4/ip_gre.c Fri Dec 14 02:24:50 2001 @@ -471,7 +471,7 @@ skb2->nh.raw = skb2->data; /* Try to guess incoming interface */ - if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0)) { + if (ip_route_output(&rt, eiph->saddr, 0, RT_TOS(eiph->tos), 0, 0)) { kfree_skb(skb2); return; } @@ -481,7 +481,7 @@ if (rt->rt_flags&RTCF_LOCAL) { ip_rt_put(rt); rt = NULL; - if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0) || + if (ip_route_output(&rt, eiph->daddr, eiph->saddr, eiph->tos, 0, 0) || rt->u.dst.dev->type != ARPHRD_IPGRE) { ip_rt_put(rt); kfree_skb(skb2); @@ -672,7 +672,7 @@ tos &= ~1; } - if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link)) { + if (ip_route_output(&rt, dst, tiph->saddr, RT_TOS(tos), tunnel->parms.link, 0)) { tunnel->stat.tx_carrier_errors++; goto tx_error; } @@ -1026,7 +1026,7 @@ struct rtable *rt; if (ip_route_output(&rt, t->parms.iph.daddr, t->parms.iph.saddr, RT_TOS(t->parms.iph.tos), - t->parms.link)) { + t->parms.link, 0)) { MOD_DEC_USE_COUNT; return -EADDRNOTAVAIL; } @@ -1096,7 +1096,7 @@ if (iph->daddr) { struct rtable *rt; - if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link)) { + if (!ip_route_output(&rt, iph->daddr, iph->saddr, RT_TOS(iph->tos), tunnel->parms.link, 0)) { tdev = rt->u.dst.dev; ip_rt_put(rt); } --- v2.2.20/linux/net/ipv4/ip_forward.c Sat Oct 21 12:11:59 2000 +++ linux/net/ipv4/ip_forward.c Fri Dec 14 02:24:21 2001 @@ -182,9 +182,20 @@ return -1; } - if (fw_res) + if (fw_res) { + rt = (struct rtable*)skb->dst; + dev2 = rt->u.dst.dev; + mtu = rt->u.dst.pmtu; + if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL) + return -1; + iph = skb->nh.iph; + opt = &(IPCB(skb)->opt); + if (opt->is_strictroute && + rt->rt_dst != rt->rt_gateway) + goto sr_failed; /* ICMP matched - skip firewall */ goto skip_call_fw_firewall; + } #ifdef CONFIG_IP_MASQUERADE_ICMP } #endif @@ -230,8 +241,16 @@ /* * Masquerader may have changed skb */ + rt = (struct rtable*)skb->dst; + dev2 = rt->u.dst.dev; + mtu = rt->u.dst.pmtu; + if ((skb = skb_cow(skb, dev2->hard_header_len)) == NULL) + return -1; iph = skb->nh.iph; opt = &(IPCB(skb)->opt); + if (opt->is_strictroute && + rt->rt_dst != rt->rt_gateway) + goto sr_failed; } } #endif --- v2.2.20/linux/net/ipv4/ip_masq.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/ip_masq.c Fri Dec 14 02:24:50 2001 @@ -50,6 +50,8 @@ * Kai Bankett : do not toss other IP protos in proto_doff() * Dan Kegel : pointed correct NAT behavior for UDP streams * Julian Anastasov : use daddr and dport as hash keys + * Julian Anastasov : connection rerouting + * Julian Anastasov : incremental checksum updates * */ @@ -1119,6 +1121,108 @@ return ret; } +static int ip_masq_check_tcpudp(struct sk_buff *skb, + struct iphdr *iph, + union ip_masq_tphdr *h, + int size, int doff) +{ + int csum; + + if (h->uh->check == 0 && iph->protocol == IPPROTO_UDP) + return 0; + +#ifdef CONFIG_IP_MASQ_DEBUG + if (ip_masq_get_debug_level() > 3) { + skb->ip_summed = CHECKSUM_NONE; + } +#endif + /* Check that the checksum is OK */ + switch (skb->ip_summed) + { + case CHECKSUM_NONE: + csum = csum_partial(h->raw + doff, size - doff, 0); + skb->csum = csum_partial(h->raw, doff, csum); + case CHECKSUM_HW: + if (csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, skb->csum)) + { + IP_MASQ_DEBUG(0, "Wrong %s checksum in %u.%u.%u.%u->%u.%u.%u.%u (size=%d)!\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr), + size); + return -1; + } + skb->ip_summed = CHECKSUM_UNNECESSARY; + break; + default: + /* CHECKSUM_UNNECESSARY */ + } + return 0; +} + +static inline u16 ip_masq_check_diff(u32 old, u32 new, u16 oldsum) +{ + u32 diff[2] = { old, new }; + + return csum_fold(csum_partial((char *) diff, sizeof(diff), + oldsum ^ 0xFFFF)); +} + +/* Incremental checksum update */ + +static inline void ip_masq_check_inc_update(union ip_masq_tphdr *h, + u32 oldip, u32 newip, u16 oldport, u16 newport, u8 protocol) +{ + u16 *checkp; + + if (protocol == IPPROTO_TCP) + checkp = &h->th->check; + else + checkp = &h->uh->check; + *checkp = ip_masq_check_diff(~oldip, newip, + ip_masq_check_diff(oldport ^ 0xFFFF, newport, *checkp)); + if (!*checkp && protocol == IPPROTO_UDP) + *checkp = 0xFFFF; +} + +/* Full checksum update */ +static inline void ip_masq_check_full_update(struct iphdr *iph, + union ip_masq_tphdr *h, + int size, int doff, int csum) +{ + switch (iph->protocol) { + case IPPROTO_TCP: + h->th->check = 0; + h->th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h->raw , doff, csum)); + IP_MASQ_DEBUG(3, "%s %u.%u.%u.%u->%u.%u.%u.%u csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr), + h->th->check, + (char*) & (h->th->check) - (char*) h->raw); + + break; + case IPPROTO_UDP: + h->uh->check = 0; + h->uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, + size, iph->protocol, + csum_partial(h->raw , doff, csum)); + if (h->uh->check == 0) + h->uh->check = 0xFFFF; + IP_MASQ_DEBUG(3, "%s %u.%u.%u.%u->%u.%u.%u.%u csum=%d (+%d)\n", + masq_proto_name(iph->protocol), + NIPQUAD(iph->saddr), + NIPQUAD(iph->daddr), + h->uh->check, + (char*) &(h->uh->check)- (char*) h->raw); + break; + } +} + + int ip_fw_masquerade(struct sk_buff **skb_p, __u32 maddr) { struct sk_buff *skb = *skb_p; @@ -1130,11 +1234,9 @@ /* * doff holds transport protocol data offset * csum holds its checksum - * csum_ok says if csum is valid */ int doff = 0; int csum = 0; - int csum_ok = 0; /* * We can only masquerade protocols with ports... and hack some ICMPs @@ -1143,7 +1245,6 @@ h.raw = (char*) iph + iph->ihl * 4; size = ntohs(iph->tot_len) - (iph->ihl * 4); - doff = proto_doff(iph->protocol, h.raw, size); if (doff <= 0) { /* @@ -1153,70 +1254,8 @@ return -1; } - /* Lets determine our maddr now, shall we? */ - if (maddr == 0) { - struct rtable *rt; - struct rtable *skb_rt = (struct rtable*)skb->dst; - struct device *skb_dev = skb_rt->u.dst.dev; - - if (ip_route_output(&rt, iph->daddr, 0, RT_TOS(iph->tos)|RTO_CONN, skb_dev?skb_dev->ifindex:0)) { - /* Fallback on old method */ - /* This really shouldn't happen... */ - maddr = inet_select_addr(skb_dev, skb_rt->rt_gateway, RT_SCOPE_UNIVERSE); - } else { - /* Route lookup succeeded */ - maddr = rt->rt_src; - ip_rt_put(rt); - } - } - - switch (iph->protocol) { - case IPPROTO_ICMP: + if (iph->protocol == IPPROTO_ICMP) return(ip_fw_masq_icmp(skb_p, maddr)); - case IPPROTO_UDP: - if (h.uh->check == 0) - /* No UDP checksum */ - break; - case IPPROTO_TCP: - /* Make sure packet is in the masq range */ - IP_MASQ_DEBUG(3, "O-pkt: %s size=%d\n", - masq_proto_name(iph->protocol), - size); - -#ifdef CONFIG_IP_MASQ_DEBUG - if (ip_masq_get_debug_level() > 3) { - skb->ip_summed = CHECKSUM_NONE; - } -#endif - /* Check that the checksum is OK */ - switch (skb->ip_summed) - { - case CHECKSUM_NONE: - { - csum = csum_partial(h.raw + doff, size - doff, 0); - IP_MASQ_DEBUG(3, "O-pkt: %s I-datacsum=%d\n", - masq_proto_name(iph->protocol), - csum); - - skb->csum = csum_partial(h.raw , doff, csum); - } - case CHECKSUM_HW: - if (csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, skb->csum)) - { - IP_MASQ_DEBUG(0, "Outgoing failed %s checksum from %d.%d.%d.%d (size=%d)!\n", - masq_proto_name(iph->protocol), - NIPQUAD(iph->saddr), - size); - return -1; - } - default: - /* CHECKSUM_UNNECESSARY */ - } - break; - default: - return -1; - } /* * Now hunt the list to see if we have an old entry */ @@ -1231,6 +1270,16 @@ ms = ip_masq_out_get_iph(iph); if (ms!=NULL) { + if (ms->app && ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + + if (!maddr && (ip_masq_select_addr(skb,&maddr,ms) < 0)) { + ip_masq_put(ms); + return -1; + } + /* * If sysctl !=0 and no pkt has been received yet * in this tunnel and routing iface address has changed... @@ -1238,6 +1287,11 @@ */ if ( sysctl_ip_dynaddr && ms->flags & IP_MASQ_F_NO_REPLY && maddr != ms->maddr) { + if (!ms->app && ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + if (sysctl_ip_dynaddr > 1) { IP_MASQ_INFO( "ip_fw_masquerade(): change masq.addr from %d.%d.%d.%d to %d.%d.%d.%d\n", NIPQUAD(ms->maddr),NIPQUAD(maddr)); @@ -1260,6 +1314,11 @@ if ( ms->flags & IP_MASQ_F_NO_SPORT && ms->protocol == IPPROTO_TCP ) { + if (!ms->app && ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); @@ -1284,6 +1343,9 @@ * Nope, not found, create a new entry for it */ + if (!maddr && (ip_masq_select_addr(skb,&maddr,0) < 0)) + return -1; + #ifdef CONFIG_IP_MASQUERADE_MOD if (!(ms = ip_masq_mod_out_create(skb, iph, maddr))) #endif @@ -1296,6 +1358,10 @@ return -1; if (!ms->app && skb->fwmark) ip_masq_bind_app_fwmark(ms, skb->fwmark); + if (ms->app && ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } } /* @@ -1312,19 +1378,20 @@ size = skb->len - (h.raw - skb->nh.raw); + if (!ms->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + /* Only port and addr are changed, make fast csum update */ + ip_masq_check_inc_update(&h, iph->saddr, ms->maddr, + h.portp[0], ms->mport, iph->protocol); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } + /* * Set iph addr and port from ip_masq obj. */ iph->saddr = ms->maddr; h.portp[0] = ms->mport; - /* - * Invalidate csum saving if tunnel has masq helper - */ - - if (ms->app) - csum_ok = 0; - /* * Attempt ip_masq_app call. * will fix ip_masq and iph seq stuff @@ -1349,44 +1416,16 @@ * Transport's payload partial csum */ - if (!csum_ok) { + if (ms->app) { csum = csum_partial(h.raw + doff, size - doff, 0); + skb->csum = csum; + IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n", + masq_proto_name(iph->protocol), + size, + csum); + ip_masq_check_full_update(iph, &h, size, doff, csum); } - skb->csum = csum; - - IP_MASQ_DEBUG(3, "O-pkt: %s size=%d O-datacsum=%d\n", - masq_proto_name(iph->protocol), - size, - csum); - /* - * Protocol csum - */ - switch (iph->protocol) { - case IPPROTO_TCP: - h.th->check = 0; - h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, - csum_partial(h.raw , doff, csum)); - IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", - masq_proto_name(iph->protocol), - h.th->check, - (char*) & (h.th->check) - (char*) h.raw); - - break; - case IPPROTO_UDP: - h.uh->check = 0; - h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, - csum_partial(h.raw , doff, csum)); - if (h.uh->check == 0) - h.uh->check = 0xFFFF; - IP_MASQ_DEBUG(3, "O-pkt: %s O-csum=%d (+%d)\n", - masq_proto_name(iph->protocol), - h.uh->check, - (char*) &(h.uh->check)- (char*) h.raw); - break; - } ip_send_check(iph); IP_MASQ_DEBUG(2, "O-routed from %08X:%04X with masq.addr %08X\n", @@ -1511,6 +1550,8 @@ iph->daddr, icmp_hv_req(icmph)); if (ms == NULL) { + if (!maddr && (ip_masq_select_addr(skb,&maddr,0) < 0)) + return -1; ms = ip_masq_new(iph->protocol, maddr, 0, iph->saddr, icmp_id(icmph), @@ -1520,6 +1561,10 @@ return (-1); IP_MASQ_DEBUG(1, "Created new icmp entry\n"); } + if (!maddr && (ip_masq_select_addr(skb,&maddr,ms) < 0)) { + ip_masq_put(ms); + return -1; + } /* Rewrite source address */ /* @@ -1611,6 +1656,11 @@ if (ms == NULL) return 0; + if (!maddr && (ip_masq_select_addr(skb,&maddr,ms) < 0)) { + __ip_masq_put(ms); + return -1; + } + /* Now we do real damage to this packet...! */ /* First change the source IP address, and recalc checksum */ iph->saddr = ms->maddr; @@ -1689,6 +1739,11 @@ if (ms == NULL) return 0; + if (!maddr && (ip_masq_select_addr(skb,&maddr,ms) < 0)) { + __ip_masq_put(ms); + return -1; + } + /* Now we do real damage to this packet...! */ /* First change the source IP address, and recalc checksum */ iph->saddr = ms->maddr; @@ -1996,7 +2051,6 @@ unsigned short size; int doff = 0; int csum = 0; - int csum_ok = 0; __u32 maddr; /* @@ -2051,37 +2105,6 @@ #endif && atomic_read(&mport_count) == 0 ) return 0; - - /* Check that the checksum is OK */ - if ((iph->protocol == IPPROTO_UDP) && (h.uh->check == 0)) - /* No UDP checksum */ - break; -#ifdef CONFIG_IP_MASQ_DEBUG - if (ip_masq_get_debug_level() > 3) { - skb->ip_summed = CHECKSUM_NONE; - } -#endif - - switch (skb->ip_summed) - { - case CHECKSUM_NONE: - csum = csum_partial(h.raw + doff, size - doff, 0); - csum_ok++; - skb->csum = csum_partial(h.raw , doff, csum); - - case CHECKSUM_HW: - if (csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, skb->csum)) - { - IP_MASQ_DEBUG(0, "Incoming failed %s checksum from %d.%d.%d.%d (size=%d)!\n", - masq_proto_name(iph->protocol), - NIPQUAD(iph->saddr), - size); - return -1; - } - default: - /* CHECKSUM_UNNECESSARY */ - } break; default: return 0; @@ -2120,6 +2143,12 @@ if (ms != NULL) { + if (ms->app && + ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + /* * got reply, so clear flag */ @@ -2139,6 +2168,12 @@ } else { if ( ms->flags & IP_MASQ_F_NO_DPORT ) { /* && ms->protocol == IPPROTO_TCP ) { */ + if (!ms->app && + ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); @@ -2154,6 +2189,12 @@ } if (ms->flags & IP_MASQ_F_NO_DADDR ) { /* && ms->protocol == IPPROTO_TCP) { */ + if (!ms->app && + ip_masq_check_tcpudp(skb, iph, &h, size, doff) < 0) { + ip_masq_put(ms); + return -1; + } + write_lock(&__ip_masq_lock); ip_masq_unhash(ms); @@ -2172,15 +2213,17 @@ ip_masq_put(ms); return -1; } - iph->daddr = ms->saddr; - h.portp[1] = ms->sport; - /* - * Invalidate csum saving if tunnel has masq helper - */ + if (!ms->app && (iph->protocol != IPPROTO_UDP || h.uh->check != 0)) { + /* Only port and addr are changed, make fast csum update */ + ip_masq_check_inc_update(&h, iph->daddr, ms->saddr, + h.portp[1], ms->sport, iph->protocol); + if (skb->ip_summed == CHECKSUM_HW) + skb->ip_summed = CHECKSUM_NONE; + } - if (ms->app) - csum_ok = 0; + iph->daddr = ms->saddr; + h.portp[1] = ms->sport; /* * Attempt ip_masq_app call. @@ -2199,37 +2242,10 @@ size = ntohs(iph->tot_len) - (iph->ihl * 4); } - /* - * Yug! adjust UDP/TCP checksums - */ - - /* - * Transport's payload partial csum - */ - - if (!csum_ok) { + if (ms->app) { csum = csum_partial(h.raw + doff, size - doff, 0); - } - skb->csum = csum; - - /* - * Protocol csum - */ - switch (iph->protocol) { - case IPPROTO_TCP: - h.th->check = 0; - h.th->check=csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, - csum_partial(h.raw , doff, csum)); - break; - case IPPROTO_UDP: - h.uh->check = 0; - h.uh->check=csum_tcpudp_magic(iph->saddr, iph->daddr, - size, iph->protocol, - csum_partial(h.raw , doff, csum)); - if (h.uh->check == 0) - h.uh->check = 0xFFFF; - break; + skb->csum = csum; + ip_masq_check_full_update(iph, &h, size, doff, csum); } ip_send_check(iph); @@ -2530,11 +2546,56 @@ } #endif /* CONFIG_PROC_FS */ /* - * Wrapper over inet_select_addr() + * Determine maddr and optionally reroute the packet */ -u32 ip_masq_select_addr(struct device *dev, u32 dst, int scope) +int ip_masq_select_addr(struct sk_buff *skb, __u32 *maddr, struct ip_masq *ms) { - return inet_select_addr(dev, dst, scope); + struct rtable *rt; + struct rtable *skb_rt = (struct rtable*)skb->dst; + struct device *skb_dev = skb_rt->u.dst.dev; + struct iphdr *iph = skb->nh.iph; + + if (ms && !(ms->flags & IP_MASQ_F_NO_REPLY)) + *maddr = ms->maddr; + + /* + * For now we call ip_route_output almost each time, i.e. we + * are not sure when the route cache entries expire + * probably after a route change (bad for multipath). + * The route lookup is avoided if: + * - we already know maddr and + * - the route to the destination is not gatewayed (still link + * can fail, so may be this is disabled) + * i.e. usually only for local networks which is not so good. + * We don't have a way to determine whether the skb_rt uses + * multipath route. In any case, these optimizations are + * still questionable when route changes take place. + */ + if ((ms && !(ms->flags & IP_MASQ_F_NOREROUTE)) || !*maddr) { + if (ip_route_output(&rt, iph->daddr, *maddr, + RT_TOS(iph->tos)|RTO_CONN, + (!*maddr && skb_dev)?skb_dev->ifindex:0, + (!*maddr && skb_dev)?skb_rt->rt_gateway:0) || + RTN_UNICAST != rt->rt_type) + return -1; + *maddr = rt->rt_src; + if (rt->rt_gateway != skb_rt->rt_gateway || + skb_dev != rt->u.dst.dev) { + dst_release(skb->dst); + skb->dst = &rt->u.dst; + if (ms) + ms->flags &= ~IP_MASQ_F_NOREROUTE; + } else { + /* Sorry, for now we always use ip_route_output */ + /* + if (ms && !(ms->flags & IP_MASQ_F_NO_REPLY) && + skb_rt->rt_gateway == skb_rt->rt_dst) + ms->flags |= IP_MASQ_F_NOREROUTE; + */ + ip_rt_put(rt); + } + } + return 0; } /* --- v2.2.20/linux/net/ipv4/ip_masq_user.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/ip_masq_user.c Fri Dec 14 02:24:50 2001 @@ -93,7 +93,7 @@ rt_saddr = 0; tos = RT_TOS(ums->ip_tos) | RTO_CONN; - if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0 /* dev */))) { + if ((ret=ip_route_output(&rt, rt_daddr, rt_saddr, tos, 0, 0))) { IP_MASQ_DEBUG(0-debug, "could not setup maddr for routing daddr=%lX, saddr=%lX\n", ntohl(rt_daddr), ntohl(rt_saddr)); return ret; --- v2.2.20/linux/net/ipv4/ip_nat_dumb.c Sat Oct 21 12:10:47 2000 +++ linux/net/ipv4/ip_nat_dumb.c Fri Dec 14 02:24:50 2001 @@ -120,6 +120,7 @@ key.dst = ciph->saddr; key.iif = skb->dev->ifindex; key.oif = 0; + key.gw = 0; #ifdef CONFIG_IP_ROUTE_TOS key.tos = RT_TOS(ciph->tos); #endif --- v2.2.20/linux/net/ipv4/ip_output.c Sat Oct 21 12:11:45 2000 +++ linux/net/ipv4/ip_output.c Fri Dec 14 02:24:50 2001 @@ -259,7 +259,7 @@ */ if(ip_route_output(&rt, daddr, sk->saddr, RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, - sk->bound_dev_if)) + sk->bound_dev_if, 0)) goto drop; sk->dst_cache = &rt->u.dst; } @@ -939,7 +939,7 @@ if (ipc.opt->srr) daddr = replyopts.opt.faddr; - if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0)) + if (ip_route_output(&rt, daddr, rt->rt_spec_dst, RT_TOS(skb->nh.iph->tos), 0, 0)) return; /* And let IP do all the hard work. */ --- v2.2.20/linux/net/ipv4/rarp.c Mon Jul 27 06:35:57 1998 +++ linux/net/ipv4/rarp.c Fri Dec 14 02:24:50 2001 @@ -347,7 +347,7 @@ * Is it reachable directly ? */ - err = ip_route_output(&rt, ip, 0, 1, 0); + err = ip_route_output(&rt, ip, 0, 1, 0, 0); if (err) return err; if (rt->rt_flags&(RTCF_LOCAL|RTCF_BROADCAST|RTCF_MULTICAST|RTCF_DNAT)) { --- v2.2.20/linux/net/ipv4/raw.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/raw.c Fri Dec 14 02:24:50 2001 @@ -344,7 +344,7 @@ rfh.saddr = sk->ip_mc_addr; } - err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif); + err = ip_route_output(&rt, daddr, rfh.saddr, tos, ipc.oif, 0); if (err) goto done; --- v2.2.20/linux/net/ipv4/route.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/route.c Fri Dec 14 02:24:50 2001 @@ -691,6 +691,7 @@ /* Gateway is different ... */ rt->rt_gateway = new_gw; + if (rt->key.gw) rt->key.gw = new_gw; /* Redirect received -> path was valid */ dst_confirm(&rth->u.dst); @@ -1066,6 +1067,7 @@ rth->key.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; rth->key.oif = 0; + rth->key.gw = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->rt_type = RTN_MULTICAST; @@ -1123,6 +1125,7 @@ #endif key.iif = dev->ifindex; key.oif = 0; + key.gw = 0; key.scope = RT_SCOPE_UNIVERSE; hash = rt_hash_code(daddr, saddr^(key.iif<<5), tos); @@ -1195,8 +1198,9 @@ if (res.type != RTN_UNICAST) goto martian_destination; + fib_select_default(&key, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) + if (res.fi->fib_nhs > 1) fib_select_multipath(&key, &res); #endif out_dev = FIB_RES_DEV(res)->ip_ptr; @@ -1250,6 +1254,7 @@ rth->key.iif = dev->ifindex; rth->u.dst.dev = out_dev->dev; rth->key.oif = 0; + rth->key.gw = 0; rth->rt_spec_dst= spec_dst; rth->u.dst.input = ip_forward; @@ -1315,6 +1320,7 @@ rth->key.iif = dev->ifindex; rth->u.dst.dev = &loopback_dev; rth->key.oif = 0; + rth->key.gw = 0; rth->rt_gateway = daddr; rth->rt_spec_dst= spec_dst; rth->u.dst.input= ip_local_deliver; @@ -1418,7 +1424,7 @@ * Major route resolver routine. */ -int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) +int ip_route_output_slow(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif, u32 gw) { struct rt_key key; struct fib_result res; @@ -1436,6 +1442,7 @@ key.tos = tos&IPTOS_TOS_MASK; key.iif = loopback_dev.ifindex; key.oif = oif; + key.gw = gw; key.scope = (tos&RTO_ONLINK) ? RT_SCOPE_LINK : RT_SCOPE_UNIVERSE; res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -1521,6 +1528,7 @@ key.dst = key.src = htonl(INADDR_LOOPBACK); dev_out = &loopback_dev; key.oif = loopback_dev.ifindex; + key.gw = 0; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; @@ -1528,7 +1536,7 @@ if (fib_lookup(&key, &res)) { res.fi = NULL; - if (oif) { + if (oif && dev_out->flags&IFF_UP) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -1563,18 +1571,18 @@ key.src = key.dst; dev_out = &loopback_dev; key.oif = dev_out->ifindex; + key.gw = 0; res.fi = NULL; flags |= RTCF_LOCAL; goto make_route; } + if (res.type == RTN_UNICAST) + fib_select_default(&key, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) + if (res.fi->fib_nhs > 1) fib_select_multipath(&key, &res); - else #endif - if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0) - fib_select_default(&key, &res); if (!key.src) key.src = FIB_RES_PREFSRC(res); @@ -1621,6 +1629,7 @@ rth->key.src = saddr; rth->key.iif = 0; rth->key.oif = oif; + rth->key.gw = gw; rth->rt_dst = key.dst; rth->rt_src = key.src; #ifdef CONFIG_IP_ROUTE_NAT @@ -1661,19 +1670,21 @@ return rt_intern_hash(hash, rth, rp); } -int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif) +int ip_route_output(struct rtable **rp, u32 daddr, u32 saddr, u32 tos, int oif, u32 gw) { unsigned hash; struct rtable *rth; hash = rt_hash_code(daddr, saddr^(oif<<5), tos); + if (!oif) gw = 0; start_bh_atomic(); for (rth=rt_hash_table[hash]; rth; rth=rth->u.rt_next) { if (rth->key.dst == daddr && rth->key.src == saddr && rth->key.iif == 0 && rth->key.oif == oif && + rth->key.gw == gw && #ifndef CONFIG_IP_TRANSPARENT_PROXY rth->key.tos == tos #else @@ -1691,7 +1702,7 @@ } end_bh_atomic(); - return ip_route_output_slow(rp, daddr, saddr, tos, oif); + return ip_route_output_slow(rp, daddr, saddr, tos, oif, gw); } #ifdef CONFIG_RTNETLINK @@ -1842,7 +1853,7 @@ int oif = 0; if (rta[RTA_OIF-1]) memcpy(&oif, RTA_DATA(rta[RTA_OIF-1]), sizeof(int)); - err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif); + err = ip_route_output(&rt, dst, src, rtm->rtm_tos, oif, 0); } if (err) { kfree_skb(skb); --- v2.2.20/linux/net/ipv4/syncookies.c Sun Nov 4 10:16:16 2001 +++ linux/net/ipv4/syncookies.c Fri Dec 14 02:24:50 2001 @@ -190,7 +190,7 @@ opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, req->af.v4_req.loc_addr, sk->ip_tos | RTO_CONN, - 0)) { + 0, 0)) { if (req->af.v4_req.opt) kfree(req->af.v4_req.opt); tcp_openreq_free(req); --- v2.2.20/linux/net/ipv4/tcp_ipv4.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/tcp_ipv4.c Fri Dec 14 02:24:51 2001 @@ -1154,7 +1154,7 @@ req->af.v4_req.rmt_addr), req->af.v4_req.loc_addr, RT_TOS(sk->ip_tos) | RTO_CONN | sk->localroute, - sk->bound_dev_if)) { + sk->bound_dev_if, 0)) { ip_statistics.IpOutNoRoutes++; return; } @@ -1518,7 +1518,7 @@ if (ip_route_output(&rt, opt && opt->srr ? opt->faddr : req->af.v4_req.rmt_addr, - req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0)) + req->af.v4_req.loc_addr, sk->ip_tos|RTO_CONN, 0, 0)) return NULL; dst = &rt->u.dst; } @@ -1865,7 +1865,7 @@ } if (rt->u.dst.obsolete) { int err; - err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif); + err = ip_route_output(&rt, rt->rt_dst, rt->rt_src, rt->key.tos|RTO_CONN, rt->key.oif, 0); if (err) { sk->err_soft=-err; sk->error_report(sk); --- v2.2.20/linux/net/ipv4/udp.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/udp.c Fri Dec 14 02:24:51 2001 @@ -704,7 +704,7 @@ #ifdef CONFIG_IP_TRANSPARENT_PROXY (msg->msg_flags&MSG_PROXY ? RTO_TPROXY : 0) | #endif - tos, ipc.oif); + tos, ipc.oif, 0); if (err) goto out;