--- v2.2.19/linux/include/net/ip_fib.h Sat Oct 21 12:10:47 2000 +++ linux/include/net/ip_fib.h Fri Sep 28 21:57:13 2001 @@ -160,7 +160,8 @@ extern __inline__ void fib_select_default(const struct rt_key *key, struct fib_result *res) { - if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + if ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || + FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST) main_table->tb_select_default(main_table, key, res); } @@ -171,6 +172,7 @@ extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; extern int fib_lookup(const struct rt_key *key, struct fib_result *res); extern struct fib_table *__fib_new_table(int id); +extern __inline__ int fib_result_table(struct fib_result *res); extern __inline__ struct fib_table *fib_get_table(int id) { @@ -214,6 +216,7 @@ extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, u8 tb_id, u8 type, u8 scope, void *dst, int dst_len, u8 tos, struct fib_info *fi); +extern int fib_num_down_nh_devs(struct fib_info *fi); extern int fib_sync_down(u32 local, struct device *dev, int force); extern int fib_sync_up(struct device *dev); extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, --- v2.2.19/linux/include/linux/rtnetlink.h Sat Oct 21 12:11:58 2000 +++ linux/include/linux/rtnetlink.h Fri Sep 28 21:53:50 2001 @@ -230,6 +230,8 @@ #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */ +#define RTNH_F_SUSPECT 8 /* We don't know the real state */ +#define RTNH_F_BADSTATE (RTNH_F_DEAD | RTNH_F_SUSPECT) /* Macros to handle hexthops */ --- v2.2.19/linux/net/ipv4/fib_hash.c Sat Oct 21 12:10:50 2000 +++ linux/net/ipv4/fib_hash.c Sun Sep 30 13:56:34 2001 @@ -69,6 +69,7 @@ struct fib_info *fn_info; #define FIB_INFO(f) ((f)->fn_info) fn_key_t fn_key; + int fn_last_dflt; u8 fn_tos; u8 fn_type; u8 fn_scope; @@ -302,68 +303,104 @@ return 1; } -static int fn_hash_last_dflt=-1; - -static int fib_detect_death(struct fib_info *fi, int order, - struct fib_info **last_resort, int *last_idx) +static int fib_detect_death(struct fib_info *fi, int order, int last_dflt, + struct fib_info **last_resort, int *last_idx, + int *last_nhsel, const struct rt_key *key) { struct neighbour *n; - int state = NUD_NONE; + int nhsel; + int state; + struct fib_nh * nh; + u32 dst; + int dead = 1; + + /* change_nexthops(fi) { */ + for (nhsel = 0, nh = fi->fib_nh; nhsel < fi->fib_nhs; nh++, nhsel++) { + if (key->oif && key->oif != nh->nh_oif) + continue; + if (nh->nh_flags & RTNH_F_DEAD) + continue; - n = neigh_lookup(&arp_tbl, &fi->fib_nh[0].nh_gw, fi->fib_dev); - if (n) { - state = n->nud_state; - neigh_release(n); - } - if (state==NUD_REACHABLE) - return 0; - if ((state&NUD_VALID) && order != fn_hash_last_dflt) - return 0; - if ((state&NUD_VALID) || - (*last_idx<0 && order > fn_hash_last_dflt)) { - *last_resort = fi; - *last_idx = order; + nh->nh_flags &= ~RTNH_F_SUSPECT; + if (nh->nh_dev->flags & IFF_NOARP) { + dead = 0; + continue; + } + + dst = nh->nh_gw; + if (!nh->nh_gw || nh->nh_scope != RT_SCOPE_LINK) + dst = key->dst; + + state = NUD_NONE; + n = neigh_lookup(&arp_tbl, &dst, nh->nh_dev); + if (n) { + state = n->nud_state; + neigh_release(n); + } + if (state==NUD_REACHABLE || + ((state&NUD_VALID) && order != last_dflt)) { + dead = 0; + continue; + } + if (!(state&NUD_VALID)) { + nh->nh_flags |= RTNH_F_SUSPECT; + } + if (!dead) continue; + if ((state&NUD_VALID) || + (*last_idx<0 && order >= last_dflt)) { + *last_resort = fi; + *last_idx = order; + *last_nhsel = nhsel; + } } - return 1; + /* } endfor_nexthops(fi) */ + + return dead; } static void fn_hash_select_default(struct fib_table *tb, const struct rt_key *key, struct fib_result *res) { - int order, last_idx; - struct fib_node *f; + int order, last_idx, last_dflt, last_nhsel; + struct fib_node *f, *first_node; struct fib_info *fi = NULL; struct fib_info *last_resort; struct fn_hash *t = (struct fn_hash*)tb->tb_data; - struct fn_zone *fz = t->fn_zones[0]; + struct fn_zone *fz = t->fn_zones[res->prefixlen]; + fn_key_t k; if (fz == NULL) return; + k = fz_key(key->dst, fz); + last_dflt = -2; + first_node = NULL; last_idx = -1; last_resort = NULL; + last_nhsel = 0; order = -1; - for (f = fz->fz_hash[0]; f; f = f->fn_next) { + for (f = fz_chain(k, fz); f; f = f->fn_next) { struct fib_info *next_fi = FIB_INFO(f); - if ((f->fn_state&FN_S_ZOMBIE) || + if (!fn_key_eq(k, f->fn_key) || + (f->fn_state&FN_S_ZOMBIE) || f->fn_scope != res->scope || f->fn_type != RTN_UNICAST) continue; if (next_fi->fib_priority > res->fi->fib_priority) break; - if (!next_fi->fib_nh[0].nh_gw || next_fi->fib_nh[0].nh_scope != RT_SCOPE_LINK) - continue; f->fn_state |= FN_S_ACCESSED; - if (fi == NULL) { - if (next_fi != res->fi) - break; - } else if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + if (!first_node) { + last_dflt = f->fn_last_dflt; + first_node = f; + } + if (fi && !fib_detect_death(fi, order, last_dflt, + &last_resort, &last_idx, &last_nhsel, key)) { res->fi = fi; - fn_hash_last_dflt = order; + first_node->fn_last_dflt = order; return; } fi = next_fi; @@ -371,19 +408,28 @@ } if (order<=0 || fi==NULL) { - fn_hash_last_dflt = -1; + if (fi && fi->fib_nhs > 1 && + fib_detect_death(fi, order, last_dflt, + &last_resort, &last_idx, &last_nhsel, key) && + last_resort == fi) { + fi->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + } + if (first_node) first_node->fn_last_dflt = -1; return; } - if (!fib_detect_death(fi, order, &last_resort, &last_idx)) { + if (!fib_detect_death(fi, order, last_dflt, &last_resort, &last_idx, + &last_nhsel, key)) { res->fi = fi; - fn_hash_last_dflt = order; + first_node->fn_last_dflt = order; return; } - if (last_idx >= 0) + if (last_idx >= 0) { res->fi = last_resort; - fn_hash_last_dflt = last_idx; + last_resort->fib_nh[last_nhsel].nh_flags &= ~RTNH_F_SUSPECT; + first_node->fn_last_dflt = last_idx; + } } #define FIB_SCAN(f, fp) \ @@ -547,6 +593,7 @@ memset(new_f, 0, sizeof(struct fib_node)); + new_f->fn_last_dflt = -1; new_f->fn_key = key; #ifdef CONFIG_IP_ROUTE_TOS new_f->fn_tos = tos; @@ -686,7 +733,10 @@ while ((f = *fp) != NULL) { struct fib_info *fi = FIB_INFO(f); - if (fi && ((f->fn_state&FN_S_ZOMBIE) || (fi->fib_flags&RTNH_F_DEAD))) { + if (fi && ((f->fn_state&FN_S_ZOMBIE) || + (fi->fib_flags&RTNH_F_DEAD && + (fi->fib_protocol != RTPROT_STATIC || + !fib_num_down_nh_devs(fi))))) { *fp = f->fn_next; synchronize_bh(); --- v2.2.19/linux/net/ipv4/fib_rules.c Wed Dec 13 11:19:12 2000 +++ linux/net/ipv4/fib_rules.c Fri Sep 28 22:31:49 2001 @@ -265,6 +265,11 @@ } } +int fib_result_table(struct fib_result *res) +{ + return res->r->r_table; +} + int fib_lookup(const struct rt_key *key, struct fib_result *res) { int err; @@ -320,7 +325,8 @@ void fib_select_default(const struct rt_key *key, struct fib_result *res) { if (res->r && res->r->r_action == RTN_UNICAST && - FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) { + ((FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) || + FIB_RES_NH(*res).nh_scope == RT_SCOPE_HOST)) { struct fib_table *tb; if ((tb = fib_get_table(res->r->r_table)) != NULL) tb->tb_select_default(tb, key, res); --- v2.2.19/linux/net/ipv4/fib_semantics.c Sat Oct 21 12:10:47 2000 +++ linux/net/ipv4/fib_semantics.c Sun Sep 30 14:34:44 2001 @@ -127,7 +127,7 @@ #ifdef CONFIG_NET_CLS_ROUTE nh->nh_tclassid != onh->nh_tclassid || #endif - ((nh->nh_flags^onh->nh_flags)&~RTNH_F_DEAD)) + ((nh->nh_flags^onh->nh_flags)&~RTNH_F_BADSTATE)) return -1; onh++; } endfor_nexthops(fi); @@ -145,7 +145,7 @@ nfi->fib_mtu == fi->fib_mtu && nfi->fib_rtt == fi->fib_rtt && nfi->fib_window == fi->fib_window && - ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_DEAD) == 0 && + ((nfi->fib_flags^fi->fib_flags)&~RTNH_F_BADSTATE) == 0 && (nfi->fib_nhs == 0 || nh_comp(fi, nfi) == 0)) return fi; } endfor_fib_info(); @@ -170,6 +170,30 @@ return -1; } +/* + * Return 0 only when we are sure that the preferred source is deleted + * or when all nexthop devices are removed + */ + +int fib_num_down_nh_devs(struct fib_info *fi) +{ +struct in_device *in_dev; +struct device *dev; +int dead = 0; + + change_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) + return 0; + dev = dev_get_by_index(nh->nh_oif); + if (dev && !(dev->flags&IFF_UP) && + ((in_dev = dev->ip_ptr) != NULL) && + in_dev->ifa_list) + dead ++; + } endfor_nexthops(fi) + /* dead>0: All are marked DEAD but there is one in DOWN state */ + return dead; +} + #ifdef CONFIG_IP_ROUTE_MULTIPATH static u32 fib_get_attr32(struct rtattr *attr, int attrlen, int type) @@ -354,11 +378,25 @@ if (key.scope < RT_SCOPE_LINK) key.scope = RT_SCOPE_LINK; - if ((err = fib_lookup(&key, &res)) != 0) - return err; - nh->nh_scope = res.scope; - nh->nh_oif = FIB_RES_OIF(res); - nh->nh_dev = FIB_RES_DEV(res); + err = fib_lookup(&key, &res); + if (err) { + if (err == -ENETUNREACH && + fi->fib_protocol == RTPROT_STATIC) { + struct device *dev; + + dev = dev_get_by_index(nh->nh_oif); + if (dev == NULL || dev->flags & IFF_UP || + inet_addr_type(nh->nh_gw) == RTN_LOCAL) + return err; + nh->nh_flags |= RTNH_F_DEAD; + nh->nh_scope = RT_SCOPE_LINK; + nh->nh_dev = dev; + } else return err; + } else { + nh->nh_scope = res.scope; + nh->nh_oif = FIB_RES_OIF(res); + nh->nh_dev = FIB_RES_DEV(res); + } } else { struct in_device *in_dev; @@ -368,8 +406,11 @@ in_dev = inetdev_by_index(nh->nh_oif); if (in_dev == NULL) return -ENODEV; - if (!(in_dev->dev->flags&IFF_UP)) - return -ENETDOWN; + if (!(in_dev->dev->flags&IFF_UP)) { + if (fi->fib_protocol != RTPROT_STATIC) + return -ENETDOWN; + nh->nh_flags |= RTNH_F_DEAD; + } nh->nh_dev = in_dev->dev; nh->nh_scope = RT_SCOPE_HOST; } @@ -490,10 +531,16 @@ if (nh->nh_dev == NULL) goto failure; } else { + int dead = 0; change_nexthops(fi) { if ((err = fib_check_nh(r, fi, nh)) != 0) goto failure; + if (nh->nh_flags & RTNH_F_DEAD) + dead ++; } endfor_nexthops(fi) + if (dead >= fi->fib_nhs) { + fi->fib_flags |= RTNH_F_DEAD; + } } if (fi->fib_prefsrc) { @@ -858,8 +905,6 @@ return ret; } -#ifdef CONFIG_IP_ROUTE_MULTIPATH - /* Dead device goes up. We wake up dead nexthops. It takes sense only on multipath routes. @@ -867,6 +912,8 @@ int fib_sync_up(struct device *dev) { + struct rt_key key; + struct fib_result res; int ret = 0; if (!(dev->flags&IFF_UP)) @@ -880,16 +927,24 @@ alive++; continue; } - if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) + if (nh->nh_oif != dev->ifindex || dev->ip_ptr == NULL) continue; - if (nh->nh_dev != dev || dev->ip_ptr == NULL) + if (nh->nh_dev == NULL || !(nh->nh_dev->flags&IFF_UP)) continue; + if (nh->nh_gw && fi->fib_protocol == RTPROT_STATIC) { + memset(&key, 0, sizeof(key)); + key.dst = nh->nh_gw; + key.oif = nh->nh_oif; + key.scope = nh->nh_scope; + if (fib_lookup(&key, &res) != 0) + continue; + } alive++; nh->nh_power = 0; nh->nh_flags &= ~RTNH_F_DEAD; } endfor_nexthops(fi) - if (alive == fi->fib_nhs) { + if (alive > 0) { fi->fib_flags &= ~RTNH_F_DEAD; ret++; } @@ -897,6 +952,8 @@ return ret; } +#ifdef CONFIG_IP_ROUTE_MULTIPATH + /* The algorithm is suboptimal, but it provides really fair weighted route distribution. @@ -905,12 +962,34 @@ void fib_select_multipath(const struct rt_key *key, struct fib_result *res) { struct fib_info *fi = res->fi; - int w; + int w, alive; + + if (key->oif) { + int sel = -1; + w = -1; + change_nexthops(fi) { + if (key->oif != nh->nh_oif) + continue; + if (!(nh->nh_flags&RTNH_F_BADSTATE)) { + if (nh->nh_power > w) { + w = nh->nh_power; + sel = nhsel; + } + } + } endfor_nexthops(fi); + if (sel >= 0) { + res->nh_sel = sel; + return; + } + goto last_resort; + } + +repeat: if (fi->fib_power <= 0) { int power = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!(nh->nh_flags&RTNH_F_BADSTATE)) { power += nh->nh_weight; nh->nh_power = nh->nh_weight; } @@ -918,8 +997,9 @@ fi->fib_power = power; #if 1 if (power <= 0) { - printk(KERN_CRIT "impossible 777\n"); - return; + goto last_resort; + /* printk(KERN_CRIT "impossible 777\n"); */ + /* return; */ } #endif } @@ -931,11 +1011,28 @@ w = jiffies % fi->fib_power; + alive = 0; change_nexthops(fi) { - if (!(nh->nh_flags&RTNH_F_DEAD) && nh->nh_power) { + if (!(nh->nh_flags&RTNH_F_BADSTATE) && nh->nh_power) { if ((w -= nh->nh_power) <= 0) { nh->nh_power--; fi->fib_power--; + res->nh_sel = nhsel; + return; + } + alive = 1; + } + } endfor_nexthops(fi); + if (alive) { + fi->fib_power = 0; + goto repeat; + } + +last_resort: + + for_nexthops(fi) { + if (!(nh->nh_flags&RTNH_F_DEAD)) { + if (!key->oif || key->oif == nh->nh_oif) { res->nh_sel = nhsel; return; } --- v2.2.19/linux/net/ipv4/fib_frontend.c Sat Oct 21 12:10:47 2000 +++ linux/net/ipv4/fib_frontend.c Tue Sep 25 22:21:22 2001 @@ -54,6 +54,8 @@ struct fib_table *local_table; struct fib_table *main_table; +#define FIB_RES_TABLE(r) (RT_TABLE_MAIN) + #else #define RT_TABLE_MIN 1 @@ -71,6 +73,7 @@ return tb; } +#define FIB_RES_TABLE(r) (fib_result_table(r)) #endif /* CONFIG_IP_MULTIPLE_TABLES */ @@ -194,6 +197,9 @@ struct in_device *in_dev = dev->ip_ptr; struct rt_key key; struct fib_result res; + int table; + unsigned char prefixlen; + unsigned char scope; key.dst = src; key.src = dst; @@ -209,24 +215,25 @@ if (res.type != RTN_UNICAST) return -EINVAL; *spec_dst = FIB_RES_PREFSRC(res); - if (itag) - fib_combine_itag(itag, &res); -#ifdef CONFIG_IP_ROUTE_MULTIPATH - if (FIB_RES_DEV(res) == dev || res.fi->fib_nhs > 1) -#else + fib_combine_itag(itag, &res); if (FIB_RES_DEV(res) == dev) -#endif return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; if (in_dev->ifa_list == NULL) goto last_resort; - if (IN_DEV_RPFILTER(in_dev)) - return -EINVAL; + table = FIB_RES_TABLE(&res); + prefixlen = res.prefixlen; + scope = res.scope; key.oif = dev->ifindex; - if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST) { + if (fib_lookup(&key, &res) == 0 && res.type == RTN_UNICAST && + ((table == FIB_RES_TABLE(&res) && res.prefixlen >= prefixlen && + res.scope >= scope) || + !IN_DEV_RPFILTER(in_dev))) { *spec_dst = FIB_RES_PREFSRC(res); return FIB_RES_NH(res).nh_scope >= RT_SCOPE_HOST; } + if (IN_DEV_RPFILTER(in_dev)) + return -EINVAL; return 0; last_resort: @@ -543,6 +550,8 @@ switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); + if (ifa->ifa_dev && ifa->ifa_dev->dev) + fib_sync_up(ifa->ifa_dev->dev); rt_cache_flush(-1); break; case NETDEV_DOWN: @@ -573,9 +582,7 @@ for_ifa(in_dev) { fib_add_ifaddr(ifa); } endfor_ifa(in_dev); -#ifdef CONFIG_IP_ROUTE_MULTIPATH fib_sync_up(dev); -#endif rt_cache_flush(-1); break; case NETDEV_DOWN: --- v2.2.19/linux/net/ipv4/route.c Sat Aug 4 12:52:33 2001 +++ linux/net/ipv4/route.c Mon Oct 1 22:11:36 2001 @@ -1195,8 +1195,9 @@ if (res.type != RTN_UNICAST) goto martian_destination; + fib_select_default(&key, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) + if (res.fi->fib_nhs > 1) fib_select_multipath(&key, &res); #endif out_dev = FIB_RES_DEV(res)->ip_ptr; @@ -1528,7 +1529,7 @@ if (fib_lookup(&key, &res)) { res.fi = NULL; - if (oif) { + if (oif && dev_out->flags&IFF_UP) { /* Apparently, routing tables are wrong. Assume, that the destination is on link. @@ -1568,13 +1569,12 @@ goto make_route; } + if (res.type == RTN_UNICAST) + fib_select_default(&key, &res); #ifdef CONFIG_IP_ROUTE_MULTIPATH - if (res.fi->fib_nhs > 1 && key.oif == 0) + if (res.fi->fib_nhs > 1) fib_select_multipath(&key, &res); - else #endif - if (res.prefixlen==0 && res.type == RTN_UNICAST && key.oif == 0) - fib_select_default(&key, &res); if (!key.src) key.src = FIB_RES_PREFSRC(res);