diff -NurpP --minimal a/drivers/net/8390.c b/drivers/net/8390.c --- a/drivers/net/8390.c 2005-02-25 17:04:41.000000000 +0100 +++ b/drivers/net/8390.c 2005-02-25 17:18:50.000000000 +0100 @@ -276,6 +276,11 @@ static int ei_start_xmit(struct sk_buff int send_length = skb->len, output_page; unsigned long flags; +#if 0 + printk("ei_start_xmit(%p[%d,#%d],%p{%s})\n", + skb, skb->nfvnet, skb->nfxid, dev, dev->name); +#endif + if (skb->len < ETH_ZLEN) { skb = skb_padto(skb, ETH_ZLEN); if (skb == NULL) diff -NurpP --minimal a/drivers/net/Kconfig b/drivers/net/Kconfig --- a/drivers/net/Kconfig 2005-02-25 17:04:41.000000000 +0100 +++ b/drivers/net/Kconfig 2005-02-25 17:18:50.000000000 +0100 @@ -105,6 +105,12 @@ config TUN If you don't know what to use this for, you don't need it. +config VNET + tristate "Virtual Network device driver support" + depends on NETDEVICES && VSERVER_NGNET + ---help--- + VNET provides packet reception and transmission + config ETHERTAP tristate "Ethertap network tap" depends on NETDEVICES && EXPERIMENTAL && NETLINK_DEV diff -NurpP --minimal a/drivers/net/Makefile b/drivers/net/Makefile --- a/drivers/net/Makefile 2005-02-25 17:04:41.000000000 +0100 +++ b/drivers/net/Makefile 2005-02-25 17:18:50.000000000 +0100 @@ -175,6 +175,7 @@ obj-$(CONFIG_MACSONIC) += macsonic.o obj-$(CONFIG_MACMACE) += macmace.o obj-$(CONFIG_MAC89x0) += mac89x0.o obj-$(CONFIG_TUN) += tun.o +obj-$(CONFIG_VNET) += vnet.o obj-$(CONFIG_DL2K) += dl2k.o obj-$(CONFIG_R8169) += r8169.o obj-$(CONFIG_AMD8111_ETH) += amd8111e.o diff -NurpP --minimal a/drivers/net/vnet.c b/drivers/net/vnet.c --- a/drivers/net/vnet.c 1970-01-01 01:00:00.000000000 +0100 +++ b/drivers/net/vnet.c 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,622 @@ + +/* + * linux/drivers/net/vnet.c + * + * Written by Herbert Pötzl, 22/08/2004 + * + * based on the shaper.c code by Alan Cox. + * + * Copyright 2004 by Herbert Pötzl. + * Redistribution of this file is permitted under the + * GNU General Public License. + * + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +// #include /* remove asap!! */ + +#include +#include + + +LIST_HEAD(vnets); + +DECLARE_RWSEM(vnets_rwsem); + +static struct hlist_head vnet_head[1 << VNET_HASHBITS]; + +static rwlock_t vnet_lock = RW_LOCK_UNLOCKED; + + + + + +/* + * Transmit from a vnet + */ + +static void vnet_real_xmit(struct vnet *vn, struct sk_buff *skb) +{ + struct sk_buff *newskb = skb_clone(skb, GFP_ATOMIC); + struct net_device *dev = vn->dev; + + vxdprintk(VXD_CBIT(ngnet, 1), + "vnet_real_xmit(%p) [%d,#%u]", + skb, skb->nfvnet, skb->nfxid); + + /* FIXME is clone really necessary here? */ + if(newskb) { + newskb->dev = dev; + // newskb->priority = 2; + + /* put it into host context */ + skb->nfvnet = 0; + skb->nfxid = 0; + + dev_queue_xmit(newskb); + + vn->stats.tx_bytes += skb->len; + vn->stats.tx_packets++; + + dev_kfree_skb(skb); + } +} + +static void vnet_vnet_xmit(struct vnet *vn, struct sk_buff *skb, int vnet) +{ + struct net_device *vndev; + struct vnet *dvn = vnet_get(vnet); + + if (!dvn) { + dev_kfree_skb(skb); + return; + } + + skb_orphan(skb); + vndev = dvn->vndev; + + skb->protocol = eth_type_trans(skb, vndev); + skb->dev = vndev; + + vxdprintk(VXD_CBIT(ngnet, 1), + "vnet_vnet_xmit(%p) [%d,#%u]->[%d]", + skb, skb->nfvnet, skb->nfxid, vnet); + + /* auto tag vnet input */ +#if 0 + skb->nfvnet = vnet; + skb->nfxid = dvn->nfxid; +#endif + // skb->nfvnet = 0; + // skb->nfxid = 0; + +#ifndef LOOPBACK_MUST_CHECKSUM + skb->ip_summed = CHECKSUM_UNNECESSARY; +#endif + vndev->last_rx = jiffies; + + vn->stats.tx_bytes += skb->len; + vn->stats.tx_packets++; + + dvn->stats.rx_bytes += skb->len; + dvn->stats.rx_packets++; + + vnet_put(dvn); + netif_rx(skb); +} + +static void vnet_loopback_xmit(struct vnet *vn, struct sk_buff *skb) +{ + struct net_device *vndev = vn->vndev; + + skb_orphan(skb); + + skb->protocol = eth_type_trans(skb, vndev); + skb->dev = vndev; + + if (skb->nfxid != vndev->nfxid) + printk("!!! vnet_loopback_xmit(%p[#%u],%p[{%s},#%u])\n", + skb, skb->nfxid, vndev, vndev->name, vndev->nfxid); +#if 0 + /* auto tag loopback input */ + skb->nfvnet = vn->vnet; +#endif + + vxdprintk(VXD_CBIT(ngnet, 1), + "vnet_loopback_xmit(%p) [%d,#%u]", + skb, skb->nfvnet, skb->nfxid); + +#ifndef LOOPBACK_MUST_CHECKSUM + skb->ip_summed = CHECKSUM_UNNECESSARY; +#endif + vndev->last_rx = jiffies; + + vn->stats.rx_bytes += skb->len; + vn->stats.rx_packets++; + vn->stats.tx_bytes += skb->len; + vn->stats.tx_packets++; + + netif_rx(skb); +} + + + +/* + * Bring the interface up. We just disallow this until a + * bind. + */ + +static int vnet_open(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + struct net_device *dev = vn->dev; + + vxdprintk(VXD_CBIT(ngnet, 2), + "vnet_open(%p[%s,#%u]) %p[%s]", + vndev, vndev->name, vndev->nfxid, + dev, dev?dev->name:NULL); + + if (!dev) + return -ENODEV; + return 0; +} + +/* + * Closing a vnet flushes the queues. + */ + +static int vnet_close(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + struct net_device *dev = vn->dev; + + vxdprintk(VXD_CBIT(ngnet, 2), + "vnet_close(%p[%s,#%u]) %p[%s]", + vndev, vndev->name, vndev->nfxid, + dev, dev?dev->name:NULL); + return 0; +} + +static int vnet_start_xmit(struct sk_buff *skb, struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + + vxdprintk(VXD_CBIT(net, 5), + "vnet_start_xmit(%p[%d,#%u],%p[%s,#%u])", + skb, skb->nfvnet, skb->nfxid, + vndev, vndev->name, vndev->nfxid); + + if (skb->nfxid != vn->nfxid) { + vxwprintk(1, "packet from #%d on vnet %d:#%d", + skb->nfxid, vn->vnet, vn->nfxid); + goto drop; + } + + /* send packet via loopback */ + if (vndev->flags & IFF_LOOPBACK) { + vnet_loopback_xmit(vn, skb); + return 0; + } + +#if 0 + /* drop packet on vnet mismatch */ + if (skb->nfvnet && skb->nfvnet != vn->vnet) + goto drop; +#endif + + /* + if (skb->nh.iph->daddr == 0x0300a8c0) { + vnet_vnet_xmit(vn, skb, 21); + return 0; + } + */ + + /* send packet via real device */ + vnet_real_xmit(vn, skb); + return 0; + +drop: + dev_kfree_skb(skb); + vn->stats.tx_carrier_errors++; + return 0; +} + +static struct net_device_stats *vnet_get_stats(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + + return &vn->stats; +} + + + +static int vnet_header(struct sk_buff *skb, struct net_device *vndev, + unsigned short type, void *daddr, void *saddr, unsigned len) +{ + struct vnet *vn = vndev->priv; + struct net_device *dev = vn->dev; + int v; + + skb->dev = dev; + v = dev->hard_header(skb, dev, type, daddr, saddr, len); + skb->dev = vndev; + return v; +} + +static int vnet_rebuild_header(struct sk_buff *skb) +{ + struct vnet *vn = skb->dev->priv; + struct net_device *dev = vn->dev; + struct net_device *vndev = skb->dev; + int v; + + skb->dev = dev; + v = dev->rebuild_header(skb); + skb->dev = vndev; + return v; +} + +#ifdef CONFIG_INET + +static int vnet_neigh_setup(struct neighbour *n) +{ + if (n->nud_state == NUD_NONE) { + n->ops = &arp_broken_ops; + n->output = n->ops->output; + } + return 0; +} + +static int vnet_neigh_setup_dev(struct net_device *dev, struct neigh_parms *p) +{ + if (p->tbl->family == AF_INET) { + p->neigh_setup = vnet_neigh_setup; + p->ucast_probes = 0; + p->mcast_probes = 0; + } + return 0; +} + +#else /* !(CONFIG_INET) */ + +static int vnet_neigh_setup_dev(struct net_device *vndev, struct neigh_parms *p) +{ + return 0; +} + +#endif + +static int vnet_set_address(struct net_device *vndev, void *p) +{ + struct sockaddr *sa = p; + + if (!is_valid_ether_addr(sa->sa_data)) + return -EADDRNOTAVAIL; + + memcpy(vndev->dev_addr, sa->sa_data, ETH_ALEN); + return 0; +} + + +#if 0 +static int vnet_ioctl(struct net_device *dev, struct ifreq *ifr, int cmd) +{ + struct vnetconf *vc= (struct vnetconf *)&ifr->ifr_ifru; + struct vnet *vn = dev->priv; + + if(vc->vc_cmd == VNET_SET_DEV) + { + if(!capable(CAP_NET_ADMIN)) + return -EPERM; + } + + switch(vc->vc_cmd) + { + case VNET_SET_DEV: + { + struct net_device *them = __dev_get_by_name(vc->vc_name); + if (!them) + return -ENODEV; + if (vn->dev) + return -EBUSY; + return vnet_attach(dev, dev->priv, them); + } + case VNET_GET_DEV: + if (!vn->dev) + return -ENODEV; + strcpy(vc->vc_name, vn->dev->name); + return 0; + default: + return -EINVAL; + } +} +#endif + +static void vnet_init_priv(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + + atomic_set(&vn->refcnt, 0); + INIT_LIST_HEAD(&vn->vnet_list); + INIT_HLIST_NODE(&vn->vnet_hlist); + + vn->vndev = vndev; + vn->vnet = ~0; + vn->nfxid = 0; +} + +/* + * Add a vnet device to the system + */ + +static void vnet_setup(struct net_device *vndev) +{ + vnet_init_priv(vndev); + + vndev->open = vnet_open; + vndev->stop = vnet_close; + vndev->hard_start_xmit = vnet_start_xmit; + vndev->get_stats = vnet_get_stats; + vndev->set_mac_address = vnet_set_address; + vndev->set_multicast_list = NULL; + + /* + * Handlers for when we attach to a device. + */ + + vndev->hard_header = vnet_header; + vndev->rebuild_header = vnet_rebuild_header; + + // vndev->neigh_setup = vnet_neigh_setup_dev; + // dev->do_ioctl = vnet_ioctl; + vndev->hard_header_len = 0; + vndev->type = ARPHRD_ETHER; /* initially */ + vndev->mtu = 1500; + vndev->addr_len = 0; + vndev->tx_queue_len = 10; + vndev->flags = 0; + vndev->priv_flags = IFF_VNET; + +} + +int vnet_attach_dev(struct net_device *vndev, struct net_device *dev) +{ + struct vnet *vn = vndev->priv; + + vn->dev = dev; + + if (dev->hard_header) + vndev->hard_header = vnet_header; + else + vndev->hard_header = NULL; + + if (dev->rebuild_header) + vndev->rebuild_header = vnet_rebuild_header; + else + vndev->rebuild_header = NULL; + + vndev->header_cache_update = NULL; + vndev->hard_header_cache = NULL; + + vndev->neigh_setup = vnet_neigh_setup_dev; + + vndev->hard_header_len = dev->hard_header_len; + vndev->type = dev->type; + vndev->addr_len = dev->addr_len; + memcpy(vndev->dev_addr, dev->dev_addr, dev->addr_len); + vndev->mtu = dev->mtu; + vndev->flags |= (dev->flags & + (IFF_LOOPBACK|IFF_POINTOPOINT|IFF_BROADCAST)); + return 0; +} + +int vnet_config(struct net_device *vndev, int vnet, nfxid_t nfxid) +{ + struct vnet *vn = vndev->priv; + struct hlist_head *hash = &vnet_head[vnet & + ((1 << VNET_HASHBITS)-1)]; + + /* remove if hashed */ + write_lock(&vnet_lock); + if (!hlist_unhashed(&vn->vnet_hlist)) + hlist_del_init(&vn->vnet_hlist); + vn->vnet = vnet; + vn->nfxid = nfxid; + + hlist_add_head(&vn->vnet_hlist, hash); + write_unlock(&vnet_lock); + return 0; +} + +struct net_device *vnet_create_dev(const char *name, nfxid_t nfxid) +{ + struct net_device *vndev; + + vndev = alloc_netdev(sizeof(struct vnet), name, vnet_setup); + if (!vndev) + goto out; + + vndev->nfxid = nfxid; + if (register_netdev(vndev)) { + free_netdev(vndev); + vndev = NULL; + } else { + struct vnet *vn = vndev->priv; + + down_write(&vnets_rwsem); + list_add_tail(&vn->vnet_list, &vnets); + up_write(&vnets_rwsem); + } + +out: + return vndev; +} + +void vnet_destroy_dev(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + + vxdprintk(VXD_CBIT(ngnet, 3), + "vnet_destroy_dev(%p»%s«) [%p,%d,#%u]", + vndev, vndev->name, vn, vn->vnet, vn->nfxid); + + down_write(&vnets_rwsem); + list_del(&vn->vnet_list); + up_write(&vnets_rwsem); + + write_lock(&vnet_lock); + if (!hlist_unhashed(&vn->vnet_hlist)) + hlist_del(&vn->vnet_hlist); + write_unlock(&vnet_lock); + + rtnl_lock(); + /* put real dev if configured */ + if (vn->dev) + dev_put(vn->dev); + + /* unregister vndev */ + unregister_netdevice(vndev); + rtnl_unlock(); + + free_netdev(vndev); +} + + + +/* locate vnet devices */ + + +struct vnet *__vnet_get(struct hlist_head *hash, int vnet) +{ + struct hlist_node *p; + + hlist_for_each(p, hash) { + struct vnet *vn + = hlist_entry(p, struct vnet, vnet_hlist); + if (vn->vnet == vnet) + return vn; + } + return NULL; +} + +struct vnet *vnet_get(int vnet) +{ + struct vnet *vn; + struct hlist_head *hash = &vnet_head[vnet & + ((1 << VNET_HASHBITS)-1)]; + + read_lock(&vnet_lock); + vn = __vnet_get(hash, vnet); + if (vn) + atomic_inc(&vn->refcnt); + read_unlock(&vnet_lock); + vxdprintk(VXD_CBIT(ngnet, 4), + "vnet_get(%d) = %p[%d,#%u]", + vnet, vn, vn?vn->vnet:0, vn?vn->nfxid:0); + return vn; +} + +xid_t vnet_get_xid(struct net_device *vndev) +{ + struct vnet *vn = vndev->priv; + + return vn->nfxid; +} + + +#ifdef CONFIG_PROC_FS + +static void *vnet_seq_start(struct seq_file *s, loff_t *pos) +{ + struct list_head *p; + loff_t l = *pos; + + down_read(&vnets_rwsem); + list_for_each(p, &vnets) + if (!l--) + return list_entry(p, struct vnet, vnet_list); + return NULL; +} + +static void *vnet_seq_next(struct seq_file *s, void *v, loff_t *pos) +{ + struct list_head *p = ((struct vnet *)v)->vnet_list.next; + + ++*pos; + return p==&vnets ? NULL : + list_entry(p, struct vnet, vnet_list); +} + +static void vnet_seq_stop(struct seq_file *s, void *v) +{ + up_read(&vnets_rwsem); +} + +static int vnet_seq_show(struct seq_file *s, void *v) +{ + struct vnet *vn = v; + struct net_device *dev = vn->dev; + struct net_device *vndev = vn->vndev; + + if (&vn->vnet_list == vnets.next) + seq_puts(s, "#vnet\t#xid\treal\tvnet dev\n"); + + seq_printf(s, "%5d\t%5d\t%s\t%s\t%04x %d\n", + vn->vnet, vn->nfxid, dev->name, + vndev->name, vndev->flags, + atomic_read(&vndev->refcnt)); + return 0; +} + +static struct seq_operations vnet_seq_op = { + .start = vnet_seq_start, + .next = vnet_seq_next, + .stop = vnet_seq_stop, + .show = vnet_seq_show, +}; + +static int vnet_proc_open(struct inode *inode, struct file *file) +{ + return seq_open(file, &vnet_seq_op); +} +static struct file_operations proc_vnet_operations = { + .open = vnet_proc_open, + .read = seq_read, + .llseek = seq_lseek, + .release = seq_release, +}; + + +#endif + +static int __init vnet_init_module(void) +{ + struct proc_dir_entry *entry; + + entry = create_proc_entry("vnet", 0, NULL); + if (entry) + entry->proc_fops = &proc_vnet_operations; + return 0; +} + +module_init(vnet_init_module); +// module_exit(dummy_cleanup_module); + + +MODULE_LICENSE("GPL"); + diff -NurpP --minimal a/include/linux/if.h b/include/linux/if.h --- a/include/linux/if.h 2004-12-24 22:34:33.000000000 +0100 +++ b/include/linux/if.h 2005-02-25 17:18:50.000000000 +0100 @@ -53,6 +53,8 @@ #define IFF_802_1Q_VLAN 0x1 /* 802.1Q VLAN device. */ #define IFF_EBRIDGE 0x2 /* Ethernet bridging device. */ +#define IFF_VNET 0x4 /* vnet virtual device */ + #define IF_GET_IFACE 0x0001 /* for querying only */ #define IF_GET_PROTO 0x0002 diff -NurpP --minimal a/include/linux/if_vnet.h b/include/linux/if_vnet.h --- a/include/linux/if_vnet.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/if_vnet.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,48 @@ +#ifndef __LINUX_VNET_H +#define __LINUX_VNET_H + +#ifdef __KERNEL__ + + +extern struct list_head vnets; + + +#define VNET_HASHBITS 8 + +struct vnet +{ + struct hlist_node vnet_hlist; + struct list_head vnet_list; + atomic_t refcnt; + + struct net_device *dev; + struct net_device *vndev; + struct net_device_stats stats; + + uint16_t vnet; + nfxid_t nfxid; +}; + +#define VNET_UNTAGGED ((uint16_t)~0) + +struct vnet *vnet_get(int vnet); + +static inline void vnet_put(struct vnet *vn) +{ + if (vn) + atomic_dec(&vn->refcnt); +} + +int vnet_attach_dev(struct net_device *vndev, struct net_device *dev); + +int vnet_config(struct net_device *vndev, int vnet, nfxid_t nfxid); + +struct net_device *vnet_create_dev(const char *name, nfxid_t nfxid); + +void vnet_destroy_dev(struct net_device *dev); + +xid_t vnet_get_xid(struct net_device *dev); + +#endif + +#endif /* __LINUX_VNET_H */ diff -NurpP --minimal a/include/linux/inetdevice.h b/include/linux/inetdevice.h --- a/include/linux/inetdevice.h 2004-12-24 22:35:01.000000000 +0100 +++ b/include/linux/inetdevice.h 2005-02-25 17:18:50.000000000 +0100 @@ -101,7 +101,7 @@ struct in_ifaddr extern int register_inetaddr_notifier(struct notifier_block *nb); extern int unregister_inetaddr_notifier(struct notifier_block *nb); -extern struct net_device *ip_dev_find(u32 addr); +extern struct net_device *ip_dev_find(u32 addr, nfxid_t nfxid); extern int inet_addr_onlink(struct in_device *in_dev, u32 a, u32 b); extern int devinet_ioctl(unsigned int cmd, void __user *); extern void devinet_init(void); diff -NurpP --minimal a/include/linux/netdevice.h b/include/linux/netdevice.h --- a/include/linux/netdevice.h 2005-02-25 17:04:58.000000000 +0100 +++ b/include/linux/netdevice.h 2005-02-25 17:18:50.000000000 +0100 @@ -302,7 +302,9 @@ struct net_device /* Interface index. Unique device identifier */ int ifindex; int iflink; - +#ifdef CONFIG_VSERVER_NGNET + nfxid_t nfxid; +#endif struct net_device_stats* (*get_stats)(struct net_device *dev); struct iw_statistics* (*get_wireless_stats)(struct net_device *dev); @@ -525,7 +527,7 @@ extern rwlock_t dev_base_lock; /* De extern int netdev_boot_setup_check(struct net_device *dev); extern unsigned long netdev_boot_base(const char *prefix, int unit); -extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr); +extern struct net_device *dev_getbyhwaddr(unsigned short type, char *hwaddr, nfxid_t nfxid); extern struct net_device *dev_getfirstbyhwtype(unsigned short type); extern void dev_add_pack(struct packet_type *pt); extern void dev_remove_pack(struct packet_type *pt); diff -NurpP --minimal a/include/linux/netfilter_ipv4/ipt_vnet.h b/include/linux/netfilter_ipv4/ipt_vnet.h --- a/include/linux/netfilter_ipv4/ipt_vnet.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/netfilter_ipv4/ipt_vnet.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,9 @@ +#ifndef _IPT_VNET_H +#define _IPT_VNET_H + +struct ipt_vnet_info { + uint16_t vnet; + uint16_t flags; +}; + +#endif /*_IPT_VNET_H*/ diff -NurpP --minimal a/include/linux/netfilter_ipv4/ipt_VNET.h b/include/linux/netfilter_ipv4/ipt_VNET.h --- a/include/linux/netfilter_ipv4/ipt_VNET.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/netfilter_ipv4/ipt_VNET.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,8 @@ +#ifndef _IPT_VNET_H_target +#define _IPT_VNET_H_target + +struct ipt_vnet_target_info { + uint16_t vnet; +}; + +#endif /*_IPT_VNET_H_target*/ diff -NurpP --minimal a/include/linux/skbuff.h b/include/linux/skbuff.h --- a/include/linux/skbuff.h 2005-02-25 17:04:59.000000000 +0100 +++ b/include/linux/skbuff.h 2005-02-25 17:18:50.000000000 +0100 @@ -23,7 +23,6 @@ #include #include #include -#include #include #include #include @@ -248,6 +247,8 @@ struct sk_buff { void (*destructor)(struct sk_buff *skb); #ifdef CONFIG_NETFILTER unsigned long nfmark; + __u16 nfxid; + __u16 nfvnet; __u32 nfcache; __u32 nfctinfo; struct nf_conntrack *nfct; diff -NurpP --minimal a/include/linux/types.h b/include/linux/types.h --- a/include/linux/types.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/linux/types.h 2005-02-25 17:18:50.000000000 +0100 @@ -37,6 +37,7 @@ typedef __kernel_gid32_t gid_t; typedef __kernel_uid16_t uid16_t; typedef __kernel_gid16_t gid16_t; typedef unsigned int xid_t; +typedef __u16 nfxid_t; typedef unsigned int nid_t; #ifdef CONFIG_UID16 diff -NurpP --minimal a/include/linux/vserver/context.h b/include/linux/vserver/context.h --- a/include/linux/vserver/context.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/linux/vserver/context.h 2005-02-25 17:18:50.000000000 +0100 @@ -73,6 +73,7 @@ #include "limit_def.h" #include "sched_def.h" #include "cvirt_def.h" +#include "ngnet_def.h" struct vx_info { struct hlist_node vx_hlist; /* linked list of contexts */ @@ -98,6 +99,7 @@ struct vx_info { struct _vx_sched sched; /* vserver scheduler */ struct _vx_cvirt cvirt; /* virtual/bias stuff */ struct _vx_cacct cacct; /* context accounting */ + struct _vx_ngnet ngnet; /* next gen networking */ char vx_name[65]; /* vserver name */ }; @@ -105,7 +107,8 @@ struct vx_info { /* status flags */ -#define VXS_HASHED 0x0001 +#define VXS_USED 0x0001 +#define VXS_HASHED 0x0002 #define VXS_PAUSED 0x0010 #define VXS_ONHOLD 0x0020 #define VXS_SHUTDOWN 0x0100 @@ -143,7 +146,16 @@ extern int xid_is_hashed(xid_t); extern int vx_migrate_task(struct task_struct *, struct vx_info *); -extern long vs_context_state(unsigned int); +extern long vs_context_state(struct vx_info *, unsigned int); + +struct vx_info_save { + struct vx_info *vxi; + xid_t xid; +}; + +extern int enter_vx_info(struct vx_info *, struct vx_info_save *); +extern int leave_vx_info(struct vx_info_save *); + extern void free_vx_info(struct vx_info *); diff -NurpP --minimal a/include/linux/vserver/debug.h b/include/linux/vserver/debug.h --- a/include/linux/vserver/debug.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/linux/vserver/debug.h 2005-02-25 17:18:50.000000000 +0100 @@ -14,6 +14,7 @@ #ifdef CONFIG_VSERVER_DEBUG +#define VX_DEBUG_INFO extern unsigned int vx_debug_switch; extern unsigned int vx_debug_xid; @@ -22,25 +23,52 @@ extern unsigned int vx_debug_net; extern unsigned int vx_debug_limit; extern unsigned int vx_debug_dlim; extern unsigned int vx_debug_cvirt; +extern unsigned int vx_debug_ngnet; + + +#include +#define __LOCATION__ __FILE__ ":" __stringify(__LINE__) + +#ifdef VX_DEBUG_INFO +#define __vxd_info(t,c,f,x) \ + asm( ".section .vxdebug \n" \ + ".ascii \"" __LOCATION__ "\" \n" \ + ".byte 0x01 \n" \ + ".ascii " #t " \n" \ + ".byte 0x01 \n" \ + ".ascii " #c " \n" \ + ".byte 0x01 \n" \ + ".ascii " #f " \n" \ + ".byte 0x01 \n" \ + ".ascii " #x " \n" \ + ".byte 0x02 \n" \ + ".previous \n" \ + ) +#else +#define __vxd_info(t,c,f,x) +#endif #define VX_LOGLEVEL "vxD: " #define VX_WARNLEVEL KERN_WARNING "vxW: " #define vxdprintk(c,f,x...) \ do { \ + __vxd_info("d",#c,f, #x); \ if (c) \ printk(VX_LOGLEVEL f "\n" , ##x); \ } while (0) #define vxlprintk(c,f,x...) \ do { \ + __vxd_info("l",#c,f, #x); \ if (c) \ printk(VX_LOGLEVEL f " @%s:%d\n", x); \ } while (0) #define vxfprintk(c,f,x...) \ do { \ + __vxd_info("f",#c,f, #x); \ if (c) \ printk(VX_LOGLEVEL f " %s@%s:%d\n", x); \ } while (0) @@ -48,6 +76,7 @@ extern unsigned int vx_debug_cvirt; #define vxwprintk(c,f,x...) \ do { \ + __vxd_info("w",#c,f, #x); \ if (c) \ printk(VX_WARNLEVEL f "\n" , ##x); \ } while (0) diff -NurpP --minimal a/include/linux/vserver/ngnet_cmd.h b/include/linux/vserver/ngnet_cmd.h --- a/include/linux/vserver/ngnet_cmd.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/vserver/ngnet_cmd.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,37 @@ +#ifndef _VX_NGNET_CMD_H +#define _VX_NGNET_CMD_H + +#ifdef __KERNEL__ +#include +#else +#include +#endif + +/* ngnet vserver commands */ + +#define VCMD_add_vndev VC_CMD(VNET, 2, 0) +#define VCMD_zap_vnet VC_CMD(VNET, 3, 0) + +struct vcmd_add_vndev_v0 { + uint16_t vnet; + uint16_t flags; + + char name[IFNAMSIZ]; + char real[IFNAMSIZ]; +}; + +struct vcmd_zap_vnet_v0 { + uint16_t vnet; + uint16_t flags; +}; + + +#ifdef __KERNEL__ + +#include + +extern int vc_add_vndev(uint32_t, void __user *); +extern int vc_zap_vnet(uint32_t, void __user *); + +#endif /* __KERNEL__ */ +#endif /* _VX_NGNET_CMD_H */ diff -NurpP --minimal a/include/linux/vserver/ngnet_def.h b/include/linux/vserver/ngnet_def.h --- a/include/linux/vserver/ngnet_def.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/vserver/ngnet_def.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,31 @@ +#ifndef _VX_NGNET_DEF_H +#define _VX_NGNET_DEF_H + +#include +#include + +#include "ngnet.h" + +/* context sub struct */ + +struct _vx_ngnet { + struct net_device *dev_base; + struct net_device **dev_tail; + struct net_device *loopback; +/* + int ifindex; + + struct _vx_dev_hash { + struct hlist_head name; +// struct hlist_head index; + } dev_hash[1 << NG_HASHBITS]; + + +// struct hlist_head *fib_info_laddrhash; + +*/ + struct fib_table *fib_main; + struct fib_table *fib_local; +}; + +#endif /* _VX_NGNET_DEF_H */ diff -NurpP --minimal a/include/linux/vserver/ngnet.h b/include/linux/vserver/ngnet.h --- a/include/linux/vserver/ngnet.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/vserver/ngnet.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,22 @@ +#ifndef _VX_NGNET_H +#define _VX_NGNET_H + +#ifdef __KERNEL__ + +struct hlist_head; +struct net_device; +struct vx_info; + +extern struct net_device *vx_dev_base(struct vx_info *); + +extern struct hlist_head *vx_dev_name_hash(struct vx_info *, unsigned); +extern struct hlist_head *vx_dev_index_hash(struct vx_info *, int); + +extern int vx_zap_vnet(struct vx_info *, int); + +#define NG_HASHBITS 4 + +#endif /* __KERNEL__ */ +#endif /* _VX_NGNET_H */ + + diff -NurpP --minimal a/include/linux/vserver/switch.h b/include/linux/vserver/switch.h --- a/include/linux/vserver/switch.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/linux/vserver/switch.h 2005-02-25 17:18:50.000000000 +0100 @@ -61,6 +61,7 @@ #define VC_CAT_NETALT 26 #define VC_CAT_NETMIG 27 #define VC_CAT_NETCTRL 28 +#define VC_CAT_NGTEST 29 #define VC_CAT_DLIMIT 36 #define VC_CAT_INODE 38 diff -NurpP --minimal a/include/linux/vs_ngnet.h b/include/linux/vs_ngnet.h --- a/include/linux/vs_ngnet.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/linux/vs_ngnet.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,249 @@ +#ifndef _VX_VS_NGNET_H +#define _VX_VS_NGNET_H + + +#include +#include +#include "vserver/debug.h" + + +#if 0 +#define __vx_loopback_dev_ptr(v) ((v) ? \ + (v)->vx_info->ngnet.loopback : &loopback_dev) + +#define __vx_loopback_dev(v) (*__vx_loopback_dev_ptr(v)) +#endif + +extern struct net_device loopback_dev; + +static inline +struct net_device *__vx_loopback_dev(nfxid_t nfxid) +{ + struct net_device *loopback; + struct vx_info *vxi; + + /* FIXME obsolete when optimization in locate? */ + if (nfxid == vx_current_xid()) + vxi = current->vx_info; + else + vxi = locate_vx_info(nfxid); + + if (vxi) { + loopback = vxi->ngnet.loopback; + /* FIXME check for general? */ + if (vxi != current->vx_info) + put_vx_info(vxi); + } else { + loopback = &loopback_dev; + } + + vxdprintk(VXD_CBIT(ngnet, 1), + "__vx_loopback_dev(#%d) = %p", nfxid, loopback); + return loopback; +} + +extern struct fib_table *ip_fib_local_table; +extern struct fib_table *ip_fib_main_table; + +static inline +struct fib_table *__vx_fib_get_table(int id, nfxid_t nfxid) +{ + struct fib_table *rt; + struct vx_info *vxi; + + /* FIXME obsolete when optimization in locate? */ + if (nfxid == vx_current_xid()) + vxi = current->vx_info; + else + vxi = locate_vx_info(nfxid); + +#ifdef CONFIG_IP_MULTIPLE_TABLES + +#else + if (nfxid && !vxi) + printk("!!! context #%u not found ...\n", nfxid); + if (vxi) { + if (id == /* RT_TABLE_LOCAL */255) + rt = vxi->ngnet.fib_local; + else + rt = vxi->ngnet.fib_main; + + /* FIXME check for general? */ + if (vxi != current->vx_info) + put_vx_info(vxi); + } else { + if (id == /* RT_TABLE_LOCAL */255) + rt = ip_fib_local_table; + else + rt = ip_fib_main_table; + } +#endif + vxdprintk(VXD_CBIT(ngnet, 1), + "__vx_fib_get_table(%d,#%d) = %p", id, nfxid, rt); + // dump_stack(); + return rt; +} + +#define __vx_fib_main_table(x) __vx_fib_get_table(RT_TABLE_MAIN, x) + +#define __vx_fib_local_table(x) __vx_fib_get_table(RT_TABLE_LOCAL, x) + + +static inline +void vx_tag_output_skb(struct sock *sk, struct sk_buff *skb) +{ + nfxid_t skxid = sk?sk->sk_xid:0; + + skb->nfvnet = VNET_UNTAGGED; + if (skxid) + skb->nfxid = skxid; + else + skb->nfxid = vx_current_xid(); +} + +#define vx_tag_netlink_skb(sk,skb) \ + vx_tag_output_skb(sk,skb) + + +#ifdef CONFIG_VSERVER_NGNET +#define VX_SOCK_MODE (VX_IDENT) +#else +#define VX_SOCK_MODE (VX_IDENT|VX_ADMIN) +#endif + +#define vx_sk_xid(s) ((s)?(s)->sk_xid:0) + +#define vx_sk_check(s,m) \ + __vx_check(vx_sk_xid(s), (m), VX_SOCK_MODE) + + +#if 0 +/* device hashes */ + +#ifdef CONFIG_VSERVER_NGNET + +#define __vx_dev_base_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.dev_base : &dev_base) + +#define __vx_dev_base (*__vx_dev_base_ptr) + +#define __vx_dev_tail_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.dev_tail : &dev_tail) + +#define __vx_dev_tail (*__vx_dev_tail_ptr) + +#define __vx_ifindex_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.ifindex : &ifindex) + +#define __vx_loopback_ptr (current->vx_info ? \ + current->vx_info->ngnet.loopback : &loopback_dev) + +static inline +struct hlist_head *__vx_dev_name_hash(struct hlist_head *head, + unsigned mask, unsigned hash) +{ + struct vx_info *vxi = current->vx_info; + + if (vxi) + return vx_dev_name_hash(vxi, hash); + return &head[hash & mask]; +} + +/* +static inline +struct hlist_head *__vx_dev_index_hash(struct hlist_head *head, + unsigned mask, int ifindex) +{ + struct vx_info *vxi = current->vx_info; + + if (vxi) + return vx_dev_index_hash(vxi, ifindex); + return &head[ifindex & mask]; +} +*/ + + +#define __vx_fib_info_hash_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.fib_info_hash : &fib_info_hash) + +#define __vx_fib_info_hash (*__vx_fib_info_hash_ptr) + +#define __vx_fib_info_laddrhash_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.fib_info_laddrhash : &fib_info_laddrhash) + +#define __vx_fib_info_laddrhash (*__vx_fib_info_laddrhash_ptr) + +#define __vx_fib_hash_size_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.fib_hash_size : &fib_hash_size) + +#define __vx_fib_hash_size (*__vx_fib_hash_size_ptr) + +#define __vx_fib_info_cnt_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.fib_info_cnt : &fib_info_cnt) + +#define __vx_fib_info_cnt (*__vx_fib_info_cnt_ptr) + +#define __vx_fib_rules_ptr (current->vx_info ? \ + ¤t->vx_info->ngnet.fib_rules : &fib_rules) + +#define __vx_fib_rules (*__vx_fib_rules_ptr) + +#define __vx_local_rule_ptr (current->vx_info ? \ + current->vx_info->ngnet.local_rule : &local_rule) + +#define _NGN_(x) do { x } while (0) + + +#else /* VSERVER_NGNET */ + +#define __vx_dev_base dev_base + +#define __vx_dev_tail_ptr &dev_tail + +#define __vx_ifindex_ptr &ifindex + +#define __vx_loopback_ptr &loopback_dev + +static inline +struct hlist_head *__vx_dev_name_hash(struct hlist_head *head, + unsigned mask, unsigned hash) +{ + return &head[hash & mask]; +} + +static inline +struct hlist_head *__vx_dev_index_hash(struct hlist_head *head, + unsigned mask, int ifindex) +{ + return &head[ifindex & mask]; +} + + +#define __vx_fib_main_table(x) ip_fib_main_table + +#define __vx_fib_local_table(x) ip_fib_local_table + +#define __vx_fib_info_hash fib_info_hash + +#define __vx_fib_info_laddrhash fib_info_laddrhash + +#define __vx_fib_hash_size fib_hash_size + +#define __vx_fib_info_cnt fib_info_cnt + +#define __vx_fib_rules fib_rules + +#define __vx_fib_rules_ptr &fib_rules + +#define __vx_local_rule_ptr &local_rule + +#define _NGN_(x) + + +#endif /* VSERVER_NGNET */ + +#endif + +#else +#warning duplicate inclusion +#endif diff -NurpP --minimal a/include/net/flow.h b/include/net/flow.h --- a/include/net/flow.h 2004-12-24 22:35:27.000000000 +0100 +++ b/include/net/flow.h 2005-02-25 17:18:50.000000000 +0100 @@ -77,6 +77,8 @@ struct flowi { #define fl_icmp_type uli_u.icmpt.type #define fl_icmp_code uli_u.icmpt.code #define fl_ipsec_spi uli_u.spi + + nfxid_t nfxid; } __attribute__((__aligned__(BITS_PER_LONG/8))); #define FLOW_DIR_IN 0 diff -NurpP --minimal a/include/net/ip_fib.h b/include/net/ip_fib.h --- a/include/net/ip_fib.h 2005-02-25 17:04:59.000000000 +0100 +++ b/include/net/ip_fib.h 2005-02-25 17:18:50.000000000 +0100 @@ -19,6 +19,9 @@ #include #include #include +#include +#include + /* WARNING: The ordering of these elements must match ordering * of RTA_* rtnetlink attribute numbers. @@ -81,6 +84,7 @@ struct fib_info { #ifdef CONFIG_IP_ROUTE_MULTIPATH int fib_power; #endif + nfxid_t fib_nfxid; struct fib_nh fib_nh[0]; #define fib_dev fib_nh[0].nh_dev }; @@ -135,6 +139,7 @@ struct fib_table { void (*tb_select_default)(struct fib_table *table, const struct flowi *flp, struct fib_result *res); + nfxid_t tb_nfxid; unsigned char tb_data[0]; }; @@ -143,30 +148,51 @@ struct fib_table { extern struct fib_table *ip_fib_local_table; extern struct fib_table *ip_fib_main_table; -static inline struct fib_table *fib_get_table(int id) +static inline struct fib_table *fib_get_table(int id, nfxid_t nfxid) { - if (id != RT_TABLE_LOCAL) - return ip_fib_main_table; - return ip_fib_local_table; + struct fib_table *tb = __vx_fib_get_table(id, nfxid); + + vxdprintk(VXD_CBIT(ngnet, 1), + "fib_get_table(%u,#%u): %p[#%u]", + id, nfxid, tb, tb->tb_nfxid); + return tb; } -static inline struct fib_table *fib_new_table(int id) +static inline struct fib_table *fib_new_table(int id, nfxid_t nfxid) { - return fib_get_table(id); + struct fib_table *tb = __vx_fib_get_table(id, nfxid); + + vxdprintk(VXD_CBIT(ngnet, 1), + "fib_new_table(%u,#%u): %p[#%u]", + id, nfxid, tb, tb->tb_nfxid); + return tb; } static inline int fib_lookup(const struct flowi *flp, struct fib_result *res) { - if (ip_fib_local_table->tb_lookup(ip_fib_local_table, flp, res) && - ip_fib_main_table->tb_lookup(ip_fib_main_table, flp, res)) + struct fib_table *fib_local = __vx_fib_local_table(flp->nfxid); + struct fib_table *fib_main = __vx_fib_main_table(flp->nfxid); + + vxdprintk(VXD_CBIT(ngnet, 2), + "fib_lookup(%p[#%d]) %u.%u.%u.%u <- %u.%u.%u.%u", + flp, flp->nfxid, + NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); + + if (fib_local->tb_lookup(fib_local, flp, res) && + fib_main->tb_lookup(fib_main, flp, res)) { + vxdprintk(VXD_CBIT(ngnet, 2), + "fib_lookup() unreachable"); return -ENETUNREACH; + } return 0; } static inline void fib_select_default(const struct flowi *flp, struct fib_result *res) { + struct fib_table *fib_main = __vx_fib_main_table(flp->nfxid); + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) - ip_fib_main_table->tb_select_default(ip_fib_main_table, flp, res); + fib_main->tb_select_default(fib_main, flp, res); } #else /* CONFIG_IP_MULTIPLE_TABLES */ @@ -210,14 +236,16 @@ extern void fib_select_multipath(const s /* Exported by fib_semantics.c */ extern int ip_fib_check_default(u32 gw, struct net_device *dev); -extern int fib_sync_down(u32 local, struct net_device *dev, int force); +extern int fib_sync_down(u32 local, struct net_device *dev, int force, + nfxid_t nfxid); extern int fib_sync_up(struct net_device *dev); extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, struct kern_rta *rta, struct rtentry *r); extern u32 __fib_res_prefsrc(struct fib_result *res); /* Exported by fib_hash.c */ -extern struct fib_table *fib_hash_init(int id); +extern struct fib_table *fib_hash_init(int id, nfxid_t nfxid); +extern void fib_table_free(struct fib_table *tb); #ifdef CONFIG_IP_MULTIPLE_TABLES /* Exported by fib_rules.c */ diff -NurpP --minimal a/include/net/ip_fib_ngnet.h b/include/net/ip_fib_ngnet.h --- a/include/net/ip_fib_ngnet.h 1970-01-01 01:00:00.000000000 +0100 +++ b/include/net/ip_fib_ngnet.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,271 @@ +#error bad bad +/* + * INET An implementation of the TCP/IP protocol suite for the LINUX + * operating system. INET is implemented using the BSD Socket + * interface as the means of communication with the user level. + * + * Definitions for the Forwarding Information Base. + * + * Authors: A.N.Kuznetsov, + * + * This program is free software; you can redistribute it and/or + * modify it under the terms of the GNU General Public License + * as published by the Free Software Foundation; either version + * 2 of the License, or (at your option) any later version. + */ + +#ifndef _NET_IP_FIB_H +#define _NET_IP_FIB_H + +#include +#include +#include + +/* WARNING: The ordering of these elements must match ordering + * of RTA_* rtnetlink attribute numbers. + */ +struct kern_rta { + void *rta_dst; + void *rta_src; + int *rta_iif; + int *rta_oif; + void *rta_gw; + u32 *rta_priority; + void *rta_prefsrc; + struct rtattr *rta_mx; + struct rtattr *rta_mp; + unsigned char *rta_protoinfo; + u32 *rta_flow; + struct rta_cacheinfo *rta_ci; + struct rta_session *rta_sess; +}; + +struct fib_info; + +struct fib_nh { + struct net_device *nh_dev; + struct hlist_node nh_hash; + struct fib_info *nh_parent; + unsigned nh_flags; + unsigned char nh_scope; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int nh_weight; + int nh_power; +#endif +#ifdef CONFIG_NET_CLS_ROUTE + __u32 nh_tclassid; +#endif + int nh_oif; + u32 nh_gw; +}; + +/* + * This structure contains data shared by many of routes. + */ + +struct fib_info { + struct hlist_node fib_hash; + struct hlist_node fib_lhash; + int fib_treeref; + atomic_t fib_clntref; + int fib_dead; + unsigned fib_flags; + int fib_protocol; + u32 fib_prefsrc; + u32 fib_priority; + u32 fib_metrics[RTAX_MAX]; +#define fib_mtu fib_metrics[RTAX_MTU-1] +#define fib_window fib_metrics[RTAX_WINDOW-1] +#define fib_rtt fib_metrics[RTAX_RTT-1] +#define fib_advmss fib_metrics[RTAX_ADVMSS-1] + int fib_nhs; +#ifdef CONFIG_IP_ROUTE_MULTIPATH + int fib_power; +#endif + struct fib_nh fib_nh[0]; +#define fib_dev fib_nh[0].nh_dev +}; + + +#ifdef CONFIG_IP_MULTIPLE_TABLES +struct fib_rule; +#endif + +struct fib_result { + unsigned char prefixlen; + unsigned char nh_sel; + unsigned char type; + unsigned char scope; + struct fib_info *fi; +#ifdef CONFIG_IP_MULTIPLE_TABLES + struct fib_rule *r; +#endif +}; + + +#ifdef CONFIG_IP_ROUTE_MULTIPATH + +#define FIB_RES_NH(res) ((res).fi->fib_nh[(res).nh_sel]) +#define FIB_RES_RESET(res) ((res).nh_sel = 0) + +#else /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define FIB_RES_NH(res) ((res).fi->fib_nh[0]) +#define FIB_RES_RESET(res) + +#endif /* CONFIG_IP_ROUTE_MULTIPATH */ + +#define FIB_RES_PREFSRC(res) ((res).fi->fib_prefsrc ? : __fib_res_prefsrc(&res)) +#define FIB_RES_GW(res) (FIB_RES_NH(res).nh_gw) +#define FIB_RES_DEV(res) (FIB_RES_NH(res).nh_dev) +#define FIB_RES_OIF(res) (FIB_RES_NH(res).nh_oif) + +struct fib_table { + unsigned char tb_id; + unsigned tb_stamp; + int (*tb_lookup)(struct fib_table *tb, const struct flowi *flp, struct fib_result *res); + int (*tb_insert)(struct fib_table *table, struct rtmsg *r, + struct kern_rta *rta, struct nlmsghdr *n, + struct netlink_skb_parms *req); + int (*tb_delete)(struct fib_table *table, struct rtmsg *r, + struct kern_rta *rta, struct nlmsghdr *n, + struct netlink_skb_parms *req); + int (*tb_dump)(struct fib_table *table, struct sk_buff *skb, + struct netlink_callback *cb); + int (*tb_flush)(struct fib_table *table); + void (*tb_select_default)(struct fib_table *table, + const struct flowi *flp, struct fib_result *res); + + unsigned char tb_data[0]; +}; + +#ifndef CONFIG_IP_MULTIPLE_TABLES + +extern struct fib_table *ip_fib_local_table; +extern struct fib_table *ip_fib_main_table; + +static inline struct fib_table *fib_get_table(int id) +{ + if (id != RT_TABLE_LOCAL) + return ip_fib_main_table; + return ip_fib_local_table; +} + +static inline struct fib_table *fib_new_table(int id) +{ + return fib_get_table(id); +} + +static inline int fib_lookup(const struct flowi *flp, struct fib_result *res) +{ + if (ip_fib_local_table->tb_lookup(ip_fib_local_table, flp, res) && + ip_fib_main_table->tb_lookup(ip_fib_main_table, flp, res)) + return -ENETUNREACH; + return 0; +} + +static inline void fib_select_default(const struct flowi *flp, struct fib_result *res) +{ + if (FIB_RES_GW(*res) && FIB_RES_NH(*res).nh_scope == RT_SCOPE_LINK) + ip_fib_main_table->tb_select_default(ip_fib_main_table, flp, res); +} + +#else /* CONFIG_IP_MULTIPLE_TABLES */ +#define ip_fib_local_table (fib_tables[RT_TABLE_LOCAL]) +#define ip_fib_main_table (fib_tables[RT_TABLE_MAIN]) + +extern struct fib_table * fib_tables[RT_TABLE_MAX+1]; +extern int fib_lookup(const struct flowi *flp, struct fib_result *res); +extern struct fib_table *__fib_new_table(int id); +extern void fib_rule_put(struct fib_rule *r); + +static inline struct fib_table *fib_get_table(int id) +{ + if (id == 0) + id = RT_TABLE_MAIN; + + return fib_tables[id]; +} + +static inline struct fib_table *fib_new_table(int id) +{ + if (id == 0) + id = RT_TABLE_MAIN; + + return fib_tables[id] ? : __fib_new_table(id); +} + +extern void fib_select_default(const struct flowi *flp, struct fib_result *res); + +#endif /* CONFIG_IP_MULTIPLE_TABLES */ + +/* Exported by fib_frontend.c */ +extern void ip_fib_init(void); +extern void fib_flush(void); +extern int inet_rtm_delroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_newroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_getroute(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_dump_fib(struct sk_buff *skb, struct netlink_callback *cb); +extern int fib_validate_source(u32 src, u32 dst, u8 tos, int oif, + struct net_device *dev, u32 *spec_dst, u32 *itag); +extern void fib_select_multipath(const struct flowi *flp, struct fib_result *res); + +/* Exported by fib_semantics.c */ +extern int ip_fib_check_default(u32 gw, struct net_device *dev); +extern int fib_sync_down(u32 local, struct net_device *dev, int force); +extern int fib_sync_up(struct net_device *dev); +extern int fib_convert_rtentry(int cmd, struct nlmsghdr *nl, struct rtmsg *rtm, + struct kern_rta *rta, struct rtentry *r); +extern u32 __fib_res_prefsrc(struct fib_result *res); + +/* Exported by fib_hash.c */ +extern struct fib_table *fib_hash_init(int id); + +#ifdef CONFIG_IP_MULTIPLE_TABLES +/* Exported by fib_rules.c */ + +extern int inet_rtm_delrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_rtm_newrule(struct sk_buff *skb, struct nlmsghdr* nlh, void *arg); +extern int inet_dump_rules(struct sk_buff *skb, struct netlink_callback *cb); +extern u32 fib_rules_map_destination(u32 daddr, struct fib_result *res); +#ifdef CONFIG_NET_CLS_ROUTE +extern u32 fib_rules_tclass(struct fib_result *res); +#endif +extern void fib_rules_init(void); +#endif + +static inline void fib_combine_itag(u32 *itag, struct fib_result *res) +{ +#ifdef CONFIG_NET_CLS_ROUTE +#ifdef CONFIG_IP_MULTIPLE_TABLES + u32 rtag; +#endif + *itag = FIB_RES_NH(*res).nh_tclassid<<16; +#ifdef CONFIG_IP_MULTIPLE_TABLES + rtag = fib_rules_tclass(res); + if (*itag == 0) + *itag = (rtag<<16); + *itag |= (rtag>>16); +#endif +#endif +} + +extern void free_fib_info(struct fib_info *fi); + +static inline void fib_info_put(struct fib_info *fi) +{ + if (atomic_dec_and_test(&fi->fib_clntref)) + free_fib_info(fi); +} + +static inline void fib_res_put(struct fib_result *res) +{ + if (res->fi) + fib_info_put(res->fi); +#ifdef CONFIG_IP_MULTIPLE_TABLES + if (res->r) + fib_rule_put(res->r); +#endif +} + +#endif /* _NET_FIB_H */ diff -NurpP --minimal a/include/net/neighbour.h b/include/net/neighbour.h --- a/include/net/neighbour.h 2004-12-24 22:35:39.000000000 +0100 +++ b/include/net/neighbour.h 2005-02-25 17:18:50.000000000 +0100 @@ -56,6 +56,7 @@ #include #include +#include #define NUD_IN_TIMER (NUD_INCOMPLETE|NUD_REACHABLE|NUD_DELAY|NUD_PROBE) #define NUD_VALID (NUD_PERMANENT|NUD_NOARP|NUD_REACHABLE|NUD_PROBE|NUD_STALE|NUD_DELAY) @@ -342,6 +343,9 @@ __neigh_lookup(struct neigh_table *tbl, { struct neighbour *n = neigh_lookup(tbl, pkey, dev); + vxdprintk(VXD_CBIT(ngnet, 2), + "__neigh_lookup(%p,%p[%s,#%u],%d) : %p", + tbl, dev, dev->name, dev->nfxid, creat, n); if (n || !creat) return n; diff -NurpP --minimal a/include/net/raw.h b/include/net/raw.h --- a/include/net/raw.h 2004-12-24 22:35:28.000000000 +0100 +++ b/include/net/raw.h 2005-02-25 17:18:50.000000000 +0100 @@ -35,7 +35,7 @@ extern rwlock_t raw_v4_lock; extern struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, unsigned long raddr, unsigned long laddr, - int dif); + int dif, nfxid_t nfxid); extern void raw_v4_input(struct sk_buff *skb, struct iphdr *iph, int hash); diff -NurpP --minimal a/include/net/route.h b/include/net/route.h --- a/include/net/route.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/net/route.h 2005-02-25 17:18:50.000000000 +0100 @@ -33,8 +33,7 @@ #include #include #include -#include -#include +#include #ifndef __KERNEL__ #warning This file is not supposed to be used outside of kernel. @@ -124,7 +123,7 @@ extern int ip_route_input(struct sk_buf extern unsigned short ip_rt_frag_needed(struct iphdr *iph, unsigned short new_mtu); extern void ip_rt_send_redirect(struct sk_buff *skb); -extern unsigned inet_addr_type(u32 addr); +extern unsigned inet_addr_type(u32 addr, nfxid_t nfxid); extern void ip_rt_multicast_event(struct in_device *); extern int ip_rt_ioctl(unsigned int cmd, void __user *arg); extern void ip_rt_get_source(u8 *src, struct rtable *rt); @@ -145,59 +144,6 @@ static inline char rt_tos2priority(u8 to return ip_tos2prio[IPTOS_TOS(tos)>>1]; } -#define IPI_LOOPBACK 0x0100007f - -static inline int ip_find_src(struct nx_info *nxi, struct rtable **rp, struct flowi *fl) -{ - int err; - int i, n = nxi->nbipv4; - u32 ipv4root = nxi->ipv4[0]; - - if (ipv4root == 0) - return 0; - - if (fl->fl4_src == 0) { - if (n > 1) { - u32 foundsrc; - - err = __ip_route_output_key(rp, fl); - if (err) { - fl->fl4_src = ipv4root; - err = __ip_route_output_key(rp, fl); - } - if (err) - return err; - - foundsrc = (*rp)->rt_src; - ip_rt_put(*rp); - - for (i=0; imask[i]; - u32 ipv4 = nxi->ipv4[i]; - u32 net4 = ipv4 & mask; - - if (foundsrc == ipv4) { - fl->fl4_src = ipv4; - break; - } - if (!fl->fl4_src && (foundsrc & mask) == net4) - fl->fl4_src = ipv4; - } - } - if (fl->fl4_src == 0) - fl->fl4_src = (fl->fl4_dst == IPI_LOOPBACK) - ? IPI_LOOPBACK : ipv4root; - } else { - for (i=0; iipv4[i] == fl->fl4_src) - break; - } - if (i == n) - return -EPERM; - } - return 0; -} - static inline int ip_route_connect(struct rtable **rp, u32 dst, u32 src, u32 tos, int oif, u8 protocol, u16 sport, u16 dport, struct sock *sk) @@ -212,23 +158,15 @@ static inline int ip_route_connect(struc .dport = dport } } }; int err; - struct nx_info *nx_info = current->nx_info; - if (sk) - nx_info = sk->sk_nx_info; - vxdprintk(VXD_CBIT(net, 4), - "ip_route_connect(%p) %p,%p;%lx", - sk, nx_info, sk->sk_socket, - (sk->sk_socket?sk->sk_socket->flags:0)); + fl.nfxid = vx_current_xid(); /* FIXME maybe socket? */ + fl.nfxid = sk->sk_xid; - if (nx_info) { - err = ip_find_src(nx_info, rp, &fl); - if (err) - return err; - if (fl.fl4_dst == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) - fl.fl4_dst = nx_info->ipv4[0]; - } - if (!fl.fl4_dst || !fl.fl4_src) { + vxdprintk(VXD_CBIT(ngnet, 1), + "ip_route_connect(%p[#%u]): #%u", + sk, sk->sk_xid, fl.nfxid); + + if (!dst || !src) { err = __ip_route_output_key(rp, &fl); if (err) return err; @@ -248,6 +186,11 @@ static inline int ip_route_newports(stru struct flowi fl; memcpy(&fl, &(*rp)->fl, sizeof(fl)); + + vxdprintk(VXD_CBIT(ngnet, 1), + "ip_route_newports(%p[#%u]): #%u", + sk, sk->sk_xid, fl.nfxid); + fl.fl_ip_sport = sport; fl.fl_ip_dport = dport; ip_rt_put(*rp); diff -NurpP --minimal a/include/net/sock.h b/include/net/sock.h --- a/include/net/sock.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/net/sock.h 2005-02-25 17:18:50.000000000 +0100 @@ -112,8 +112,6 @@ struct sock_common { atomic_t skc_refcnt; xid_t skc_xid; struct vx_info *skc_vx_info; - nid_t skc_nid; - struct nx_info *skc_nx_info; }; /** @@ -197,8 +195,6 @@ struct sock { #define sk_refcnt __sk_common.skc_refcnt #define sk_xid __sk_common.skc_xid #define sk_vx_info __sk_common.skc_vx_info -#define sk_nid __sk_common.skc_nid -#define sk_nx_info __sk_common.skc_nx_info volatile unsigned char sk_zapped; unsigned char sk_shutdown; unsigned char sk_use_write_queue; diff -NurpP --minimal a/include/net/tcp.h b/include/net/tcp.h --- a/include/net/tcp.h 2005-02-25 17:05:41.000000000 +0100 +++ b/include/net/tcp.h 2005-02-25 17:18:50.000000000 +0100 @@ -194,8 +194,6 @@ struct tcp_tw_bucket { #define tw_refcnt __tw_common.skc_refcnt #define tw_xid __tw_common.skc_xid #define tw_vx_info __tw_common.skc_vx_info -#define tw_nid __tw_common.skc_nid -#define tw_nx_info __tw_common.skc_nx_info volatile unsigned char tw_substate; unsigned char tw_rcv_wscale; __u16 tw_sport; diff -NurpP --minimal a/kernel/vserver/context.c b/kernel/vserver/context.c --- a/kernel/vserver/context.c 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/context.c 2005-02-25 17:18:50.000000000 +0100 @@ -23,6 +23,7 @@ #include #include #include +#include #include #include @@ -38,6 +39,7 @@ #include "cvirt_init.h" #include "limit_init.h" #include "sched_init.h" +#include "ngnet_init.h" /* __alloc_vx_info() @@ -72,7 +74,7 @@ static struct vx_info *__alloc_vx_info(x vx_info_init_sched(&new->sched); vx_info_init_cvirt(&new->cvirt); vx_info_init_cacct(&new->cacct); - + vx_info_init_ngnet(&new->ngnet, xid); new->vx_flags = VXF_STATE_SETUP|VXF_STATE_INIT; new->vx_bcaps = CAP_INIT_EFF_SET; @@ -101,6 +103,7 @@ static void __dealloc_vx_info(struct vx_ vx_info_exit_sched(&vxi->sched); vx_info_exit_cvirt(&vxi->cvirt); vx_info_exit_cacct(&vxi->cacct); + vx_info_exit_ngnet(&vxi->ngnet, 0); vxi->vx_state |= VXS_RELEASED; kfree(vxi); @@ -113,6 +116,10 @@ void __shutdown_vx_info(struct vx_info * might_sleep(); + vs_context_state(vxi, VS_CONTEXT_DESTROY); + + vx_zap_vnet(vxi, 0); + namespace = xchg(&vxi->vx_namespace, NULL); if (namespace) put_namespace(namespace); @@ -304,6 +311,7 @@ static struct vx_info * __loc_vx_info(in "loc_vx_info(%d) = %p (new)", id, new); __hash_vx_info(get_vx_info(new)); vxi = new, new = NULL; + vxi->vx_state |= VXS_USED; *err = 1; out_unlock: @@ -337,6 +345,8 @@ struct vx_info *locate_vx_info(int id) { struct vx_info *vxi = NULL; + /* FIXME optimization? */ + // if ((id < 0) || (id == current->xid)) { if (id < 0) { vxi = get_vx_info(current->vx_info); } else if (id > 1) { @@ -530,6 +540,24 @@ int vx_set_init(struct vx_info *vxi, str } +/* posing as xid ... */ + +int enter_vx_info(struct vx_info *vxi, struct vx_info_save *vxis) +{ + vxis->vxi = xchg(¤t->vx_info, vxi); + vxis->xid = current->xid; + current->xid = vxi->vx_id; + return 0; +} + +int leave_vx_info(struct vx_info_save *vxis) +{ + xchg(¤t->vx_info, vxis->vxi); + current->xid = vxis->xid; + return 0; +} + + /* vserver syscall commands below here */ /* taks xid and vx_info functions */ @@ -605,6 +633,7 @@ int vc_ctx_create(uint32_t xid, void __u ret = -EEXIST; goto out_put; } + vs_context_state(new_vxi, VS_CONTEXT_CREATED); ret = new_vxi->vx_id; vx_migrate_task(current, new_vxi); diff -NurpP --minimal a/kernel/vserver/helper.c b/kernel/vserver/helper.c --- a/kernel/vserver/helper.c 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/helper.c 2005-02-25 17:18:50.000000000 +0100 @@ -89,15 +89,15 @@ long vs_reboot(unsigned int cmd, void * return 0; } -long vs_context_state(unsigned int cmd) +long vs_context_state(struct vx_info *vxi, unsigned int cmd) { char id_buf[8], cmd_buf[32]; - char *argv[] = {vshelper_path, NULL, id_buf, NULL, 0}; char *envp[] = {"HOME=/", "TERM=linux", "PATH=/sbin:/usr/sbin:/bin:/usr/bin", cmd_buf, 0}; + int ret; - snprintf(id_buf, sizeof(id_buf)-1, "%d", vx_current_xid()); + snprintf(id_buf, sizeof(id_buf)-1, "%d", vxi->vx_id); snprintf(cmd_buf, sizeof(cmd_buf)-1, "VS_CMD=%08x", cmd); switch (cmd) { @@ -111,10 +111,10 @@ long vs_context_state(unsigned int cmd) return 0; } - if (call_usermodehelper(*argv, argv, envp, 1)) { + if ((ret = call_usermodehelper(vshelper_path, argv, envp, 1))) { printk( KERN_WARNING - "vs_context_state(): failed to exec (%s %s %s %s)\n", - vshelper_path, argv[1], argv[2], argv[3]); + "vs_context_state(): (%s %s %s %s) returned with %d\n", + vshelper_path, argv[1], argv[2], argv[3], ret); return 0; } return 0; diff -NurpP --minimal a/kernel/vserver/Kconfig b/kernel/vserver/Kconfig --- a/kernel/vserver/Kconfig 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/Kconfig 2005-02-25 17:18:50.000000000 +0100 @@ -86,6 +86,15 @@ config INOXID_RUNTIME endchoice +config VSERVER_NGNET + bool "Enable Next Gen Networking" + depends on EXPERIMENTAL + select VNET + default n + help + This enables iptable marking based next generation + networking. Be prepared to change your vserver setup! + config VSERVER_DEBUG bool "Compile Debugging Code" default n diff -NurpP --minimal a/kernel/vserver/Makefile b/kernel/vserver/Makefile --- a/kernel/vserver/Makefile 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/Makefile 2005-02-25 17:18:50.000000000 +0100 @@ -12,3 +12,5 @@ vserver-$(CONFIG_VSERVER_DEBUG) += sysct vserver-$(CONFIG_VSERVER_LEGACY) += legacy.o vserver-$(CONFIG_VSERVER_HISTORY) += history.o +vserver-$(CONFIG_VSERVER_NGNET) += ngnet.o + diff -NurpP --minimal a/kernel/vserver/network.c b/kernel/vserver/network.c --- a/kernel/vserver/network.c 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/network.c 2005-02-25 17:18:50.000000000 +0100 @@ -14,7 +14,10 @@ #include #include +#include #include +#include +#include #include #include @@ -422,6 +425,7 @@ int dev_in_nx_info(struct net_device *de return 0; } +#if 0 /* * check if address is covered by socket * @@ -474,6 +478,7 @@ int nx_addr_conflict(struct nx_info *nxi } } +#endif /* vserver syscall commands below here */ diff -NurpP --minimal a/kernel/vserver/ngnet.c b/kernel/vserver/ngnet.c --- a/kernel/vserver/ngnet.c 1970-01-01 01:00:00.000000000 +0100 +++ b/kernel/vserver/ngnet.c 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,289 @@ +/* + * linux/kernel/vserver/limit.c + * + * Virtual Server: Context Limits + * + * Copyright (C) 2004 Herbert Pötzl + * + * V0.01 broken out from vcontext V0.05 + * + */ + +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include +#include + + + +#if 0 + +struct net_device *vx_dev_base(struct vx_info *vxi) +{ + return vxi->ngnet.dev_base; +} + + +struct hlist_head *vx_dev_name_hash(struct vx_info *vxi, unsigned hash) +{ + return &vxi->ngnet.dev_hash[hash & + ((1 << NG_HASHBITS) - 1)].name; +} + +/* +struct hlist_head *vx_dev_index_hash(struct vx_info *vxi, int ifindex) +{ + return &vxi->ngnet.dev_hash[ifindex & + ((1 << NG_HASHBITS) - 1)].index; +} + +*/ + +#endif + + + +struct net_device *vx_vnet_create(struct vx_info *vxi, const char *name) +{ + struct vx_info_save vxis; + struct net_device *vndev; + + enter_vx_info(vxi, &vxis); + vndev = vnet_create_dev(name, vxi->vx_id); + leave_vx_info(&vxis); + return vndev; +} + +void vx_vnet_destroy(struct vx_info *vxi, struct net_device *vndev) +{ + struct vx_info_save vxis; + + enter_vx_info(vxi, &vxis); + vnet_destroy_dev(vndev); + leave_vx_info(&vxis); +} + + +int vx_zap_vnet(struct vx_info *vxi, int vnet) +{ + struct vnet *vn, *n; + int cnt = 0; + + list_for_each_entry_safe(vn, n, &vnets, vnet_list) { + struct net_device *vndev = vn->vndev; + + if (vn->nfxid != vxi->vx_id) + continue; + if ((vnet > 0) && (vn->vnet != vnet)) + continue; + + /* private loopback device ? */ + if (vndev == vxi->ngnet.loopback) + vxi->ngnet.loopback = &loopback_dev; + + vx_vnet_destroy(vxi, vn->vndev); + cnt++; + } + return cnt; +} + + + /* must not get a reference to vxi */ + +void __vx_cleanup_vnet(struct vx_info *vxi) +{ + struct vx_info_save vxis; + struct vnet *vn, *n; + + enter_vx_info(vxi, &vxis); + list_for_each_entry_safe(vn, n, &vnets, vnet_list) { + struct net_device *vndev = vn->vndev; + + if (vn->nfxid != vxi->vx_id) + continue; + /* private loopback device ? */ + if (vndev == vxi->ngnet.loopback) + vxi->ngnet.loopback = &loopback_dev; + + vnet_destroy_dev(vndev); + } + leave_vx_info(&vxis); +} + + +int vc_add_vndev(uint32_t id, void __user *data) +{ + struct net_device *dev, *vndev; + struct vcmd_add_vndev_v0 vc_data; + struct vx_info *vxi; + int err = -ENODEV; + + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = locate_vx_info(id); + if (!vxi) + return -ESRCH; + + dev = dev_get_by_name(vc_data.real); + if (!dev) + goto out_put; + + err = -EINVAL; + vndev = vx_vnet_create(vxi, vc_data.name); + if (!vndev) + goto out_dev_put; + + /* attach vnet device to real device */ + err = vnet_attach_dev(vndev, dev); + if (err) + goto cleanup; + + /* configure vnet device properties */ + err = vnet_config(vndev, vc_data.vnet, vxi->vx_id); + if (err) + goto cleanup; + + /* private loopback device ? */ + if (vndev->flags & IFF_LOOPBACK) + vxi->ngnet.loopback = vndev; + + /* everything is fine, keep ref to dev */ + goto out_put; + +cleanup: + vx_vnet_destroy(vxi, vndev); +out_dev_put: + dev_put(dev); +out_put: + put_vx_info(vxi); + return err; +} + +int vc_zap_vnet(uint32_t id, void __user *data) +{ + struct vcmd_zap_vnet_v0 vc_data; + struct vx_info *vxi; + int err; + + + if (copy_from_user (&vc_data, data, sizeof(vc_data))) + return -EFAULT; + + vxi = locate_vx_info(id); + if (!vxi) + return -ESRCH; + + err = -ENODEV; + if (vx_zap_vnet(vxi, 0)) + err = 0; + + put_vx_info(vxi); + return err; +} + + + /* net/core/dev code */ + +#include + +unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh); + +/* FIXME should come from headers */ +#define PF_VNET PF_LOCAL + +int netif_receive_skb_ngnet(struct sk_buff *skb); + +atomic_t netstamp_needed; + +/* FIXME should come from headers, stamp always */ +static inline void net_timestamp(struct timeval *stamp) +{ + do_gettimeofday(stamp); +} + +/* FIXME should come from headers */ +static __inline__ void skb_bond(struct sk_buff *skb) +{ + struct net_device *dev = skb->dev; + + if (dev->master) { + skb->real_dev = skb->dev; + skb->dev = dev->master; + } +} + + +int netif_receive_skb(struct sk_buff *skb) +{ + struct list_head *elem; + unsigned int verdict; + unsigned int hook = NF_IP_PRE_ROUTING; + int ret = NET_RX_DROP; + + vxdprintk(VXD_CBIT(ngnet, 0), + "netif_receive_skb(%p[#%u])", skb, skb->nfxid); + +#ifdef CONFIG_NETPOLL + if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) { + kfree_skb(skb); + return NET_RX_DROP; + } +#endif + + if (!skb->stamp.tv_sec) + net_timestamp(&skb->stamp); + + skb_bond(skb); + + __get_cpu_var(netdev_rx_stat).total++; + + skb->h.raw = skb->nh.raw = skb->data; + skb->mac_len = skb->nh.raw - skb->mac.raw; + + rcu_read_lock(); + + elem = &nf_hooks[PF_VNET][hook]; + verdict = nf_iterate(&nf_hooks[PF_VNET][hook], + &skb, hook, skb->dev, NULL, &elem, + netif_receive_skb_ngnet, INT_MIN); + + vxdprintk(VXD_CBIT(ngnet, 0), + "netif_receive_skb(%p[#%u]) = %u", skb, skb->nfxid, verdict); + switch (verdict) { + case NF_ACCEPT: + ret = netif_receive_skb_ngnet(skb); + break; + + case NF_DROP: + kfree_skb(skb); + ret = -EPERM; + break; + } + + rcu_read_unlock(); + + return ret; +} + diff -NurpP --minimal a/kernel/vserver/ngnet_init.h b/kernel/vserver/ngnet_init.h --- a/kernel/vserver/ngnet_init.h 1970-01-01 01:00:00.000000000 +0100 +++ b/kernel/vserver/ngnet_init.h 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,51 @@ + +#include + +struct fib_table *fib_hash_init(int id, nfxid_t nfxid); + +static inline void vx_info_init_ngnet(struct _vx_ngnet *ngnet, xid_t xid) +{ +/* + int index; + + ngnet->dev_base = NULL; + ngnet->dev_tail = &ngnet->dev_base; + ngnet->ifindex = 0; + + for (index=0; index<(1 << NG_HASHBITS); index++) { + INIT_HLIST_HEAD(&ngnet->dev_hash[index].name); + // INIT_HLIST_HEAD(&ngnet->dev_hash[index].index); + } + +*/ + ngnet->loopback = &loopback_dev; + ngnet->fib_main = fib_hash_init(RT_TABLE_MAIN, xid); + ngnet->fib_local = fib_hash_init(RT_TABLE_LOCAL, xid); + +} + +extern void vx_vnet_destroy(struct vx_info *, struct net_device *); +extern xid_t vnet_get_xid(struct net_device *); + +static inline void vx_info_exit_ngnet(struct _vx_ngnet *ngnet, xid_t xid) +{ +/* + int index; + + for (index=0; index<(1 << NG_HASHBITS); index++) { + struct hlist_head *head = &ngnet->dev_hash[index].name; + struct hlist_node *node, *n; + struct net_device *vndev; + + hlist_for_each_entry_safe(vndev, node, n, head, name_hlist) { + printk("unreaped device %p:%s [#%d]\n", + vndev, vndev->name, (vxi)?vxi->vx_id:0); + vx_vnet_destroy(vxi, vndev); + } + } + + fib_table_free(ngnet->fib_main); + fib_table_free(ngnet->fib_local); +*/ +} + diff -NurpP --minimal a/kernel/vserver/switch.c b/kernel/vserver/switch.c --- a/kernel/vserver/switch.c 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/switch.c 2005-02-25 17:18:50.000000000 +0100 @@ -35,6 +35,7 @@ vc_get_version(uint32_t id) #include #include #include +#include #include #include @@ -76,6 +77,7 @@ sys_vserver(uint32_t cmd, uint32_t id, v case VCMD_new_s_context: return vc_new_s_context(id, data); case VCMD_set_ipv4root: + return 0; return vc_set_ipv4root(id, data); #endif @@ -200,6 +202,13 @@ sys_vserver(uint32_t cmd, uint32_t id, v case VCMD_net_migrate: return vc_net_migrate(id, data); +#ifdef CONFIG_VSERVER_NGNET + case VCMD_add_vndev: + return vc_add_vndev(id, data); + case VCMD_zap_vnet: + return vc_zap_vnet(id, data); +#endif + } return -ENOSYS; } diff -NurpP --minimal a/kernel/vserver/sysctl.c b/kernel/vserver/sysctl.c --- a/kernel/vserver/sysctl.c 2005-02-25 17:05:42.000000000 +0100 +++ b/kernel/vserver/sysctl.c 2005-02-25 17:18:50.000000000 +0100 @@ -15,6 +15,7 @@ #include #include #include +#include #include #include @@ -31,6 +32,7 @@ enum { CTL_DEBUG_LIMIT, CTL_DEBUG_DLIM, CTL_DEBUG_CVIRT, + CTL_DEBUG_NGNET, }; @@ -41,6 +43,7 @@ unsigned int vx_debug_net = 0; unsigned int vx_debug_limit = 0; unsigned int vx_debug_dlim = 0; unsigned int vx_debug_cvirt = 0; +unsigned int vx_debug_ngnet = 0; static struct ctl_table_header *vserver_table_header; @@ -180,6 +183,14 @@ static ctl_table debug_table[] = { .mode = 0644, .proc_handler = &proc_dodebug }, + { + .ctl_name = CTL_DEBUG_NGNET, + .procname = "debug_ngnet", + .data = &vx_debug_ngnet, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = &proc_dodebug + }, { .ctl_name = 0 } }; @@ -193,6 +204,65 @@ static ctl_table vserver_table[] = { { .ctl_name = 0 } }; +enum { + Opt_err = 0, + Opt_switch, Opt_xid, Opt_nid, Opt_net, Opt_limit, + Opt_dlim, Opt_cvirt, Opt_ngnet +}; + +static match_table_t tokens = { + { Opt_switch, "switch=%x" }, + { Opt_xid, "xid=%x" }, + { Opt_nid, "nid=%x" }, + { Opt_net, "net=%x" }, + { Opt_limit, "limit=%x" }, + { Opt_dlim, "dlim=%x" }, + { Opt_cvirt, "cvirt=%x" }, + { Opt_ngnet, "ngnet=%x" }, + { Opt_err, NULL } +}; + +#define HANDLE_CASE(x,v) \ + case Opt_ ## x: \ + vx_debug_ ## x = v; \ + printk("vs_debug_" #x "=%x\n", v); \ + break + +static int __init vs_debug_setup(char *str) +{ + char *p; + int token; + + printk("vs_debug_setup(%s)\n", str); + while ((p = strsep(&str, ",")) != NULL) { + substring_t args[MAX_OPT_ARGS]; + unsigned int value; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + value = (token>0)?simple_strtoul(args[0].from, NULL, 16):0; + + switch (token) { + HANDLE_CASE(switch, value); + HANDLE_CASE(xid, value); + HANDLE_CASE(nid, value); + HANDLE_CASE(net, value); + HANDLE_CASE(limit, value); + HANDLE_CASE(dlim, value); + HANDLE_CASE(cvirt, value); + HANDLE_CASE(ngnet, value); + default: + return -EINVAL; + break; + } + } + return 1; +} + +__setup("vsdebug=", vs_debug_setup); + EXPORT_SYMBOL_GPL(vx_debug_switch); EXPORT_SYMBOL_GPL(vx_debug_xid); @@ -201,4 +271,5 @@ EXPORT_SYMBOL_GPL(vx_debug_net); EXPORT_SYMBOL_GPL(vx_debug_limit); EXPORT_SYMBOL_GPL(vx_debug_dlim); EXPORT_SYMBOL_GPL(vx_debug_cvirt); +EXPORT_SYMBOL_GPL(vx_debug_ngnet); diff -NurpP --minimal a/Makefile b/Makefile --- a/Makefile 2005-02-25 17:04:30.000000000 +0100 +++ b/Makefile 2005-02-25 17:18:50.000000000 +0100 @@ -1,7 +1,7 @@ VERSION = 2 PATCHLEVEL = 6 SUBLEVEL = 11 -EXTRAVERSION =-rc5 +EXTRAVERSION =-rc5-vs1.9.4.8-ng9.0 NAME=Woozy Numbat # *DOCUMENTATION* diff -NurpP --minimal a/net/atm/clip.c b/net/atm/clip.c --- a/net/atm/clip.c 2004-12-24 22:35:39.000000000 +0100 +++ b/net/atm/clip.c 2005-02-25 17:18:50.000000000 +0100 @@ -303,7 +303,7 @@ static int clip_constructor(struct neigh struct neigh_parms *parms; DPRINTK("clip_constructor (neigh %p, entry %p)\n",neigh,entry); - neigh->type = inet_addr_type(entry->ip); + neigh->type = inet_addr_type(entry->ip, dev->nfxid); if (neigh->type != RTN_UNICAST) return -EINVAL; rcu_read_lock(); diff -NurpP --minimal a/net/core/dev.c b/net/core/dev.c --- a/net/core/dev.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/core/dev.c 2005-02-25 17:18:50.000000000 +0100 @@ -113,8 +113,8 @@ #include /* Note : will define WIRELESS_EXT */ #include #endif /* CONFIG_NET_RADIO */ -#include #include +#include /* This define, if set, will randomly drop a packet when congestion * is more than moderate. It helps fairness in the multi-interface @@ -489,10 +489,13 @@ __setup("netdev=", netdev_boot_setup); struct net_device *__dev_get_by_name(const char *name) { struct hlist_node *p; + nfxid_t nfxid = vx_current_xid(); hlist_for_each(p, dev_name_hash(name)) { struct net_device *dev = hlist_entry(p, struct net_device, name_hlist); + if (dev->nfxid != nfxid) + continue; if (!strncmp(dev->name, name, IFNAMSIZ)) return dev; } @@ -540,6 +543,7 @@ struct net_device *__dev_get_by_index(in hlist_for_each(p, dev_index_hash(ifindex)) { struct net_device *dev = hlist_entry(p, struct net_device, index_hlist); + /* ifindex is global, so no xid check */ if (dev->ifindex == ifindex) return dev; } @@ -583,25 +587,33 @@ struct net_device *dev_get_by_index(int * If the API was consistent this would be __dev_get_by_hwaddr */ -struct net_device *dev_getbyhwaddr(unsigned short type, char *ha) +struct net_device *dev_getbyhwaddr(unsigned short type, char *ha, + nfxid_t nfxid) { struct net_device *dev; ASSERT_RTNL(); - for (dev = dev_base; dev; dev = dev->next) + for (dev = dev_base; dev; dev = dev->next) { + if (dev->nfxid != nfxid) + continue; if (dev->type == type && !memcmp(dev->dev_addr, ha, dev->addr_len)) break; + } return dev; } struct net_device *dev_getfirstbyhwtype(unsigned short type) { struct net_device *dev; + /* should be an argument */ + nfxid_t nfxid = vx_current_xid(); rtnl_lock(); for (dev = dev_base; dev; dev = dev->next) { + if (dev->nfxid != nfxid) + continue; if (dev->type == type) { dev_hold(dev); break; @@ -627,9 +639,13 @@ EXPORT_SYMBOL(dev_getfirstbyhwtype); struct net_device * dev_get_by_flags(unsigned short if_flags, unsigned short mask) { struct net_device *dev; + /* should be an argument */ + nfxid_t nfxid = vx_current_xid(); read_lock(&dev_base_lock); for (dev = dev_base; dev != NULL; dev = dev->next) { + if (dev->nfxid != nfxid) + continue; if (((dev->flags ^ if_flags) & mask) == 0) { dev_hold(dev); break; @@ -674,6 +690,8 @@ int dev_alloc_name(struct net_device *de const int max_netdevices = 8*PAGE_SIZE; long *inuse; struct net_device *d; + /* called only from userspace so should be fine */ + nfxid_t nfxid = vx_current_xid(); p = strnchr(name, IFNAMSIZ-1, '%'); if (p) { @@ -691,6 +709,8 @@ int dev_alloc_name(struct net_device *de return -ENOMEM; for (d = dev_base; d; d = d->next) { + if (dev->nfxid != nfxid) + continue; if (!sscanf(d->name, name, &i)) continue; if (i < 0 || i >= max_netdevices) @@ -964,6 +984,7 @@ int register_netdevice_notifier(struct n err = notifier_chain_register(&netdev_chain, nb); if (!err) { for (dev = dev_base; dev; dev = dev->next) { + /* no xid check, we want _all_ calls */ nb->notifier_call(nb, NETDEV_REGISTER, dev); if (dev->flags & IFF_UP) @@ -1037,6 +1058,10 @@ void dev_queue_xmit_nit(struct sk_buff * net_timestamp(&skb->stamp); rcu_read_lock(); + if (skb->nfxid != dev->nfxid) + printk("!!! dev_queue_xmit_nit(%p[#%u],%p[{%s},#%u])\n", + skb, skb->nfxid, dev, dev->name, dev->nfxid); + list_for_each_entry_rcu(ptype, &ptype_all, list) { /* Never send packets back to the socket * they originated from - MvS (miquels@drinkel.ow.org) @@ -1624,12 +1649,23 @@ static int ing_filter(struct sk_buff *sk } #endif +#ifdef CONFIG_VSERVER_NGNET +int netif_receive_skb_ngnet(struct sk_buff *skb) +#else int netif_receive_skb(struct sk_buff *skb) +#endif { struct packet_type *ptype, *pt_prev; int ret = NET_RX_DROP; unsigned short type; +#ifdef CONFIG_VSERVER_NGNET + vxdprintk(VXD_CBIT(ngnet, 0), + "netif_receive_skb_ngnet(%p[#%u])", skb, skb->nfxid); +#else + vxdprintk(VXD_CBIT(ngnet, 0), + "netif_receive_skb(%p[#%u])", skb, skb->nfxid); + #ifdef CONFIG_NETPOLL if (skb->dev->netpoll_rx && skb->dev->poll && netpoll_rx(skb)) { kfree_skb(skb); @@ -1647,9 +1683,9 @@ int netif_receive_skb(struct sk_buff *sk skb->h.raw = skb->nh.raw = skb->data; skb->mac_len = skb->nh.raw - skb->mac.raw; - pt_prev = NULL; - rcu_read_lock(); +#endif /* CONFIG_VSERVER_NGNET */ + pt_prev = NULL; #ifdef CONFIG_NET_CLS_ACT if (skb->tc_verd & TC_NCLS) { @@ -1711,7 +1747,9 @@ ncls: } out: +#ifndef CONFIG_VSERVER_NGNET rcu_read_unlock(); +#endif return ret; } @@ -1878,6 +1916,7 @@ static int dev_ifconf(char __user *arg) int len; int total; int i; + nfxid_t nfxid = vx_current_xid(); /* * Fetch the caller's info block. @@ -1895,8 +1934,7 @@ static int dev_ifconf(char __user *arg) total = 0; for (dev = dev_base; dev; dev = dev->next) { - if (vx_flags(VXF_HIDE_NETIF, 0) && - !dev_in_nx_info(dev, current->nx_info)) + if (dev->nfxid != nfxid) continue; for (i = 0; i < NPROTO; i++) { if (gifconf_list[i]) { @@ -1958,9 +1996,7 @@ void dev_seq_stop(struct seq_file *seq, static void dev_seq_printf_stats(struct seq_file *seq, struct net_device *dev) { - struct nx_info *nxi = current->nx_info; - - if (vx_flags(VXF_HIDE_NETIF, 0) && !dev_in_nx_info(dev, nxi)) + if (dev->nfxid != vx_current_xid()) return; if (dev->get_stats) { struct net_device_stats *stats = dev->get_stats(dev); @@ -2715,6 +2751,7 @@ int register_netdevice(struct net_device struct hlist_head *head; struct hlist_node *p; int ret; + nfxid_t nfxid = dev->nfxid; BUG_ON(dev_boot_phase); ASSERT_RTNL(); @@ -2759,6 +2796,8 @@ int register_netdevice(struct net_device hlist_for_each(p, head) { struct net_device *d = hlist_entry(p, struct net_device, name_hlist); + if (d->nfxid != nfxid) + continue; if (!strncmp(d->name, dev->name, IFNAMSIZ)) { ret = -EEXIST; goto out_err; @@ -3098,6 +3137,7 @@ void synchronize_net(void) int unregister_netdevice(struct net_device *dev) { struct net_device *d, **dp; + // nfxid_t nfxid = dev->nfxid; BUG_ON(dev_boot_phase); ASSERT_RTNL(); diff -NurpP --minimal a/net/core/dst.c b/net/core/dst.c --- a/net/core/dst.c 2005-02-25 17:05:01.000000000 +0100 +++ b/net/core/dst.c 2005-02-25 17:18:50.000000000 +0100 @@ -233,6 +233,7 @@ static inline void dst_ifdown(struct dst dst->input = dst_discard_in; dst->output = dst_discard_out; } else { + /* FIXME hopefully loopback_dev is fine here */ dst->dev = &loopback_dev; dev_hold(&loopback_dev); dev_put(dev); diff -NurpP --minimal a/net/core/neighbour.c b/net/core/neighbour.c --- a/net/core/neighbour.c 2005-02-25 17:05:01.000000000 +0100 +++ b/net/core/neighbour.c 2005-02-25 17:18:50.000000000 +0100 @@ -363,6 +363,9 @@ struct neighbour *neigh_lookup(struct ne u32 hash_val = tbl->hash(pkey, dev) & tbl->hash_mask; NEIGH_CACHE_STAT_INC(tbl, lookups); + vxdprintk(VXD_CBIT(ngnet, 2), + "neigh_lookup(%p,%p[%s,#%u]) : %08x", + tbl, dev, dev->name, dev->nfxid, *(unsigned *)pkey); read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { @@ -383,6 +386,8 @@ struct neighbour *neigh_lookup_nodev(str u32 hash_val = tbl->hash(pkey, NULL) & tbl->hash_mask; NEIGH_CACHE_STAT_INC(tbl, lookups); + vxdprintk(VXD_CBIT(ngnet, 2), + "neigh_lookup_nodev(%p) : %08x", tbl, *(unsigned *)pkey); read_lock_bh(&tbl->lock); for (n = tbl->hash_buckets[hash_val]; n; n = n->next) { @@ -404,6 +409,9 @@ struct neighbour *neigh_create(struct ne int error; struct neighbour *n1, *rc, *n = neigh_alloc(tbl); + vxdprintk(VXD_CBIT(ngnet, 2), + "neigh_create(%p,%p[%s,#%u]) : %08x", + tbl, dev, dev->name, dev->nfxid, *(unsigned *)pkey); if (!n) { rc = ERR_PTR(-ENOBUFS); goto out; diff -NurpP --minimal a/net/core/netfilter.c b/net/core/netfilter.c --- a/net/core/netfilter.c 2005-02-25 17:05:01.000000000 +0100 +++ b/net/core/netfilter.c 2005-02-25 17:18:50.000000000 +0100 @@ -340,14 +340,17 @@ int nf_getsockopt(struct sock *sk, int p return nf_sockopt(sk, pf, val, opt, len, 1); } -static unsigned int nf_iterate(struct list_head *head, - struct sk_buff **skb, - int hook, - const struct net_device *indev, - const struct net_device *outdev, - struct list_head **i, - int (*okfn)(struct sk_buff *), - int hook_thresh) +#ifndef CONFIG_VSERVER_NGNET +static +#endif +unsigned int nf_iterate(struct list_head *head, + struct sk_buff **skb, + int hook, + const struct net_device *indev, + const struct net_device *outdev, + struct list_head **i, + int (*okfn)(struct sk_buff *), + int hook_thresh) { /* * The caller must not block between calls to this @@ -622,7 +625,7 @@ int ip_route_me_harder(struct sk_buff ** /* some non-standard hacks like ipt_REJECT.c:send_reset() can cause * packets with foreign saddr to appear on the NF_IP_LOCAL_OUT hook. */ - if (inet_addr_type(iph->saddr) == RTN_LOCAL) { + if (inet_addr_type(iph->saddr, (*pskb)->nfxid) == RTN_LOCAL) { fl.nl_u.ip4_u.daddr = iph->daddr; fl.nl_u.ip4_u.saddr = iph->saddr; fl.nl_u.ip4_u.tos = RT_TOS(iph->tos); @@ -630,6 +633,7 @@ int ip_route_me_harder(struct sk_buff ** #ifdef CONFIG_IP_ROUTE_FWMARK fl.nl_u.ip4_u.fwmark = (*pskb)->nfmark; #endif + fl.nfxid = (*pskb)->nfxid; fl.proto = iph->protocol; if (ip_route_output_key(&rt, &fl) != 0) return -1; diff -NurpP --minimal a/net/core/net-sysfs.c b/net/core/net-sysfs.c --- a/net/core/net-sysfs.c 2004-12-24 22:35:28.000000000 +0100 +++ b/net/core/net-sysfs.c 2005-02-25 17:18:50.000000000 +0100 @@ -420,7 +420,11 @@ int netdev_register_sysfs(struct net_dev class_dev->class = &net_class; class_dev->class_data = net; - strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); + if (net->nfxid) + snprintf(class_dev->class_id, BUS_ID_SIZE, + "%04x_%s", net->nfxid, net->name); + else + strlcpy(class_dev->class_id, net->name, BUS_ID_SIZE); if ((ret = class_device_register(class_dev))) goto out; diff -NurpP --minimal a/net/core/rtnetlink.c b/net/core/rtnetlink.c --- a/net/core/rtnetlink.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/core/rtnetlink.c 2005-02-25 17:18:50.000000000 +0100 @@ -266,13 +266,13 @@ static int rtnetlink_dump_ifinfo(struct int idx; int s_idx = cb->args[0]; struct net_device *dev; + nfxid_t nfxid = skb->sk->sk_xid; read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) continue; - if (vx_info_flags(skb->sk->sk_vx_info, VXF_HIDE_NETIF, 0) && - !dev_in_nx_info(dev, skb->sk->sk_nx_info)) + if (dev->nfxid != nfxid) continue; if (rtnetlink_fill_ifinfo(skb, dev, RTM_NEWLINK, NETLINK_CB(cb->skb).pid, cb->nlh->nlmsg_seq, 0) <= 0) break; @@ -444,8 +444,7 @@ void rtmsg_ifinfo(int type, struct net_d sizeof(struct rtnl_link_ifmap) + sizeof(struct rtnl_link_stats) + 128); - if (vx_flags(VXF_HIDE_NETIF, 0) && - !dev_in_nx_info(dev, current->nx_info)) + if (dev->nfxid != vx_current_xid()) return; skb = alloc_skb(size, GFP_KERNEL); if (!skb) diff -NurpP --minimal a/net/core/skbuff.c b/net/core/skbuff.c --- a/net/core/skbuff.c 2005-02-25 17:05:01.000000000 +0100 +++ b/net/core/skbuff.c 2005-02-25 17:18:50.000000000 +0100 @@ -149,6 +149,7 @@ struct sk_buff *alloc_skb(unsigned int s skb->data = data; skb->tail = data; skb->end = data + size; + skb->nfxid = vx_current_xid(); atomic_set(&(skb_shinfo(skb)->dataref), 1); skb_shinfo(skb)->nr_frags = 0; @@ -373,6 +374,8 @@ struct sk_buff *skb_clone(struct sk_buff nf_bridge_get(skb->nf_bridge); #endif #endif /*CONFIG_NETFILTER*/ + C(nfvnet); + C(nfxid); #if defined(CONFIG_HIPPI) C(private); #endif diff -NurpP --minimal a/net/core/sock.c b/net/core/sock.c --- a/net/core/sock.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/core/sock.c 2005-02-25 17:18:50.000000000 +0100 @@ -122,6 +122,7 @@ #include #include +#include #include #include @@ -638,7 +639,6 @@ struct sock *sk_alloc(int family, int pr } sk->sk_slab = slab; sock_vx_init(sk); - sock_nx_init(sk); if (security_sk_alloc(sk, family, priority)) { kmem_cache_free(slab, sk); @@ -673,9 +673,6 @@ void sk_free(struct sock *sk) // BUG_ON(sk->sk_vx_info); clr_vx_info(&sk->sk_vx_info); sk->sk_xid = -1; - // BUG_ON(sk->sk_nx_info); - clr_nx_info(&sk->sk_nx_info); - sk->sk_nid = -1; kmem_cache_free(sk->sk_slab, sk); module_put(owner); } @@ -1227,8 +1224,6 @@ void sock_init_data(struct socket *sock, set_vx_info(&sk->sk_vx_info, current->vx_info); sk->sk_xid = vx_current_xid(); vx_sock_inc(sk); - set_nx_info(&sk->sk_nx_info, current->nx_info); - sk->sk_nid = nx_current_nid(); atomic_set(&sk->sk_refcnt, 1); } diff -NurpP --minimal a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c --- a/net/ipv4/af_inet.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/af_inet.c 2005-02-25 17:18:50.000000000 +0100 @@ -87,6 +87,8 @@ #include #include #include +#include +#include #include #include @@ -112,7 +114,6 @@ #ifdef CONFIG_IP_MROUTE #include #endif -#include DEFINE_SNMP_STAT(struct linux_mib, net_statistics); @@ -397,10 +398,6 @@ int inet_bind(struct socket *sock, struc unsigned short snum; int chk_addr_ret; int err; - __u32 s_addr; /* Address used for validation */ - __u32 s_addr1; /* Address used for socket */ - __u32 s_addr2; /* Broadcast address for the socket */ - struct nx_info *nxi = sk->sk_nx_info; /* If the socket has its own bind function then use it. (RAW) */ if (sk->sk_prot->bind) { @@ -411,40 +408,7 @@ int inet_bind(struct socket *sock, struc if (addr_len < sizeof(struct sockaddr_in)) goto out; - s_addr = addr->sin_addr.s_addr; - s_addr1 = s_addr; - s_addr2 = 0xffffffffl; - - vxdprintk(VXD_CBIT(net, 3), - "inet_bind(%p)* %p,%p;%lx %d.%d.%d.%d", - sk, sk->sk_nx_info, sk->sk_socket, - (sk->sk_socket?sk->sk_socket->flags:0), - VXD_QUAD(s_addr)); - if (nxi) { - __u32 v4_bcast = nxi->v4_bcast; - __u32 ipv4root = nxi->ipv4[0]; - int nbipv4 = nxi->nbipv4; - - if (s_addr == 0) { - /* bind to any for 1-n */ - s_addr = ipv4root; - s_addr1 = (nbipv4 > 1) ? 0 : s_addr; - s_addr2 = v4_bcast; - } else if (s_addr == 0x0100007f) { - /* rewrite localhost to ipv4root */ - s_addr = ipv4root; - s_addr1 = ipv4root; - } else if (s_addr != v4_bcast) { - /* normal address bind */ - if (!addr_in_nx_info(nxi, s_addr)) - return -EADDRNOTAVAIL; - } - } - chk_addr_ret = inet_addr_type(s_addr); - - vxdprintk(VXD_CBIT(net, 3), - "inet_bind(%p) %d.%d.%d.%d, %d.%d.%d.%d, %d.%d.%d.%d", - sk, VXD_QUAD(s_addr), VXD_QUAD(s_addr1), VXD_QUAD(s_addr2)); + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr, sk->sk_xid); /* Not specified by any standard per-se, however it breaks too * many applications when removed. It is unfortunate since @@ -456,7 +420,7 @@ int inet_bind(struct socket *sock, struc err = -EADDRNOTAVAIL; if (!sysctl_ip_nonlocal_bind && !inet->freebind && - s_addr != INADDR_ANY && + addr->sin_addr.s_addr != INADDR_ANY && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) @@ -481,8 +445,7 @@ int inet_bind(struct socket *sock, struc if (sk->sk_state != TCP_CLOSE || inet->num) goto out_release_sock; - inet->rcv_saddr = inet->saddr = s_addr1; - inet->rcv_saddr2 = s_addr2; + inet->rcv_saddr = inet->saddr = addr->sin_addr.s_addr; if (chk_addr_ret == RTN_MULTICAST || chk_addr_ret == RTN_BROADCAST) inet->saddr = 0; /* Use device */ diff -NurpP --minimal a/net/ipv4/arp.c b/net/ipv4/arp.c --- a/net/ipv4/arp.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/arp.c 2005-02-25 17:18:50.000000000 +0100 @@ -238,7 +238,7 @@ static int arp_constructor(struct neighb struct in_device *in_dev; struct neigh_parms *parms; - neigh->type = inet_addr_type(addr); + neigh->type = inet_addr_type(addr, dev->nfxid); rcu_read_lock(); in_dev = rcu_dereference(__in_dev_get(dev)); @@ -336,20 +336,24 @@ static void arp_solicit(struct neighbour int probes = atomic_read(&neigh->probes); struct in_device *in_dev = in_dev_get(dev); + vxdprintk(VXD_CBIT(ngnet, 0), + "arp_solicit(%p[#%u]): %p[%s,#%u]", + skb, skb->nfxid, dev, dev->name, dev->nfxid); + if (!in_dev) return; switch (IN_DEV_ARP_ANNOUNCE(in_dev)) { default: case 0: /* By default announce any local IP */ - if (skb && inet_addr_type(skb->nh.iph->saddr) == RTN_LOCAL) + if (skb && inet_addr_type(skb->nh.iph->saddr, dev->nfxid) == RTN_LOCAL) saddr = skb->nh.iph->saddr; break; case 1: /* Restrict announcements of saddr in same subnet */ if (!skb) break; saddr = skb->nh.iph->saddr; - if (inet_addr_type(saddr) == RTN_LOCAL) { + if (inet_addr_type(saddr, dev->nfxid) == RTN_LOCAL) { /* saddr should be known to target */ if (inet_addr_onlink(in_dev, target, saddr)) break; @@ -480,7 +484,7 @@ int arp_find(unsigned char *haddr, struc paddr = ((struct rtable*)skb->dst)->rt_gateway; - if (arp_set_predefined(inet_addr_type(paddr), haddr, paddr, dev)) + if (arp_set_predefined(inet_addr_type(paddr, dev->nfxid), haddr, paddr, dev)) return 0; n = __neigh_lookup(&arp_tbl, &paddr, dev, 1); @@ -830,7 +834,7 @@ static int arp_process(struct sk_buff *s /* Special case: IPv4 duplicate address detection packet (RFC2131) */ if (sip == 0) { if (arp->ar_op == htons(ARPOP_REQUEST) && - inet_addr_type(tip) == RTN_LOCAL && + inet_addr_type(tip, dev->nfxid) == RTN_LOCAL && !arp_ignore(in_dev,dev,sip,tip)) arp_send(ARPOP_REPLY,ETH_P_ARP,tip,dev,tip,sha,dev->dev_addr,dev->dev_addr); goto out; @@ -890,7 +894,7 @@ static int arp_process(struct sk_buff *s */ if (n == NULL && arp->ar_op == htons(ARPOP_REPLY) && - inet_addr_type(sip) == RTN_UNICAST) + inet_addr_type(sip, dev->nfxid) == RTN_UNICAST) n = __neigh_lookup(&arp_tbl, &sip, dev, -1); #endif @@ -969,13 +973,15 @@ static int arp_req_set(struct arpreq *r, u32 ip = ((struct sockaddr_in *) &r->arp_pa)->sin_addr.s_addr; struct neighbour *neigh; int err; + nfxid_t nfxid = vx_current_xid(); if (r->arp_flags&ATF_PUBL) { u32 mask = ((struct sockaddr_in *) &r->arp_netmask)->sin_addr.s_addr; if (mask && mask != 0xFFFFFFFF) return -EINVAL; if (!dev && (r->arp_flags & ATF_COM)) { - dev = dev_getbyhwaddr(r->arp_ha.sa_family, r->arp_ha.sa_data); + dev = dev_getbyhwaddr(r->arp_ha.sa_family, + r->arp_ha.sa_data, nfxid); if (!dev) return -ENODEV; } diff -NurpP --minimal a/net/ipv4/devinet.c b/net/ipv4/devinet.c --- a/net/ipv4/devinet.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/devinet.c 2005-02-25 17:18:50.000000000 +0100 @@ -57,6 +57,7 @@ #include #endif #include +#include #include #include @@ -489,33 +490,6 @@ static __inline__ int inet_abc_len(u32 a return rc; } -/* - Check that a device is not member of the ipv4root assigned to the process - Return true if this is the case - - If the process is not bound to specific IP, then it returns 0 (all - interface are fine). -*/ -static inline int devinet_notiproot (struct in_ifaddr *ifa) -{ - int ret = 0; - struct nx_info *nxi; - - if ((nxi = current->nx_info)) { - int i; - int nbip = nxi->nbipv4; - __u32 addr = ifa->ifa_local; - ret = 1; - for (i=0; iipv4[i] == addr) { - ret = 0; - break; - } - } - } - return ret; -} - int devinet_ioctl(unsigned int cmd, void __user *arg) { @@ -623,9 +597,6 @@ int devinet_ioctl(unsigned int cmd, void ret = -EADDRNOTAVAIL; if (!ifa && cmd != SIOCSIFADDR && cmd != SIOCSIFFLAGS) goto done; - if (vx_flags(VXF_HIDE_NETIF, 0) && - !ifa_in_nx_info(ifa, current->nx_info)) - goto done; switch(cmd) { case SIOCGIFADDR: /* Get interface address */ @@ -769,9 +740,6 @@ static int inet_gifconf(struct net_devic goto out; for (; ifa; ifa = ifa->ifa_next) { - if (vx_flags(VXF_HIDE_NETIF, 0) && - !ifa_in_nx_info(ifa, current->nx_info)) - continue; if (!buf) { done += sizeof(ifr); continue; @@ -804,6 +772,7 @@ u32 inet_select_addr(const struct net_de { u32 addr = 0; struct in_device *in_dev; + nfxid_t nfxid = dev->nfxid; rcu_read_lock(); in_dev = __in_dev_get(dev); @@ -833,6 +802,8 @@ no_in_dev: read_lock(&dev_base_lock); rcu_read_lock(); for (dev = dev_base; dev; dev = dev->next) { + if (dev->nfxid != nfxid) + continue; if ((in_dev = __in_dev_get(dev)) == NULL) continue; @@ -899,6 +870,7 @@ u32 inet_confirm_addr(const struct net_d { u32 addr = 0; struct in_device *in_dev; + nfxid_t nfxid = dev->nfxid; if (dev) { rcu_read_lock(); @@ -912,6 +884,8 @@ u32 inet_confirm_addr(const struct net_d read_lock(&dev_base_lock); rcu_read_lock(); for (dev = dev_base; dev; dev = dev->next) { + if (dev->nfxid != nfxid) + continue; if ((in_dev = __in_dev_get(dev))) { addr = confirm_addr_indev(in_dev, dst, local, scope); if (addr) @@ -976,17 +950,22 @@ static int inetdev_event(struct notifier ASSERT_RTNL(); + vxdprintk(VXD_CBIT(ngnet, 5), + "inetdev_event(%lu,%p[%s,#%u)", + event, dev, dev->name, dev->nfxid); + if (!in_dev) goto out; switch (event) { case NETDEV_REGISTER: - printk(KERN_DEBUG "inetdev_event: bug\n"); + printk(KERN_DEBUG "inetdev_even bug\n"); dev->ip_ptr = NULL; break; case NETDEV_UP: if (dev->mtu < 68) break; + /* FIXME speical case for lo up ? */ if (dev == &loopback_dev) { struct in_ifaddr *ifa; if ((ifa = inet_alloc_ifa()) != NULL) { @@ -1078,27 +1057,30 @@ static int inet_dump_ifaddr(struct sk_bu struct net_device *dev; struct in_device *in_dev; struct in_ifaddr *ifa; - struct sock *sk = skb->sk; int s_ip_idx, s_idx = cb->args[0]; + nfxid_t nfxid = vx_current_xid(); /* FIXME skb ? */ + + vxdprintk(VXD_CBIT(ngnet, 6), + "inet_dump_ifaddr(%p[#%u]): #%u", + skb, skb->nfxid, nfxid); s_ip_idx = ip_idx = cb->args[1]; read_lock(&dev_base_lock); - for (dev = dev_base, idx = 0; dev; dev = dev->next, idx++) { - if (idx < s_idx) + for (dev = dev_base, idx = 0; dev; dev = dev->next) { + if (dev->nfxid != nfxid) continue; + if (idx < s_idx) + goto next; if (idx > s_idx) s_ip_idx = 0; rcu_read_lock(); if ((in_dev = __in_dev_get(dev)) == NULL) { rcu_read_unlock(); - continue; + goto next; } for (ifa = in_dev->ifa_list, ip_idx = 0; ifa; ifa = ifa->ifa_next, ip_idx++) { - if (sk && vx_info_flags(sk->sk_vx_info, VXF_HIDE_NETIF, 0) && - !ifa_in_nx_info(ifa, sk->sk_nx_info)) - continue; if (ip_idx < s_ip_idx) continue; if (inet_fill_ifaddr(skb, ifa, NETLINK_CB(cb->skb).pid, @@ -1109,6 +1091,8 @@ static int inet_dump_ifaddr(struct sk_bu } } rcu_read_unlock(); + next: + idx++; } done: @@ -1155,6 +1139,7 @@ void inet_forward_change(void) { struct net_device *dev; int on = ipv4_devconf.forwarding; + nfxid_t nfxid = vx_current_xid(); /* FIXME argument! */ ipv4_devconf.accept_redirects = !on; ipv4_devconf_dflt.forwarding = on; @@ -1162,6 +1147,9 @@ void inet_forward_change(void) read_lock(&dev_base_lock); for (dev = dev_base; dev; dev = dev->next) { struct in_device *in_dev; + + if (dev->nfxid != nfxid) + continue; rcu_read_lock(); in_dev = __in_dev_get(dev); if (in_dev) diff -NurpP --minimal a/net/ipv4/fib_frontend.c b/net/ipv4/fib_frontend.c --- a/net/ipv4/fib_frontend.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/fib_frontend.c 2005-02-25 17:18:50.000000000 +0100 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -60,11 +61,14 @@ struct fib_table *ip_fib_main_table; struct fib_table *fib_tables[RT_TABLE_MAX+1]; -struct fib_table *__fib_new_table(int id) +struct fib_table *__fib_new_table(int id, nfxid_t nfxid) { struct fib_table *tb; - tb = fib_hash_init(id); + vxdprintk(VXD_CBIT(ngnet, 1), + "__fib_new_table(%u,#%u)", id, nfxid); + + tb = fib_hash_init(id, nfxid); if (!tb) return NULL; fib_tables[id] = tb; @@ -75,7 +79,7 @@ struct fib_table *__fib_new_table(int id #endif /* CONFIG_IP_MULTIPLE_TABLES */ -static void fib_flush(void) +static void fib_flush(nfxid_t nfxid) { int flushed = 0; #ifdef CONFIG_IP_MULTIPLE_TABLES @@ -83,13 +87,16 @@ static void fib_flush(void) int id; for (id = RT_TABLE_MAX; id>0; id--) { - if ((tb = fib_get_table(id))==NULL) + if ((tb = fib_get_table(id, nfxid))==NULL) continue; flushed += tb->tb_flush(tb); } #else /* CONFIG_IP_MULTIPLE_TABLES */ - flushed += ip_fib_main_table->tb_flush(ip_fib_main_table); - flushed += ip_fib_local_table->tb_flush(ip_fib_local_table); + struct fib_table *fib_main = __vx_fib_main_table(nfxid); + struct fib_table *fib_local = __vx_fib_local_table(nfxid); + + flushed += fib_main->tb_flush(fib_main); + flushed += fib_local->tb_flush(fib_local); #endif /* CONFIG_IP_MULTIPLE_TABLES */ if (flushed) @@ -100,18 +107,23 @@ static void fib_flush(void) * Find the first device with a given source address. */ -struct net_device * ip_dev_find(u32 addr) +struct net_device * ip_dev_find(u32 addr, nfxid_t nfxid) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } }, + .nfxid = nfxid }; struct fib_result res; struct net_device *dev = NULL; + struct fib_table *fib_local = __vx_fib_local_table(fl.nfxid); + vxdprintk(VXD_CBIT(ngnet, 4), + "ip_dev_find(%u.%u.%u.%u, #%u)", + NIPQUAD(addr), nfxid); #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; #endif - if (!ip_fib_local_table || - ip_fib_local_table->tb_lookup(ip_fib_local_table, &fl, &res)) + if (!fib_local || + fib_local->tb_lookup(fib_local, &fl, &res)) return NULL; if (res.type != RTN_LOCAL) goto out; @@ -124,11 +136,17 @@ out: return dev; } -unsigned inet_addr_type(u32 addr) +unsigned inet_addr_type(u32 addr, nfxid_t nfxid) { - struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } } }; + struct flowi fl = { .nl_u = { .ip4_u = { .daddr = addr } }, + .nfxid = nfxid }; struct fib_result res; unsigned ret = RTN_BROADCAST; + struct fib_table *fib_local = __vx_fib_local_table(fl.nfxid); + + vxdprintk(VXD_CBIT(ngnet, 4), + "inet_addr_type(%u.%u.%u.%u, #%u)", + NIPQUAD(addr), nfxid); if (ZERONET(addr) || BADCLASS(addr)) return RTN_BROADCAST; @@ -139,10 +157,9 @@ unsigned inet_addr_type(u32 addr) res.r = NULL; #endif - if (ip_fib_local_table) { + if (fib_local) { ret = RTN_UNICAST; - if (!ip_fib_local_table->tb_lookup(ip_fib_local_table, - &fl, &res)) { + if (!fib_local->tb_lookup(fib_local, &fl, &res)) { ret = res.type; fib_res_put(&res); } @@ -166,11 +183,16 @@ int fib_validate_source(u32 src, u32 dst { .daddr = src, .saddr = dst, .tos = tos } }, - .iif = oif }; + .iif = oif, + .nfxid = dev->nfxid }; struct fib_result res; int no_addr, rpf; int ret; + vxdprintk(VXD_CBIT(ngnet, 4), + "fib_validate_source(%p[%s,#%u]): %u.%u.%u.%u <- %u.%u.%u.%u", + dev, dev->name, dev->nfxid, NIPQUAD(dst), NIPQUAD(src)); + no_addr = rpf = 0; rcu_read_lock(); in_dev = __in_dev_get(dev); @@ -244,6 +266,10 @@ int ip_rt_ioctl(unsigned int cmd, void _ struct nlmsghdr nlh; struct rtmsg rtm; } req; + nfxid_t nfxid = vx_current_xid(); + + vxdprintk(VXD_CBIT(ngnet, 5), + "ip_rt_ioctl(%x) #%u", cmd, nfxid); switch (cmd) { case SIOCADDRT: /* Add a route */ @@ -256,12 +282,13 @@ int ip_rt_ioctl(unsigned int cmd, void _ err = fib_convert_rtentry(cmd, &req.nlh, &req.rtm, &rta, &r); if (err == 0) { if (cmd == SIOCDELRT) { - struct fib_table *tb = fib_get_table(req.rtm.rtm_table); + struct fib_table *tb = fib_get_table( + req.rtm.rtm_table, nfxid); err = -ESRCH; if (tb) err = tb->tb_delete(tb, &req.rtm, &rta, &req.nlh, NULL); } else { - struct fib_table *tb = fib_new_table(req.rtm.rtm_table); + struct fib_table *tb = fib_new_table(req.rtm.rtm_table, nfxid); err = -ENOBUFS; if (tb) err = tb->tb_insert(tb, &req.rtm, &rta, &req.nlh, NULL); @@ -305,11 +332,15 @@ int inet_rtm_delroute(struct sk_buff *sk struct fib_table * tb; struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + nfxid_t nfxid = skb->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 4), + "inet_rtm_delroute(%p[#%u])", skb, nfxid); if (inet_check_attr(r, rta)) return -EINVAL; - tb = fib_get_table(r->rtm_table); + tb = fib_get_table(r->rtm_table, nfxid); if (tb) return tb->tb_delete(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); return -ESRCH; @@ -320,11 +351,15 @@ int inet_rtm_newroute(struct sk_buff *sk struct fib_table * tb; struct rtattr **rta = arg; struct rtmsg *r = NLMSG_DATA(nlh); + nfxid_t nfxid = skb->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 4), + "inet_rtm_newroute(%p[#%u])", skb, nfxid); if (inet_check_attr(r, rta)) return -EINVAL; - tb = fib_new_table(r->rtm_table); + tb = fib_new_table(r->rtm_table, nfxid); if (tb) return tb->tb_insert(tb, r, (struct kern_rta*)rta, nlh, &NETLINK_CB(skb)); return -ENOBUFS; @@ -335,6 +370,10 @@ int inet_dump_fib(struct sk_buff *skb, s int t; int s_t; struct fib_table *tb; + nfxid_t nfxid = skb->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 4), + "inet_dump_fib(%p[#%u])", skb, nfxid); if (NLMSG_PAYLOAD(cb->nlh, 0) >= sizeof(struct rtmsg) && ((struct rtmsg*)NLMSG_DATA(cb->nlh))->rtm_flags&RTM_F_CLONED) @@ -348,7 +387,7 @@ int inet_dump_fib(struct sk_buff *skb, s if (t < s_t) continue; if (t > s_t) memset(&cb->args[1], 0, sizeof(cb->args)-sizeof(cb->args[0])); - if ((tb = fib_get_table(t))==NULL) + if ((tb = fib_get_table(t, nfxid))==NULL) continue; if (tb->tb_dump(tb, skb, cb) < 0) break; @@ -366,7 +405,8 @@ int inet_dump_fib(struct sk_buff *skb, s only when netlink is already locked. */ -static void fib_magic(int cmd, int type, u32 dst, int dst_len, struct in_ifaddr *ifa) +static void fib_magic(int cmd, int type, u32 dst, int dst_len, + struct in_ifaddr *ifa, nfxid_t nfxid) { struct fib_table * tb; struct { @@ -375,13 +415,17 @@ static void fib_magic(int cmd, int type, } req; struct kern_rta rta; + vxdprintk(VXD_CBIT(ngnet, 4), + "fib_magic(%u,#%u): %u.%u.%u.%u", + cmd, nfxid, NIPQUAD(dst)); + memset(&req.rtm, 0, sizeof(req.rtm)); memset(&rta, 0, sizeof(rta)); if (type == RTN_UNICAST) - tb = fib_new_table(RT_TABLE_MAIN); + tb = fib_new_table(RT_TABLE_MAIN, nfxid); else - tb = fib_new_table(RT_TABLE_LOCAL); + tb = fib_new_table(RT_TABLE_LOCAL, nfxid); if (tb == NULL) return; @@ -416,6 +460,11 @@ static void fib_add_ifaddr(struct in_ifa u32 mask = ifa->ifa_mask; u32 addr = ifa->ifa_local; u32 prefix = ifa->ifa_address&mask; + nfxid_t nfxid = dev->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 4), + "fib_add_ifaddr(%p): %p[%s,#%u] %u.%u.%u.%u/%u.%u.%u.%u", + ifa, dev, dev->name, nfxid, NIPQUAD(addr), NIPQUAD(mask)); if (ifa->ifa_flags&IFA_F_SECONDARY) { prim = inet_ifa_byprefix(in_dev, prefix, mask); @@ -425,24 +474,27 @@ static void fib_add_ifaddr(struct in_ifa } } - fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_LOCAL, addr, 32, prim, nfxid); if (!(dev->flags&IFF_UP)) return; /* Add broadcast address, if it is explicitly assigned. */ if (ifa->ifa_broadcast && ifa->ifa_broadcast != 0xFFFFFFFF) - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, ifa->ifa_broadcast, + 32, prim, nfxid); if (!ZERONET(prefix) && !(ifa->ifa_flags&IFA_F_SECONDARY) && (prefix != addr || ifa->ifa_prefixlen < 32)) { fib_magic(RTM_NEWROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim); + RTN_UNICAST, prefix, ifa->ifa_prefixlen, prim, nfxid); /* Add network specific broadcasts, when it takes a sense */ if (ifa->ifa_prefixlen < 31) { - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, 32, prim); - fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, 32, prim); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix, + 32, prim, nfxid); + fib_magic(RTM_NEWROUTE, RTN_BROADCAST, prefix|~mask, + 32, prim, nfxid); } } } @@ -460,10 +512,16 @@ static void fib_del_ifaddr(struct in_ifa #define BRD0_OK 4 #define BRD1_OK 8 unsigned ok = 0; + nfxid_t nfxid = dev->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 4), + "fib_del_ifaddr(%p): %p[%s,#%u] %u.%u.%u.%u/%u.%u.%u.%u", + ifa, dev, dev->name, nfxid, + NIPQUAD(ifa->ifa_address), NIPQUAD(ifa->ifa_mask)); if (!(ifa->ifa_flags&IFA_F_SECONDARY)) fib_magic(RTM_DELROUTE, dev->flags&IFF_LOOPBACK ? RTN_LOCAL : - RTN_UNICAST, any, ifa->ifa_prefixlen, prim); + RTN_UNICAST, any, ifa->ifa_prefixlen, prim, nfxid); else { prim = inet_ifa_byprefix(in_dev, any, ifa->ifa_mask); if (prim == NULL) { @@ -490,24 +548,26 @@ static void fib_del_ifaddr(struct in_ifa } if (!(ok&BRD_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, ifa->ifa_broadcast, + 32, prim, nfxid); if (!(ok&BRD1_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, brd, 32, prim, nfxid); if (!(ok&BRD0_OK)) - fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim); + fib_magic(RTM_DELROUTE, RTN_BROADCAST, any, 32, prim, nfxid); if (!(ok&LOCAL_OK)) { - fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, 32, prim); + fib_magic(RTM_DELROUTE, RTN_LOCAL, ifa->ifa_local, + 32, prim, nfxid); /* Check, that this local address finally disappeared. */ - if (inet_addr_type(ifa->ifa_local) != RTN_LOCAL) { + if (inet_addr_type(ifa->ifa_local, nfxid) != RTN_LOCAL) { /* And the last, but not the least thing. We must flush stray FIB entries. First of all, we scan fib_info list searching for stray nexthop entries, then ignite fib_flush. */ - if (fib_sync_down(ifa->ifa_local, NULL, 0)) - fib_flush(); + if (fib_sync_down(ifa->ifa_local, NULL, 0, nfxid)) + fib_flush(nfxid); } } #undef LOCAL_OK @@ -518,8 +578,8 @@ static void fib_del_ifaddr(struct in_ifa static void fib_disable_ip(struct net_device *dev, int force) { - if (fib_sync_down(0, dev, force)) - fib_flush(); + if (fib_sync_down(0, dev, force, dev->nfxid)) + fib_flush(dev->nfxid); rt_cache_flush(0); arp_ifdown(dev); } @@ -528,6 +588,9 @@ static int fib_inetaddr_event(struct not { struct in_ifaddr *ifa = (struct in_ifaddr*)ptr; + vxdprintk(VXD_CBIT(ngnet, 6), + "fib_inetaddr_event(%lu,%p)", event, ifa); + switch (event) { case NETDEV_UP: fib_add_ifaddr(ifa); @@ -556,6 +619,10 @@ static int fib_netdev_event(struct notif struct net_device *dev = ptr; struct in_device *in_dev = __in_dev_get(dev); + vxdprintk(VXD_CBIT(ngnet, 6), + "fib_netdev_event(%lu,%p[%s,#%u])", + event, dev, dev->name, dev->nfxid); + if (event == NETDEV_UNREGISTER) { fib_disable_ip(dev, 2); return NOTIFY_DONE; @@ -596,8 +663,8 @@ static struct notifier_block fib_netdev_ void __init ip_fib_init(void) { #ifndef CONFIG_IP_MULTIPLE_TABLES - ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL); - ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN); + ip_fib_local_table = fib_hash_init(RT_TABLE_LOCAL, 0); + ip_fib_main_table = fib_hash_init(RT_TABLE_MAIN, 0); #else fib_rules_init(); #endif diff -NurpP --minimal a/net/ipv4/fib_hash.c b/net/ipv4/fib_hash.c --- a/net/ipv4/fib_hash.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/fib_hash.c 2005-02-25 17:18:50.000000000 +0100 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -391,6 +392,9 @@ fn_hash_insert(struct fib_table *tb, str u32 key; int err; + vxdprintk(VXD_CBIT(ngnet, 6), + "fn_hash_insert(%p[#%d])", tb, tb->tb_nfxid); + if (z > 32) return -EINVAL; fz = table->fn_zones[z]; @@ -406,7 +410,7 @@ fn_hash_insert(struct fib_table *tb, str key = fz_key(dst, fz); } - if ((fi = fib_create_info(r, rta, n, &err)) == NULL) + if ((fi = fib_create_info(r, rta, n, &err, tb->tb_nfxid)) == NULL) return err; if (fz->fz_nent > (fz->fz_divisor<<1) && @@ -544,6 +548,9 @@ fn_hash_delete(struct fib_table *tb, str u32 key; u8 tos = r->rtm_tos; + vxdprintk(VXD_CBIT(ngnet, 6), + "fn_hash_delete(%p[#%d])", tb, tb->tb_nfxid); + if (z > 32) return -EINVAL; if ((fz = table->fn_zones[z]) == NULL) @@ -756,11 +763,7 @@ static int fn_hash_dump(struct fib_table return skb->len; } -#ifdef CONFIG_IP_MULTIPLE_TABLES -struct fib_table * fib_hash_init(int id) -#else -struct fib_table * __init fib_hash_init(int id) -#endif +struct fib_table * fib_hash_init(int id, nfxid_t nfxid) { struct fib_table *tb; @@ -781,7 +784,11 @@ struct fib_table * __init fib_hash_init( if (tb == NULL) return NULL; + vxdprintk(VXD_CBIT(ngnet, 1), + "fib_hash_init(%u,#%u): %p", id, nfxid, tb); + tb->tb_id = id; + tb->tb_nfxid = nfxid; tb->tb_lookup = fn_hash_lookup; tb->tb_insert = fn_hash_insert; tb->tb_delete = fn_hash_delete; @@ -801,12 +808,14 @@ struct fib_iter_state { struct hlist_head *hash_head; struct fib_node *fn; struct fib_alias *fa; + xid_t xid; }; static struct fib_alias *fib_get_first(struct seq_file *seq) { struct fib_iter_state *iter = seq->private; - struct fn_hash *table = (struct fn_hash *) ip_fib_main_table->tb_data; + struct fib_table *fib_main = __vx_fib_main_table(iter->xid); + struct fn_hash *table = (struct fn_hash *) fib_main->tb_data; iter->bucket = 0; iter->hash_head = NULL; @@ -922,9 +931,11 @@ out: static void *fib_seq_start(struct seq_file *seq, loff_t *pos) { void *v = NULL; + struct fib_iter_state *iter = seq->private; + struct fib_table *fib_main = __vx_fib_main_table(iter->xid); read_lock(&fib_hash_lock); - if (ip_fib_main_table) + if (fib_main) v = *pos ? fib_get_next(seq) : SEQ_START_TOKEN; return v; } @@ -955,8 +966,6 @@ static unsigned fib_flag_trans(int type, return flags; } -extern int dev_in_nx_info(struct net_device *, struct nx_info *); - /* * This outputs /proc/net/route. * @@ -987,8 +996,7 @@ static int fib_seq_show(struct seq_file prefix = f->fn_key; mask = FZ_MASK(iter->zone); flags = fib_flag_trans(fa->fa_type, mask, fi); - if (fi && (!vx_flags(VXF_HIDE_NETIF, 0) || - dev_in_nx_info(fi->fib_dev, current->nx_info))) + if (fi) snprintf(bf, sizeof(bf), "%s\t%08X\t%08X\t%04X\t%d\t%u\t%d\t%08X\t%d\t%u\t%u", fi->fib_dev ? fi->fib_dev->name : "*", prefix, @@ -1028,6 +1036,7 @@ static int fib_seq_open(struct inode *in seq = file->private_data; seq->private = s; memset(s, 0, sizeof(*s)); + s->xid = vx_current_xid(); out: return rc; out_kfree: diff -NurpP --minimal a/net/ipv4/fib_lookup.h b/net/ipv4/fib_lookup.h --- a/net/ipv4/fib_lookup.h 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/fib_lookup.h 2005-02-25 17:18:50.000000000 +0100 @@ -24,7 +24,7 @@ extern void fib_release_info(struct fib_ extern struct fib_info *fib_create_info(const struct rtmsg *r, struct kern_rta *rta, const struct nlmsghdr *, - int *err); + int *err, nfxid_t nfxid); extern int fib_nh_match(struct rtmsg *r, struct nlmsghdr *, struct kern_rta *rta, struct fib_info *fi); extern int fib_dump_info(struct sk_buff *skb, u32 pid, u32 seq, int event, diff -NurpP --minimal a/net/ipv4/fib_rules.c b/net/ipv4/fib_rules.c --- a/net/ipv4/fib_rules.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/fib_rules.c 2005-02-25 17:18:50.000000000 +0100 @@ -289,10 +289,20 @@ int fib_lookup(const struct flowi *flp, u32 daddr = flp->fl4_dst; u32 saddr = flp->fl4_src; + vxdprintk(VXD_CBIT(ngnet, 2), + "fib_lookup(%p[#%d]) %u.%u.%u.%u <- %u.%u.%u.%u", + flp, flp->nfxid, + NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); FRprintk("Lookup: %u.%u.%u.%u <- %u.%u.%u.%u ", NIPQUAD(flp->fl4_dst), NIPQUAD(flp->fl4_src)); read_lock(&fib_rules_lock); for (r = fib_rules; r; r=r->r_next) { +/* + printk("rule: %u.%u.%u.%u/%u.%u.%u.%u, %u.%u.%u.%u/%u.%u.%u.%u\n", + NIPQUAD(r->r_src), NIPQUAD(r->r_srcmask), + NIPQUAD(r->r_dst), NIPQUAD(r->r_dstmask)); +*/ + if (((saddr^r->r_src) & r->r_srcmask) || ((daddr^r->r_dst) & r->r_dstmask) || (r->r_tos && r->r_tos != flp->fl4_tos) || diff -NurpP --minimal a/net/ipv4/fib_semantics.c b/net/ipv4/fib_semantics.c --- a/net/ipv4/fib_semantics.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/fib_semantics.c 2005-02-25 17:18:50.000000000 +0100 @@ -35,6 +35,7 @@ #include #include #include +#include #include #include @@ -217,6 +218,8 @@ static struct fib_info *fib_find_info(co head = &fib_info_hash[hash]; hlist_for_each_entry(fi, node, head, fib_hash) { + if (fi->fib_nfxid != nfi->fib_nfxid) + continue; if (fi->fib_nhs != nfi->fib_nhs) continue; if (nfi->fib_protocol == fi->fib_protocol && @@ -489,6 +492,7 @@ int fib_nh_match(struct rtmsg *r, struct static int fib_check_nh(const struct rtmsg *r, struct fib_info *fi, struct fib_nh *nh) { int err; + nfxid_t nfxid = fi->fib_nfxid; if (nh->nh_gw) { struct fib_result res; @@ -502,7 +506,7 @@ static int fib_check_nh(const struct rtm if (r->rtm_scope >= RT_SCOPE_LINK) return -EINVAL; - if (inet_addr_type(nh->nh_gw) != RTN_UNICAST) + if (inet_addr_type(nh->nh_gw, nfxid) != RTN_UNICAST) return -EINVAL; if ((dev = __dev_get_by_index(nh->nh_oif)) == NULL) return -ENODEV; @@ -517,7 +521,8 @@ static int fib_check_nh(const struct rtm struct flowi fl = { .nl_u = { .ip4_u = { .daddr = nh->nh_gw, .scope = r->rtm_scope + 1 } }, - .oif = nh->nh_oif }; + .oif = nh->nh_oif, + .nfxid = nfxid }; /* It is not necessary, but requires a bit of thinking */ if (fl.fl4_scope < RT_SCOPE_LINK) @@ -639,7 +644,7 @@ static void fib_hash_move(struct hlist_h struct fib_info * fib_create_info(const struct rtmsg *r, struct kern_rta *rta, - const struct nlmsghdr *nlh, int *errp) + const struct nlmsghdr *nlh, int *errp, nfxid_t nfxid) { int err; struct fib_info *fi = NULL; @@ -695,6 +700,7 @@ fib_create_info(const struct rtmsg *r, s memset(fi, 0, sizeof(*fi)+nhs*sizeof(struct fib_nh)); fi->fib_protocol = r->rtm_protocol; + fi->fib_nfxid = nfxid; fi->fib_nhs = nhs; change_nexthops(fi) { @@ -782,7 +788,7 @@ fib_create_info(const struct rtmsg *r, s if (fi->fib_prefsrc) { if (r->rtm_type != RTN_LOCAL || rta->rta_dst == NULL || memcmp(&fi->fib_prefsrc, rta->rta_dst, 4)) - if (inet_addr_type(fi->fib_prefsrc) != RTN_LOCAL) + if (inet_addr_type(fi->fib_prefsrc, fi->fib_nfxid) != RTN_LOCAL) goto err_inval; } @@ -1072,7 +1078,7 @@ fib_convert_rtentry(int cmd, struct nlms ptr = &((struct sockaddr_in*)&r->rt_gateway)->sin_addr.s_addr; if (r->rt_gateway.sa_family == AF_INET && *ptr) { rta->rta_gw = ptr; - if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr) == RTN_UNICAST) + if (r->rt_flags&RTF_GATEWAY && inet_addr_type(*ptr, vx_current_xid()) == RTN_UNICAST) rtm->rtm_scope = RT_SCOPE_UNIVERSE; } @@ -1127,11 +1133,15 @@ fib_convert_rtentry(int cmd, struct nlms - device went down -> we must shutdown all nexthops going via it. */ -int fib_sync_down(u32 local, struct net_device *dev, int force) +int fib_sync_down(u32 local, struct net_device *dev, int force, nfxid_t nfxid) { int ret = 0; int scope = RT_SCOPE_NOWHERE; + vxdprintk(VXD_CBIT(ngnet, 5), + "fib_sync_down(%p[%s,#%u],%d): %u.%u.%u.%u", + dev, dev?dev->name:NULL, nfxid, force, NIPQUAD(local)); + if (force) scope = -1; @@ -1142,7 +1152,11 @@ int fib_sync_down(u32 local, struct net_ struct fib_info *fi; hlist_for_each_entry(fi, node, head, fib_lhash) { + if (fi->fib_nfxid != nfxid) + continue; if (fi->fib_prefsrc == local) { + vxdprintk(VXD_CBIT(ngnet, 1), + "tear down %p[#%u]", fi, nfxid); fi->fib_flags |= RTNH_F_DEAD; ret++; } diff -NurpP --minimal a/net/ipv4/icmp.c b/net/ipv4/icmp.c --- a/net/ipv4/icmp.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/icmp.c 2005-02-25 17:18:50.000000000 +0100 @@ -404,7 +404,8 @@ static void icmp_reply(struct icmp_bxm * { .daddr = daddr, .saddr = rt->rt_spec_dst, .tos = RT_TOS(skb->nh.iph->tos) } }, - .proto = IPPROTO_ICMP }; + .proto = IPPROTO_ICMP, + .nfxid = skb->nfxid, }; if (ip_route_output_key(&rt, &fl)) goto out_unlock; } @@ -662,7 +663,7 @@ static void icmp_unreach(struct sk_buff */ if (!sysctl_icmp_ignore_bogus_error_responses && - inet_addr_type(iph->daddr) == RTN_BROADCAST) { + inet_addr_type(iph->daddr, skb->nfxid) == RTN_BROADCAST) { if (net_ratelimit()) printk(KERN_WARNING "%u.%u.%u.%u sent an invalid ICMP " "type %u, code %u " @@ -692,8 +693,8 @@ static void icmp_unreach(struct sk_buff read_lock(&raw_v4_lock); if ((raw_sk = sk_head(&raw_v4_htable[hash])) != NULL) { while ((raw_sk = __raw_v4_lookup(raw_sk, protocol, iph->daddr, - iph->saddr, - skb->dev->ifindex)) != NULL) { + iph->saddr, skb->dev->ifindex, + skb->nfxid)) != NULL) { raw_err(raw_sk, skb, info); raw_sk = sk_next(raw_sk); iph = (struct iphdr *)skb->data; diff -NurpP --minimal a/net/ipv4/igmp.c b/net/ipv4/igmp.c --- a/net/ipv4/igmp.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/igmp.c 2005-02-25 17:18:50.000000000 +0100 @@ -1296,10 +1296,11 @@ void ip_mc_destroy_dev(struct in_device write_unlock_bh(&in_dev->mc_list_lock); } -static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr) +static struct in_device * ip_mc_find_dev(struct ip_mreqn *imr, nfxid_t nfxid) { struct flowi fl = { .nl_u = { .ip4_u = - { .daddr = imr->imr_multiaddr.s_addr } } }; + { .daddr = imr->imr_multiaddr.s_addr } }, + .nfxid = nfxid }; struct rtable *rt; struct net_device *dev = NULL; struct in_device *idev = NULL; @@ -1311,7 +1312,7 @@ static struct in_device * ip_mc_find_dev return idev; } if (imr->imr_address.s_addr) { - dev = ip_dev_find(imr->imr_address.s_addr); + dev = ip_dev_find(imr->imr_address.s_addr, nfxid); if (!dev) return NULL; __dev_put(dev); @@ -1625,7 +1626,7 @@ int ip_mc_join_group(struct sock *sk , s rtnl_shlock(); - in_dev = ip_mc_find_dev(imr); + in_dev = ip_mc_find_dev(imr, sk->sk_xid); if (!in_dev) { iml = NULL; @@ -1746,7 +1747,7 @@ int ip_mc_source(int add, int omode, str imr.imr_multiaddr.s_addr = mreqs->imr_multiaddr; imr.imr_address.s_addr = mreqs->imr_interface; imr.imr_ifindex = ifindex; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(&imr, sk->sk_xid); if (!in_dev) { err = -ENODEV; @@ -1866,7 +1867,7 @@ int ip_mc_msfilter(struct sock *sk, stru imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = ifindex; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(&imr, sk->sk_xid); if (!in_dev) { err = -ENODEV; @@ -1933,7 +1934,7 @@ int ip_mc_msfget(struct sock *sk, struct imr.imr_multiaddr.s_addr = msf->imsf_multiaddr; imr.imr_address.s_addr = msf->imsf_interface; imr.imr_ifindex = 0; - in_dev = ip_mc_find_dev(&imr); + in_dev = ip_mc_find_dev(&imr, sk->sk_xid); if (!in_dev) { err = -ENODEV; diff -NurpP --minimal a/net/ipv4/ip_options.c b/net/ipv4/ip_options.c --- a/net/ipv4/ip_options.c 2004-12-24 22:35:23.000000000 +0100 +++ b/net/ipv4/ip_options.c 2005-02-25 17:18:50.000000000 +0100 @@ -148,7 +148,7 @@ int ip_options_echo(struct ip_options * __u32 addr; memcpy(&addr, sptr+soffset-1, 4); - if (inet_addr_type(addr) != RTN_LOCAL) { + if (inet_addr_type(addr, skb->nfxid) != RTN_LOCAL) { dopt->ts_needtime = 1; soffset += 8; } @@ -389,7 +389,7 @@ int ip_options_compile(struct ip_options { u32 addr; memcpy(&addr, &optptr[optptr[2]-1], 4); - if (inet_addr_type(addr) == RTN_UNICAST) + if (inet_addr_type(addr, skb->nfxid) == RTN_UNICAST) break; if (skb) timeptr = (__u32*)&optptr[optptr[2]+3]; diff -NurpP --minimal a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c --- a/net/ipv4/ip_output.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/ip_output.c 2005-02-25 17:18:50.000000000 +0100 @@ -84,6 +84,9 @@ #include #include #include +#include +#include +#include /* * Shall we try to damage output packets if routing dev changes? @@ -163,6 +166,7 @@ int ip_build_and_send_pkt(struct sk_buff ip_send_check(iph); skb->priority = sk->sk_priority; + vx_tag_output_skb(sk, skb); /* Send it out. */ return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, @@ -301,6 +305,10 @@ int ip_queue_xmit(struct sk_buff *skb, i struct ip_options *opt = inet->opt; struct rtable *rt; struct iphdr *iph; + nfxid_t nfxid = skb->nfxid; + + vxdprintk(VXD_CBIT(ngnet, 7), + "ip_queue_xmit(%p[#%u])", skb, skb->nfxid); /* Skip all of this if the packet is already routed, * f.e. by something like SCTP. @@ -328,7 +336,8 @@ int ip_queue_xmit(struct sk_buff *skb, i .proto = sk->sk_protocol, .uli_u = { .ports = { .sport = inet->sport, - .dport = inet->dport } } }; + .dport = inet->dport } }, + .nfxid = nfxid }; /* If this fails, retransmit mechanism of transport layer will * keep trying until route appears or the connection times @@ -372,6 +381,7 @@ packet_routed: ip_send_check(iph); skb->priority = sk->sk_priority; + vx_tag_output_skb(sk, skb); return NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); @@ -1186,6 +1196,7 @@ int ip_push_pending_frames(struct sock * skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); + vx_tag_output_skb(sk, skb); /* Netfilter gets whole the not fragmented skb. */ err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, diff -NurpP --minimal a/net/ipv4/ip_sockglue.c b/net/ipv4/ip_sockglue.c --- a/net/ipv4/ip_sockglue.c 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/ip_sockglue.c 2005-02-25 17:18:50.000000000 +0100 @@ -565,7 +565,7 @@ int ip_setsockopt(struct sock *sk, int l err = 0; break; } - dev = ip_dev_find(mreq.imr_address.s_addr); + dev = ip_dev_find(mreq.imr_address.s_addr, sk->sk_xid); if (dev) { mreq.imr_ifindex = dev->ifindex; dev_put(dev); diff -NurpP --minimal a/net/ipv4/netfilter/iptable_vnet.c b/net/ipv4/netfilter/iptable_vnet.c --- a/net/ipv4/netfilter/iptable_vnet.c 1970-01-01 01:00:00.000000000 +0100 +++ b/net/ipv4/netfilter/iptable_vnet.c 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,211 @@ +/* + * Copyright (C) 2004 Herbert Poetzl + * + * heavily based on the raw table by + * Jozsef Kadlecsik + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + * + */ + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herbert Poetzl "); +MODULE_DESCRIPTION("iptables vnet table"); + +#define VNET_VALID_HOOKS ((1 << NF_IP_PRE_ROUTING) | (1 << NF_IP_LOCAL_OUT)) + +/* FIXME best match for now, move to header? */ +#define PF_VNET PF_LOCAL + + +static struct +{ + struct ipt_replace repl; + struct ipt_standard entries[2]; + struct ipt_error term; +} initial_table __initdata += { + .repl = { + .name = "vnet", + .valid_hooks = VNET_VALID_HOOKS, + .num_entries = 3, + .size = sizeof(struct ipt_standard) * 2 + sizeof(struct ipt_error), + .hook_entry = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + .underflow = { + [NF_IP_PRE_ROUTING] = 0, + [NF_IP_LOCAL_OUT] = sizeof(struct ipt_standard) }, + }, + .entries = { + /* PRE_ROUTING */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + + /* LOCAL_OUT */ + { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_standard), + }, + .target = { + .target = { + .u = { + .target_size = IPT_ALIGN(sizeof(struct ipt_standard_target)), + }, + }, + .verdict = -NF_ACCEPT - 1, + }, + }, + }, + /* ERROR */ + .term = { + .entry = { + .target_offset = sizeof(struct ipt_entry), + .next_offset = sizeof(struct ipt_error), + }, + .target = { + .target = { + .u = { + .user = { + .target_size = IPT_ALIGN(sizeof(struct ipt_error_target)), + .name = IPT_ERROR_TARGET, + }, + }, + }, + .errorname = "ERROR", + }, + } +}; + +static struct ipt_table packet_vnet = { + .name = "vnet", + .valid_hooks = VNET_VALID_HOOKS, + .lock = RW_LOCK_UNLOCKED, + .me = THIS_MODULE +}; + +/* The work comes in here from netfilter.c. */ +static unsigned int +ipt_route_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_vnet, NULL); +} + +static unsigned int +ipt_out_hook(unsigned int hook, + struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + int (*okfn)(struct sk_buff *)) +{ + return ipt_do_table(pskb, hook, in, out, &packet_vnet, NULL); +} + +static struct nf_hook_ops ipt_ops[] = { + { + .hook = ipt_route_hook, + .pf = PF_VNET, + .hooknum = NF_IP_PRE_ROUTING, + .priority = NF_IP_PRI_FIRST + }, + { + .hook = ipt_out_hook, + .pf = PF_INET, + .hooknum = NF_IP_LOCAL_OUT, + .priority = NF_IP_PRI_FIRST + }, +#if 0 + { + .hook = ipt_arp_hook, + .pf = NF_ARP, + .hooknum = NF_ARP_IN, + .priority = NF_IP_PRI_FIRST + }, + { + .hook = ipt_arp_hook, + .pf = NF_ARP, + .hooknum = NF_ARP_OUT, + .priority = NF_IP_PRI_FIRST + }, +#endif +}; + +static int __init init(void) +{ + int ret; + + /* Register table */ + ret = ipt_register_table(&packet_vnet, &initial_table.repl); + if (ret < 0) + return ret; + + /* Register hooks */ + ret = nf_register_hook(&ipt_ops[0]); + if (ret < 0) + goto cleanup_table; + + ret = nf_register_hook(&ipt_ops[1]); + if (ret < 0) + goto cleanup_hook0; + +#if 0 + ret = nf_register_hook(&ipt_ops[2]); + if (ret < 0) + goto cleanup_hook1; + + ret = nf_register_hook(&ipt_ops[3]); + if (ret < 0) + goto cleanup_hook2; +#endif + + return ret; + +#if 0 + cleanup_hook2: + nf_unregister_hook(&ipt_ops[2]); + cleanup_hook1: + nf_unregister_hook(&ipt_ops[1]); +#endif + cleanup_hook0: + nf_unregister_hook(&ipt_ops[0]); + cleanup_table: + ipt_unregister_table(&packet_vnet); + + return ret; +} + +static void __exit fini(void) +{ + unsigned int i; + + for (i = 0; i < sizeof(ipt_ops)/sizeof(struct nf_hook_ops); i++) + nf_unregister_hook(&ipt_ops[i]); + + ipt_unregister_table(&packet_vnet); +} + +module_init(init); +module_exit(fini); +MODULE_LICENSE("GPL"); diff -NurpP --minimal a/net/ipv4/netfilter/ipt_addrtype.c b/net/ipv4/netfilter/ipt_addrtype.c --- a/net/ipv4/netfilter/ipt_addrtype.c 2004-12-24 22:33:49.000000000 +0100 +++ b/net/ipv4/netfilter/ipt_addrtype.c 2005-02-25 17:18:50.000000000 +0100 @@ -22,9 +22,9 @@ MODULE_LICENSE("GPL"); MODULE_AUTHOR("Patrick McHardy "); MODULE_DESCRIPTION("iptables addrtype match"); -static inline int match_type(u_int32_t addr, u_int16_t mask) +static inline int match_type(u_int32_t addr, u_int16_t mask, nfxid_t nfxid) { - return !!(mask & (1 << inet_addr_type(addr))); + return !!(mask & (1 << inet_addr_type(addr, nfxid))); } static int match(const struct sk_buff *skb, const struct net_device *in, @@ -36,9 +36,11 @@ static int match(const struct sk_buff *s int ret = 1; if (info->source) - ret &= match_type(iph->saddr, info->source)^info->invert_source; + ret &= match_type(iph->saddr, info->source, + skb->nfxid) ^ info->invert_source; if (info->dest) - ret &= match_type(iph->daddr, info->dest)^info->invert_dest; + ret &= match_type(iph->daddr, info->dest, + skb->nfxid) ^ info->invert_dest; return ret; } diff -NurpP --minimal a/net/ipv4/netfilter/ipt_vnet.c b/net/ipv4/netfilter/ipt_vnet.c --- a/net/ipv4/netfilter/ipt_vnet.c 1970-01-01 01:00:00.000000000 +0100 +++ b/net/ipv4/netfilter/ipt_vnet.c 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,65 @@ +/* Kernel module to match NFVNET values. */ + +/* Copyright (C) 2004 Herbert Poetzl + * heavily based on the MARK stuff by Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include + +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herbert Poetzl "); +MODULE_DESCRIPTION("iptables vnet matching module"); + +static int +match(const struct sk_buff *skb, + const struct net_device *in, + const struct net_device *out, + const void *matchinfo, + int offset, + int *hotdrop) +{ + const struct ipt_vnet_info *info = matchinfo; + + return (skb->nfvnet == info->vnet); +} + +static int +checkentry(const char *tablename, + const struct ipt_ip *ip, + void *matchinfo, + unsigned int matchsize, + unsigned int hook_mask) +{ + if (matchsize != IPT_ALIGN(sizeof(struct ipt_vnet_info))) + return 0; + + return 1; +} + +static struct ipt_match vnet_match = { + .name = "vnet", + .match = &match, + .checkentry = &checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_match(&vnet_match); +} + +static void __exit fini(void) +{ + ipt_unregister_match(&vnet_match); +} + +module_init(init); +module_exit(fini); diff -NurpP --minimal a/net/ipv4/netfilter/ipt_VNET.c b/net/ipv4/netfilter/ipt_VNET.c --- a/net/ipv4/netfilter/ipt_VNET.c 1970-01-01 01:00:00.000000000 +0100 +++ b/net/ipv4/netfilter/ipt_VNET.c 2005-02-25 17:18:50.000000000 +0100 @@ -0,0 +1,133 @@ +/* This is a module which is used for setting the NFVNET field of an skb. */ + +/* Copyright (C) 2004 Herbert Poetzl + * heavily based on the MARK stuff by Marc Boucher + * + * This program is free software; you can redistribute it and/or modify + * it under the terms of the GNU General Public License version 2 as + * published by the Free Software Foundation. + */ + +#include +#include +#include +#include + +#include +#include +#include +#include + +MODULE_LICENSE("GPL"); +MODULE_AUTHOR("Herbert Poetzl "); +MODULE_DESCRIPTION("iptables VNET modification module"); + +int netif_receive_skb_ngnet(struct sk_buff *skb); + +static unsigned int +target(struct sk_buff **pskb, + const struct net_device *in, + const struct net_device *out, + unsigned int hooknum, + const void *targinfo, + void *userinfo) +{ + const struct ipt_vnet_target_info *vnetinfo = targinfo; + struct vnet *vn; + + vxdprintk(VXD_CBIT(net, 1), + "vnet_target(%d,#%u) [%d,%d]", + (*pskb)->nfvnet, (*pskb)->nfxid, + vnetinfo->vnet, hooknum); + + switch (hooknum) { + case NF_IP_PRE_ROUTING: + + /* process prestine packets only */ + if ((*pskb)->nfvnet) + return IPT_CONTINUE; + + /* rewrite packet and reinject from vnet */ + if ((vn = vnet_get(vnetinfo->vnet))) { + (*pskb)->nfcache |= NFC_ALTERED; + (*pskb)->dev = vn->vndev; + (*pskb)->real_dev = vn->dev; + (*pskb)->nfxid = vn->nfxid; + (*pskb)->nfvnet = vn->vnet; + + vn->stats.rx_bytes += (*pskb)->len; + vn->stats.rx_packets++; + + vnet_put(vn); + + netif_receive_skb_ngnet(*pskb); + return NF_STOLEN; + } + + /* unknown vnet */ + return NF_DROP; + + case NF_IP_LOCAL_OUT: + + /* process prestine packets only */ + if ((*pskb)->nfvnet != VNET_UNTAGGED) + return IPT_CONTINUE; + + /* rewrite packet output */ + if ((vn = vnet_get(vnetinfo->vnet))) { + (*pskb)->nfcache |= NFC_ALTERED; + (*pskb)->nfxid = vn->nfxid; + (*pskb)->nfvnet = vn->vnet; + + vnet_put(vn); + + return NF_ACCEPT; + } + + /* unknown vnet */ + return NF_DROP; + } + return IPT_CONTINUE; +} + +static int +checkentry(const char *tablename, + const struct ipt_entry *e, + void *targinfo, + unsigned int targinfosize, + unsigned int hook_mask) +{ + if (targinfosize != IPT_ALIGN(sizeof(struct ipt_vnet_target_info))) { + printk(KERN_WARNING "VNET: targinfosize %u != %Zu\n", + targinfosize, + IPT_ALIGN(sizeof(struct ipt_vnet_target_info))); + return 0; + } + + if (strcmp(tablename, "vnet") != 0) { + printk(KERN_WARNING "VNET: can only be called from \"vnet\" table, not \"%s\"\n", tablename); + return 0; + } + + return 1; +} + +static struct ipt_target ipt_vnet_reg = { + .name = "VNET", + .target = target, + .checkentry = checkentry, + .me = THIS_MODULE, +}; + +static int __init init(void) +{ + return ipt_register_target(&ipt_vnet_reg); +} + +static void __exit fini(void) +{ + ipt_unregister_target(&ipt_vnet_reg); +} + +module_init(init); +module_exit(fini); diff -NurpP --minimal a/net/ipv4/netfilter/Kconfig b/net/ipv4/netfilter/Kconfig --- a/net/ipv4/netfilter/Kconfig 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/netfilter/Kconfig 2005-02-25 17:18:50.000000000 +0100 @@ -168,6 +168,16 @@ config IP_NF_MATCH_MARK To compile it as a module, choose M here. If unsure, say N. +config IP_NF_MATCH_VNET + tristate "netfilter VNET match support" + depends on IP_NF_IPTABLES && VSERVER_NGNET + help + Netfilter vnet matching allows you to match packets based on the + `nfvnet' value in the packet. This can be set by the VNET target + (see below). + + To compile it as a module, choose M here. If unsure, say N. + config IP_NF_MATCH_MULTIPORT tristate "Multiple port match support" depends on IP_NF_IPTABLES @@ -545,6 +555,26 @@ config IP_NF_NAT_AMANDA default IP_NF_NAT if IP_NF_AMANDA=y default m if IP_NF_AMANDA=m +# vnet + specific targets +config IP_NF_VNET + tristate "Packet virtualization" + depends on IP_NF_IPTABLES && VSERVER_NGNET + help + This option adds a `vnet' table to iptables: see the man page for + iptables(8). This table is used for packet virtualization. + + To compile it as a module, choose M here. If unsure, say N. + +config IP_NF_TARGET_VNET + tristate "VNET target support" + depends on IP_NF_VNET + help + This option adds a `VNET' target, which allows you to create rules + in the `mangle' table which alter the netfilter vnet (nfvnet) field + associated with the packet prior to routing. + + To compile it as a module, choose M here. If unsure, say N. + # mangle + specific targets config IP_NF_MANGLE tristate "Packet mangling" diff -NurpP --minimal a/net/ipv4/netfilter/Makefile b/net/ipv4/netfilter/Makefile --- a/net/ipv4/netfilter/Makefile 2005-02-25 17:05:02.000000000 +0100 +++ b/net/ipv4/netfilter/Makefile 2005-02-25 17:18:50.000000000 +0100 @@ -30,6 +30,7 @@ obj-$(CONFIG_IP_NF_IPTABLES) += ip_table # the three instances of ip_tables obj-$(CONFIG_IP_NF_FILTER) += iptable_filter.o obj-$(CONFIG_IP_NF_MANGLE) += iptable_mangle.o +obj-$(CONFIG_IP_NF_VNET) += iptable_vnet.o obj-$(CONFIG_IP_NF_NAT) += iptable_nat.o obj-$(CONFIG_IP_NF_RAW) += iptable_raw.o @@ -39,6 +40,7 @@ obj-$(CONFIG_IP_NF_MATCH_LIMIT) += ipt_l obj-$(CONFIG_IP_NF_MATCH_HASHLIMIT) += ipt_hashlimit.o obj-$(CONFIG_IP_NF_MATCH_SCTP) += ipt_sctp.o obj-$(CONFIG_IP_NF_MATCH_MARK) += ipt_mark.o +obj-$(CONFIG_IP_NF_MATCH_VNET) += ipt_vnet.o obj-$(CONFIG_IP_NF_MATCH_MAC) += ipt_mac.o obj-$(CONFIG_IP_NF_MATCH_IPRANGE) += ipt_iprange.o obj-$(CONFIG_IP_NF_MATCH_PKTTYPE) += ipt_pkttype.o @@ -66,6 +68,7 @@ obj-$(CONFIG_IP_NF_TARGET_TOS) += ipt_TO obj-$(CONFIG_IP_NF_TARGET_ECN) += ipt_ECN.o obj-$(CONFIG_IP_NF_TARGET_DSCP) += ipt_DSCP.o obj-$(CONFIG_IP_NF_TARGET_MARK) += ipt_MARK.o +obj-$(CONFIG_IP_NF_TARGET_VNET) += ipt_VNET.o obj-$(CONFIG_IP_NF_TARGET_MASQUERADE) += ipt_MASQUERADE.o obj-$(CONFIG_IP_NF_TARGET_REDIRECT) += ipt_REDIRECT.o obj-$(CONFIG_IP_NF_TARGET_NETMAP) += ipt_NETMAP.o diff -NurpP --minimal a/net/ipv4/raw.c b/net/ipv4/raw.c --- a/net/ipv4/raw.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/raw.c 2005-02-25 17:18:50.000000000 +0100 @@ -79,6 +79,8 @@ #include #include #include +#include +#include struct hlist_head raw_v4_htable[RAWV4_HTABLE_SIZE]; DEFINE_RWLOCK(raw_v4_lock); @@ -102,30 +104,9 @@ static void raw_v4_unhash(struct sock *s write_unlock_bh(&raw_v4_lock); } - -/* - * Check if a given address matches for a socket - * - * nxi: the socket's nx_info if any - * addr: to be verified address - * saddr/baddr: socket addresses - */ -static inline int raw_addr_match ( - struct nx_info *nxi, - uint32_t addr, - uint32_t saddr, - uint32_t baddr) -{ - if (addr && (saddr == addr || baddr == addr)) - return 1; - if (!saddr) - return addr_in_nx_info(nxi, addr); - return 0; -} - struct sock *__raw_v4_lookup(struct sock *sk, unsigned short num, unsigned long raddr, unsigned long laddr, - int dif) + int dif, nfxid_t nfxid) { struct hlist_node *node; @@ -134,8 +115,8 @@ struct sock *__raw_v4_lookup(struct sock if (inet->num == num && !(inet->daddr && inet->daddr != raddr) && - raw_addr_match(sk->sk_nx_info, laddr, - inet->rcv_saddr, inet->rcv_saddr2) && + vx_sk_check(sk, nfxid) && + !(inet->rcv_saddr && inet->rcv_saddr != laddr) && !(sk->sk_bound_dev_if && sk->sk_bound_dev_if != dif)) goto found; /* gotcha */ } @@ -183,7 +164,7 @@ void raw_v4_input(struct sk_buff *skb, s goto out; sk = __raw_v4_lookup(__sk_head(head), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, skb->nfxid); while (sk) { if (iph->protocol != IPPROTO_ICMP || !icmp_filter(sk, skb)) { @@ -195,7 +176,7 @@ void raw_v4_input(struct sk_buff *skb, s } sk = __raw_v4_lookup(sk_next(sk), iph->protocol, iph->saddr, iph->daddr, - skb->dev->ifindex); + skb->dev->ifindex, skb->nfxid); } out: read_unlock(&raw_v4_lock); @@ -309,6 +290,7 @@ static int raw_send_hdrinc(struct sock * skb->priority = sk->sk_priority; skb->dst = dst_clone(&rt->u.dst); + vx_tag_output_skb(sk, skb); skb->nh.iph = iph = (struct iphdr *)skb_put(skb, length); @@ -330,10 +312,6 @@ static int raw_send_hdrinc(struct sock * iph->check = ip_fast_csum((unsigned char *)iph, iph->ihl); } - err = -EPERM; - if (!vx_check(0, VX_ADMIN) && !capable(CAP_NET_RAW) - && (!addr_in_nx_info(sk->sk_nx_info, iph->saddr))) - goto error; err = NF_HOOK(PF_INET, NF_IP_LOCAL_OUT, skb, NULL, rt->u.dst.dev, dst_output); @@ -409,6 +387,9 @@ static int raw_sendmsg(struct kiocb *ioc u8 tos; int err; + vxdprintk(VXD_CBIT(ngnet, 1), + "raw_sendmsg(%p[#%u])", sk, sk->sk_xid); + err = -EMSGSIZE; if (len < 0 || len > 0xFFFF) goto out; @@ -502,16 +483,11 @@ static int raw_sendmsg(struct kiocb *ioc .tos = tos } }, .proto = inet->hdrincl ? IPPROTO_RAW : sk->sk_protocol, + .nfxid = sk->sk_xid, }; if (!inet->hdrincl) raw_probe_proto_opt(&fl, msg); - if (sk->sk_nx_info) { - err = ip_find_src(sk->sk_nx_info, &rt, &fl); - - if (err) - goto done; - } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); } if (err) @@ -576,7 +552,7 @@ static int raw_bind(struct sock *sk, str if (sk->sk_state != TCP_CLOSE || addr_len < sizeof(struct sockaddr_in)) goto out; - chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr); + chk_addr_ret = inet_addr_type(addr->sin_addr.s_addr, sk->sk_xid); ret = -EADDRNOTAVAIL; if (addr->sin_addr.s_addr && chk_addr_ret != RTN_LOCAL && chk_addr_ret != RTN_MULTICAST && chk_addr_ret != RTN_BROADCAST) diff -NurpP --minimal a/net/ipv4/route.c b/net/ipv4/route.c --- a/net/ipv4/route.c 2005-02-25 17:05:03.000000000 +0100 +++ b/net/ipv4/route.c 2005-02-25 17:18:50.000000000 +0100 @@ -90,6 +90,7 @@ #include #include #include +#include #include #include #include @@ -769,9 +770,10 @@ out: return 0; static inline int compare_keys(struct flowi *fl1, struct flowi *fl2) { - return memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, sizeof(fl1->nl_u.ip4_u)) == 0 && - fl1->oif == fl2->oif && - fl1->iif == fl2->iif; + return fl1->nfxid == fl2->nfxid && + memcmp(&fl1->nl_u.ip4_u, &fl2->nl_u.ip4_u, + sizeof(fl1->nl_u.ip4_u)) == 0 && + fl1->oif == fl2->oif && fl1->iif == fl2->iif; } static int rt_intern_hash(unsigned hash, struct rtable *rt, struct rtable **rp) @@ -998,7 +1000,7 @@ void ip_rt_redirect(u32 old_gw, u32 dadd if (IN_DEV_SEC_REDIRECTS(in_dev) && ip_fib_check_default(new_gw, dev)) goto reject_redirect; } else { - if (inet_addr_type(new_gw) != RTN_UNICAST) + if (inet_addr_type(new_gw, dev->nfxid) != RTN_UNICAST) goto reject_redirect; } @@ -1348,6 +1350,8 @@ static void ipv4_dst_ifdown(struct dst_e { struct rtable *rt = (struct rtable *) dst; struct in_device *idev = rt->idev; + + /* FIXME block ifdown for vserver lo ? */ if (dev != &loopback_dev && idev && idev->dev == dev) { struct in_device *loopback_idev = in_dev_get(&loopback_dev); if (loopback_idev) { @@ -1560,7 +1564,8 @@ static int ip_route_input_slow(struct sk .fwmark = skb->nfmark #endif } }, - .iif = dev->ifindex }; + .iif = dev->ifindex, + .nfxid = dev->nfxid }; unsigned flags = 0; u32 itag = 0; struct rtable * rth; @@ -1571,6 +1576,10 @@ static int ip_route_input_slow(struct sk /* IP on this device is disabled. */ + vxdprintk(VXD_CBIT(ngnet, 1), + "ip_route_input_slow(%p[#%u],%p[%s,#%u])", + skb, skb->nfxid, dev, dev->name, dev->nfxid); + if (!in_dev) goto out; @@ -1595,6 +1604,7 @@ static int ip_route_input_slow(struct sk if (BADCLASS(daddr) || ZERONET(daddr) || LOOPBACK(daddr)) goto martian_destination; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_input_slow() -1-"); /* * Now we are ready to route packet. */ @@ -1604,6 +1614,7 @@ static int ip_route_input_slow(struct sk goto no_route; } free_res = 1; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_input_slow() -2-"); RT_CACHE_STAT_INC(in_slow_tot); @@ -1612,9 +1623,13 @@ static int ip_route_input_slow(struct sk if (res.type == RTN_LOCAL) { int result; + struct net_device *loopback = __vx_loopback_dev(dev->nfxid); + result = fib_validate_source(saddr, daddr, tos, - loopback_dev.ifindex, + loopback->ifindex, dev, &spec_dst, &itag); + vxdprintk(VXD_CBIT(ngnet, 0), + "ip_route_input_slow() -3-"); if (result < 0) goto martian_source; if (result) @@ -1703,7 +1718,8 @@ done: in_dev_put(out_dev); if (free_res) fib_res_put(&res); -out: return err; +out: vxdprintk(VXD_CBIT(ngnet, 1), "ip_route_input_slow() = %d", err); + return err; brd_input: if (skb->protocol != htons(ETH_P_IP)) @@ -1822,6 +1838,10 @@ int ip_route_input(struct sk_buff *skb, unsigned hash; int iif = dev->ifindex; + vxdprintk(VXD_CBIT(ngnet, 1), + "ip_route_input(%p[#%u],%p[%s,#%u])", + skb, skb->nfxid, dev, dev->name, dev->nfxid); + tos &= IPTOS_RT_MASK; hash = rt_hash_code(daddr, saddr ^ (iif << 5), tos); @@ -1889,6 +1909,7 @@ int ip_route_input(struct sk_buff *skb, static int ip_route_output_slow(struct rtable **rp, const struct flowi *oldflp) { u32 tos = oldflp->fl4_tos & (IPTOS_RT_MASK | RTO_ONLINK); + struct net_device *loopback = __vx_loopback_dev(oldflp->nfxid); struct flowi fl = { .nl_u = { .ip4_u = { .daddr = oldflp->fl4_dst, .saddr = oldflp->fl4_src, @@ -1897,11 +1918,12 @@ static int ip_route_output_slow(struct r RT_SCOPE_LINK : RT_SCOPE_UNIVERSE), #ifdef CONFIG_IP_ROUTE_FWMARK - .fwmark = oldflp->fl4_fwmark + .fwmark = oldflp->fl4_fwmark, #endif } }, - .iif = loopback_dev.ifindex, - .oif = oldflp->oif }; + .iif = loopback->ifindex, + .oif = oldflp->oif, + .nfxid = oldflp->nfxid, }; struct fib_result res; unsigned flags = 0; struct rtable *rth; @@ -1911,6 +1933,7 @@ static int ip_route_output_slow(struct r int free_res = 0; int err; + vxdprintk(VXD_CBIT(ngnet, 1), "ip_route_output_slow()"); res.fi = NULL; #ifdef CONFIG_IP_MULTIPLE_TABLES res.r = NULL; @@ -1924,7 +1947,7 @@ static int ip_route_output_slow(struct r goto out; /* It is equivalent to inet_addr_type(saddr) == RTN_LOCAL */ - dev_out = ip_dev_find(oldflp->fl4_src); + dev_out = ip_dev_find(oldflp->fl4_src, oldflp->nfxid); if (dev_out == NULL) goto out; @@ -1960,6 +1983,7 @@ static int ip_route_output_slow(struct r dev_put(dev_out); dev_out = NULL; } + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -1-"); if (oldflp->oif) { dev_out = dev_get_by_index(oldflp->oif); err = -ENODEV; @@ -1985,20 +2009,23 @@ static int ip_route_output_slow(struct r RT_SCOPE_HOST); } } + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -2-"); if (!fl.fl4_dst) { fl.fl4_dst = fl.fl4_src; if (!fl.fl4_dst) fl.fl4_dst = fl.fl4_src = htonl(INADDR_LOOPBACK); + /* FIXME probably needs the vservers lo here */ if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback; dev_hold(dev_out); - fl.oif = loopback_dev.ifindex; + fl.oif = loopback->ifindex; res.type = RTN_LOCAL; flags |= RTCF_LOCAL; goto make_route; } + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -3-"); if (fib_lookup(&fl, &res)) { res.fi = NULL; @@ -2033,13 +2060,15 @@ static int ip_route_output_slow(struct r goto out; } free_res = 1; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4-"); if (res.type == RTN_LOCAL) { if (!fl.fl4_src) fl.fl4_src = fl.fl4_dst; + /* FIXME probably needs the vservers lo */ if (dev_out) dev_put(dev_out); - dev_out = &loopback_dev; + dev_out = loopback; dev_hold(dev_out); fl.oif = dev_out->ifindex; if (res.fi) @@ -2067,9 +2096,11 @@ static int ip_route_output_slow(struct r fl.oif = dev_out->ifindex; make_route: + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4.0-"); if (LOOPBACK(fl.fl4_src) && !(dev_out->flags&IFF_LOOPBACK)) goto e_inval; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4.1-"); if (fl.fl4_dst == 0xFFFFFFFF) res.type = RTN_BROADCAST; else if (MULTICAST(fl.fl4_dst)) @@ -2077,13 +2108,17 @@ make_route: else if (BADCLASS(fl.fl4_dst) || ZERONET(fl.fl4_dst)) goto e_inval; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4.2-"); if (dev_out->flags & IFF_LOOPBACK) flags |= RTCF_LOCAL; in_dev = in_dev_get(dev_out); + /* printk("dev_out = %p[%s,#%d], in_dev = %p\n", + dev_out, dev_out->name, dev_out->nfxid, in_dev); */ if (!in_dev) goto e_inval; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4.3-"); if (res.type == RTN_BROADCAST) { flags |= RTCF_BROADCAST | RTCF_LOCAL; if (res.fi) { @@ -2108,6 +2143,7 @@ make_route: if (!rth) goto e_nobufs; + vxdprintk(VXD_CBIT(ngnet, 0), "ip_route_output_slow() -4.4-"); atomic_set(&rth->u.dst.__refcnt, 1); rth->u.dst.flags= DST_HOST; if (in_dev->cnf.no_xfrm) @@ -2121,6 +2157,7 @@ make_route: #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark= oldflp->fl4_fwmark; #endif + rth->fl.nfxid = oldflp->nfxid; rth->rt_dst = fl.fl4_dst; rth->rt_src = fl.fl4_src; rth->rt_iif = oldflp->oif ? : dev_out->ifindex; @@ -2169,7 +2206,8 @@ done: dev_put(dev_out); if (in_dev) in_dev_put(in_dev); -out: return err; +out: vxdprintk(VXD_CBIT(ngnet, 1), "ip_route_output_slow() = %d", err); + return err; e_inval: err = -EINVAL; @@ -2184,6 +2222,7 @@ int __ip_route_output_key(struct rtable unsigned hash; struct rtable *rth; + vxdprintk(VXD_CBIT(ngnet, 1), "__ip_route_output_key(#%d)", flp->nfxid); hash = rt_hash_code(flp->fl4_dst, flp->fl4_src ^ (flp->oif << 5), flp->fl4_tos); rcu_read_lock_bh(); @@ -2196,6 +2235,7 @@ int __ip_route_output_key(struct rtable #ifdef CONFIG_IP_ROUTE_FWMARK rth->fl.fl4_fwmark == flp->fl4_fwmark && #endif + rth->fl.nfxid == flp->nfxid && !((rth->fl.fl4_tos ^ flp->fl4_tos) & (IPTOS_RT_MASK | RTO_ONLINK))) { rth->u.dst.lastuse = jiffies; @@ -2217,6 +2257,7 @@ int ip_route_output_flow(struct rtable * { int err; + vxdprintk(VXD_CBIT(ngnet, 1), "ip_route_output_flow()"); if ((err = __ip_route_output_key(rp, flp)) != 0) return err; @@ -2233,6 +2274,7 @@ int ip_route_output_flow(struct rtable * int ip_route_output_key(struct rtable **rp, struct flowi *flp) { + vxdprintk(VXD_CBIT(ngnet, 1), "ip_route_output_key()"); return ip_route_output_flow(rp, flp, NULL, 0); } diff -NurpP --minimal a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c --- a/net/ipv4/tcp_ipv4.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/tcp_ipv4.c 2005-02-25 17:18:50.000000000 +0100 @@ -74,6 +74,9 @@ #include #include #include + +#include +#include #include extern int sysctl_ip_dynaddr; @@ -182,20 +185,23 @@ void tcp_bind_hash(struct sock *sk, stru static inline int tcp_bind_conflict(struct sock *sk, struct tcp_bind_bucket *tb) { + const u32 sk_rcv_saddr = tcp_v4_rcv_saddr(sk); struct sock *sk2; struct hlist_node *node; int reuse = sk->sk_reuse; sk_for_each_bound(sk2, node, &tb->owners) { if (sk != sk2 && + (sk->sk_xid == sk2->sk_xid) && !tcp_v6_ipv6only(sk2) && (!sk->sk_bound_dev_if || !sk2->sk_bound_dev_if || sk->sk_bound_dev_if == sk2->sk_bound_dev_if)) { if (!reuse || !sk2->sk_reuse || sk2->sk_state == TCP_LISTEN) { - if (nx_addr_conflict(sk->sk_nx_info, - tcp_v4_rcv_saddr(sk), sk2)) + const u32 sk2_rcv_saddr = tcp_v4_rcv_saddr(sk2); + if (!sk2_rcv_saddr || !sk_rcv_saddr || + sk2_rcv_saddr == sk_rcv_saddr) break; } } @@ -404,26 +410,6 @@ void tcp_unhash(struct sock *sk) wake_up(&tcp_lhash_wait); } - -/* - * Check if a given address matches for a tcp socket - * - * nxi: the socket's nx_info if any - * addr: to be verified address - * saddr: socket addresses - */ -static inline int tcp_addr_match ( - struct nx_info *nxi, - uint32_t addr, - uint32_t saddr) -{ - if (addr && (saddr == addr)) - return 1; - if (!saddr) - return addr_in_nx_info(nxi, addr); - return 0; -} - /* Don't inline this cruft. Here are some nice properties to * exploit here. The BSD API does not allow a listening TCP * to specify the remote port nor the remote address for the @@ -431,64 +417,88 @@ static inline int tcp_addr_match ( * during the search since they can never be otherwise. */ static struct sock *__tcp_v4_lookup_listener(struct hlist_head *head, u32 daddr, - unsigned short hnum, int dif) + unsigned short hnum, int dif, nfxid_t nfxid) { struct sock *result = NULL, *sk; struct hlist_node *node; int score, hiscore; + vxdprintk(VXD_CBIT(ngnet, 9), + "__tcp_v4_lookup_listener(#%u) %u.%u.%u.%u:%u", + nfxid, NIPQUAD(daddr), hnum); + hiscore=-1; sk_for_each(sk, node, head) { struct inet_sock *inet = inet_sk(sk); + if (!vx_sk_check(sk, nfxid)) + continue; if (inet->num == hnum && !ipv6_only_sock(sk)) { __u32 rcv_saddr = inet->rcv_saddr; score = (sk->sk_family == PF_INET ? 1 : 0); - if (tcp_addr_match(sk->sk_nx_info, daddr, rcv_saddr)) + if (rcv_saddr) { + if (rcv_saddr != daddr) + continue; score+=2; - else - continue; + } if (sk->sk_bound_dev_if) { if (sk->sk_bound_dev_if != dif) continue; score+=2; } - if (score == 5) - return sk; + if (score == 5) { + result = sk; + goto out; + } if (score > hiscore) { hiscore = score; result = sk; } } } +out: + vxdprintk(VXD_CBIT(ngnet, 9), + "__tcp_v4_lookup_listener(#%u) = %p[#%u]", + nfxid, result, result?result->sk_xid:0); return result; } /* Optimize the common listener case. */ static inline struct sock *tcp_v4_lookup_listener(u32 daddr, - unsigned short hnum, int dif) + unsigned short hnum, int dif, nfxid_t nfxid) { struct sock *sk = NULL; struct hlist_head *head; + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_lookup_listener(#%u) %u.%u.%u.%u:%u", + nfxid, NIPQUAD(daddr), hnum); + read_lock(&tcp_lhash_lock); + /* FIXME maybe better hash for ngnet/nfxid */ head = &tcp_listening_hash[tcp_lhashfn(hnum)]; if (!hlist_empty(head)) { struct inet_sock *inet = inet_sk((sk = __sk_head(head))); + if (!vx_sk_check(sk, nfxid)) + goto no_sherry; if (inet->num == hnum && !sk->sk_node.next && + (!inet->rcv_saddr || inet->rcv_saddr == daddr) && (sk->sk_family == PF_INET || !ipv6_only_sock(sk)) && - tcp_addr_match(sk->sk_nx_info, daddr, inet->rcv_saddr) && !sk->sk_bound_dev_if) goto sherry_cache; - sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif); + no_sherry: + sk = __tcp_v4_lookup_listener(head, daddr, hnum, dif, nfxid); } if (sk) { sherry_cache: sock_hold(sk); } read_unlock(&tcp_lhash_lock); + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_lookup_listener(#%u) = %p[#%u]", + nfxid, sk, sk?sk->sk_xid:0); return sk; } @@ -500,7 +510,7 @@ sherry_cache: static inline struct sock *__tcp_v4_lookup_established(u32 saddr, u16 sport, u32 daddr, u16 hnum, - int dif) + int dif, nfxid_t nfxid) { struct tcp_ehash_bucket *head; TCP_V4_ADDR_COOKIE(acookie, saddr, daddr) @@ -513,19 +523,31 @@ static inline struct sock *__tcp_v4_look int hash = tcp_hashfn(daddr, hnum, saddr, sport); head = &tcp_ehash[hash]; read_lock(&head->lock); + + vxdprintk(VXD_CBIT(ngnet, 9), + "__tcp_v4_lookup_established(#%u) %u.%u.%u.%u:%u <- %u.%u.%u.%u:%u", + nfxid, NIPQUAD(daddr), hnum, NIPQUAD(saddr), sport); + sk_for_each(sk, node, &head->chain) { + if (!vx_sk_check(sk, nfxid)) + continue; if (TCP_IPV4_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; /* You sunk my battleship! */ } /* Must check for a TIME_WAIT'er before going to listener hash. */ sk_for_each(sk, node, &(head + tcp_ehash_size)->chain) { + if (!vx_sk_check(sk, nfxid)) + continue; if (TCP_IPV4_TW_MATCH(sk, acookie, saddr, daddr, ports, dif)) goto hit; } sk = NULL; out: read_unlock(&head->lock); + vxdprintk(VXD_CBIT(ngnet, 9), + "__tcp_v4_lookup_established(#%u) = %p[#%u]", + nfxid, sk, sk?sk->sk_xid:0); return sk; hit: sock_hold(sk); @@ -533,21 +555,33 @@ hit: } static inline struct sock *__tcp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 hnum, int dif) + u32 daddr, u16 hnum, + int dif, nfxid_t nfxid) { - struct sock *sk = __tcp_v4_lookup_established(saddr, sport, - daddr, hnum, dif); + struct sock *sk; - return sk ? : tcp_v4_lookup_listener(daddr, hnum, dif); + vxdprintk(VXD_CBIT(ngnet, 8), + "__tcp_v4_lookup(#%u) %u.%u.%u.%u:%u <- %u.%u.%u.%u:%u", + nfxid, NIPQUAD(daddr), hnum, NIPQUAD(saddr), sport); + + sk = __tcp_v4_lookup_established(saddr, sport, + daddr, hnum, dif, nfxid); + if (!sk) + sk = tcp_v4_lookup_listener(daddr, hnum, dif, nfxid); + + vxdprintk(VXD_CBIT(ngnet, 8), + "__tcp_v4_lookup(#%u) = %p[#%u]", + nfxid, sk, sk?sk->sk_xid:0); + return sk; } inline struct sock *tcp_v4_lookup(u32 saddr, u16 sport, u32 daddr, - u16 dport, int dif) + u16 dport, int dif, nfxid_t nfxid) { struct sock *sk; local_bh_disable(); - sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif); + sk = __tcp_v4_lookup(saddr, sport, daddr, ntohs(dport), dif, nfxid); local_bh_enable(); return sk; @@ -585,6 +619,8 @@ static int __tcp_v4_check_established(st sk_for_each(sk2, node, &(head + tcp_ehash_size)->chain) { tw = (struct tcp_tw_bucket *)sk2; + if (!vx_sk_check(sk, vx_sk_xid(sk2))) + continue; if (TCP_IPV4_TW_MATCH(sk2, acookie, saddr, daddr, ports, dif)) { struct tcp_sock *tp = tcp_sk(sk); @@ -621,6 +657,8 @@ static int __tcp_v4_check_established(st /* And established part... */ sk_for_each(sk2, node, &head->chain) { + if (!vx_sk_check(sk, vx_sk_xid(sk2))) + continue; if (TCP_IPV4_MATCH(sk2, acookie, saddr, daddr, ports, dif)) goto not_unique; } @@ -769,6 +807,9 @@ int tcp_v4_connect(struct sock *sk, stru int tmp; int err; + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_connect(%p[#%u])", sk, sk->sk_xid); + if (addr_len < sizeof(struct sockaddr_in)) return -EINVAL; @@ -1011,7 +1052,7 @@ void tcp_v4_err(struct sk_buff *skb, u32 } sk = tcp_v4_lookup(iph->daddr, th->dest, iph->saddr, - th->source, tcp_v4_iif(skb)); + th->source, tcp_v4_iif(skb), skb->nfxid); if (!sk) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; @@ -1291,7 +1332,11 @@ static struct dst_entry* tcp_v4_route_re .proto = IPPROTO_TCP, .uli_u = { .ports = { .sport = inet_sk(sk)->sport, - .dport = req->rmt_port } } }; + .dport = req->rmt_port } }, + .nfxid = sk->sk_xid }; + + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_route_req(%p[#%u],%p)", sk, sk->sk_xid, req); if (ip_route_output_flow(&rt, &fl, sk, 0)) { IP_INC_STATS_BH(IPSTATS_MIB_OUTNOROUTES); @@ -1423,6 +1468,10 @@ int tcp_v4_conn_request(struct sock *sk, #define want_cookie 0 /* Argh, why doesn't gcc optimize this :( */ #endif + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_conn_request(%p[#%u],%p[#%u])", + sk, sk->sk_xid, skb, skb->nfxid); + /* Never answer to SYNs send to broadcast or multicast */ if (((struct rtable *)skb->dst)->rt_flags & (RTCF_BROADCAST | RTCF_MULTICAST)) @@ -1571,6 +1620,10 @@ struct sock *tcp_v4_syn_recv_sock(struct struct tcp_sock *newtp; struct sock *newsk; + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_syn_recv_sock(%p[#%u],%p[#%u],%p,%p)", + sk, sk->sk_xid, skb, skb->nfxid, req, dst); + if (sk_acceptq_is_full(sk)) goto exit_overflow; @@ -1629,11 +1682,9 @@ static struct sock *tcp_v4_hnd_req(struc if (req) return tcp_check_req(sk, skb, req, prev); - nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, - th->source, - skb->nh.iph->daddr, - ntohs(th->dest), - tcp_v4_iif(skb)); + nsk = __tcp_v4_lookup_established(skb->nh.iph->saddr, th->source, + skb->nh.iph->daddr, ntohs(th->dest), + tcp_v4_iif(skb), skb->nfxid); if (nsk) { if (nsk->sk_state != TCP_TIME_WAIT) { @@ -1688,6 +1739,10 @@ static int tcp_v4_checksum_init(struct s */ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb) { + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_do_rcv(%p[#%u],%p[#%u])", + sk, sk->sk_xid, skb, skb->nfxid); + if (sk->sk_state == TCP_ESTABLISHED) { /* Fast path */ TCP_CHECK_TIMER(sk); if (tcp_rcv_established(sk, skb, skb->h.th, skb->len)) @@ -1743,6 +1798,9 @@ int tcp_v4_rcv(struct sk_buff *skb) struct sock *sk; int ret; + vxdprintk(VXD_CBIT(ngnet, 8), + "tcp_v4_rcv(%p[#%u])", skb, skb->nfxid); + if (skb->pkt_type != PACKET_HOST) goto discard_it; @@ -1778,7 +1836,7 @@ int tcp_v4_rcv(struct sk_buff *skb) sk = __tcp_v4_lookup(skb->nh.iph->saddr, th->source, skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + tcp_v4_iif(skb), skb->nfxid); if (!sk) goto no_tcp_socket; @@ -1844,7 +1902,8 @@ do_time_wait: case TCP_TW_SYN: { struct sock *sk2 = tcp_v4_lookup_listener(skb->nh.iph->daddr, ntohs(th->dest), - tcp_v4_iif(skb)); + tcp_v4_iif(skb), + skb->nfxid); if (sk2) { tcp_tw_deschedule((struct tcp_tw_bucket *)sk); tcp_tw_put((struct tcp_tw_bucket *)sk); diff -NurpP --minimal a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c --- a/net/ipv4/tcp_minisocks.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/tcp_minisocks.c 2005-02-25 17:18:50.000000000 +0100 @@ -31,6 +31,7 @@ #include #include +#include #ifdef CONFIG_SYSCTL #define SYNC_INIT 0 /* let the user enable it */ @@ -369,8 +370,6 @@ void tcp_time_wait(struct sock *sk, int tw->tw_xid = sk->sk_xid; tw->tw_vx_info = NULL; - tw->tw_nid = sk->sk_nid; - tw->tw_nx_info = NULL; #if defined(CONFIG_IPV6) || defined(CONFIG_IPV6_MODULE) if (tw->tw_family == PF_INET6) { @@ -699,6 +698,10 @@ struct sock *tcp_create_openreq_child(st * slabcache (i.e. is it TCPv4 or v6?) -acme */ struct sock *newsk = sk_alloc(PF_INET, GFP_ATOMIC, 0, sk->sk_prot->slab); + vxdprintk(VXD_CBIT(ngnet, 9), + "tcp_create_openreq_child(%p[#%u],%p,%p[#%u])", + sk, sk->sk_xid, req, skb, skb->nfxid); + if(newsk != NULL) { struct tcp_sock *newtp; struct sk_filter *filter; @@ -708,7 +711,6 @@ struct sock *tcp_create_openreq_child(st /* SANITY */ sock_vx_init(newsk); - sock_nx_init(newsk); sk_node_init(&newsk->sk_node); tcp_sk(newsk)->bind_hash = NULL; @@ -811,8 +813,6 @@ struct sock *tcp_create_openreq_child(st set_vx_info(&newsk->sk_vx_info, sk->sk_vx_info); newsk->sk_xid = sk->sk_xid; vx_sock_inc(newsk); - set_nx_info(&newsk->sk_nx_info, sk->sk_nx_info); - newsk->sk_nid = sk->sk_nid; #ifdef INET_REFCNT_DEBUG atomic_inc(&inet_sock_nr); #endif @@ -1069,6 +1069,11 @@ int tcp_child_process(struct sock *paren int ret = 0; int state = child->sk_state; + vxdprintk(VXD_CBIT(ngnet, 9), + "tcp_v4_do_rcv(%p[#%u],%p[#%u],%p[#%u])", + parent, parent->sk_xid, child, child->sk_xid, + skb, skb->nfxid); + if (!sock_owned_by_user(child)) { ret = tcp_rcv_state_process(child, skb, skb->h.th, skb->len); diff -NurpP --minimal a/net/ipv4/udp.c b/net/ipv4/udp.c --- a/net/ipv4/udp.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv4/udp.c 2005-02-25 17:18:50.000000000 +0100 @@ -107,6 +107,8 @@ #include #include #include +#include +#include /* * Snmp MIB for the UDP layer @@ -174,12 +176,14 @@ gotit: struct inet_sock *inet2 = inet_sk(sk2); if (inet2->num == snum && - sk2 != sk && !ipv6_only_sock(sk2) && + sk2 != sk && + !ipv6_only_sock(sk2) && (!sk2->sk_bound_dev_if || !sk->sk_bound_dev_if || sk2->sk_bound_dev_if == sk->sk_bound_dev_if) && - nx_addr_conflict(sk->sk_nx_info, - tcp_v4_rcv_saddr(sk), sk2) && + (!inet2->rcv_saddr || + !inet->rcv_saddr || + inet2->rcv_saddr == inet->rcv_saddr) && (!sk2->sk_reuse || !sk->sk_reuse)) goto fail; } @@ -214,22 +218,12 @@ static void udp_v4_unhash(struct sock *s write_unlock_bh(&udp_hash_lock); } -static inline int udp_in_list(struct nx_info *nx_info, u32 addr) -{ - int n = nx_info->nbipv4; - int i; - - for (i=0; iipv4[i] == addr) - return 1; - return 0; -} - /* UDP is nearly always wildcards out the wazoo, it makes no sense to try * harder than this. -DaveM */ static struct sock *udp_v4_lookup_longway(u32 saddr, u16 sport, - u32 daddr, u16 dport, int dif) + u32 daddr, u16 dport, + int dif, nfxid_t nfxid) { struct sock *sk, *result = NULL; struct hlist_node *node; @@ -239,17 +233,14 @@ static struct sock *udp_v4_lookup_longwa sk_for_each(sk, node, &udp_hash[hnum & (UDP_HTABLE_SIZE - 1)]) { struct inet_sock *inet = inet_sk(sk); + if (!vx_sk_check(sk, nfxid)) + continue; if (inet->num == hnum && !ipv6_only_sock(sk)) { int score = (sk->sk_family == PF_INET ? 1 : 0); if (inet->rcv_saddr) { if (inet->rcv_saddr != daddr) continue; score+=2; - } else if (sk->sk_nx_info) { - if (udp_in_list(sk->sk_nx_info, daddr)) - score+=2; - else - continue; } if (inet->daddr) { if (inet->daddr != saddr) @@ -279,12 +270,13 @@ static struct sock *udp_v4_lookup_longwa } static __inline__ struct sock *udp_v4_lookup(u32 saddr, u16 sport, - u32 daddr, u16 dport, int dif) + u32 daddr, u16 dport, + int dif, nfxid_t nfxid) { struct sock *sk; read_lock(&udp_hash_lock); - sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif); + sk = udp_v4_lookup_longway(saddr, sport, daddr, dport, dif, nfxid); if (sk) sock_hold(sk); read_unlock(&udp_hash_lock); @@ -306,8 +298,7 @@ static inline struct sock *udp_v4_mcast_ if (inet->num != hnum || (inet->daddr && inet->daddr != rmt_addr) || (inet->dport != rmt_port && inet->dport) || - (inet->rcv_saddr && inet->rcv_saddr != loc_addr && - inet->rcv_saddr2 && inet->rcv_saddr2 != loc_addr) || + (inet->rcv_saddr && inet->rcv_saddr != loc_addr) || ipv6_only_sock(s) || (s->sk_bound_dev_if && s->sk_bound_dev_if != dif)) continue; @@ -342,7 +333,8 @@ void udp_err(struct sk_buff *skb, u32 in int harderr; int err; - sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, skb->dev->ifindex); + sk = udp_v4_lookup(iph->daddr, uh->dest, iph->saddr, uh->source, + skb->dev->ifindex, skb->nfxid); if (sk == NULL) { ICMP_INC_STATS_BH(ICMP_MIB_INERRORS); return; /* No socket for error */ @@ -510,6 +502,9 @@ int udp_sendmsg(struct kiocb *iocb, stru int err; int corkreq = up->corkflag || msg->msg_flags&MSG_MORE; + vxdprintk(VXD_CBIT(ngnet, 1), + "udp_sendmsg(%p[#%u])", sk, sk->sk_xid); + if (len > 0xFFFF) return -EMSGSIZE; @@ -615,16 +610,9 @@ int udp_sendmsg(struct kiocb *iocb, stru .proto = IPPROTO_UDP, .uli_u = { .ports = { .sport = inet->sport, - .dport = dport } } }; - struct nx_info *nxi = sk->sk_nx_info; + .dport = dport } }, + .nfxid = sk->sk_xid }; - if (nxi) { - err = ip_find_src(nxi, &rt, &fl); - if (err) - goto out; - if (daddr == IPI_LOOPBACK && !vx_check(0, VX_ADMIN)) - daddr = fl.fl4_dst = nxi->ipv4[0]; - } err = ip_route_output_flow(&rt, &fl, sk, !(msg->msg_flags&MSG_DONTWAIT)); if (err) goto out; @@ -1170,7 +1158,8 @@ int udp_rcv(struct sk_buff *skb) if(rt->rt_flags & (RTCF_BROADCAST|RTCF_MULTICAST)) return udp_v4_mcast_deliver(skb, uh, saddr, daddr); - sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, skb->dev->ifindex); + sk = udp_v4_lookup(saddr, uh->source, daddr, uh->dest, + skb->dev->ifindex, skb->nfxid); if (sk != NULL) { int ret = udp_queue_rcv_skb(sk, skb); diff -NurpP --minimal a/net/ipv6/addrconf.c b/net/ipv6/addrconf.c --- a/net/ipv6/addrconf.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/ipv6/addrconf.c 2005-02-25 17:18:50.000000000 +0100 @@ -2684,10 +2684,6 @@ static int inet6_dump_addr(struct sk_buf struct ifmcaddr6 *ifmca; struct ifacaddr6 *ifaca; - /* no ipv6 inside a vserver for now */ - if (skb->sk && skb->sk->sk_vx_info) - return skb->len; - s_idx = cb->args[0]; s_ip_idx = ip_idx = cb->args[1]; read_lock(&dev_base_lock); @@ -2907,10 +2903,6 @@ static int inet6_dump_ifinfo(struct sk_b struct net_device *dev; struct inet6_dev *idev; - /* no ipv6 inside a vserver for now */ - if (skb->sk && skb->sk->sk_vx_info) - return skb->len; - read_lock(&dev_base_lock); for (dev=dev_base, idx=0; dev; dev = dev->next, idx++) { if (idx < s_idx) diff -NurpP --minimal a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c --- a/net/netlink/af_netlink.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/netlink/af_netlink.c 2005-02-25 17:18:50.000000000 +0100 @@ -55,6 +55,7 @@ #include #include #include +#include #define Nprintk(a...) @@ -650,7 +651,8 @@ int netlink_sendskb(struct sock *sk, str return len; } #endif - + /* FIXME move it up to origin? */ + vx_tag_netlink_skb(sk, skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, len); sock_put(sk); @@ -723,6 +725,8 @@ static __inline__ int netlink_broadcast_ if (atomic_read(&sk->sk_rmem_alloc) <= sk->sk_rcvbuf && !test_bit(0, &nlk->state)) { skb_set_owner_r(skb, sk); + /* FIXME move it up to origin? */ + vx_tag_netlink_skb(sk, skb); skb_queue_tail(&sk->sk_receive_queue, skb); sk->sk_data_ready(sk, skb->len); return atomic_read(&sk->sk_rmem_alloc) > sk->sk_rcvbuf; @@ -1096,6 +1100,7 @@ static int netlink_dump(struct sock *sk) if (!skb) return -ENOBUFS; + vx_tag_netlink_skb(sk, skb); spin_lock(&nlk->cb_lock); cb = nlk->cb; diff -NurpP --minimal a/net/socket.c b/net/socket.c --- a/net/socket.c 2005-02-25 17:05:42.000000000 +0100 +++ b/net/socket.c 2005-02-25 17:18:50.000000000 +0100 @@ -551,9 +551,8 @@ static inline int __sock_sendmsg(struct vx_sock_fail(sock->sk, size); } vxdprintk(VXD_CBIT(net, 7), - "__sock_sendmsg: %p[%p,%p,%p;%d]:%d/%d", + "__sock_sendmsg: %p[%p,%p;#%d]:%d/%d", sock, sock->sk, - (sock->sk)?sock->sk->sk_nx_info:0, (sock->sk)?sock->sk->sk_vx_info:0, (sock->sk)?sock->sk->sk_xid:0, (unsigned int)size, len); @@ -612,9 +611,8 @@ static inline int __sock_recvmsg(struct if ((len >= 0) && sock->sk) vx_sock_recv(sock->sk, len); vxdprintk(VXD_CBIT(net, 7), - "__sock_recvmsg: %p[%p,%p,%p;%d]:%d/%d", + "__sock_recvmsg: %p[%p,%p;#%d]:%d/%d", sock, sock->sk, - (sock->sk)?sock->sk->sk_nx_info:0, (sock->sk)?sock->sk->sk_vx_info:0, (sock->sk)?sock->sk->sk_xid:0, (unsigned int)size, len); @@ -1106,10 +1104,6 @@ static int __sock_create(int family, int if (type < 0 || type >= SOCK_MAX) return -EINVAL; - /* disable IPv6 inside vservers for now */ - if (family == PF_INET6 && !vx_check(0, VX_ADMIN)) - return -EAFNOSUPPORT; - /* Compatibility. This uglymoron is moved from INET layer to here to avoid @@ -1188,6 +1182,10 @@ static int __sock_create(int family, int *res = sock; security_socket_post_create(sock, family, type, protocol, kern); +#if 0 + printk("__sock_create(%d,%d,%d,,%d) %p[#%d]\n", + family, type, protocol, kern, sock->sk, sock->sk->sk_xid); +#endif out: net_family_read_unlock(); return err; diff -NurpP --minimal a/net/xfrm/xfrm_policy.c b/net/xfrm/xfrm_policy.c --- a/net/xfrm/xfrm_policy.c 2005-02-25 17:05:04.000000000 +0100 +++ b/net/xfrm/xfrm_policy.c 2005-02-25 17:18:50.000000000 +0100 @@ -735,6 +735,10 @@ int xfrm_lookup(struct dst_entry **dst_p int err; u32 genid; u16 family = dst_orig->ops->family; + + vxdprintk(VXD_CBIT(ngnet, 1), + "xfrm_lookup(%p[#%u]) #%d (%p[#%u])", + dst_p, (*dst_p)->fl.nfxid, fl->nfxid, sk, sk->nfxid); restart: genid = atomic_read(&flow_cache_genid); policy = NULL;