Howdy!
Here is a pleasant observation: Snabb Switch's recent evolution has been bringing our internal software design closer into line with the
End to End Principle.
The end to end principle says to put complexity at the edges of a network and keep the bits in the middle simple. Prime example: the Internet has scaled by putting a lot of complexity in the endpoints (TCP, HTTP, SSL) but keeping the intermediate hops simple (Ethernet, IP, GRE). This has allowed applications and infrastructure to evolve independently, and indeed the internet has scaled up remarkably well in all respects.
Our app network has recently been pushing complexity to the edges too. The programming model for an app is almost exactly like the programming model for a real device: you have some links that carry a stream of packets, and each packet is a blob of binary data that you can process however you want, and there is really not much more to it than that.
This is unusual in networking stacks. It is more common for packets to be big data structures with things like mutexes, reference counts, cached protocol headers, extra reserved space, checksum validity information, multiple data buffers chained together, and so on. This information can be useful, but if every app in the network has to carefully maintain it then it can also be complex and expensive. My reflection is that by making all code that deals with packets worry about these details is contrary to the end-to-end principle, like making every internet switch and router deal with TCP/HTTP/SSL just because some endpoints use those protocols.
End of reflection :). But if you are feeling curious then below is a little summary of how our packet structure has evolved and a comparison with a more traditional one.
This is what our packet structure looks like today (simple, spartan, SIMD-friendly):
struct packet {
unsigned char data[PACKET_PAYLOAD_SIZE];
uint16_t length;
};
Can't be much simpler than this, can it? (Indeed, I won't be shocked if we find a good reason to extend this again in the future, but that is not the current trend.)
Our previous one was also quite simple, but included checksum metadata ("checksum is already done" flag and "checksum needs to be done before transmission" flag). That made it possible for endpoint apps to communicate information to each other (e.g. between the Intel10G app and the Virtio-net app) but at the expense of every app on the path needing to deal with this (e.g. when encapsulating/decapsulating packets):
struct packet {
unsigned char data[PACKET_PAYLOAD_SIZE];
uint16_t length; // data payload length
uint16_t flags; // see packet_flags enum below
uint16_t csum_start; // position where checksum starts
uint16_t csum_offset; // offset (after start) to store checksum
};
enum packet_flags {
PACKET_NEEDS_CSUM = 1, // Layer-4 checksum needs to be computed
PACKET_CSUM_VALID = 2 // checksums are known to be correct
};
... and this one was already much simpler than our previous version that allowed multiple buffers to be chained together and to keep track of where memory was allocated (e.g. within a particular virtual machine):
struct buffer;
struct buffer_origin {
enum buffer_origin_type {
BUFFER_ORIGIN_UNKNOWN = 0,
BUFFER_ORIGIN_VIRTIO = 1
// NUMA...
} type;
union buffer_origin_info {
struct buffer_origin_info_virtio {
int16_t device_id;
int16_t ring_id;
int16_t header_id;
char *header_pointer; // virtual address in this process
uint32_t total_size; // how many bytes in all buffers
} virtio;
} info;
};
// A packet_iovec describes a portion of a buffer.
struct packet_iovec {
struct buffer *buffer;
uint32_t offset;
uint32_t length;
};
struct packet_info {
uint8_t flags; // see below
uint8_t gso_flags; // see below
uint16_t hdr_len; // ethernet + ip + tcp/udp header length
uint16_t gso_size; // bytes of post-header payload per segment
uint16_t csum_start; // position where checksum starts
uint16_t csum_offset; // offset (after start) to store checksum
};
struct packet {
int32_t refcount;
int32_t color;
struct packet_info info;
int niovecs;
int length;
struct packet_iovec iovecs[PACKET_IOVEC_MAX];
} __attribute__ ((aligned(64)));
... which was pretty complex but still considerably more spartan than a genuine traditional one like the Linux struct sk_buff:
struct sk_buff {
/* These two members must be first. */
struct sk_buff *next;
struct sk_buff *prev;
union {
ktime_t tstamp;
struct skb_mstamp skb_mstamp;
};
struct sock *sk;
struct net_device *dev;
/*
* This is the control buffer. It is free to use for every
* layer. Please put your private variables there. If you
* want to keep them across layers you have to do a skb_clone()
* first. This is owned by whoever has the skb queued ATM.
*/
char cb[48] __aligned(8);
unsigned long _skb_refdst;
void (*destructor)(struct sk_buff *skb);
#ifdef CONFIG_XFRM
struct sec_path *sp;
#endif
#if defined(CONFIG_NF_CONNTRACK) || defined(CONFIG_NF_CONNTRACK_MODULE)
struct nf_conntrack *nfct;
#endif
#if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
struct nf_bridge_info *nf_bridge;
#endif
unsigned int len,
data_len;
__u16 mac_len,
hdr_len;
/* Following fields are _not_ copied in __copy_skb_header()
* Note that queue_mapping is here mostly to fill a hole.
*/
kmemcheck_bitfield_begin(flags1);
__u16 queue_mapping;
__u8 cloned:1,
nohdr:1,
fclone:2,
peeked:1,
head_frag:1,
xmit_more:1;
/* one bit hole */
kmemcheck_bitfield_end(flags1);
/* fields enclosed in headers_start/headers_end are copied
* using a single memcpy() in __copy_skb_header()
*/
/* private: */
__u32 headers_start[0];
/* public: */
/* if you move pkt_type around you also must adapt those constants */
#ifdef __BIG_ENDIAN_BITFIELD
#define PKT_TYPE_MAX (7 << 5)
#else
#define PKT_TYPE_MAX 7
#endif
#define PKT_TYPE_OFFSET() offsetof(struct sk_buff, __pkt_type_offset)
__u8 __pkt_type_offset[0];
__u8 pkt_type:3;
__u8 pfmemalloc:1;
__u8 ignore_df:1;
__u8 nfctinfo:3;
__u8 nf_trace:1;
__u8 ip_summed:2;
__u8 ooo_okay:1;
__u8 l4_hash:1;
__u8 sw_hash:1;
__u8 wifi_acked_valid:1;
__u8 wifi_acked:1;
__u8 no_fcs:1;
/* Indicates the inner headers are valid in the skbuff. */
__u8 encapsulation:1;
__u8 encap_hdr_csum:1;
__u8 csum_valid:1;
__u8 csum_complete_sw:1;
__u8 csum_level:2;
__u8 csum_bad:1;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
__u8 ndisc_nodetype:2;
#endif
__u8 ipvs_property:1;
__u8 inner_protocol_type:1;
/* 4 or 6 bit hole */
#ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */
#ifdef CONFIG_NET_CLS_ACT
__u16 tc_verd; /* traffic control verdict */
#endif
#endif
union {
__wsum csum;
struct {
__u16 csum_start;
__u16 csum_offset;
};
};
__u32 priority;
int skb_iif;
__u32 hash;
__be16 vlan_proto;
__u16 vlan_tci;
#ifdef CONFIG_NET_RX_BUSY_POLL
unsigned int napi_id;
#endif
#ifdef CONFIG_NETWORK_SECMARK
__u32 secmark;
#endif
union {
__u32 mark;
__u32 dropcount;
__u32 reserved_tailroom;
};
union {
__be16 inner_protocol;
__u8 inner_ipproto;
};
__u16 inner_transport_header;
__u16 inner_network_header;
__u16 inner_mac_header;
__be16 protocol;
__u16 transport_header;
__u16 network_header;
__u16 mac_header;
/* private: */
__u32 headers_end[0];
/* public: */
/* These elements must be at the end, see alloc_skb() for details. */
sk_buff_data_t tail;
sk_buff_data_t end;
unsigned char *head,
*data;
unsigned int truesize;
atomic_t users;
};