I’m working with a Linux TUN device (MTU 65535) where I write jumbo TCP packets (10KB+), and my goal is for the kernel (via software GSO) or the NIC (via TSO if available) to segment them into MTU-sized chunks before sending out through eth0 (MTU 1500). I understand this involves creating the TUN interface with IFF_VNET_HDR, enabling GSO via TUNSETOFFLOAD, and letting the kernel networking stack handle segmentation rather than doing it in user space. However, I’m struggling to get the connection established when GSO is enabled, even though the same setup works fine when GSO is disabled. Has anyone successfully achieved TCP segmentation this way, and could share a minimal working example or guidance (some docs would be greatly appreciated as well)?
That is the code that I use to setup my TUN device with GSO enabled:
package tun
import (
"fmt"
"os"
"syscall"
"unsafe"
)
const (
IFF_TUN = 0x0001
IFF_NO_PI = 0x1000
IFF_VNET_HDR = 0x4000
TUNSETIFF = 0x400454ca
TUNSETVNETHDRSZ = 0x400454d8
TUNSETOFFLOAD = 0x400454d0
TUN_OFFLOAD_CSUM = 0x01
TUN_OFFLOAD_GSO = 0x02
)
type ifreq struct {
Name [16]byte
Flags uint16
_ [22]byte
}
func CreateTUN(name string) (*os.File, error) {
f, err := os.OpenFile("/dev/net/tun", os.O_RDWR, 0)
if err != nil {
return nil, fmt.Errorf("failed to open /dev/net/tun: %v", err)
}
var ifr ifreq
copy(ifr.Name[:], name)
ifr.Flags = IFF_TUN | IFF_NO_PI | IFF_VNET_HDR
_, _, errno := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), uintptr(TUNSETIFF), uintptr(unsafe.Pointer(&ifr)))
if errno != 0 {
return nil, fmt.Errorf("TUNSETIFF failed: %v", errno)
}
if err := EnableGSO(f.Fd()); err != nil {
return nil, err
}
return f, nil
}
func EnableGSO(tunFileFd uintptr) error {
hdrSize := uint32(12) // virtio-net header size
if _, _, errno := syscall.Syscall(
syscall.SYS_IOCTL, tunFileFd,
uintptr(TUNSETVNETHDRSZ),
uintptr(unsafe.Pointer(&hdrSize)),
); errno != 0 {
return fmt.Errorf("TUNSETVNETHDRSZ failed: %v", errno)
}
if _, _, errno := syscall.Syscall(
syscall.SYS_IOCTL, tunFileFd,
uintptr(TUNSETOFFLOAD),
uintptr(TUN_OFFLOAD_GSO|TUN_OFFLOAD_CSUM),
); errno != 0 {
return fmt.Errorf("TUNSETOFFLOAD failed: %v", errno)
}
fmt.Printf("Enabled GSO+CSUM with virtio-net headers on TUN (fd=%d)\n", tunFileFd)
return nil
}
And that is the piece that adds vnet_header to a packet before it is written into the TUN:
package main
import (
"encoding/binary"
"errors"
)
const (
VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
VIRTIO_NET_HDR_GSO_NONE = 0
VIRTIO_NET_HDR_GSO_TCPV4 = 1
VIRTIO_NET_HDR_GSO_TCPV6 = 4
TCP_CHECKSUM_OFFSET = 16
)
type VirtioNetHdr struct {
Flags uint8
GSOType uint8
HdrLen uint16
GSOSize uint16
CSumStart uint16
CSumOffset uint16
}
func PrependVnetHeader(pkt []byte, mtu int) ([]byte, error) {
if len(pkt) < 1 {
return nil, errors.New("empty packet")
}
ipVersion := (pkt[0] & 0xF0) >> 4
var hdr VirtioNetHdr
switch ipVersion {
case 4: // IPv4
if len(pkt) < 20 {
return nil, errors.New("invalid IPv4 packet")
}
if pkt[9] != 6 { // Only TCP
break
}
ipHeaderLen := int(pkt[0]&0x0F) * 4
tcpLen := len(pkt) - ipHeaderLen
hdr.Flags = VIRTIO_NET_HDR_F_NEEDS_CSUM
hdr.HdrLen = uint16(ipHeaderLen + 20)
hdr.CSumStart = uint16(ipHeaderLen)
hdr.CSumOffset = TCP_CHECKSUM_OFFSET
if tcpLen > mtu {
hdr.GSOType = VIRTIO_NET_HDR_GSO_TCPV4
hdr.GSOSize = uint16(mtu)
} else {
hdr.GSOType = VIRTIO_NET_HDR_GSO_NONE
}
case 6: // IPv6
if len(pkt) < 40 {
return nil, errors.New("invalid IPv6 packet")
}
if pkt[6] != 6 { // Only TCP
break
}
ipHeaderLen := 40
tcpLen := len(pkt) - ipHeaderLen
hdr.Flags = VIRTIO_NET_HDR_F_NEEDS_CSUM
hdr.HdrLen = uint16(ipHeaderLen + 20)
hdr.CSumStart = uint16(ipHeaderLen)
hdr.CSumOffset = TCP_CHECKSUM_OFFSET
if tcpLen > mtu {
hdr.GSOType = VIRTIO_NET_HDR_GSO_TCPV6
hdr.GSOSize = uint16(mtu)
} else {
hdr.GSOType = VIRTIO_NET_HDR_GSO_NONE
}
}
// Serialize header + packet
buf := make([]byte, 12+len(pkt))
buf[0] = hdr.Flags
buf[1] = hdr.GSOType
binary.LittleEndian.PutUint16(buf[2:4], hdr.HdrLen)
binary.LittleEndian.PutUint16(buf[4:6], hdr.GSOSize)
binary.LittleEndian.PutUint16(buf[6:8], hdr.CSumStart)
binary.LittleEndian.PutUint16(buf[8:10], hdr.CSumOffset)
// buf[10:12] reserved = 0
copy(buf[12:], pkt)
return buf, nil
}
Here are my pcaps on eth0 when I try to test it:
root@2458c936acdb:/# tcpdump -i eth0 port 80 -nn -vv -xx
tcpdump: listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
20:57:13.100414 IP (tos 0x0, ttl 63, id 60084, offset 0, flags [DF], proto TCP (6), length 60)
172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xafdf (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498122859 ecr 0,nop,wscale 7], length 0
0x0000: 0242 fe82 3318 32e1 1aa3 4686 0800 4500
0x0010: 003c eab4 4000 3f06 f23e ac11 0002 96ab
0x0020: 1c0a dad2 0050 e4f2 a116 0000 0000 a002
0x0030: faf0 afdf 0000 0204 f99c 0402 080a 94e6
0x0040: 546b 0000 0000 0103 0307
20:57:13.103550 IP (tos 0x0, ttl 110, id 3013, offset 0, flags [DF], proto TCP (6), length 52)
150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x870d (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,wscale 8,nop,nop,sackOK], length 0
0x0000: 32e1 1aa3 4686 0242 fe82 3318 0800 4500
0x0010: 0034 0bc5 4000 6e06 a236 96ab 1c0a ac11
0x0020: 0002 0050 dad2 c663 61e7 e4f2 a117 8012
0x0030: ffff 870d 0000 0204 0566 0103 0308 0101
0x0040: 0402
20:57:13.492844 IP (tos 0x0, ttl 110, id 3014, offset 0, flags [DF], proto TCP (6), length 52)
150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x870d (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,wscale 8,nop,nop,sackOK], length 0
0x0000: 32e1 1aa3 4686 0242 fe82 3318 0800 4500
0x0010: 0034 0bc6 4000 6e06 a235 96ab 1c0a ac11
0x0020: 0002 0050 dad2 c663 61e7 e4f2 a117 8012
0x0030: ffff 870d 0000 0204 0566 0103 0308 0101
0x0040: 0402
20:57:14.115474 IP (tos 0x0, ttl 63, id 60085, offset 0, flags [DF], proto TCP (6), length 60)
172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xabe3 (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498123879 ecr 0,nop,wscale 7], length 0
0x0000: 0242 fe82 3318 32e1 1aa3 4686 0800 4500
0x0010: 003c eab5 4000 3f06 f23d ac11 0002 96ab
0x0020: 1c0a dad2 0050 e4f2 a116 0000 0000 a002
0x0030: faf0 abe3 0000 0204 f99c 0402 080a 94e6
0x0040: 5867 0000 0000 0103 0307
20:57:14.267406 IP (tos 0x0, ttl 110, id 3015, offset 0, flags [DF], proto TCP (6), length 48)
150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x9b1c (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,nop,sackOK], length 0
0x0000: 32e1 1aa3 4686 0242 fe82 3318 0800 4500
0x0010: 0030 0bc7 4000 6e06 a238 96ab 1c0a ac11
0x0020: 0002 0050 dad2 c663 61e7 e4f2 a117 7012
0x0030: ffff 9b1c 0000 0204 0566 0101 0402
20:57:15.140409 IP (tos 0x0, ttl 63, id 60086, offset 0, flags [DF], proto TCP (6), length 60)
172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xa7e3 (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498124903 ecr 0,nop,wscale 7], length 0
0x0000: 0242 fe82 3318 32e1 1aa3 4686 0800 4500
0x0010: 003c eab6 4000 3f06 f23c ac11 0002 96ab
0x0020: 1c0a dad2 0050 e4f2 a116 0000 0000 a002
0x0030: faf0 a7e3 0000 0204 f99c 0402 080a 94e6
0x0040: 5c67 0000 0000 0103 0307
20:57:15.792324 IP (tos 0x0, ttl 63, id 0, offset 0, flags [DF], proto TCP (6), length 40)
100.64.0.5.56018 > 150.171.28.10.80: Flags [R], cksum 0xbdc3 (correct), seq 0, win 0, length 0
0x0000: 0242 fe82 3318 32e1 1aa3 4686 0800 4500
0x0010: 0028 0000 4000 3f06 24d6 6440 0005 96ab
0x0020: 1c0a dad2 0050 0000 0000 0000 0000 5004
0x0030: 0000 bdc3 0000
20:57:15.820568 IP (tos 0x0, ttl 111, id 3016, offset 0, flags [DF], proto TCP (6), length 40)
150.171.28.10.80 > 172.17.0.2.56018: Flags [R], cksum 0xc79e (correct), seq 3328401896, win 0, length 0
0x0000: 32e1 1aa3 4686 0242 fe82 3318 0800 4500
0x0010: 0028 0bc8 4000 6f06 a13f 96ab 1c0a ac11
0x0020: 0002 0050 dad2 c663 61e8 e4f2 a117 5004
Seems like it sends out some corrupted SYN packet that never gets delivered.