1

I’m working with a Linux TUN device (MTU 65535) where I write jumbo TCP packets (10KB+), and my goal is for the kernel (via software GSO) or the NIC (via TSO if available) to segment them into MTU-sized chunks before sending out through eth0 (MTU 1500). I understand this involves creating the TUN interface with IFF_VNET_HDR, enabling GSO via TUNSETOFFLOAD, and letting the kernel networking stack handle segmentation rather than doing it in user space. However, I’m struggling to get the connection established when GSO is enabled, even though the same setup works fine when GSO is disabled. Has anyone successfully achieved TCP segmentation this way, and could share a minimal working example or guidance (some docs would be greatly appreciated as well)?

That is the code that I use to setup my TUN device with GSO enabled:

package tun

import (
    "fmt"
    "os"
    "syscall"
    "unsafe"
)

const (
    IFF_TUN          = 0x0001
    IFF_NO_PI        = 0x1000
    IFF_VNET_HDR     = 0x4000
    TUNSETIFF        = 0x400454ca
    TUNSETVNETHDRSZ  = 0x400454d8
    TUNSETOFFLOAD    = 0x400454d0

    TUN_OFFLOAD_CSUM = 0x01
    TUN_OFFLOAD_GSO  = 0x02
)

type ifreq struct {
    Name  [16]byte
    Flags uint16
    _     [22]byte
}

func CreateTUN(name string) (*os.File, error) {
    f, err := os.OpenFile("/dev/net/tun", os.O_RDWR, 0)
    if err != nil {
        return nil, fmt.Errorf("failed to open /dev/net/tun: %v", err)
    }

    var ifr ifreq
    copy(ifr.Name[:], name)
    ifr.Flags = IFF_TUN | IFF_NO_PI | IFF_VNET_HDR

    _, _, errno := syscall.Syscall(syscall.SYS_IOCTL, f.Fd(), uintptr(TUNSETIFF), uintptr(unsafe.Pointer(&ifr)))
    if errno != 0 {
        return nil, fmt.Errorf("TUNSETIFF failed: %v", errno)
    }

    if err := EnableGSO(f.Fd()); err != nil {
        return nil, err
    }

    return f, nil
}

func EnableGSO(tunFileFd uintptr) error {
    hdrSize := uint32(12) // virtio-net header size
    if _, _, errno := syscall.Syscall(
        syscall.SYS_IOCTL, tunFileFd,
        uintptr(TUNSETVNETHDRSZ),
        uintptr(unsafe.Pointer(&hdrSize)),
    ); errno != 0 {
        return fmt.Errorf("TUNSETVNETHDRSZ failed: %v", errno)
    }

    if _, _, errno := syscall.Syscall(
        syscall.SYS_IOCTL, tunFileFd,
        uintptr(TUNSETOFFLOAD),
        uintptr(TUN_OFFLOAD_GSO|TUN_OFFLOAD_CSUM),
    ); errno != 0 {
        return fmt.Errorf("TUNSETOFFLOAD failed: %v", errno)
    }

    fmt.Printf("Enabled GSO+CSUM with virtio-net headers on TUN (fd=%d)\n", tunFileFd)
    return nil
}

And that is the piece that adds vnet_header to a packet before it is written into the TUN:

package main

import (
    "encoding/binary"
    "errors"
)

const (
    VIRTIO_NET_HDR_F_NEEDS_CSUM = 1
    VIRTIO_NET_HDR_GSO_NONE     = 0
    VIRTIO_NET_HDR_GSO_TCPV4    = 1
    VIRTIO_NET_HDR_GSO_TCPV6    = 4
    TCP_CHECKSUM_OFFSET         = 16
)

type VirtioNetHdr struct {
    Flags     uint8
    GSOType   uint8
    HdrLen    uint16
    GSOSize   uint16
    CSumStart uint16
    CSumOffset uint16
}

func PrependVnetHeader(pkt []byte, mtu int) ([]byte, error) {
    if len(pkt) < 1 {
        return nil, errors.New("empty packet")
    }

    ipVersion := (pkt[0] & 0xF0) >> 4
    var hdr VirtioNetHdr

    switch ipVersion {
    case 4: // IPv4
        if len(pkt) < 20 {
            return nil, errors.New("invalid IPv4 packet")
        }
        if pkt[9] != 6 { // Only TCP
            break
        }

        ipHeaderLen := int(pkt[0]&0x0F) * 4
        tcpLen := len(pkt) - ipHeaderLen

        hdr.Flags = VIRTIO_NET_HDR_F_NEEDS_CSUM
        hdr.HdrLen = uint16(ipHeaderLen + 20)
        hdr.CSumStart = uint16(ipHeaderLen)
        hdr.CSumOffset = TCP_CHECKSUM_OFFSET

        if tcpLen > mtu {
            hdr.GSOType = VIRTIO_NET_HDR_GSO_TCPV4
            hdr.GSOSize = uint16(mtu)
        } else {
            hdr.GSOType = VIRTIO_NET_HDR_GSO_NONE
        }

    case 6: // IPv6
        if len(pkt) < 40 {
            return nil, errors.New("invalid IPv6 packet")
        }
        if pkt[6] != 6 { // Only TCP
            break
        }

        ipHeaderLen := 40
        tcpLen := len(pkt) - ipHeaderLen

        hdr.Flags = VIRTIO_NET_HDR_F_NEEDS_CSUM
        hdr.HdrLen = uint16(ipHeaderLen + 20)
        hdr.CSumStart = uint16(ipHeaderLen)
        hdr.CSumOffset = TCP_CHECKSUM_OFFSET

        if tcpLen > mtu {
            hdr.GSOType = VIRTIO_NET_HDR_GSO_TCPV6
            hdr.GSOSize = uint16(mtu)
        } else {
            hdr.GSOType = VIRTIO_NET_HDR_GSO_NONE
        }
    }

    // Serialize header + packet
    buf := make([]byte, 12+len(pkt))
    buf[0] = hdr.Flags
    buf[1] = hdr.GSOType
    binary.LittleEndian.PutUint16(buf[2:4], hdr.HdrLen)
    binary.LittleEndian.PutUint16(buf[4:6], hdr.GSOSize)
    binary.LittleEndian.PutUint16(buf[6:8], hdr.CSumStart)
    binary.LittleEndian.PutUint16(buf[8:10], hdr.CSumOffset)
    // buf[10:12] reserved = 0
    copy(buf[12:], pkt)

    return buf, nil
}

Here are my pcaps on eth0 when I try to test it:

root@2458c936acdb:/# tcpdump -i eth0 port 80 -nn -vv -xx
tcpdump: listening on eth0, link-type EN10MB (Ethernet), snapshot length 262144 bytes
20:57:13.100414 IP (tos 0x0, ttl 63, id 60084, offset 0, flags [DF], proto TCP (6), length 60)
    172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xafdf (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498122859 ecr 0,nop,wscale 7], length 0
        0x0000:  0242 fe82 3318 32e1 1aa3 4686 0800 4500
        0x0010:  003c eab4 4000 3f06 f23e ac11 0002 96ab
        0x0020:  1c0a dad2 0050 e4f2 a116 0000 0000 a002
        0x0030:  faf0 afdf 0000 0204 f99c 0402 080a 94e6
        0x0040:  546b 0000 0000 0103 0307
20:57:13.103550 IP (tos 0x0, ttl 110, id 3013, offset 0, flags [DF], proto TCP (6), length 52)
    150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x870d (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,wscale 8,nop,nop,sackOK], length 0
        0x0000:  32e1 1aa3 4686 0242 fe82 3318 0800 4500
        0x0010:  0034 0bc5 4000 6e06 a236 96ab 1c0a ac11
        0x0020:  0002 0050 dad2 c663 61e7 e4f2 a117 8012
        0x0030:  ffff 870d 0000 0204 0566 0103 0308 0101
        0x0040:  0402
20:57:13.492844 IP (tos 0x0, ttl 110, id 3014, offset 0, flags [DF], proto TCP (6), length 52)
    150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x870d (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,wscale 8,nop,nop,sackOK], length 0
        0x0000:  32e1 1aa3 4686 0242 fe82 3318 0800 4500
        0x0010:  0034 0bc6 4000 6e06 a235 96ab 1c0a ac11
        0x0020:  0002 0050 dad2 c663 61e7 e4f2 a117 8012
        0x0030:  ffff 870d 0000 0204 0566 0103 0308 0101
        0x0040:  0402
20:57:14.115474 IP (tos 0x0, ttl 63, id 60085, offset 0, flags [DF], proto TCP (6), length 60)
    172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xabe3 (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498123879 ecr 0,nop,wscale 7], length 0
        0x0000:  0242 fe82 3318 32e1 1aa3 4686 0800 4500
        0x0010:  003c eab5 4000 3f06 f23d ac11 0002 96ab
        0x0020:  1c0a dad2 0050 e4f2 a116 0000 0000 a002
        0x0030:  faf0 abe3 0000 0204 f99c 0402 080a 94e6
        0x0040:  5867 0000 0000 0103 0307
20:57:14.267406 IP (tos 0x0, ttl 110, id 3015, offset 0, flags [DF], proto TCP (6), length 48)
    150.171.28.10.80 > 172.17.0.2.56018: Flags [S.], cksum 0x9b1c (correct), seq 3328401895, ack 3841106199, win 65535, options [mss 1382,nop,nop,sackOK], length 0
        0x0000:  32e1 1aa3 4686 0242 fe82 3318 0800 4500
        0x0010:  0030 0bc7 4000 6e06 a238 96ab 1c0a ac11
        0x0020:  0002 0050 dad2 c663 61e7 e4f2 a117 7012
        0x0030:  ffff 9b1c 0000 0204 0566 0101 0402
20:57:15.140409 IP (tos 0x0, ttl 63, id 60086, offset 0, flags [DF], proto TCP (6), length 60)
    172.17.0.2.56018 > 150.171.28.10.80: Flags [S], cksum 0xa7e3 (correct), seq 3841106198, win 64240, options [mss 63900,sackOK,TS val 2498124903 ecr 0,nop,wscale 7], length 0
        0x0000:  0242 fe82 3318 32e1 1aa3 4686 0800 4500
        0x0010:  003c eab6 4000 3f06 f23c ac11 0002 96ab
        0x0020:  1c0a dad2 0050 e4f2 a116 0000 0000 a002
        0x0030:  faf0 a7e3 0000 0204 f99c 0402 080a 94e6
        0x0040:  5c67 0000 0000 0103 0307
20:57:15.792324 IP (tos 0x0, ttl 63, id 0, offset 0, flags [DF], proto TCP (6), length 40)
    100.64.0.5.56018 > 150.171.28.10.80: Flags [R], cksum 0xbdc3 (correct), seq 0, win 0, length 0
        0x0000:  0242 fe82 3318 32e1 1aa3 4686 0800 4500
        0x0010:  0028 0000 4000 3f06 24d6 6440 0005 96ab
        0x0020:  1c0a dad2 0050 0000 0000 0000 0000 5004
        0x0030:  0000 bdc3 0000
20:57:15.820568 IP (tos 0x0, ttl 111, id 3016, offset 0, flags [DF], proto TCP (6), length 40)
    150.171.28.10.80 > 172.17.0.2.56018: Flags [R], cksum 0xc79e (correct), seq 3328401896, win 0, length 0
        0x0000:  32e1 1aa3 4686 0242 fe82 3318 0800 4500
        0x0010:  0028 0bc8 4000 6f06 a13f 96ab 1c0a ac11
        0x0020:  0002 0050 dad2 c663 61e8 e4f2 a117 5004    

Seems like it sends out some corrupted SYN packet that never gets delivered.

0

Start asking to get answers

Find the answer to your question by asking.

Ask question

Explore related questions

See similar questions with these tags.