Linux内核socket系统调用源码分析

发布时间 2023-12-31 17:30:38作者: 划水的猫

一、环境说明

内核版本:Linux 3.10

内核源码地址:https://elixir.bootlin.com/linux/v3.10/source (包含各个版本内核源码,且网页可全局搜索函数)

二、应用层-socket()函数

应用层创建 socket 对象返回整型的文件描述符。

/* family:被称为协议族,或者协议域。
 * type:套接字类型。
 * protocol:某个协议的类型常值,可以设置为 0。
 * return:返回整型的文件描述符,如果返回 -1 就失败。
 */
#include <sys/socket.h>
//socket(int domain/family, int type, int protocol)
int socket_fd = socket(AF_INET, SOCK_STREAM, 0);

三、BSD Socket层-sys_socketcall()函数

网络栈专用操作函数集的总入口函数,主要是将请求分配,调用具体的底层函数进行处理:

// file: net/socket.c
SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)
{
    ......
    switch (call) {
    case SYS_SOCKET:
        err = sys_socket(a0, a1, a[2]);
        break;
    case SYS_BIND:
        err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_CONNECT:
        err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);
        break;
    case SYS_LISTEN:
        err = sys_listen(a0, a1);
        break;
    case SYS_ACCEPT:
        err = sys_accept4(a0, (struct sockaddr __user *)a1, (int __user *)a[2], 0);
        break;
    ......

    }
    return err;
}

四、INET Socket层-sys_socket()函数

sys_socket()主要包含两个部分:sock_create和sock_map_fd

// file: net/socket.c
SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)
{
    int retval;
    struct socket *sock;
    int flags;

    ...... //省略参数合法性校验代码
    retval = sock_create(family, type, protocol, &sock); //创建socket
    if (retval < 0)
        goto out;

    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK)); //将socket与file关联
    if (retval < 0)
        goto out_release;

out:
    /* It may be already another descriptor 8) Not kernel problem. */
    return retval;

out_release:
    sock_release(sock);
    return retval;
}

4.1 sock_create()函数

// file: net/socket.c
int sock_create(int family, int type, int protocol, struct socket **res)
{
    return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
}
EXPORT_SYMBOL(sock_create);

int __sock_create(struct net *net, int family, int type, int protocol, struct socket **res, int kern)
{
    int err;
    struct socket *sock;
    const struct net_proto_family *pf;
    
    ......

    err = security_socket_create(family, type, protocol, kern); //SElinux相关,跳过
    if (err)
        return err;

    /*
     *    Allocate the socket and allow the family to set things up. if
     *    the protocol is 0, the family is instructed to select an appropriate
     *    default.
     */
    sock = sock_alloc(); //创建struct socket结构体
    if (!sock) {
        net_warn_ratelimited("socket: no more sockets\n");
        return -ENFILE;    /* Not exactly a match, but its the closest posix thing */
    }

    sock->type = type; //设置套接字的类型

    ......

    rcu_read_lock();
    pf = rcu_dereference(net_families[family]); //获取对应协议族的协议实例对象
    ......
    rcu_read_unlock();
    err = pf->create(net, sock, protocol, kern); //调用对应协议create方法
    if (err < 0)
        goto out_module_put;

    ......
}

4.1.1 pf->create()函数

pf由net_families[]数组获得:

// file: include/linux/socket.h
#define AF_MAX      41  /* For now.. */
#define PF_INET        AF_INET

// file: include/uapi/linux/net.h
#define NPROTO      AF_MAX

// file: net/socket.c
static const struct net_proto_family __rcu *net_families[NPROTO] __read_mostly;

net_families[]数组的初始化在inet_init()函数:

// file: net/ipv4/af_inet.c
static const struct net_proto_family inet_family_ops = {
    .family = PF_INET,
    .create = inet_create,
    .owner    = THIS_MODULE,
};

static int __init inet_init(void)
{
    ......
    (void)sock_register(&inet_family_ops);
    ......
}

// file: net/socket.c
int sock_register(const struct net_proto_family *ops)
{
    ......
    // net_families[]数组里存放的是各个协议族的信息,以family字段作为下标
    rcu_assign_pointer(net_families[ops->family], ops);
    ......
}

因此,pf->create()最终调用的是inet_create()函数。

4.1.2 inet_create()函数

inet_create()主要完成以下工作:
设置socket的状态为SS_UNCONNECTED;
根据socket的type找到对应的套接字类型,获取对应协议类型的接口操作集信息;
使用匹配的协议族操作集初始化sk;
分配并初始化sock结构;

// file: net/ipv4/af_inet.c
static int inet_create(struct net *net, struct socket *sock, int protocol, int kern)
{
    struct sock *sk;
    struct inet_protosw *answer;
    struct inet_sock *inet;
    struct proto *answer_prot;
    unsigned char answer_flags;
    char answer_no_check;
    int try_loading_module = 0;
    int err;

    if (unlikely(!inet_ehash_secret))
        if (sock->type != SOCK_RAW && sock->type != SOCK_DGRAM)
            build_ehash_secret();

    sock->state = SS_UNCONNECTED; //设置socket的状态

    /* Look for the requested type/protocol pair. */
lookup_protocol:
    err = -ESOCKTNOSUPPORT;
    rcu_read_lock();
    //根据socket传入的protocal在inetsw[]数组中查找对应的元素,获取对应协议类型的接口操作集信息
    list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { //sock->type应用层传入的是SOCK_STREAM

        err = 0;
        /* Check the non-wild match. */
        if (protocol == answer->protocol) { //如果我们在socket的protocal传入的是6,即TCP协议,那么走这个分支
            if (protocol != IPPROTO_IP)
                break;
        } else { //如果socket的protocal传入的是0,那么走这个分支(我们应用层传入的是0)
            /* Check for the two wild cases. */
            if (IPPROTO_IP == protocol) { //IPPROTO_IP内核定义,值为0
                protocol = answer->protocol; //重新给protocal赋值,因此socket中protocal传入的是0或者6,都是可以的
                break;
            }
            if (IPPROTO_IP == answer->protocol)
                break;
        }
        err = -EPROTONOSUPPORT;
    }
    //循环结束后,answer的prot和ops,对应的就是inetsw[SOCK_STREAM]协议族信息

    ......

    sock->ops = answer->ops; //将查找到的对应协议族的协议函数操作集赋值给我们之前创建的socket
    answer_prot = answer->prot;
    answer_no_check = answer->no_check;
    answer_flags = answer->flags;
    rcu_read_unlock();

    WARN_ON(answer_prot->slab == NULL);

    err = -ENOBUFS;
    sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot); //创建sock结构体
    if (sk == NULL)
        goto out;

    err = 0;
    sk->sk_no_check = answer_no_check;
    if (INET_PROTOSW_REUSE & answer_flags)
        sk->sk_reuse = SK_CAN_REUSE;

    inet = inet_sk(sk); //强制类型转化,初始化inet_sock结构
    inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;

    inet->nodefrag = 0;

    if (SOCK_RAW == sock->type) {
        inet->inet_num = protocol;
        if (IPPROTO_RAW == protocol)
            inet->hdrincl = 1;
    }

    if (ipv4_config.no_pmtu_disc)
        inet->pmtudisc = IP_PMTUDISC_DONT;
    else
        inet->pmtudisc = IP_PMTUDISC_WANT;

    inet->inet_id = 0;

    sock_init_data(sock, sk); //sock初始化

    sk->sk_destruct       = inet_sock_destruct;
    sk->sk_protocol       = protocol;
    sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;

    inet->uc_ttl    = -1;
    inet->mc_loop    = 1;
    inet->mc_ttl    = 1;
    inet->mc_all    = 1;
    inet->mc_index    = 0;
    inet->mc_list    = NULL;
    inet->rcv_tos    = 0;

    sk_refcnt_debug_inc(sk);

    if (inet->inet_num) {
        /* It assumes that any protocol which allows
         * the user to assign a number at socket
         * creation time automatically
         * shares.
         */
        inet->inet_sport = htons(inet->inet_num);
        /* Add to protocol hash chains. */
        sk->sk_prot->hash(sk);
    }

    //另一部分初始化,里面有对各个socket连接定时器的初始化
    if (sk->sk_prot->init) {
        err = sk->sk_prot->init(sk);
        if (err)
            sk_common_release(sk);
    }
out:
    return err;
out_rcu_unlock:
    rcu_read_unlock();
    goto out;
}
4.1.2.1 inetsw[]数组

inetsw[]数组存放的是各个sock_type的信息,它也是在inet_init()函数中初始化:

// file:net/ipv4/af_inet.c
static struct list_head inetsw[SOCK_MAX];

static int __init inet_init(void)
{
    ......
    /* Register the socket-side information for inet_create. */
    for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)
        INIT_LIST_HEAD(r);

    for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)
        inet_register_protosw(q);
    ......
}

其中inetsw_array[]存放的就是具体的每种sock的信息,包括操作函数,协议号等,其中prot和ops两个成员是比较重要的,后续很多操作依赖于这两个成员:

// file: net/ipv4/af_inet.c
static struct inet_protosw inetsw_array[] =
{
    {
        .type =       SOCK_STREAM,
        .protocol =   IPPROTO_TCP,
        .prot =       &tcp_prot,
        .ops =        &inet_stream_ops,
        .no_check =   0,
        .flags =      INET_PROTOSW_PERMANENT |
                  INET_PROTOSW_ICSK,
    },

    {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_UDP,
        .prot =       &udp_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_PERMANENT,
       },

       {
        .type =       SOCK_DGRAM,
        .protocol =   IPPROTO_ICMP,
        .prot =       &ping_prot,
        .ops =        &inet_dgram_ops,
        .no_check =   UDP_CSUM_DEFAULT,
        .flags =      INET_PROTOSW_REUSE,
       },

       {
           .type =       SOCK_RAW,
           .protocol =   IPPROTO_IP,    /* wild card */
           .prot =       &raw_prot,
           .ops =        &inet_sockraw_ops,
           .no_check =   UDP_CSUM_DEFAULT,
           .flags =      INET_PROTOSW_REUSE,
       }
};

通过inet_register_protosw()函数,将上述inetsw_array[]里的元素,按照type字段挂在inetsw[]数组的链表上。

void inet_register_protosw(struct inet_protosw *p)
{
    ......
    list_add_rcu(&p->list, last_perm); //按照type的值,添加到inetsw[type]数组中的链表中
    ......
}
4.1.2.2 sock初始化

sock_init_data()函数,将之前分配的struct socket和struct sock联系在一起:

// file: net/core/sock.c
void sock_init_data(struct socket *sock, struct sock *sk)
{
    ......
    sk->sk_send_head    =    NULL;

    init_timer(&sk->sk_timer);

    sk->sk_allocation    =    GFP_KERNEL;
    sk->sk_rcvbuf        =    sysctl_rmem_default;
    sk->sk_sndbuf        =    sysctl_wmem_default;
    sk->sk_state        =    TCP_CLOSE;
    sk_set_socket(sk, sock);
    ......
    sock->sk    =    sk;
    ......
}

// file: include/net/sock.h
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
    sk_tx_queue_clear(sk);
    sk->sk_socket = sock;
}

sk->sk_prot->init(),对于TCP协议,这个init成员指向的是tcp_v4_init_sock()函数:

// file: net/ipv4/tcp_ipv4.c
struct proto tcp_prot = {
    .name            = "TCP",
    .owner            = THIS_MODULE,
    .close            = tcp_close,
    .connect        = tcp_v4_connect,
    .disconnect        = tcp_disconnect,
    .accept            = inet_csk_accept,
    .ioctl            = tcp_ioctl,
    .init            = tcp_v4_init_sock,
    ......
}

// file: net/ipv4/tcp_ipv4.c
static int tcp_v4_init_sock(struct sock *sk)
{
    struct inet_connection_sock *icsk = inet_csk(sk);

    tcp_init_sock(sk);

    icsk->icsk_af_ops = &ipv4_specific;
    ......
}

4.2 sock_map_fd()函数

这个函数主要有两个部分:
创建file文件结构,fd文件描述符;
将file文件结构和fd文件描述符关联,同时将上一步返回的socket也一起绑定,形成一个完整的逻辑;

// file: net/socket.c
static int sock_map_fd(struct socket *sock, int flags)
{
    struct file *newfile;
    int fd = get_unused_fd_flags(flags); //获取一个未使用的文件描述符
    if (unlikely(fd < 0))
        return fd;

    newfile = sock_alloc_file(sock, flags, NULL); //分配file结构体
    if (likely(!IS_ERR(newfile))) {
        fd_install(fd, newfile);
        return fd;
    }

    put_unused_fd(fd);
    return PTR_ERR(newfile);
}

// file: net/socket.c
struct file *sock_alloc_file(struct socket *sock, int flags, const char *dname)
{
    ......
    sock->file = file;
    file->f_flags = O_RDWR | (flags & O_NONBLOCK);
    file->private_data = sock; //file与socket关联
    return file;
}

4.3 总结一下

socket系统调用的操作:
首先在内核生成一个socket_alloc和tcp_sock类型的对象,其中sock_alloc对象中的socket和tcp_sock对象的sock绑定,sock_alloc对象中的inode和file类型对象绑定。
然后将分配的文件描述符fd和file对象关联,最后将这个文件描述符fd返回给用户使用。
经过这一连串操作,用户只要使用fd,内核就能根据这个fd进行网络连接管理的各种操作。

fd与内核sock各个结构的关系,如下图所示: