小公司研发总监,既当司令也当兵!
分类: linux
2015-05-20 10:33:02
const struct proto_ops inet_stream_ops = {
.family = pf_inet,
.owner = this_module,
.release = inet_release,
.bind = inet_bind,
.connect = inet_stream_connect,
.socketpair = sock_no_socketpair,
.accept = inet_accept,
.getname = inet_getname,
.poll = tcp_poll,
.ioctl = inet_ioctl,
.listen = inet_listen,
.shutdown = inet_shutdown,
.setsockopt = sock_common_setsockopt,
.getsockopt = sock_common_getsockopt,
.sendmsg = inet_sendmsg,
.recvmsg = inet_recvmsg,
.mmap = sock_no_mmap,
.sendpage = inet_sendpage,
.splice_read = tcp_splice_read,
#ifdef config_compat
.compat_setsockopt = compat_sock_common_setsockopt,
.compat_getsockopt = compat_sock_common_getsockopt,
.compat_ioctl = inet_compat_ioctl,
#endif
};
struct proto tcp_prot = {
.name = "tcp",
.owner = this_module,
.close = tcp_close,
.connect = tcp_v4_connect,
.disconnect = tcp_disconnect,
.accept = inet_csk_accept,
.ioctl = tcp_ioctl,
.init = tcp_v4_init_sock,
.destroy = tcp_v4_destroy_sock,
.shutdown = tcp_shutdown,
.setsockopt = tcp_setsockopt,
.getsockopt = tcp_getsockopt,
.recvmsg = tcp_recvmsg,
.sendmsg = tcp_sendmsg,
.sendpage = tcp_sendpage,
.backlog_rcv = tcp_v4_do_rcv,
.hash = inet_hash,
.unhash = inet_unhash,
.get_port = inet_csk_get_port,
.enter_memory_pressure = tcp_enter_memory_pressure,
.sockets_allocated = &tcp_sockets_allocated,
.orphan_count = &tcp_orphan_count,
.memory_allocated = &tcp_memory_allocated,
.memory_pressure = &tcp_memory_pressure,
.sysctl_mem = sysctl_tcp_mem,
.sysctl_wmem = sysctl_tcp_wmem,
.sysctl_rmem = sysctl_tcp_rmem,
.max_header = max_tcp_header,
.obj_size = sizeof(struct tcp_sock),
.slab_flags = slab_destroy_by_rcu,
.twsk_prot = &tcp_timewait_sock_ops,
.rsk_prot = &tcp_request_sock_ops,
.h.hashinfo = &tcp_hashinfo,
.no_autobind = true,
#ifdef config_compat
.compat_setsockopt = compat_tcp_setsockopt,
.compat_getsockopt = compat_tcp_getsockopt,
#endif
};
static struct inet_protosw inetsw_array[] =
{
{
.type = sock_stream,
.protocol = ipproto_tcp,
.prot = &tcp_prot,
.ops = &inet_stream_ops,
.no_check = 0,
.flags = inet_protosw_permanent |
inet_protosw_icsk,
},
{
.type = sock_dgram,
.protocol = ipproto_udp,
.prot = &udp_prot,
.ops = &inet_dgram_ops,
.no_check = udp_csum_default,
.flags = inet_protosw_permanent,
},
{
.type = sock_raw,
.protocol = ipproto_ip, /* wild card */
.prot = &raw_prot,
.ops = &inet_sockraw_ops,
.no_check = udp_csum_default,
.flags = inet_protosw_reuse,
}
};
然后在inet_init()中将inetsw_array注册到inetsw中:
static int __init inet_init(void)
{
......
/* register the socket-side information for inet_create. */
for (r = &inetsw[0]; r < &inetsw[sock_max]; r)
init_list_head(r);
for (q = inetsw_array; q < &inetsw_array[inetsw_array_len]; q)
inet_register_protosw(q);
.......
}
#include
int socket(int family, int type, int protocol)
返回值说明:返回非负描述字——成功,返回-1——失败
syscall_define3(socket, int, family, int, type, int, protocol)
{
int retval;
struct socket *sock;
int flags;
/* check the sock_* constants for consistency. */
flags = type & ~sock_type_mask;
if (flags & ~(sock_cloexec | sock_nonblock))
return -einval;
type &= sock_type_mask;
if (sock_nonblock != o_nonblock && (flags & sock_nonblock))
flags = (flags & ~sock_nonblock) | o_nonblock;
// 创建一个socket实例并分配sock结构
retval = sock_create(family, type, protocol, &sock);
if (retval < 0)
goto out;
// 将socket与文件关联起来,并且添加文件描述符关联,返回该文件描述符
retval = sock_map_fd(sock, flags & (o_cloexec | o_nonblock));
if (retval < 0)
goto out_release;
out:
/* it may be already another descriptor 8) not kernel problem. */
return retval;
out_release:
sock_release(sock);
return retval;
}
sock_create为__sock_create的包裹函数:
int __sock_create(struct net *net, int family, int type, int protocol,
struct socket **res, int kern)
{
int err;
struct socket *sock;
const struct net_proto_family *pf;
…… 略 ……..
/*
* allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select
* an appropriate default.
*/
sock = sock_alloc();
if (!sock) {
if (net_ratelimit())
printk(kern_warning "socket: no more sockets\n");
return -enfile; /* not exactly a match, but its the
closest posix thing */
}
sock->type = type;
rcu_read_lock();
// 从协议簇中取出对应协议类型,其中有该协议簇注册的create函数
pf = rcu_dereference(net_families[family]);
err = -eafnosupport;
if (!pf)
goto out_release;
/*
* we will call the ->create function, that possibly is in a loadable
* module, so we have to bump that loadable module refcnt first.
*/
// 再次强调一次,这里调用的是协议簇的create函数,其会根据protocol进一步调用具体协议的create函数,比如pppox下有pppoe
err = pf->create(net, sock, protocol, kern);
if (err < 0)
goto out_module_put;
…略…
*res = sock;
return 0;
}
net_families中, af_inet对应的结构数据如下:
static const struct net_proto_family inet_family_ops = {
.family = pf_inet,
.create = inet_create,
.owner = this_module,
};
static int inet_create(struct net *net, struct socket *sock, int protocol,
int kern)
{
struct sock *sk;
struct inet_protosw *answer;
struct inet_sock *inet;
struct proto *answer_prot;
unsigned char answer_flags;
char answer_no_check;
int try_loading_module = 0;
int err;
… 略…
list_for_each_entry_rcu(answer, &inetsw[sock->type], list) {
err = 0;
/* check the non-wild match. */
if (protocol == answer->protocol) {
if (protocol != ipproto_ip)
break;
} else {
/* check for the two wild cases. */
if (ipproto_ip == protocol) {
protocol = answer->protocol;
break;
}
if (ipproto_ip == answer->protocol)
break;
}
err = -eprotonosupport;
}
… 略…
// answer已经指向inetsw[sock->type], 为inet_protosw结构内容
// 将inet_protosw中的ops函数集注册到sock->ops中
sock->ops = answer->ops;
answer_prot = answer->prot;
answer_no_check = answer->no_check;
answer_flags = answer->flags;
rcu_read_unlock();
// 分配一个sock结构, 并且传入了inet_protosw->port
sk = sk_alloc(net, pf_inet, gfp_kernel, answer_prot);
if (sk == null)
goto out;
… 略, 对sk进行一些检查和初始化…
// 初始化sk的发送、接收、出错队列,绑定sk状态变化、数据就绪等回调函数
sock_init_data(sock, sk);
if (sk->sk_prot->init) {
err = sk->sk_prot->init(sk);//对sk进行进一步init
//(tcp_v4_init_sock)
if (err)
sk_common_release(sk);
}
}
#include
int bind(int sockfd, const struct sockaddr *serveraddr, socklen_t len)
返回0——成功, -1——出错
syscall_define3(bind, int, fd, struct sockaddr __user *, umyaddr, int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
// 通过sockfd查找socket
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
// 将用户空间的地址信息复制到内核空间
err = move_addr_to_kernel(umyaddr, addrlen, (struct sockaddr *)&address);
if (err >= 0) {
err = security_socket_bind(sock,
(struct sockaddr *)&address,
addrlen);
if (!err)
// 调用socket注册的bind函数(inet_bind)
err = sock->ops->bind(sock,
(struct sockaddr *)
&address, addrlen);
}
fput_light(sock->file, fput_needed);
}
return err;
}
// 如果在sk注册的protocol中,有注册bind函数,那么就调用该注册函数进行bind,否
// 则对需要绑定的端口进行检查,将该ip和端口绑定到sk中
int inet_bind(struct socket *sock, struct sockaddr *uaddr, int addr_len)
{
struct sockaddr_in *addr = (struct sockaddr_in *)uaddr;
struct sock *sk = sock->sk;
struct inet_sock *inet = inet_sk(sk);
unsigned short snum;
int chk_addr_ret;
int err;
/* if the socket has its own bind function then use it. (raw) */
if (sk->sk_prot->bind) {
// 如果sk中已经注册了proto,那么使用注册的bind函数
err = sk->sk_prot->bind(sk, uaddr, addr_len);
goto out;
}
err = -einval;
if (addr_len < sizeof(struct sockaddr_in))
goto out;
chk_addr_ret = inet_addr_type(sock_net(sk), addr->sin_addr.s_addr);
/* not specified by any standard per-se, however it breaks too
* many applications when removed. it is unfortunate since
* allowing applications to make a non-local bind solves
* several problems with systems using dynamic addressing.
* (ie. your servers still start up even if your isdn link
* is temporarily down)
*/
err = -eaddrnotavail;
if (!sysctl_ip_nonlocal_bind &&
!(inet->freebind || inet->transparent) &&
addr->sin_addr.s_addr != htonl(inaddr_any) &&
chk_addr_ret != rtn_local &&
chk_addr_ret != rtn_multicast &&
chk_addr_ret != rtn_broadcast)
goto out;
snum = ntohs(addr->sin_port);
err = -eacces;
if (snum && snum < prot_sock && !capable(cap_net_bind_service))
goto out;
/* we keep a pair of addresses. rcv_saddr is the one
* used by hash lookups, and saddr is used for transmit.
*
* in the bsd api these are the same except where it
* would be illegal to use them (multicast/broadcast) in
* which case the sending device address is used.
*/
lock_sock(sk);
/* check these errors (active socket, double bind). */
err = -einval;
if (sk->sk_state != tcp_close || inet->inet_num)
goto out_release_sock;
inet->inet_rcv_saddr = inet->inet_saddr = addr->sin_addr.s_addr;
if (chk_addr_ret == rtn_multicast || chk_addr_ret == rtn_broadcast)
inet->inet_saddr = 0; /* use device */
/* make sure we are allowed to bind here. */
if (sk->sk_prot->get_port(sk, snum)) {
inet->inet_saddr = inet->inet_rcv_saddr = 0;
err = -eaddrinuse;
goto out_release_sock;
}
if (inet->inet_rcv_saddr)
sk->sk_userlocks |= sock_bindaddr_lock;
if (snum)
sk->sk_userlocks |= sock_bindport_lock;
inet->inet_sport = htons(inet->inet_num);
inet->inet_daddr = 0;
inet->inet_dport = 0;
sk_dst_reset(sk);
err = 0;
out_release_sock:
release_sock(sk);
out:
return err;
}
#include
int listen(int sockfd, int backlog)
返回0——成功, -1——失败
listen()函数仅由tcp服务器调用,它做两件事情:
(1) 当socket函数创建一个套接口时,默认为一个主动套接口。listen函数把一个未连接的套接口转换为一个被动套接口,并指示内核应该接受指向该套接口的连接请求。同时,更改tcp的状态有closed变更为listen状态。
(2) 指定内核应该为相应套接口排队的最大连接个数(由backlog指定)。
syscall_define2(listen, int, fd, int, backlog)
{
struct socket *sock;
int err, fput_needed;
int somaxconn;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (sock) {
somaxconn = sock_net(sock->sk)->core.sysctl_somaxconn;
if ((unsigned)backlog > somaxconn)
backlog = somaxconn;
err = security_socket_listen(sock, backlog);
if (!err)
// 调用创建socket是的ops(inet_listen)
err = sock->ops->listen(sock, backlog);
fput_light(sock->file, fput_needed);
}
return err;
}
int inet_listen(struct socket *sock, int backlog)
{
struct sock *sk = sock->sk;
unsigned char old_state;
int err;
lock_sock(sk);
err = -einval;
if (sock->state != ss_unconnected || sock->type != sock_stream)
goto out;
old_state = sk->sk_state;
if (!((1 << old_state) & (tcpf_close | tcpf_listen)))
goto out;
/* really, if the socket is already in listen state
* we can only allow the backlog to be adjusted.
*/
if (old_state != tcp_listen) {
err = inet_csk_listen_start(sk, backlog);
if (err)
goto out;
}
sk->sk_max_ack_backlog = backlog;
err = 0;
out:
release_sock(sk);
return err;
}
int inet_csk_listen_start(struct sock *sk, const int nr_table_entries)
{
struct inet_sock *inet = inet_sk(sk);
struct inet_connection_sock *icsk = inet_csk(sk);
// 为listen 的sock分配队列空间
int rc = reqsk_queue_alloc(&icsk->icsk_accept_queue,
nr_table_entries);
if (rc != 0)
return rc;
sk->sk_max_ack_backlog = 0;
sk->sk_ack_backlog = 0;
inet_csk_delack_init(sk);
/* there is race window here: we announce ourselves listening,
* but this transition is still not validated by get_port().
* it is ok, because this socket enters to hash table only
* after validation is complete.
*/
sk->sk_state = tcp_listen; // 设置tcp状态为listen状态
if (!sk->sk_prot->get_port(sk, inet->inet_num)) {
inet->inet_sport = htons(inet->inet_num);
// 重置dst_entry(设置为null)
sk_dst_reset(sk);
// 将该sock加入listen socks的哈希表中(记录在sk->sk_prot->h.hashinfo中)
sk->sk_prot->hash(sk);
return 0;
}
sk->sk_state = tcp_close;
__reqsk_queue_destroy(&icsk->icsk_accept_queue);
return -eaddrinuse;
}
#include
int accept(int sockfd, struct sockaddr *cliaddr, socklen_t *addrlen)
返回非负描述字——成功, -1 ——失败
参数cliaddr和addrlen用于返回客户端地址信息。
/*
* for accept, we attempt to create a new socket, set up the link
* with the client, wake up the client, then return the new
* connected fd. we collect the address of the connector in kernel
* space and move it to user at the very end. this is unclean because
* we open the socket then return an error.
*
* 1003.1g adds the ability to recvmsg() to query connection pending
* status to recvmsg. we need to add that support in a way thats
* clean when we restucture accept also.
*/
syscall_define4(accept4, int, fd, struct sockaddr __user *, upeer_sockaddr,
int __user *, upeer_addrlen, int, flags)
{
struct socket *sock, *newsock;
struct file *newfile;
int err, len, newfd, fput_needed;
struct sockaddr_storage address;
if (flags & ~(sock_cloexec | sock_nonblock))
return -einval;
if (sock_nonblock != o_nonblock && (flags & sock_nonblock))
flags = (flags & ~sock_nonblock) | o_nonblock;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = -enfile;
// 新生成一个socket
newsock = sock_alloc();
if (!newsock)
goto out_put;
newsock->type = sock->type;
newsock->ops = sock->ops;
/*
* we don't need try_module_get here, as the listening socket (sock)
* has the protocol module (sock->ops->owner) held.
*/
__module_get(newsock->ops->owner);
// 为新的socket生成文件和文件描述符
newfd = sock_alloc_file(newsock, &newfile, flags);
if (unlikely(newfd < 0)) {
err = newfd;
sock_release(newsock);
goto out_put;
}
err = security_socket_accept(sock, newsock);
if (err)
goto out_fd;
// 调用协议的accept(inet_accept)
err = sock->ops->accept(sock, newsock, sock->file->f_flags);
if (err < 0)
goto out_fd;
if (upeer_sockaddr) {
// 获取新连接的客户端地址
if (newsock->ops->getname(newsock, (struct sockaddr *)&address,
&len, 2) < 0) {
err = -econnaborted;
goto out_fd;
}
// 返回该客户地址到用户空间
err = move_addr_to_user((struct sockaddr *)&address,
len, upeer_sockaddr, upeer_addrlen);
if (err < 0)
goto out_fd;
}
/* file flags are not inherited via accept() unlike another oses. */
fd_install(newfd, newfile);
err = newfd;
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
out_fd:
fput(newfile);
put_unused_fd(newfd);
goto out_put;
}
/*
* accept a pending connection. the tcp layer now gives bsd semantics.
*/
int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
struct sock *sk1 = sock->sk;
int err = -einval;
// protocol 注册的accept(tcp_v4_accept)
struct sock *sk2 = sk1->sk_prot->accept(sk1, flags, &err);
if (!sk2)
goto do_err;
lock_sock(sk2);
warn_on(!((1 << sk2->sk_state) &
(tcpf_established | tcpf_close_wait | tcpf_close)));
// 把protocol accept的sk内容移动至new socket中
sock_graft(sk2, newsock);
newsock->state = ss_connected; //新socket设置为connected状态
err = 0;
release_sock(sk2);
do_err:
return err;
}
/*
* this will accept the next outstanding connection.
*/
struct sock *inet_csk_accept(struct sock *sk, int flags, int *err)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sock *newsk;
int error;
lock_sock(sk);
/* we need to make sure that this socket is listening,
* and that it has something pending.
*/
error = -einval;
if (sk->sk_state != tcp_listen)
goto out_err;
/* find already established connection */
if (reqsk_queue_empty(&icsk->icsk_accept_queue)) {
long timeo = sock_rcvtimeo(sk, flags & o_nonblock);
/* if this is a non blocking socket don't sleep */
error = -eagain;
if (!timeo)
goto out_err;
error = inet_csk_wait_for_connect(sk, timeo);
if (error)
goto out_err;
}
// 从连接队列中(已经完成3次握手)取出头部的一个连接(并从链表中删除)
newsk = reqsk_queue_get_child(&icsk->icsk_accept_queue, sk);
warn_on(newsk->sk_state == tcp_syn_recv);
out:
release_sock(sk);
return newsk;
out_err:
newsk = null;
*err = error;
goto out;
}
#include
int connect(int sockfd, const struct sockaddr *servaddr, socklen_t addrlen)
返回0——成功, -1——失败
/*
* attempt to connect to a socket with the server address. the address
* is in user space so we verify it is ok and move it to kernel space.
*
* for 1003.1g we need to add clean support for a bind to af_unspec to
* break bindings
*
* note: 1003.1g draft 6.3 is broken with respect to ax.25/netrom and
* other seqpacket protocols that take time to connect() as it doesn't
* include the -einprogress status for such sockets.
*/
syscall_define3(connect, int, fd, struct sockaddr __user *, uservaddr,
int, addrlen)
{
struct socket *sock;
struct sockaddr_storage address;
int err, fput_needed;
sock = sockfd_lookup_light(fd, &err, &fput_needed);
if (!sock)
goto out;
err = move_addr_to_kernel(uservaddr, addrlen, (struct sockaddr *)&address);
if (err < 0)
goto out_put;
err =
security_socket_connect(sock, (struct sockaddr *)&address, addrlen);
if (err)
goto out_put;
// socket注册的connect函数(inet_stream_connect)
err = sock->ops->connect(sock, (struct sockaddr *)&address, addrlen,
sock->file->f_flags);
out_put:
fput_light(sock->file, fput_needed);
out:
return err;
}
inet_stream_connect =》tcp_v4_connect
/* this will initiate an outgoing connection. */
int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
{
struct inet_sock *inet = inet_sk(sk);
struct tcp_sock *tp = tcp_sk(sk);
struct sockaddr_in *usin = (struct sockaddr_in *)uaddr;
struct rtable *rt;
__be32 daddr, nexthop;
int tmp;
int err;
if (addr_len < sizeof(struct sockaddr_in))
return -einval;
if (usin->sin_family != af_inet)
return -eafnosupport;
nexthop = daddr = usin->sin_addr.s_addr;
if (inet->opt && inet->opt->srr) {
if (!daddr)
return -einval;
nexthop = inet->opt->faddr;
}
/* 调用函数ip_route_connect(),寻找合适的路由存放在rt中。ip_route_connect找两次,第一次找到下一跳的ip地址,在路由缓存或fib中找到,然后第二次找到下一跳的具体邻居,到neigh_table中找到*/
tmp = ip_route_connect(&rt, nexthop, inet->inet_saddr,
rt_conn_flags(sk), sk->sk_bound_dev_if,
ipproto_tcp,
inet->inet_sport, usin->sin_port, sk, 1);
if (tmp < 0) {
if (tmp == -enetunreach)
ip_inc_stats_bh(sock_net(sk), ipstats_mib_outnoroutes);
return tmp;
}
if (rt->rt_flags & (rtcf_multicast | rtcf_broadcast)) {
ip_rt_put(rt);
return -enetunreach;
}
if (!inet->opt || !inet->opt->srr)
daddr = rt->rt_dst;
if (!inet->inet_saddr)
inet->inet_saddr = rt->rt_src;
inet->inet_rcv_saddr = inet->inet_saddr;
if (tp->rx_opt.ts_recent_stamp && inet->inet_daddr != daddr) {
/* reset inherited state */
tp->rx_opt.ts_recent = 0;
tp->rx_opt.ts_recent_stamp = 0;
tp->write_seq = 0;
}
if (tcp_death_row.sysctl_tw_recycle &&
!tp->rx_opt.ts_recent_stamp && rt->rt_dst == daddr) {
struct inet_peer *peer = rt_get_peer(rt);
/*
* vj's idea. we save last timestamp seen from
* the destination in peer table, when entering state
* time-wait * and initialize rx_opt.ts_recent from it,
* when trying new connection.
*/
if (peer) {
inet_peer_refcheck(peer);
if ((u32)get_seconds() - peer->tcp_ts_stamp <= tcp_paws_msl) {
tp->rx_opt.ts_recent_stamp = peer->tcp_ts_stamp;
tp->rx_opt.ts_recent = peer->tcp_ts;
}
}
}
inet->inet_dport = usin->sin_port;
inet->inet_daddr = daddr;
inet_csk(sk)->icsk_ext_hdr_len = 0;
if (inet->opt)
inet_csk(sk)->icsk_ext_hdr_len = inet->opt->optlen;
tp->rx_opt.mss_clamp = tcp_mss_default;
/* socket identity is still unknown (sport may be zero).
* however we set state to syn-sent and not releasing socket
* lock select source port, enter ourselves into the hash tables and
* complete initialization after this.
*/
tcp_set_state(sk, tcp_syn_sent);
err = inet_hash_connect(&tcp_death_row, sk);
if (err)
goto failure;
err = ip_route_newports(&rt, ipproto_tcp,
inet->inet_sport, inet->inet_dport, sk);
if (err)
goto failure;
/* ok, now commit destination to socket. */
sk->sk_gso_type = skb_gso_tcpv4;
sk_setup_caps(sk, &rt->dst);
if (!tp->write_seq)
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
inet->inet_daddr,
inet->inet_sport,
usin->sin_port);
inet->inet_id = tp->write_seq ^ jiffies;
err = tcp_connect(sk);
rt = null;
if (err)
goto failure;
return 0;
failure:
/*
* this unhashes the socket and releases the local port,
* if necessary.
*/
tcp_set_state(sk, tcp_close);
ip_rt_put(rt);
sk->sk_route_caps = 0;
inet->inet_dport = 0;
return err;
}