Linux系统调用

时间：2021-06-29 15:58:17 收藏：0 阅读：0

Linux系统调用

原文：https://www.cnblogs.com/ycw0923/p/12913925.html

一.为何要有系统调用

unix内核分为用户态和内核态，在用户态下程序不内直接访问内核数据结构或者内核程序，只有在内核态下才可访问。请求内核服务的进程使用系统调用的特殊机制，每个系统调用都设置了一组识别进程请求的参数，通过执行CPU指令完成用户态向内核态的转换。

二.系统调用过程

32位系统中，通过int $0x80指令触发系统调用。其中EAX寄存器用于传递系统调用号，参数按顺序赋值给EBX、ECX、EDX、ESI、EDI、EBP这6个寄存器。

64位系统则是使用syscall指令来触发系统调用，同样使用EAX寄存器传递系统调用号，RDI、RSI、RDX、RCX、R8、R9这6个寄存器则用来传递参数。

下面以64位系统中的42号，connect系统调用作为例子

connect是socket网络通信中的函数，是客户端与服务端连接时所用到的函数，connect接受三个参数，分别是客户端的文件描述符，sockaddr结构体，以及地址长度(ipv4为4）。若成功连接，返回0，否则返回-1

下面是客户端的源代码

#include <sys/socket.h>
#include <sys/types.h>
#include <netdb.h>
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
#include "rio.h"
#define MAXLINE 100

int open_clientfd(char*,char*);

int main(int argc,char** argv){
    int clientfd;
    char* host,*port,buf[MAXLINE];
    rio_t rio;
    if(argc != 3){
        fprintf(stderr,"usage: %s <host> <port>\n",argv[0]);
        exit(0);
    }
    host = argv[1];
    port = argv[2];

    clientfd = open_clientfd(host,port);
    rio_readinitb(&rio,clientfd);
    while(fgets(buf,MAXLINE,stdin)!=NULL){
        rio_writen(clientfd,buf,strlen(buf));
        rio_readlineb(&rio,buf,MAXLINE);
        fputs(buf,stdout);
    }
    close(clientfd);
    exit(0);
}

int open_clientfd(char* hostname,char* port){
    int clientfd;
    struct addrinfo hints,*listp,*p;
    memset(&hints,0,sizeof(struct addrinfo));
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_flags = AI_NUMERICSERV;
    hints.ai_flags |= AI_ADDRCONFIG;
    getaddrinfo(hostname,port,&hints,&listp);
    //getaddrinfo会返回所有可用的套接字
    for(p=listp;p;p=p->ai_next){
        if((clientfd = socket(p->ai_family,p->ai_socktype,p->ai_protocol))<0)
            continue;
        if(connect(clientfd,p->ai_addr,p->ai_addrlen)!=-1)//参数分别为客户端的文件描述符，addr地址结构，已经地址长度
            break;//成功建立连接
        close(clientfd);//建立失败，尝试另一个套接字
    }
    freeaddrinfo(listp);
    if(!p) return -1;
    return clientfd;
}

服务端是采用基于I/O多路复用的并发事件驱动服务器，基于select函数

#include <sys/socket.h>
#include <sys/types.h>
#include <sys/select.h>
#include <netdb.h>
#include <stdlib.h>
#include <stdio.h>
#include <memory.h>
#include <unistd.h>
#include <errno.h>
#include "rio.h"

#define LISTENQ 1024
#define MAXLINE 100

typedef struct{
    int maxfd;
    fd_set read_set;
    fd_set ready_set;
    int nready;
    int maxi;
    int clientfd[FD_SETSIZE];
    rio_t clientrio[FD_SETSIZE];
}pool;

int bytes_cnt = 0;

int open_listenfd(char*);
void echo(int);
void command();
void init_pool(int,pool*);
void add_client(int,pool*);
void check_clients(pool*);

int main(int argc,char** argv){
    int listenfd,connfd;
    socklen_t clientlen;
    struct sockaddr_storage clientaddr;
    char client_hostname[MAXLINE]; char client_port[MAXLINE];
    static pool pool;

    if(argc != 2){
        fprintf(stderr,"usage: %s <port>\n",argv[0]);
        exit(0);
    }

    listenfd = open_listenfd(argv[1]);
    init_pool(listenfd,&pool);

    while(1){
        pool.ready_set = pool.read_set;
        pool.nready = select(pool.maxfd+1,&pool.ready_set,NULL,NULL,NULL);

        if(FD_ISSET(listenfd,&pool.ready_set)){
            clientlen = sizeof(struct sockaddr_storage);
            connfd = accept(listenfd,(struct sockaddr *)&clientaddr,&clientlen);
            add_client(connfd,&pool);
            getnameinfo((struct sockaddr *)&clientaddr,clientlen,client_hostname,MAXLINE,client_port,MAXLINE,0);
            printf("连接到:(%s,%s)\n",client_hostname,client_port);
        }
        check_clients(&pool);
    }
}

int open_listenfd(char* port){
    int listenfd; int optval = 1;
    struct addrinfo hints,*listp,*p;
    memset(&hints,0,sizeof(struct addrinfo));
    hints.ai_socktype = SOCK_STREAM;
    hints.ai_flags = AI_PASSIVE | AI_ADDRCONFIG;
    hints.ai_flags |= AI_NUMERICSERV;
    getaddrinfo(NULL,port,&hints,&listp);
    for(p=listp;p;p=p->ai_next){
        if((listenfd = socket(p->ai_family,p->ai_socktype,p->ai_protocol))<0)
            continue;
        setsockopt(listenfd,SOL_SOCKET,SO_REUSEADDR,(const void*)&optval,sizeof(int));
        if(bind(listenfd,p->ai_addr,p->ai_addrlen)==0)
            break;
        close(listenfd);
    }
    freeaddrinfo(listp);
    if(!p) return -1;
    //建立成功，开始监听
    //LISTENQ是等待的连接请求队列
    if(listen(listenfd,LISTENQ)<0){
        close(listenfd);
        return -1;
    }
    return listenfd;
}

void echo(int connfd){
    size_t n;
    char buf[MAXLINE];
    rio_t rio;
    rio_readinitb(&rio,connfd);
    while((n = rio_readlineb(&rio,buf,MAXLINE)) != 0){
        printf("服务器接受到: %d 字节\n",(int)n);
        printf("%s\n",buf);
        rio_writen(connfd,buf,n);
    }
}

void command(){
    char buf[MAXLINE];
    if(!fgets(buf,MAXLINE,stdin))
        exit(0);
    printf("%s",buf);
}

void init_pool(int listenfd,pool* p){
    int i;
    p->maxi = -1;
    for(i=0;i<FD_SETSIZE;i++)
        p->clientfd[i]=-1;
    p->maxfd = listenfd;
    FD_ZERO(&p->read_set);
    FD_SET(listenfd,&p->read_set);
}

void add_client(int connfd,pool* p){
    int i;
    p->nready--;
    for(i=0;i<FD_SETSIZE;i++){
        if(p->clientfd[i]<0){
            p->clientfd[i] = connfd;
            rio_readinitb(&p->clientrio[i],connfd);

            FD_SET(connfd,&p->read_set);
            if(connfd > p->maxfd)
                p->maxfd = connfd;
            if(i > p->maxi)
                p->maxi = i;
            break;
        }
    }
    if(i == FD_SETSIZE)
        printf("add_client error: 客户端过多");
}

void check_clients(pool* p){
    int i,connfd,n;
    char buf[MAXLINE];
    rio_t rio;
    for(i=0;i<=p->maxi && p->nready>0;i++){
        connfd = p->clientfd[i];
        rio = p->clientrio[i];

        if((connfd>0) && (FD_ISSET(connfd,&p->ready_set))){
            p->nready--;
            if((n = rio_readlineb(&rio,buf,MAXLINE))!=0){
                bytes_cnt += n;
                printf("服务器收到 %d (总共%d) 字节 在 文件描述符%d ",n,bytes_cnt,connfd);
                rio_writen(connfd,buf,n);
                printf("内容:%s\n",buf);
            }
            else{
                close(connfd);
                FD_CLR(connfd,&p->read_set);
                p->clientfd[i] = -1;
            }
        }
    }
}

修改connect函数，以汇编指令的形式进入系统调用

通过gdb查看connect函数传参用到的寄存器

技术图片

其中connect的系统调用号为0x2a

技术图片

        asm volatile(
                     "movl %1,%%edi\n\t"
                     "movq %2,%%rsi\n\t"
                     "movl %3,%%edx\n\t"
                     "movl $0x2a,%%eax\n\t"
                     "syscall\n\t"
                     "movq %%rax,%0\n\t"
                     :"=m"(ret)
                     :"a"(clientfd),"b"(p->ai_addr),"c"(p->ai_addrlen)
        );

测试是否通过汇编正常调用connect函数，服务端监听45678端口

技术图片

客户端试图连接到45678端口

技术图片

看来是可以正常触发的，其中50962是客户端进程的端口号

接下来重新静态编译客户端程序 gcc clis.c -o ciis -static，如果不是静态编译，在qemu下是不能正常运行的,提示. /not found（缺少lib动态链接库）

然后重新打包系统根目录rootfs

打开qemu，通过gdb在entry_SYSCALL_64处打断点

技术图片

进入home目录后，执行./ciis localhost 1256

由于每次按下键盘都会触发一个中断，每个中断都会进入断点，所以调试的过程非常慢

技术图片

进入entry_syscall后，会保存寄存器的值到pt_regs结构体中

ENTRY(entry_SYSCALL_64)
    UNWIND_HINT_EMPTY
    /*
     * Interrupts are off on entry.
     * We do not frame this tiny irq-off block with TRACE_IRQS_OFF/ON,
     * it is too small to ever cause noticeable irq latency.
     */

    swapgs
    /* tss.sp2 is scratch space. */
    movq    %rsp, PER_CPU_VAR(cpu_tss_rw + TSS_sp2)
    SWITCH_TO_KERNEL_CR3 scratch_reg=%rsp
    movq    PER_CPU_VAR(cpu_current_top_of_stack), %rsp

    /* Construct struct pt_regs on stack */
    pushq    $__USER_DS                /* pt_regs->ss */
    pushq    PER_CPU_VAR(cpu_tss_rw + TSS_sp2)    /* pt_regs->sp */
    pushq    %r11                    /* pt_regs->flags */
    pushq    $__USER_CS                /* pt_regs->cs */
    pushq    %rcx                    /* pt_regs->ip */
GLOBAL(entry_SYSCALL_64_after_hwframe)
    pushq    %rax                    /* pt_regs->orig_ax */

进入entry_syscalll_64后，会保存现在寄存器的值放入pt_regs结构体中

继续单步执行，执行call dosyscallc_64函数

技术图片

do_syscall64定义在common.c中

技术图片

regs->ax = sys_call_table[nr](regs);

这句会查找对应的系统调用号，然后传入regs结构体，regs中保存着各个寄存器的值，之后会把调用返回值传给ax寄存器

最后会执行sysret指令恢复堆栈

技术图片

USERGS_SYSERT64是个宏展开，其扩展调用 swapgs 指令交换用户 GS 和内核GS， sysret 指令执行从系统调用处理退出

至此，一段系统调用结束

总结

操作系统对于中断处理流程一般为：

关中断：CPU关闭中段响应，即不再接受其它外部中断请求
保存断点：将发生中断处的指令地址压入堆栈，以使中断处理完后能正确地返回。
识别中断源：CPU识别中断的来源，确定中断类型号，从而找到相应的中断服务程序的入口地址。
保护现场所：将发生中断处理有关寄存器（中断服务程序中要使用的寄存器）以及标志寄存器的内存压入堆栈。
执行中断服务程序：转到中断服务程序入口开始执行，可在适当时刻重新开放中断，以便允许响应较高优先级的外部中断。
恢复现场并返回：把“保护现场”时压入堆栈的信息弹回原寄存器，然后执行中断返回指令（IRET），从而返回主程序继续运行。

在内核初始化时，会执行trap_init函数，把中断向量表拷贝到指定位置，syscall_64.c中定义着系统调用表sys_call_table，在cpu_init时完成初始化。执行int 0x80时，硬件找到在中断描述符表中的表项，在自动切换到内核栈 (tss.ss0 : tss.esp0) 后根据中断描述符的 segment selector 在 GDT / LDT 中找到对应的段描述符，从段描述符拿到段的基址，加载到 cs ，将 offset 加载到 eip。最后硬件将 ss / sp / eflags / cs / ip / error code 依次压到内核栈。返回时，iret 将先前压栈的 ss / sp / eflags / cs / ip 弹出，恢复用户态调用时的寄存器上下文。

syscall则是64位系统中，为了加速系统调用通过引入新的 MSR 来存放内核态的代码和栈的段号和偏移量，从而实现快速跳转。

技术图片