c语言下载http并提取http body-凯发app官方网站

在"c语言写soap"文章中提到用c下载soap,其实也就是http内容。
地址：http://blog.chinaunix.net/u1/54401/showart.php?id=2310264

不过后来发现有个问题，有些 web server，比如lighttpd，不用content-leng,而是在http头和包体之间直接写长度。这样分析起来稍有困难。后来参考httpfetch代码，解决了这个问题。做到和常用浏览器兼容。
下载就采用那个文章上的方案，分析采用httpfetch,很简单，一个函数(其中调用几个工具函数)：

int timeout = 2;
char *useragent = null;
char *referer = null;
int hideuseragent = 0;
int hidereferer = 1;
static int followredirects = default_redirects;    /* # of redirects to follow */
extern const char *http_errlist[];    /* array of http fetcher error messages */
extern char convertederror[128];    /* buffer to used when errors contain %d */
static int errorsource = 0;
static int http_errno = 0;
static int errorint = 0;            /* when the error message has a %d in it,
                                    *    this variable is inserted */

    /*
    * actually downloads the page, registering a hit (donation)
    *    if the filebuf passed in is null, the url is downloaded and then
    *    freed; otherwise the necessary space is allocated for filebuf.
    *    returns size of download on success, -1 on error is set,
    */

int http_fetch(const char *url_tmp, char **filebuf)
    {
    fd_set rfds;
    struct timeval tv;
    char headerbuf[header_buf_size];
    char *tmp, *url, *pagebuf, *requestbuf = null, *host, *charindex;
    int sock, bytesread = 0, contentlength = -1, bufsize = request_buf_size;
    int i,
        ret = -1,
        tempsize,
        selectret,
        found = 0,    /* for redirects */
        redirectsfollowed = 0;

    if(url_tmp == null)
        {
        errorsource = fetcher_error;
        http_errno = hf_nullurl;
        return -1;
        }

    /* copy the url passed in into a buffer we can work with, change, etc. */
    url = (char*)malloc(strlen(url_tmp) 1);
    if(url == null)
        {
        errorsource = errno;
        return -1;
        }
    strncpy(url, url_tmp, strlen(url_tmp) 1);

    /* this loop allows us to follow redirects if need be. an afterthought,
    * added to provide this basic functionality. will hopefully be designed
    * better in 2.x.x ;) */
/*    while(!found &&
        (followredirects < 0 || redirectsfollowed < followredirects) )
*/ do
        {
        /* seek to the file path portion of the url */
        charindex = strstr(url, "://");
        if(charindex != null)
            {
            /* url contains a protocol field */
            charindex = strlen("://");
            host = charindex;
            charindex = strchr(charindex, '/');
            }
        else
            {
            host = (char *)url;
            charindex = strchr(url, '/');
            }

        /* compose a request string */
        requestbuf = (char*)malloc(bufsize);
        if(requestbuf == null)
            {
            free(url);
            errorsource = errno;
            return -1;
            }
        requestbuf[0] = 0;

        if(charindex == null)
            {
            /* the url has no '/' in it, assume the user is making a root-level
            *    request */
            tempsize = strlen("get /") strlen(http_version) 2;
            if(_checkbufsize(&requestbuf, &bufsize, tempsize) ||
                snprintf(requestbuf, bufsize, "get / %s\r\n", http_version) < 0)
                {
                free(url);
                free(requestbuf);
                errorsource = errno;
                return -1;
                }
            }
        else
            {
            tempsize = strlen("get ") strlen(charindex)
              strlen(http_version) 4;
             /* 4 is for ' ', '\r', '\n', and null */

            if(_checkbufsize(&requestbuf, &bufsize, tempsize) ||
                    snprintf(requestbuf, bufsize, "get %s %s\r\n",
                    charindex, http_version) < 0)
                {
                free(url);
                free(requestbuf);
                errorsource = errno;
                return -1;
                }
            }

        /* null out the end of the hostname if need be */
        if(charindex != null)
            *charindex = 0;

        /* use host: even though 1.0 doesn't specify it. some servers
        *    won't play nice if we don't send host, and it shouldn't
        *    hurt anything */
        ret = bufsize - strlen(requestbuf); /* space left in buffer */
        tempsize = (int)strlen("host: ") (int)strlen(host) 3;
        /* 3 for "\r\n\0" */
        if(_checkbufsize(&requestbuf, &bufsize, tempsize 128))
            {
            free(url);
            free(requestbuf);
            errorsource = errno;
            return -1;
            }
        strcat(requestbuf, "host: ");
        strcat(requestbuf, host);
        strcat(requestbuf, "\r\n");

        if(!hidereferer && referer != null)    /* no default referer */
            {
            tempsize = (int)strlen("referer: ") (int)strlen(referer) 3;
               /* 3 is for '\r', '\n', and null */
            if(_checkbufsize(&requestbuf, &bufsize, tempsize))
                {
                free(url);
                free(requestbuf);
                errorsource = errno;
                return -1;
                }
            strcat(requestbuf, "referer: ");
            strcat(requestbuf, referer);
            strcat(requestbuf, "\r\n");
            }

        if(!hideuseragent && useragent == null)
            {
            tempsize = (int)strlen("user-agent: ")
                (int)strlen(default_user_agent) (int)strlen(http_version) 4;
               /* 4 is for '\', '\r', '\n', and null */
            if(_checkbufsize(&requestbuf, &bufsize, tempsize))
                {
                free(url);
                free(requestbuf);
                errorsource = errno;
                return -1;
                }
            strcat(requestbuf, "user-agent: ");
            strcat(requestbuf, default_user_agent);
            strcat(requestbuf, "/");
            strcat(requestbuf, http_version);
            strcat(requestbuf, "\r\n");
            }
        else if(!hideuseragent)
            {
            tempsize = (int)strlen("user-agent: ") (int)strlen(useragent) 3;
               /* 3 is for '\r', '\n', and null */
            if(_checkbufsize(&requestbuf, &bufsize, tempsize))
                {
                free(url);
                free(requestbuf);
                errorsource = errno;
                return -1;
                }
            strcat(requestbuf, "user-agent: ");
            strcat(requestbuf, useragent);
            strcat(requestbuf, "\r\n");
            }

        tempsize = (int)strlen("connection: close\r\n\r\n");
        if(_checkbufsize(&requestbuf, &bufsize, tempsize))
            {
            free(url);
            free(requestbuf);
            errorsource = errno;
            return -1;
            }
        strcat(requestbuf, "connection: close\r\n\r\n");

        /* now free any excess memory allocated to the buffer */
        tmp = (char*)realloc(requestbuf, strlen(requestbuf) 1);
        if(tmp == null)
            {
            free(url);
            free(requestbuf);
            errorsource = errno;
            return -1;
            }
        requestbuf = tmp;

        sock = makesocket(host);        /* errorsource set within makesocket */
        if(sock == -1) { free(url); free(requestbuf); return -1;}

        free(url);
        url = null;

        if(write(sock, requestbuf, strlen(requestbuf)) == -1)
            {
            close(sock);
            free(requestbuf);
            errorsource = errno;
            return -1;
            }

        free(requestbuf);
        requestbuf = null;

        /* grab enough of the response to get the metadata */
        ret = _http_read_header(sock, headerbuf);    /* errorsource set within */
        if(ret < 0) { close(sock); return -1; }

        /* get the return code */
        charindex = strstr(headerbuf, "http/");
        if(charindex == null)
            {
            close(sock);
            errorsource = fetcher_error;
            http_errno = hf_freturncode;
            return -1;
            }
        while(*charindex != ' ')
            charindex ;
        charindex ;

        ret = sscanf(charindex, "%d", &i);
        if(ret != 1)
            {
            close(sock);
            errorsource = fetcher_error;
            http_errno = hf_creturncode;
            return -1;
            }
        if(i<200 || i>307)
            {
            close(sock);
            errorint = i;    /* status code, to be inserted in error string */
            errorsource = fetcher_error;
            http_errno = hf_statuscode;
            return -1;
            }

        /* if a redirect, repeat operation until final url is found or we
        * redirect followredirects times. note the case sensitive "location",
        * should probably be made more robust in the future (without relying
        * on the non-standard strcasecmp()).
        * this bit mostly by dean wilder, tweaked by me */
        if(i >= 300)
            {
            redirectsfollowed ;

            /* pick up redirect url, allocate new url, and repeat process */
            charindex = strstr(headerbuf, "location:");
            if(!charindex)
                {
                close(sock);
                errorint = i; /* status code, to be inserted in error string */
                errorsource = fetcher_error;
                http_errno = hf_cantredirect;
                return -1;
                }
            charindex = strlen("location:");
            /* skip any whitespace... */
            while(*charindex != '\0' && isspace(*charindex))
                charindex ;
            if(*charindex == '\0')
                {
                close(sock);
                errorint = i; /* status code, to be inserted in error string */
                errorsource = fetcher_error;
                http_errno = hf_cantredirect;
                return -1;
                }

            i = strcspn(charindex, " \r\n");
            if(i > 0)
                {
                url = (char *)malloc(i 1);
                strncpy(url, charindex, i);
                url[i] = '\0';
                }
            else
                /* found 'location:' but contains no url! we'll handle it as
                 * 'found', hopefully the resulting document will give the user
                 * a hint as to what happened. */
                found = 1;
            }
        else
            found = 1;
        }

        while(!found &&
                (followredirects < 0 || redirectsfollowed <= followredirects) );

    if(url) /* redirection code may malloc this, then exceed followredirects */
        {
        free(url);
        url = null;
        }

    if(redirectsfollowed >= followredirects && !found)
        {
        close(sock);
        errorint = followredirects; /* to be inserted in error string */
        errorsource = fetcher_error;
        http_errno = hf_maxredirects;
        return -1;
        }

    /*
    * parse out about how big the data segment is.
    *    note that under current http standards (1.1 and prior), the
    *    content-length field is not guaranteed to be accurate or even present.
    *    i just use it here so i can allocate a ballpark amount of memory.
    *
    * note that some servers use different capitalization
    */
    charindex = strstr(headerbuf, "content-length:");
    if(charindex == null)
        charindex = strstr(headerbuf, "content-length:");

    if(charindex != null)
        {
        ret = sscanf(charindex strlen("content-length: "), "%d",
            &contentlength);
        if(ret < 1)
            {
            close(sock);
            errorsource = fetcher_error;
            http_errno = hf_contentlen;
            return -1;
            }
        }

    /* allocate enough memory to hold the page */
    if(contentlength == -1)
        contentlength = default_page_buf_size;

    pagebuf = (char *)malloc(contentlength);
    if(pagebuf == null)
        {
        close(sock);
        errorsource = errno;
        return -1;
        }

    /* begin reading the body of the file */
    while(ret > 0)
        {
        fd_zero(&rfds);
        fd_set(sock, &rfds);
        tv.tv_sec = timeout;
        tv.tv_usec = 0;

        if(timeout >= 0)
            selectret = select(sock 1, &rfds, null, null, &tv);
        else        /* no timeout, can block indefinately */
            selectret = select(sock 1, &rfds, null, null, null);

        if(selectret == 0)
            {
            errorsource = fetcher_error;
            http_errno = hf_datatimeout;
            errorint = timeout;
            close(sock);
            free(pagebuf);
            return -1;
            }
        else if(selectret == -1)
            {
            close(sock);
            free(pagebuf);
            errorsource = errno;
            return -1;
            }

        ret = read(sock, pagebuf bytesread, contentlength);
        if(ret == -1)
            {
            close(sock);
            free(pagebuf);
            errorsource = errno;
            return -1;
            }

        bytesread = ret;

        if(ret > 0)
            {
            /* to be tolerant of inaccurate content-length fields, we'll
            *    allocate another read-sized chunk to make sure we have
            *    enough room.
            */
            tmp = (char *)realloc(pagebuf, bytesread contentlength);
            if(tmp == null)
                {
                close(sock);
                free(pagebuf);
                errorsource = errno;
                return -1;
                }
            pagebuf = tmp;
            }
        }

    /*
    * the download buffer is too large. trim off the safety padding.
     * note that we add one null byte to the end of the data, as it may not
     * already be null terminated and we can't be sure what type of data it
     * is or what the caller will do with it.
    */
    tmp = (char *)realloc(pagebuf, bytesread 1);
        /* tmp shouldn't be null, since we're _shrinking_ the buffer,
        *    and if it did fail, we could go on with the too-large buffer,
        *    but something would definately be wrong, so we'll just give
        *    an error message */
    if(tmp == null)
        {
        close(sock);
        free(pagebuf);
        errorsource = errno;
        return -1;
        }
    pagebuf = tmp;
    pagebuf[bytesread] = '\0'; /* null terminate the data */

    if(filebuf == null)    /* they just wanted us to "hit" the url */
        free(pagebuf);
    else
        *filebuf = pagebuf;

    close(sock);
    return bytesread;
    }

int _checkbufsize(char **buf, int *bufsize, int more)
    {
    char *tmp;
    int roomleft = *bufsize - (strlen(*buf) 1);
    if(roomleft > more)
        return 0;
    tmp = (char*)realloc(*buf, *bufsize more 1);
    if(tmp == null)
        return -1;
    *buf = tmp;
    *bufsize = more 1;
    return 0;
    }
int makesocket(const char *host)
    {
    int sock;                                        /* socket descriptor */
    struct sockaddr_in sa;                            /* socket address */
    struct hostent *hp;                                /* host entity */
    int ret;
    int port;
    char *p;

    /* check for port number specified in url */
    p = strchr(host, ':');
    if(p)
        {
        port = atoi(p 1);
        *p = '\0';
        }
    else
        port = port_number;

    hp = gethostbyname(host);
    if(hp == null) { errorsource = h_errno; return -1; }

    /* copy host address from hostent to (server) socket address */
    memcpy((char *)&sa.sin_addr, (char *)hp->h_addr, hp->h_length);
    sa.sin_family = hp->h_addrtype;        /* set service sin_family to pf_inet */
    sa.sin_port = htons(port);          /* put portnum into sockaddr */

    sock = socket(hp->h_addrtype, sock_stream, 0);
    if(sock == -1) { errorsource = errno; return -1; }

    ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
    if(ret == -1) { errorsource = errno; return -1; }

    return sock;
    }

int _http_read_header(int sock, char *headerptr)
    {
    fd_set rfds;
    struct timeval tv;
    int bytesread = 0, newlines = 0, ret, selectret;

    while(newlines != 2 && bytesread != header_buf_size)
        {
        fd_zero(&rfds);
        fd_set(sock, &rfds);
        tv.tv_sec = timeout;
        tv.tv_usec = 0;

        if(timeout >= 0)
            selectret = select(sock 1, &rfds, null, null, &tv);
        else        /* no timeout, can block indefinately */
            selectret = select(sock 1, &rfds, null, null, null);

        if(selectret == 0)
            {
            errorsource = fetcher_error;
            http_errno = hf_headtimeout;
            errorint = timeout;
            return -1;
            }
        else if(selectret == -1) { errorsource = errno; return -1; }

        ret = read(sock, headerptr, 1);
        if(ret == -1) { errorsource = errno; return -1; }
        bytesread ;

        if(*headerptr == '\r')            /* ignore cr */
            {
            /* basically do nothing special, just don't set newlines
            *    to 0 */
            headerptr ;
            continue;
            }
        else if(*headerptr == '\n')        /* lf is the separator */
            newlines ;
        else
            newlines = 0;

        headerptr ;
        }

    headerptr -= 3;        /* snip the trailing lf's */
    *headerptr = '\0';
    return bytesread;
    }

const char *http_strerror()
    {
    extern int errno;

    if(errorsource == errno)
        return strerror(errno);
    else if(errorsource == h_errno)
#ifdef have_hstrerror
        return hstrerror(h_errno);
#else
        return http_errlist[hf_herror];
#endif
    else if(errorsource == fetcher_error)
        {
        if(strstr(http_errlist[http_errno], "%d") == null)
            return http_errlist[http_errno];
        else
            {
            /* the error string has a %d in it, we need to insert errorint.
            *    convertederror[128] has been declared for that purpose */
            char *stringindex, *originalerror;

            originalerror = (char *)http_errlist[http_errno];
            convertederror[0] = 0;        /* start off with null */
            stringindex = strstr(originalerror, "%d");
            strncat(convertederror, originalerror,        /* copy up to %d */
                abs(stringindex - originalerror));
            sprintf(&convertederror[strlen(convertederror)],"%d",errorint);
            stringindex = 2;        /* skip past the %d */
            strcat(convertederror, stringindex);

            return convertederror;
            }
        }

    return http_errlist[hf_metaerror];    /* should never happen */
    }

阅读(4716) | 评论(0) | 转发(0) |

上一篇：mac os 10.5 硬盘安装升级

下一篇：live555 media server文件播放与读内存播放

给主人留下些什么吧！~~

| | | | |

感谢所有关心和支持过chinaunix的朋友们