在"c语言写soap"文章中提到用c下载soap,其实也就是http内容。
地址:http://blog.chinaunix.net/u1/54401/showart.php?id=2310264
不过后来发现有个问题,有些 web server,比如lighttpd,不用content-leng,而是在http头和包体之间直接写长度。这样分析起来稍有困难。后来参考httpfetch代码,解决了这个问题。做到和常用浏览器兼容。
下载就采用那个文章上的方案,分析采用httpfetch,很简单,一个函数(其中调用几个工具函数):
int timeout = 2;
char *useragent = null;
char *referer = null;
int hideuseragent = 0;
int hidereferer = 1;
static int followredirects = default_redirects; /* # of redirects to follow */
extern const char *http_errlist[]; /* array of http fetcher error messages */
extern char convertederror[128]; /* buffer to used when errors contain %d */
static int errorsource = 0;
static int http_errno = 0;
static int errorint = 0; /* when the error message has a %d in it,
* this variable is inserted */
/*
* actually downloads the page, registering a hit (donation)
* if the filebuf passed in is null, the url is downloaded and then
* freed; otherwise the necessary space is allocated for filebuf.
* returns size of download on success, -1 on error is set,
*/
int http_fetch(const char *url_tmp, char **filebuf)
{
fd_set rfds;
struct timeval tv;
char headerbuf[header_buf_size];
char *tmp, *url, *pagebuf, *requestbuf = null, *host, *charindex;
int sock, bytesread = 0, contentlength = -1, bufsize = request_buf_size;
int i,
ret = -1,
tempsize,
selectret,
found = 0, /* for redirects */
redirectsfollowed = 0;
if(url_tmp == null)
{
errorsource = fetcher_error;
http_errno = hf_nullurl;
return -1;
}
/* copy the url passed in into a buffer we can work with, change, etc. */
url = (char*)malloc(strlen(url_tmp) 1);
if(url == null)
{
errorsource = errno;
return -1;
}
strncpy(url, url_tmp, strlen(url_tmp) 1);
/* this loop allows us to follow redirects if need be. an afterthought,
* added to provide this basic functionality. will hopefully be designed
* better in 2.x.x ;) */
/* while(!found &&
(followredirects < 0 || redirectsfollowed < followredirects) )
*/ do
{
/* seek to the file path portion of the url */
charindex = strstr(url, "://");
if(charindex != null)
{
/* url contains a protocol field */
charindex = strlen("://");
host = charindex;
charindex = strchr(charindex, '/');
}
else
{
host = (char *)url;
charindex = strchr(url, '/');
}
/* compose a request string */
requestbuf = (char*)malloc(bufsize);
if(requestbuf == null)
{
free(url);
errorsource = errno;
return -1;
}
requestbuf[0] = 0;
if(charindex == null)
{
/* the url has no '/' in it, assume the user is making a root-level
* request */
tempsize = strlen("get /") strlen(http_version) 2;
if(_checkbufsize(&requestbuf, &bufsize, tempsize) ||
snprintf(requestbuf, bufsize, "get / %s\r\n", http_version) < 0)
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
}
else
{
tempsize = strlen("get ") strlen(charindex)
strlen(http_version) 4;
/* 4 is for ' ', '\r', '\n', and null */
if(_checkbufsize(&requestbuf, &bufsize, tempsize) ||
snprintf(requestbuf, bufsize, "get %s %s\r\n",
charindex, http_version) < 0)
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
}
/* null out the end of the hostname if need be */
if(charindex != null)
*charindex = 0;
/* use host: even though 1.0 doesn't specify it. some servers
* won't play nice if we don't send host, and it shouldn't
* hurt anything */
ret = bufsize - strlen(requestbuf); /* space left in buffer */
tempsize = (int)strlen("host: ") (int)strlen(host) 3;
/* 3 for "\r\n\0" */
if(_checkbufsize(&requestbuf, &bufsize, tempsize 128))
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
strcat(requestbuf, "host: ");
strcat(requestbuf, host);
strcat(requestbuf, "\r\n");
if(!hidereferer && referer != null) /* no default referer */
{
tempsize = (int)strlen("referer: ") (int)strlen(referer) 3;
/* 3 is for '\r', '\n', and null */
if(_checkbufsize(&requestbuf, &bufsize, tempsize))
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
strcat(requestbuf, "referer: ");
strcat(requestbuf, referer);
strcat(requestbuf, "\r\n");
}
if(!hideuseragent && useragent == null)
{
tempsize = (int)strlen("user-agent: ")
(int)strlen(default_user_agent) (int)strlen(http_version) 4;
/* 4 is for '\', '\r', '\n', and null */
if(_checkbufsize(&requestbuf, &bufsize, tempsize))
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
strcat(requestbuf, "user-agent: ");
strcat(requestbuf, default_user_agent);
strcat(requestbuf, "/");
strcat(requestbuf, http_version);
strcat(requestbuf, "\r\n");
}
else if(!hideuseragent)
{
tempsize = (int)strlen("user-agent: ") (int)strlen(useragent) 3;
/* 3 is for '\r', '\n', and null */
if(_checkbufsize(&requestbuf, &bufsize, tempsize))
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
strcat(requestbuf, "user-agent: ");
strcat(requestbuf, useragent);
strcat(requestbuf, "\r\n");
}
tempsize = (int)strlen("connection: close\r\n\r\n");
if(_checkbufsize(&requestbuf, &bufsize, tempsize))
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
strcat(requestbuf, "connection: close\r\n\r\n");
/* now free any excess memory allocated to the buffer */
tmp = (char*)realloc(requestbuf, strlen(requestbuf) 1);
if(tmp == null)
{
free(url);
free(requestbuf);
errorsource = errno;
return -1;
}
requestbuf = tmp;
sock = makesocket(host); /* errorsource set within makesocket */
if(sock == -1) { free(url); free(requestbuf); return -1;}
free(url);
url = null;
if(write(sock, requestbuf, strlen(requestbuf)) == -1)
{
close(sock);
free(requestbuf);
errorsource = errno;
return -1;
}
free(requestbuf);
requestbuf = null;
/* grab enough of the response to get the metadata */
ret = _http_read_header(sock, headerbuf); /* errorsource set within */
if(ret < 0) { close(sock); return -1; }
/* get the return code */
charindex = strstr(headerbuf, "http/");
if(charindex == null)
{
close(sock);
errorsource = fetcher_error;
http_errno = hf_freturncode;
return -1;
}
while(*charindex != ' ')
charindex ;
charindex ;
ret = sscanf(charindex, "%d", &i);
if(ret != 1)
{
close(sock);
errorsource = fetcher_error;
http_errno = hf_creturncode;
return -1;
}
if(i<200 || i>307)
{
close(sock);
errorint = i; /* status code, to be inserted in error string */
errorsource = fetcher_error;
http_errno = hf_statuscode;
return -1;
}
/* if a redirect, repeat operation until final url is found or we
* redirect followredirects times. note the case sensitive "location",
* should probably be made more robust in the future (without relying
* on the non-standard strcasecmp()).
* this bit mostly by dean wilder, tweaked by me */
if(i >= 300)
{
redirectsfollowed ;
/* pick up redirect url, allocate new url, and repeat process */
charindex = strstr(headerbuf, "location:");
if(!charindex)
{
close(sock);
errorint = i; /* status code, to be inserted in error string */
errorsource = fetcher_error;
http_errno = hf_cantredirect;
return -1;
}
charindex = strlen("location:");
/* skip any whitespace... */
while(*charindex != '\0' && isspace(*charindex))
charindex ;
if(*charindex == '\0')
{
close(sock);
errorint = i; /* status code, to be inserted in error string */
errorsource = fetcher_error;
http_errno = hf_cantredirect;
return -1;
}
i = strcspn(charindex, " \r\n");
if(i > 0)
{
url = (char *)malloc(i 1);
strncpy(url, charindex, i);
url[i] = '\0';
}
else
/* found 'location:' but contains no url! we'll handle it as
* 'found', hopefully the resulting document will give the user
* a hint as to what happened. */
found = 1;
}
else
found = 1;
}
while(!found &&
(followredirects < 0 || redirectsfollowed <= followredirects) );
if(url) /* redirection code may malloc this, then exceed followredirects */
{
free(url);
url = null;
}
if(redirectsfollowed >= followredirects && !found)
{
close(sock);
errorint = followredirects; /* to be inserted in error string */
errorsource = fetcher_error;
http_errno = hf_maxredirects;
return -1;
}
/*
* parse out about how big the data segment is.
* note that under current http standards (1.1 and prior), the
* content-length field is not guaranteed to be accurate or even present.
* i just use it here so i can allocate a ballpark amount of memory.
*
* note that some servers use different capitalization
*/
charindex = strstr(headerbuf, "content-length:");
if(charindex == null)
charindex = strstr(headerbuf, "content-length:");
if(charindex != null)
{
ret = sscanf(charindex strlen("content-length: "), "%d",
&contentlength);
if(ret < 1)
{
close(sock);
errorsource = fetcher_error;
http_errno = hf_contentlen;
return -1;
}
}
/* allocate enough memory to hold the page */
if(contentlength == -1)
contentlength = default_page_buf_size;
pagebuf = (char *)malloc(contentlength);
if(pagebuf == null)
{
close(sock);
errorsource = errno;
return -1;
}
/* begin reading the body of the file */
while(ret > 0)
{
fd_zero(&rfds);
fd_set(sock, &rfds);
tv.tv_sec = timeout;
tv.tv_usec = 0;
if(timeout >= 0)
selectret = select(sock 1, &rfds, null, null, &tv);
else /* no timeout, can block indefinately */
selectret = select(sock 1, &rfds, null, null, null);
if(selectret == 0)
{
errorsource = fetcher_error;
http_errno = hf_datatimeout;
errorint = timeout;
close(sock);
free(pagebuf);
return -1;
}
else if(selectret == -1)
{
close(sock);
free(pagebuf);
errorsource = errno;
return -1;
}
ret = read(sock, pagebuf bytesread, contentlength);
if(ret == -1)
{
close(sock);
free(pagebuf);
errorsource = errno;
return -1;
}
bytesread = ret;
if(ret > 0)
{
/* to be tolerant of inaccurate content-length fields, we'll
* allocate another read-sized chunk to make sure we have
* enough room.
*/
tmp = (char *)realloc(pagebuf, bytesread contentlength);
if(tmp == null)
{
close(sock);
free(pagebuf);
errorsource = errno;
return -1;
}
pagebuf = tmp;
}
}
/*
* the download buffer is too large. trim off the safety padding.
* note that we add one null byte to the end of the data, as it may not
* already be null terminated and we can't be sure what type of data it
* is or what the caller will do with it.
*/
tmp = (char *)realloc(pagebuf, bytesread 1);
/* tmp shouldn't be null, since we're _shrinking_ the buffer,
* and if it did fail, we could go on with the too-large buffer,
* but something would definately be wrong, so we'll just give
* an error message */
if(tmp == null)
{
close(sock);
free(pagebuf);
errorsource = errno;
return -1;
}
pagebuf = tmp;
pagebuf[bytesread] = '\0'; /* null terminate the data */
if(filebuf == null) /* they just wanted us to "hit" the url */
free(pagebuf);
else
*filebuf = pagebuf;
close(sock);
return bytesread;
}
int _checkbufsize(char **buf, int *bufsize, int more)
{
char *tmp;
int roomleft = *bufsize - (strlen(*buf) 1);
if(roomleft > more)
return 0;
tmp = (char*)realloc(*buf, *bufsize more 1);
if(tmp == null)
return -1;
*buf = tmp;
*bufsize = more 1;
return 0;
}
int makesocket(const char *host)
{
int sock; /* socket descriptor */
struct sockaddr_in sa; /* socket address */
struct hostent *hp; /* host entity */
int ret;
int port;
char *p;
/* check for port number specified in url */
p = strchr(host, ':');
if(p)
{
port = atoi(p 1);
*p = '\0';
}
else
port = port_number;
hp = gethostbyname(host);
if(hp == null) { errorsource = h_errno; return -1; }
/* copy host address from hostent to (server) socket address */
memcpy((char *)&sa.sin_addr, (char *)hp->h_addr, hp->h_length);
sa.sin_family = hp->h_addrtype; /* set service sin_family to pf_inet */
sa.sin_port = htons(port); /* put portnum into sockaddr */
sock = socket(hp->h_addrtype, sock_stream, 0);
if(sock == -1) { errorsource = errno; return -1; }
ret = connect(sock, (struct sockaddr *)&sa, sizeof(sa));
if(ret == -1) { errorsource = errno; return -1; }
return sock;
}
int _http_read_header(int sock, char *headerptr)
{
fd_set rfds;
struct timeval tv;
int bytesread = 0, newlines = 0, ret, selectret;
while(newlines != 2 && bytesread != header_buf_size)
{
fd_zero(&rfds);
fd_set(sock, &rfds);
tv.tv_sec = timeout;
tv.tv_usec = 0;
if(timeout >= 0)
selectret = select(sock 1, &rfds, null, null, &tv);
else /* no timeout, can block indefinately */
selectret = select(sock 1, &rfds, null, null, null);
if(selectret == 0)
{
errorsource = fetcher_error;
http_errno = hf_headtimeout;
errorint = timeout;
return -1;
}
else if(selectret == -1) { errorsource = errno; return -1; }
ret = read(sock, headerptr, 1);
if(ret == -1) { errorsource = errno; return -1; }
bytesread ;
if(*headerptr == '\r') /* ignore cr */
{
/* basically do nothing special, just don't set newlines
* to 0 */
headerptr ;
continue;
}
else if(*headerptr == '\n') /* lf is the separator */
newlines ;
else
newlines = 0;
headerptr ;
}
headerptr -= 3; /* snip the trailing lf's */
*headerptr = '\0';
return bytesread;
}
const char *http_strerror()
{
extern int errno;
if(errorsource == errno)
return strerror(errno);
else if(errorsource == h_errno)
#ifdef have_hstrerror
return hstrerror(h_errno);
#else
return http_errlist[hf_herror];
#endif
else if(errorsource == fetcher_error)
{
if(strstr(http_errlist[http_errno], "%d") == null)
return http_errlist[http_errno];
else
{
/* the error string has a %d in it, we need to insert errorint.
* convertederror[128] has been declared for that purpose */
char *stringindex, *originalerror;
originalerror = (char *)http_errlist[http_errno];
convertederror[0] = 0; /* start off with null */
stringindex = strstr(originalerror, "%d");
strncat(convertederror, originalerror, /* copy up to %d */
abs(stringindex - originalerror));
sprintf(&convertederror[strlen(convertederror)],"%d",errorint);
stringindex = 2; /* skip past the %d */
strcat(convertederror, stringindex);
return convertederror;
}
}
return http_errlist[hf_metaerror]; /* should never happen */
}
阅读(4716) | 评论(0) | 转发(0) |