一个网页多线程下载程序
一个网页多线程下载程序
平台:linux,unix
语言:c
程序如下:
/* spider.h
*/
#include <pthread.h>
#include <stdio.h>
#include <time.h>
#include <stdlib.h>
#include <string.h>
#include <sys/stat.h>
#include <sys/types.h>
#include <fcntl.h>
#include <limits.h>
// #include <pthread.h>
#include <unistd.h>
#include<sys/wait.h>
#include<signal.h>
#include<netdb.h>
#include<sys/socket.h>
#include<netinet/in.h>
#include<arpa/inet.h>
#define PORT 80 /* default port number */
#define MAXDATASIZE 4096 /* max buffer size */
#define GET_CMD "GET %s HTTP/1.0\r\nHost:%s\r\nAccept:*/*\r\nUser_Agent:myAgentr\r\nConnection:Close\r\n\r\n" /* the GET command */
#define URL_HEADER "http://"
#define MAX_PATH 1024
#define MAXCONN 10 /* MAX Threads allowed */
#define MAXURLS 1024
pthread_mutex_t th_lock = PTHREAD_MUTEX_INITIALIZER; /* thread lock */
int conns[MAXCONN] = {0}; /* threads counter */
int url_cnt = 0; /* urls count */
int url_num = 0; /* url index */
char * urls[MAXURLS];
char * read_list(char *filename, int *file_size);
void setup(pthread_attr_t *attrp);
int thread_state(int *idle_cnt);
void* get_webpage(void *);
struct th_url {
char * url;
int th_index;
int url_index;
};
struct url_arr {
char **url;
int len;
};
/* spider.c
* multi-thread downloading webpages.
*
*/
#include "spider.h"
int main(int argc, char *argv[]) {
// struct stat statbuf; /* file information */
size_t file_size; /* file size */
int i;
// int url_cnt;
// int url_index;
int th_index, idle_th;
// struct th_url st_url[MAXCONN];
struct url_arr th_url_arr;
// static int conns[MAXCONN] = {0}; /* threads counter */
pthread_t spiders[MAXCONN]; /* threads */
pthread_attr_t attr;
// FILE *fp;
char *filename = "urllist.txt";
char *buffer, *p;
char *url;
// char *urls[MAXURLS];
printf("Reading data!...\n");
/* read urllist */
if((buffer = read_list(filename, &file_size))<0) {
printf("Error reading file!\n");
exit(1);
}
printf("Parsing date....\n");
/* parse \n to \0 */
i = 0;
url_cnt = 0;
urls[0] = buffer;
while(i<file_size - 1) {
if(buffer[i] == '\n') {
buffer[i] = '\0';
++url_cnt;
if(url_cnt >= MAXURLS) {
break;
}
urls[url_cnt] = &buffer[i+1];
}
++i;
}
buffer[file_size - 1] = '\0';
++url_cnt;
th_url_arr.url = urls;
th_url_arr.len = url_cnt;
// buffer[0] = buffer[0]=='\n'?'\0':buffer[0];
/*if(buffer[0] == '\n') {
buffer[0] = '\0';
++url_cnt;
}*/
p = buffer;
printf("urls count: %d\n", url_cnt);
for(i = 0; i<url_cnt; i++) {
printf("%s\n", p);
p += strlen(p) + 1;
}
setup(&attr); /* initialize threads */
url = buffer;
// url_index = 0;
i = MAXCONN + 1;
th_index = 0;
/* download webpages */
while(--i) {
if((MAXCONN - i) >= url_cnt) {
break;
}
// printf("Thread: %d created!\n", MAXCONN-i);
// if((th_index = thread_state(NULL)) < 0) { /* max threads */
/*if(url_index >= url_cnt) {
break;
}*/
// sleep(1); /* wait 1 second */
// continue;
//}
/* start a thread to download webpage */
// while((url_index < url_cnt) && (*url == '\0')) {
// ++url;
// ++url_index;
// }
/*while(*url == ' ') {
++url;
}
if(url_index >= url_cnt) {
th_index = thread_state(&idle_th);
if(idle_th >= MAXCONN) {
break;
}
sleep(1);
continue;
}
st_url[th_index].url = url;
st_url[th_index].th_index = th_index;
st_url[th_index].url_index = url_index;
pthread_mutex_lock(&th_lock);
conns[th_index] = 1;
pthread_mutex_unlock(&th_lock);
printf("In thread %d\nURL is: %s\nurl_index: %d\n", th_index, url, url_index);
*/
pthread_create(&spiders[th_index], NULL/*&attr*/, get_webpage,(void *) &th_url_arr);
++th_index;
/* create thread */
// sleep(1);
//
// url += strlen(url) + 1;
// ++url_index;
}
i = 0;
while(i < MAXCONN) {
if(i >= url_cnt) {
break;
}
if(pthread_join(spiders[i], NULL) == 0) {
// printf("Thread %d exit success!\n", i);
}
else {
// printf("Thread %d exit error!\n", i);
}
++i;
}
// sleep(5);
/* free buffer */
free(buffer);
return EXIT_SUCCESS;
}
/* read urllist from file */
char* read_list(char *filename, int *file_size) {
struct stat statbuf; /* file information */
// size_t file_size; /* file size */
FILE *fp;
char *buffer;
/* open file: urllist.txt */
if((fp = fopen(filename, "r"))==NULL) {
printf("Error! Can't open file: %s!\n", filename);
return NULL;
}
/* file status */
if (stat(filename, &statbuf) < 0) {
printf("Error! Can't get status of file!\n");
fclose(fp);
return NULL;
}
*file_size = statbuf.st_size;
/* allocate buffer */
if((buffer=(char*)malloc(*file_size))==NULL) {
printf("Error!Can'tallocatebuffer!\n");
fclose(fp);
return NULL;
}
/* read file */
fread(buffer, 1, *file_size, fp);
/* close file */
fclose(fp);
return buffer;
}
/* initialize the status variables and set
* the thread attribute to detached
*/
void setup(pthread_attr_t *attrp) {
pthread_attr_init(attrp);
pthread_attr_setdetachstate(attrp, PTHREAD_CREATE_DETACHED);
}
/* thread status
* return the min idle thread num
*/
int thread_state(int *idle_cnt) {
int i,low_index, cnt;
low_index = -1;
cnt = 0;
pthread_mutex_lock(&th_lock);
for(i = 0; i < MAXCONN; ++i) {
if( conns[i] == 0 ) {
if(low_index == -1) {
low_index = i;
// conns[i] = 1;
}
++cnt;
}
}
pthread_mutex_unlock(&th_lock);
if(idle_cnt != NULL) {
*idle_cnt = cnt;
}
return low_index;
}
void* get_webpage(void* url)
{
int sockfd,numbytes;
int ret, url_len, url_index, port = PORT;
int i, j;
int is_pagehead;
char buf[MAXDATASIZE];
char filename[MAX_PATH];
struct hostent *he;
struct sockaddr_in serv_addr;
struct th_url st_url;// = (struct th_url *)url;
struct url_arr *th_url_arr = (struct url_arr *)(url);
FILE *fin, *fout;
char *p, *p_url;
char buffer[1024];
char daemon_name[MAX_PATH], url_path[MAX_PATH];
char cmd_line[MAX_PATH];
char *url_head = "http://";
char ch_port[6];
while(1) {
pthread_mutex_lock(&th_lock);
// st_url = *(struct th_url *) url;
// conns[st_url.th_index] = 1;
url_index = url_num;
++url_num;
pthread_mutex_unlock(&th_lock);
if(url_index >= url_cnt) {
break;
}
printf("url_index: %d\n", url_index);
p = th_url_arr->url[url_index];
if((p_url = strstr(p, url_head)) != NULL) {
printf("URL is: %s\n", p);
if (p_url == p) {
/* URL starts with http:// */
printf("url_head: %s exist!\n", url_head);
p += strlen(url_head);
}
}
i = 0;
url_len = strlen(p); /* url length */
while(i < url_len) { /* get the host */
if((daemon_name[i] = p[i]) == '/') {
break;
}
if(daemon_name[i] == ':') {
break;
}
++i;
}
daemon_name[i] = '\0';
puts(daemon_name);
p += i;
port = PORT;
if(*p == ':') {
i = 0;
++p;
while((ch_port[i] = *p) != '/' && *p != '\0') {
++p;
++i;
}
ch_port[i] = '\0';
port = atoi(ch_port);
}
j = i = 0;
url_len = strlen(p);
while(i < url_len) { /* get the url path */
if((url_path[j] = p[i])=='\0') {
break;
}
if(url_path[j] == ' ') {
url_path[j++] = '%';
url_path[j++] = '2';
url_path[j] = '0';
}
++i;
++j;
}
url_path[j] = '\0';
puts(url_path);
if(*url_path == '\0') { /* path is null */
snprintf(url_path, sizeof(url_path), "/");
}
/* GET command */
snprintf(cmd_line, sizeof(cmd_line), GET_CMD, url_path, daemon_name);
snprintf(filename, sizeof(filename), "%d.html", url_index);
// snprintf(buffer, sizeof(buffer), "test!!");
// printf("%s\n", buffer);
/*if(argc!=2)
{
fprintf(stderr,"usage:clienthostname\n");
exit(1);
}*/
if((he=gethostbyname(daemon_name))==NULL)
{
herror("gethostbyname");
// pthread_mutex_lock(&th_lock);
// conns[st_url.th_index] = 0;
// pthread_mutex_unlock(&th_lock);
// return 0;
continue;
}
printf("Host name : %s\n", he->h_name);
printf("Host IP : %s\n", inet_ntoa(*(struct in_addr*)(he->h_addr)));
printf("File name : %s\n", filename);
sockfd = socket(AF_INET, SOCK_STREAM, 0);
serv_addr.sin_family=AF_INET;
serv_addr.sin_port=htons(port);
// serv_addr.sin_addr = *((struct in_addr*)(he->h_addr));
bcopy(he->h_addr, (struct sockaddr*)&serv_addr.sin_addr, he->h_length);
bzero(&(serv_addr.sin_zero),8);
if((ret=connect(sockfd,(struct sockaddr*)&serv_addr,sizeof(serv_addr)))<0)
{
// printf("connect to server error!\n");
perror("connect");
close(sockfd);
/*pthread_mutex_lock(&th_lock);
conns[st_url.th_index] = 0;
pthread_mutex_unlock(&th_lock);
return 0;
*/
continue;
}
// numbytes =
send(sockfd, cmd_line, strlen(cmd_line),0);
fout = fopen(filename, "w");
is_pagehead = 1;
while(1) {
if((numbytes=recv(sockfd, buf, MAXDATASIZE,0))<=0)
{
// perror("recv");
// exit(1);
break;
}
// printf("receive size: %d\n", numbytes);
buf[numbytes]='\0';
p = buf;
if(is_pagehead) { /* remove header information */
is_pagehead = 0;
if((p=strstr(buf, "\r\n\r\n")) != NULL) {
p += strlen("\r\n\r\n");
}
else
p = buf;
}
// fputs(buf,fout);
// printf("Write to file: %s\n", filename);
fwrite(p,sizeof(char), strlen(p), fout);
// printf("contents:\n%s\n",buf);
}
printf("End writen to file: %s\tSuccess download: %s\n", filename, th_url_arr->url[url_index]);
close(sockfd);
fclose(fout);
} /* end while(1) */
/*
pthread_mutex_lock(&th_lock);
conns[st_url.th_index] = 0;
pthread_mutex_unlock(&th_lock);
*/
return 0;
}
下面urllist文件,文件格式为每一行为一个url链接地址,示例如下,并保存为urllist.txt:
http://www.baidu.com/s?wd=%C9%F9%B3%A1&cl=3
www.baidu.com
www.baidu.com:80
http://www.google.com:80/
http://www.sina.com.cn
http://sports.sina.com.cn/g/
http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/%20%20by shmilytan/
http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/
http://mujiebule.nease.net/shengchang/1.htm
http://www.12soundfield.com/
http://www.yesky.com/249/1763249.shtml
http://www.gxyx.cn/newyx/News_Show2.asp?NewsID=933
http://www.ecgoogle.com/zhanjiang/projectdesign/Info_View.asp?ContentID=117
No comments:
Post a Comment