兔子窝边草: 一个网页多线程下载程序

一个网页多线程下载程序

平台：linux，unix

语言：c

程序如下：

/* spider.h

#include <pthread.h>

#include <stdio.h>

#include <time.h>

#include <stdlib.h>

#include <string.h>

#include <sys/stat.h>

#include <sys/types.h>

#include <fcntl.h>

#include <limits.h>

// #include <pthread.h>

#include <unistd.h>

#include<sys/wait.h>

#include<signal.h>

#include<netdb.h>

#include<sys/socket.h>

#include<netinet/in.h>

#include<arpa/inet.h>

#define PORT 80 /* default port number */

#define MAXDATASIZE 4096 /* max buffer size */

#define GET_CMD "GET %s HTTP/1.0\r\nHost:%s\r\nAccept:*/*\r\nUser_Agent:myAgentr\r\nConnection:Close\r\n\r\n" /* the GET command */

#define URL_HEADER "http://"

#define MAX_PATH 1024

#define MAXCONN 10 /* MAX Threads allowed */

#define MAXURLS 1024

pthread_mutex_t th_lock = PTHREAD_MUTEX_INITIALIZER; /* thread lock */

int conns[MAXCONN] = {0}; /* threads counter */

int url_cnt = 0; /* urls count */

int url_num = 0; /* url index */

char * urls[MAXURLS];

char * read_list(char *filename, int *file_size);

void setup(pthread_attr_t *attrp);

int thread_state(int *idle_cnt);

void* get_webpage(void *);

struct th_url {

char * url;

int th_index;

int url_index;

};

struct url_arr {

char **url;

int len;

};

/* spider.c

* multi-thread downloading webpages.

#include "spider.h"

int main(int argc, char *argv[]) {

// struct stat statbuf; /* file information */

size_t file_size; /* file size */

int i;

// int url_cnt;

// int url_index;

int th_index, idle_th;

// struct th_url st_url[MAXCONN];

struct url_arr th_url_arr;

// static int conns[MAXCONN] = {0}; /* threads counter */

pthread_t spiders[MAXCONN]; /* threads */

pthread_attr_t attr;

// FILE *fp;

char *filename = "urllist.txt";

char *buffer, *p;

char *url;

// char *urls[MAXURLS];

printf("Reading data!...\n");

/* read urllist */

if((buffer = read_list(filename, &file_size))<0) {

printf("Error reading file!\n");

exit(1);

}

printf("Parsing date....\n");

/* parse \n to \0 */

i = 0;

url_cnt = 0;

urls[0] = buffer;

while(i<file_size - 1) {

if(buffer[i] == '\n') {

buffer[i] = '\0';

++url_cnt;

if(url_cnt >= MAXURLS) {

break;

}

urls[url_cnt] = &buffer[i+1];

}

++i;

}

buffer[file_size - 1] = '\0';

++url_cnt;

th_url_arr.url = urls;

th_url_arr.len = url_cnt;

// buffer[0] = buffer[0]=='\n'?'\0':buffer[0];

/*if(buffer[0] == '\n') {

buffer[0] = '\0';

++url_cnt;

}*/

p = buffer;

printf("urls count: %d\n", url_cnt);

for(i = 0; i<url_cnt; i++) {

printf("%s\n", p);

p += strlen(p) + 1;

}

setup(&attr); /* initialize threads */

url = buffer;

// url_index = 0;

i = MAXCONN + 1;

th_index = 0;

/* download webpages */

while(--i) {

if((MAXCONN - i) >= url_cnt) {

break;

}

// printf("Thread: %d created!\n", MAXCONN-i);

// if((th_index = thread_state(NULL)) < 0) { /* max threads */

/*if(url_index >= url_cnt) {

break;

}*/

// sleep(1); /* wait 1 second */

// continue;

//}

/* start a thread to download webpage */

// while((url_index < url_cnt) && (*url == '\0')) {

// ++url;

// ++url_index;

// }

/*while(*url == ' ') {

++url;

}

if(url_index >= url_cnt) {

th_index = thread_state(&idle_th);

if(idle_th >= MAXCONN) {

break;

}

sleep(1);

continue;

}

st_url[th_index].url = url;

st_url[th_index].th_index = th_index;

st_url[th_index].url_index = url_index;

pthread_mutex_lock(&th_lock);

conns[th_index] = 1;

pthread_mutex_unlock(&th_lock);

printf("In thread %d\nURL is: %s\nurl_index: %d\n", th_index, url, url_index);

pthread_create(&spiders[th_index], NULL/*&attr*/, get_webpage,(void *) &th_url_arr);

++th_index;

/* create thread */

// sleep(1);

// url += strlen(url) + 1;

// ++url_index;

}

i = 0;

while(i < MAXCONN) {

if(i >= url_cnt) {

break;

}

if(pthread_join(spiders[i], NULL) == 0) {

// printf("Thread %d exit success!\n", i);

}

else {

// printf("Thread %d exit error!\n", i);

}

++i;

}

// sleep(5);

/* free buffer */

free(buffer);

return EXIT_SUCCESS;

}

/* read urllist from file */

char* read_list(char *filename, int *file_size) {

struct stat statbuf; /* file information */

// size_t file_size; /* file size */

FILE *fp;

char *buffer;

/* open file: urllist.txt */

if((fp = fopen(filename, "r"))==NULL) {

printf("Error! Can't open file: %s!\n", filename);

return NULL;

}

/* file status */

if (stat(filename, &statbuf) < 0) {

printf("Error! Can't get status of file!\n");

fclose(fp);

return NULL;

}

*file_size = statbuf.st_size;

/* allocate buffer */

if((buffer=(char*)malloc(*file_size))==NULL) {

printf("Error!Can'tallocatebuffer!\n");

fclose(fp);

return NULL;

}

/* read file */

fread(buffer, 1, *file_size, fp);

/* close file */

fclose(fp);

return buffer;

}

/* initialize the status variables and set

* the thread attribute to detached

void setup(pthread_attr_t *attrp) {

pthread_attr_init(attrp);

pthread_attr_setdetachstate(attrp, PTHREAD_CREATE_DETACHED);

}

/* thread status

* return the min idle thread num

int thread_state(int *idle_cnt) {

int i,low_index, cnt;

low_index = -1;

cnt = 0;

pthread_mutex_lock(&th_lock);

for(i = 0; i < MAXCONN; ++i) {

if( conns[i] == 0 ) {

if(low_index == -1) {

low_index = i;

// conns[i] = 1;

}

++cnt;

}

pthread_mutex_unlock(&th_lock);

if(idle_cnt != NULL) {

*idle_cnt = cnt;

}

return low_index;

}

void* get_webpage(void* url)

{

int sockfd,numbytes;

int ret, url_len, url_index, port = PORT;

int i, j;

int is_pagehead;

char buf[MAXDATASIZE];

char filename[MAX_PATH];

struct hostent *he;

struct sockaddr_in serv_addr;

struct th_url st_url;// = (struct th_url *)url;

struct url_arr *th_url_arr = (struct url_arr *)(url);

FILE *fin, *fout;

char *p, *p_url;

char buffer[1024];

char daemon_name[MAX_PATH], url_path[MAX_PATH];

char cmd_line[MAX_PATH];

char *url_head = "http://";

char ch_port[6];

while(1) {

pthread_mutex_lock(&th_lock);

// st_url = *(struct th_url *) url;

// conns[st_url.th_index] = 1;

url_index = url_num;

++url_num;

pthread_mutex_unlock(&th_lock);

if(url_index >= url_cnt) {

break;

}

printf("url_index: %d\n", url_index);

p = th_url_arr->url[url_index];

if((p_url = strstr(p, url_head)) != NULL) {

printf("URL is: %s\n", p);

if (p_url == p) {

/* URL starts with http:// */

printf("url_head: %s exist!\n", url_head);

p += strlen(url_head);

}

i = 0;

url_len = strlen(p); /* url length */

while(i < url_len) { /* get the host */

if((daemon_name[i] = p[i]) == '/') {

break;

}

if(daemon_name[i] == ':') {

break;

}

++i;

}

daemon_name[i] = '\0';

puts(daemon_name);

p += i;

port = PORT;

if(*p == ':') {

i = 0;

++p;

while((ch_port[i] = *p) != '/' && *p != '\0') {

++p;

++i;

}

ch_port[i] = '\0';

port = atoi(ch_port);

}

j = i = 0;

url_len = strlen(p);

while(i < url_len) { /* get the url path */

if((url_path[j] = p[i])=='\0') {

break;

}

if(url_path[j] == ' ') {

url_path[j++] = '%';

url_path[j++] = '2';

url_path[j] = '0';

}

++i;

++j;

}

url_path[j] = '\0';

puts(url_path);

if(*url_path == '\0') { /* path is null */

snprintf(url_path, sizeof(url_path), "/");

}

/* GET command */

snprintf(cmd_line, sizeof(cmd_line), GET_CMD, url_path, daemon_name);

snprintf(filename, sizeof(filename), "%d.html", url_index);

// snprintf(buffer, sizeof(buffer), "test!!");

// printf("%s\n", buffer);

/*if(argc!=2)

{

fprintf(stderr,"usage:clienthostname\n");

exit(1);

}*/

if((he=gethostbyname(daemon_name))==NULL)

{

herror("gethostbyname");

// pthread_mutex_lock(&th_lock);

// conns[st_url.th_index] = 0;

// pthread_mutex_unlock(&th_lock);

// return 0;

continue;

}

printf("Host name : %s\n", he->h_name);

printf("Host IP : %s\n", inet_ntoa(*(struct in_addr*)(he->h_addr)));

printf("File name : %s\n", filename);

sockfd = socket(AF_INET, SOCK_STREAM, 0);

serv_addr.sin_family=AF_INET;

serv_addr.sin_port=htons(port);

// serv_addr.sin_addr = *((struct in_addr*)(he->h_addr));

bcopy(he->h_addr, (struct sockaddr*)&serv_addr.sin_addr, he->h_length);

bzero(&(serv_addr.sin_zero),8);

if((ret=connect(sockfd,(struct sockaddr*)&serv_addr,sizeof(serv_addr)))<0)

{

// printf("connect to server error!\n");

perror("connect");

close(sockfd);

/*pthread_mutex_lock(&th_lock);

conns[st_url.th_index] = 0;

pthread_mutex_unlock(&th_lock);

return 0;

continue;

}

// numbytes =

send(sockfd, cmd_line, strlen(cmd_line),0);

fout = fopen(filename, "w");

is_pagehead = 1;

while(1) {

if((numbytes=recv(sockfd, buf, MAXDATASIZE,0))<=0)

{

// perror("recv");

// exit(1);

break;

}

// printf("receive size: %d\n", numbytes);

buf[numbytes]='\0';

p = buf;

if(is_pagehead) { /* remove header information */

is_pagehead = 0;

if((p=strstr(buf, "\r\n\r\n")) != NULL) {

p += strlen("\r\n\r\n");

}

else

p = buf;

}

// fputs(buf,fout);

// printf("Write to file: %s\n", filename);

fwrite(p,sizeof(char), strlen(p), fout);

// printf("contents:\n%s\n",buf);

}

printf("End writen to file: %s\tSuccess download: %s\n", filename, th_url_arr->url[url_index]);

close(sockfd);

fclose(fout);

} /* end while(1) */

pthread_mutex_lock(&th_lock);

conns[st_url.th_index] = 0;

pthread_mutex_unlock(&th_lock);

return 0;

}

下面urllist文件，文件格式为每一行为一个url链接地址，示例如下，并保存为urllist.txt：

http://www.baidu.com/s?wd=%C9%F9%B3%A1&cl=3

www.baidu.com

www.baidu.com:80

http://www.google.com:80/

http://www.sina.com.cn

http://sports.sina.com.cn/g/2005-06-29/01071638335.shtml

http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/%20%20by shmilytan/

http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/

http://mujiebule.nease.net/shengchang/1.htm

http://www.12soundfield.com/

http://www.yesky.com/249/1763249.shtml

http://www.gxyx.cn/newyx/News_Show2.asp?NewsID=933

http://www.ecgoogle.com/zhanjiang/projectdesign/Info_View.asp?ContentID=117

兔子窝边草

Monday, October 17, 2005

一个网页多线程下载程序

No comments:

My Web Albums

Google Search & Firefox

Links

Labels

Blog Archive

Friends

Visitors