Monday, October 17, 2005

自己写的一个网页多线程下载程序

一个网页多线程下载程序

 

平台:linuxunix

语言:c

 

程序如下:

/* spider.h

 */

#include <pthread.h>

#include <stdio.h>

#include <time.h>

#include <stdlib.h>

#include <string.h>

#include <sys/stat.h>

#include <sys/types.h>

#include <fcntl.h>

#include <limits.h>

 

// #include <pthread.h>

#include <unistd.h>

 

#include<sys/wait.h>

#include<signal.h>

#include<netdb.h>

#include<sys/socket.h>

#include<netinet/in.h>

#include<arpa/inet.h>

 

#define PORT 80                               /* default port number */

#define MAXDATASIZE 4096                      /* max buffer size */    

#define GET_CMD     "GET %s HTTP/1.0\r\nHost:%s\r\nAccept:*/*\r\nUser_Agent:myAgentr\r\nConnection:Close\r\n\r\n" /* the GET command */

 

#define URL_HEADER "http://"

#define MAX_PATH      1024

#define MAXCONN 10  /* MAX Threads allowed */

#define MAXURLS 1024           

       

pthread_mutex_t th_lock = PTHREAD_MUTEX_INITIALIZER;    /* thread lock */

int conns[MAXCONN] = {0};    /* threads counter */

 

int url_cnt = 0;       /* urls count */

int url_num = 0;     /* url index */

char * urls[MAXURLS];

 

char * read_list(char *filename, int *file_size);

void setup(pthread_attr_t *attrp);

int thread_state(int *idle_cnt);

void*  get_webpage(void *);

 

struct th_url {

       char * url;

       int th_index;

       int url_index;

};

 

struct url_arr {

       char **url;

       int len;

};

 

/* spider.c

 * multi-thread downloading webpages.

 *

 */

 

#include "spider.h"

 

int main(int argc, char *argv[]) {

       // struct stat statbuf;       /* file information */

       size_t file_size;             /* file size */

      

       int i;

       // int url_cnt;

       // int url_index;

    int th_index, idle_th;

      // struct th_url st_url[MAXCONN];

       struct url_arr th_url_arr;

      

       // static int conns[MAXCONN] = {0};  /* threads counter */

       pthread_t spiders[MAXCONN];                  /* threads */

       pthread_attr_t attr;

      

       // FILE *fp;

       char *filename = "urllist.txt";

       char *buffer, *p;

       char *url;

       // char *urls[MAXURLS];

      

       printf("Reading data!...\n");

       /* read urllist */

       if((buffer = read_list(filename, &file_size))<0) {

              printf("Error reading file!\n");

              exit(1);

       }

      

       printf("Parsing date....\n");

      

       /* parse \n to \0 */

       i = 0;

       url_cnt = 0;

       urls[0] = buffer;

       while(i<file_size - 1) {

              if(buffer[i] == '\n') {

                     buffer[i] = '\0';

                     ++url_cnt;

                     if(url_cnt >= MAXURLS) {

                            break;

                     }

                     urls[url_cnt] = &buffer[i+1];

              }

              ++i;

       }

       buffer[file_size - 1] = '\0';

       ++url_cnt;

      

       th_url_arr.url = urls;

       th_url_arr.len = url_cnt;

      

       // buffer[0] = buffer[0]=='\n'?'\0':buffer[0];

       /*if(buffer[0] == '\n') {

              buffer[0] = '\0';

              ++url_cnt;

       }*/

      

       p = buffer;

       printf("urls count: %d\n", url_cnt);

      

       for(i = 0; i<url_cnt; i++) {

              printf("%s\n", p);

              p += strlen(p) + 1;

       }

      

       setup(&attr);   /* initialize threads */

      

       url = buffer;

       // url_index = 0;

      

       i = MAXCONN + 1;

       th_index = 0;

      

       /* download webpages */

       while(--i) {

              if((MAXCONN - i) >= url_cnt) {

                     break;

              }

              // printf("Thread: %d created!\n", MAXCONN-i);

              // if((th_index = thread_state(NULL)) < 0) { /* max threads */

                     /*if(url_index >= url_cnt) {

                            break;

                     }*/

              //     sleep(1);  /* wait 1 second */

              //     continue;

              //}

              /* start a thread to download webpage */

              // while((url_index < url_cnt) && (*url == '\0')) {

              //     ++url;

              //     ++url_index;

              // }

              /*while(*url == ' ') {

                     ++url;

              }

             

              if(url_index >= url_cnt) {

                     th_index = thread_state(&idle_th);

                     if(idle_th >= MAXCONN) {

                            break;

                     }

                     sleep(1);

                     continue;

              }

             

              st_url[th_index].url = url;

              st_url[th_index].th_index = th_index;

              st_url[th_index].url_index = url_index;

             

              pthread_mutex_lock(&th_lock);

              conns[th_index] = 1;

              pthread_mutex_unlock(&th_lock);

             

              printf("In thread %d\nURL is: %s\nurl_index: %d\n", th_index, url, url_index);

              */

             

              pthread_create(&spiders[th_index], NULL/*&attr*/, get_webpage,(void *) &th_url_arr);      

              ++th_index;

              /* create thread */  

             

              // sleep(1);     

              //

              // url += strlen(url) + 1;

              // ++url_index;

       }

      

       i = 0;

       while(i < MAXCONN) {

              if(i >= url_cnt) {

                     break;

              }

              if(pthread_join(spiders[i], NULL) == 0) {

                     // printf("Thread %d exit success!\n", i);

              }

              else {

                     // printf("Thread %d exit error!\n", i);

              }

              ++i;

       }

      

       // sleep(5);

      

       /* free buffer */

       free(buffer);

       return EXIT_SUCCESS;

}

 

/* read urllist from file */

char* read_list(char *filename, int *file_size) {

       struct stat statbuf;   /* file information */

//     size_t file_size;       /* file size */

       FILE *fp;

       char *buffer;

       /* open file: urllist.txt */

       if((fp = fopen(filename, "r"))==NULL) {

              printf("Error! Can't open file: %s!\n", filename);

              return NULL;

       }

       /* file status */

       if (stat(filename, &statbuf) < 0) {

              printf("Error! Can't get status of file!\n");

              fclose(fp);

              return NULL;

       }

       *file_size = statbuf.st_size;

       /* allocate buffer */

       if((buffer=(char*)malloc(*file_size))==NULL) {

              printf("Error!Can'tallocatebuffer!\n");

              fclose(fp);

              return NULL;

       }

       /* read file */

       fread(buffer, 1, *file_size, fp);

       /* close file */

       fclose(fp);

       return buffer;

}

 

/* initialize the status variables and set

 * the thread attribute to detached

 */

 

void setup(pthread_attr_t *attrp) {

       pthread_attr_init(attrp);

       pthread_attr_setdetachstate(attrp, PTHREAD_CREATE_DETACHED);

}

 

/* thread status

 * return the min idle thread num

 */

int thread_state(int *idle_cnt) {

       int i,low_index,  cnt;

       low_index = -1;

       cnt = 0;

      

       pthread_mutex_lock(&th_lock);   

       for(i = 0; i < MAXCONN; ++i) {

              if( conns[i] == 0 ) {

                     if(low_index == -1) {

                            low_index = i;

                            // conns[i] = 1;

                     }

                     ++cnt;

              }

       }

       pthread_mutex_unlock(&th_lock);

      

       if(idle_cnt != NULL) {

              *idle_cnt = cnt;

       }

       return low_index;

}

 

void* get_webpage(void* url)

{

       int sockfd,numbytes;

       int ret, url_len, url_index, port = PORT;

       int i, j;

       int is_pagehead;

       char buf[MAXDATASIZE];

       char filename[MAX_PATH];

      

       struct hostent *he;

       struct sockaddr_in serv_addr;

       struct th_url st_url;// = (struct th_url *)url;

       struct url_arr *th_url_arr = (struct url_arr *)(url);

      

       FILE *fin, *fout;

       char *p, *p_url;

    char buffer[1024];

       char daemon_name[MAX_PATH], url_path[MAX_PATH];

       char cmd_line[MAX_PATH];

       char *url_head = "http://";

       char ch_port[6];

      

       while(1) {

      

              pthread_mutex_lock(&th_lock);

       // st_url = *(struct th_url *) url;

       // conns[st_url.th_index] = 1;

       url_index = url_num;

       ++url_num;

      

       pthread_mutex_unlock(&th_lock);

      

       if(url_index >= url_cnt) {

              break;

       }

       printf("url_index: %d\n", url_index);

 

      

       p = th_url_arr->url[url_index];

 

      

       if((p_url = strstr(p, url_head)) != NULL) {

              printf("URL is: %s\n", p);

              if (p_url == p) {                        

                     /* URL starts with http:// */

                     printf("url_head: %s exist!\n", url_head);

                     p += strlen(url_head);

              }

       }

       i = 0;

       url_len = strlen(p); /* url length */

       while(i < url_len) {       /* get the host */

              if((daemon_name[i] = p[i]) == '/') {

                     break;

              }

              if(daemon_name[i] == ':') {

                     break;

              }

              ++i;       

       }

       daemon_name[i] = '\0';

       puts(daemon_name);

      

       p += i;

      

       port = PORT;

      

       if(*p == ':') {

              i = 0;

              ++p;

              while((ch_port[i] = *p) != '/' && *p != '\0') {

                     ++p;

                     ++i;

              }

              ch_port[i] = '\0';

              port = atoi(ch_port);

       }

      

       j = i = 0;

       url_len = strlen(p);

       while(i < url_len) {       /* get the url path */

              if((url_path[j] = p[i])=='\0') {

                     break;

              }

              if(url_path[j] == ' ') {

                     url_path[j++] = '%';

                     url_path[j++] = '2';

                     url_path[j] = '0';

              }

              ++i;

              ++j;

       }

       url_path[j] = '\0';

       puts(url_path);

      

       if(*url_path == '\0') {    /* path is null */

              snprintf(url_path, sizeof(url_path), "/");

       }

      

       /* GET command */

       snprintf(cmd_line, sizeof(cmd_line), GET_CMD, url_path, daemon_name);

       snprintf(filename, sizeof(filename), "%d.html", url_index);

      

       // snprintf(buffer, sizeof(buffer), "test!!");

       // printf("%s\n", buffer);

      

       /*if(argc!=2)

       {

              fprintf(stderr,"usage:clienthostname\n");

              exit(1);

       }*/

 

      

       if((he=gethostbyname(daemon_name))==NULL)

       {

              herror("gethostbyname");

              // pthread_mutex_lock(&th_lock);

              // conns[st_url.th_index] = 0;

              // pthread_mutex_unlock(&th_lock);

              // return 0;

              continue;

       }

       printf("Host name : %s\n", he->h_name);

       printf("Host IP         : %s\n", inet_ntoa(*(struct in_addr*)(he->h_addr)));

       printf("File name : %s\n", filename);

      

       sockfd = socket(AF_INET, SOCK_STREAM, 0);

      

       serv_addr.sin_family=AF_INET;

       serv_addr.sin_port=htons(port);

       // serv_addr.sin_addr = *((struct in_addr*)(he->h_addr));

       bcopy(he->h_addr, (struct sockaddr*)&serv_addr.sin_addr, he->h_length);

 

       bzero(&(serv_addr.sin_zero),8);

      

       if((ret=connect(sockfd,(struct sockaddr*)&serv_addr,sizeof(serv_addr)))<0)

       {

              // printf("connect to server error!\n");

              perror("connect");

              close(sockfd);

              /*pthread_mutex_lock(&th_lock);

              conns[st_url.th_index] = 0;

              pthread_mutex_unlock(&th_lock);

              return 0;

              */

              continue;

       }

       // numbytes =       

       send(sockfd, cmd_line, strlen(cmd_line),0);

       fout = fopen(filename, "w");

       is_pagehead = 1;

       while(1) {

              if((numbytes=recv(sockfd, buf, MAXDATASIZE,0))<=0)

              {

                     // perror("recv");

                     // exit(1);

                     break;

              }

              // printf("receive size: %d\n", numbytes);

              buf[numbytes]='\0';

              p = buf;

              if(is_pagehead) {    /* remove header information */

                     is_pagehead = 0;

                    

                     if((p=strstr(buf, "\r\n\r\n")) != NULL) {

                            p += strlen("\r\n\r\n");

                     }

                     else

                            p = buf;

              }

              // fputs(buf,fout);

              // printf("Write to file: %s\n", filename);

              fwrite(p,sizeof(char), strlen(p), fout);

              // printf("contents:\n%s\n",buf);

       }

       printf("End writen to file: %s\tSuccess download: %s\n", filename, th_url_arr->url[url_index]);

       close(sockfd);

       fclose(fout);

      

       }     /* end while(1) */

      

       /*

       pthread_mutex_lock(&th_lock);

       conns[st_url.th_index] = 0;

       pthread_mutex_unlock(&th_lock);

       */                     

       return 0;

}

 

下面urllist文件,文件格式为每一行为一个url链接地址,示例如下,并保存为urllist.txt

http://www.baidu.com/s?wd=%C9%F9%B3%A1&cl=3

www.baidu.com

www.baidu.com:80

http://www.google.com:80/

http://www.sina.com.cn

http://sports.sina.com.cn/g/2005-06-29/01071638335.shtml

http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/%20%20by shmilytan/

http://59.66.122.77/ftpsoft/pub/EBooks/incomingEBooks/EE&COMM/

http://mujiebule.nease.net/shengchang/1.htm

http://www.12soundfield.com/

http://www.yesky.com/249/1763249.shtml

http://www.gxyx.cn/newyx/News_Show2.asp?NewsID=933

http://www.ecgoogle.com/zhanjiang/projectdesign/Info_View.asp?ContentID=117

No comments: