首页 > 其他分享 >搜索引擎的那些事(多线程web遍历)

搜索引擎的那些事(多线程web遍历)

时间:2022-11-23 12:07:50浏览次数:47  
标签:web 遍历 index int NULL char buffer download 多线程



    上面一篇博客当中,我们可以利用单一的线程完成网页的下载。今天,我们打算在此基础上完成多线程的访问和加载操作。使用多线程,倒不是因为这项技术有多牛,主要是因为我们想利用多线程的访问机制,充分利用线程的阻塞时间,这样可以在单位时间内完成更多的下载操作,这样至少可以帮助我们提高一部分效率。


    要做到这一点的话,首先要对我们原来的代码进行少许的改造。怎么改造呢?首先就是我们需要把网页分成已访问和待访问两种。一方面,我们对网页进行下载,另一方面我们可以利用网页中链接信息下载新的网页。如果把整个网页看成是一个队列,那么就有一个front、一个end。每一个线程不断地从end中读取文件,获取到http地址之后插入到front之后,整个过程是一个while死循环过程。

/* index to read html file */
static int get_read_index()
{
int index;

WaitForSingleObject(h_index, INFINITE);
if(end == front)
{
ReleaseSemaphore(h_index, 1, NULL);
return -1;
}

index = end ++;
ReleaseSemaphore(h_index, 1, NULL);
return index;
}

/* index to write html file */
static int get_write_index()
{
int index;

WaitForSingleObject(h_index, INFINITE);
index = front ++;
ReleaseSemaphore(h_index, 1, NULL);
return index;
}


    上面的过程就是记录索引的过程。当然,我们还需要进行另外一项改造工作,那就是把原来的函数工作模式修改成while(1)的线程工作模式。对于线程来说,就是不断地读取队列、不断地分析文件、不断地插入队列,就是这么简单。

/* download page and its linked pages */
DWORD WINAPI download_page_entry(LPVOID param)
{
char* buffer;
int size;
char name[64];
int index;

while(1)
{
while( -1 == (index = get_read_index()))
{
Sleep(100);
}

memset(name, 0, 64);
sprintf(name, "E:/download/%d.html", index);
if(OK == get_file_content(name, &buffer, &size))
{
get_http_and_download(buffer);
free(buffer);
}
}
}

    上面两段代码就是我们所做的基本修改动作。当然还有其他一些注意事项,比如说,注意第一个网页的下载过程、在生成可执行文件的时候一定要选用MultithreadDebugDll选项,编写代码的时候注意进行拷机测试。最后的话,就啥也不说了,贴上源码。当然,还是和原来一样,如果大家希望可以运行执行这段代码的话,一定要在E盘创建一个download目录,其他的就什么也不用管了。

#include <stdio.h>
#include <windows.h>
#include <wininet.h>
#include <assert.h>

#ifdef ERROR
#undef ERROR
#endif

#define U8 unsigned char
#define U32 unsigned int
#define STATUS unsigned int

#define OK 0
#define ERROR (~0L)
#define MAX_BLOCK_SIZE 1024
#define MAX_DOMAIN_NAME_LENGTH 64
#define MAX_THREAD_NUMBER (8)

#pragma comment(lib, "wininet.lib")
static STATUS download_web_page(const char* url, const char* path);

static int end = 0;
static int front = 1;
static HANDLE h_index;

/* index to read html file */
static int get_read_index()
{
int index;

WaitForSingleObject(h_index, INFINITE);
if(end == front)
{
ReleaseSemaphore(h_index, 1, NULL);
return -1;
}

index = end ++;
ReleaseSemaphore(h_index, 1, NULL);
return index;
}

/* index to write html file */
static int get_write_index()
{
int index;

WaitForSingleObject(h_index, INFINITE);
index = front ++;
ReleaseSemaphore(h_index, 1, NULL);
return index;
}

/* get length of html file */
static int get_file_size(const char* path)
{
HANDLE hFile;
int size = 0;

hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
if (hFile != INVALID_HANDLE_VALUE)
{
size = GetFileSize(hFile, NULL);
CloseHandle(hFile);
}

return size;
}

/* get all data from html file */
static STATUS get_file_content(const char* path, void** pp_buffer, int* size)
{
int length;
char* buffer;
HANDLE hFile;

if(NULL == path)
{
return ERROR;
}

if(NULL == pp_buffer)
{
return ERROR;
}

if(NULL == size)
{
return ERROR;
}

length = get_file_size(path);
if(0 == length)
{
return ERROR;
}

buffer = (char*) malloc(length +1);
if(NULL == buffer)
{
return ERROR;
}

buffer[length] = '\0';
hFile = fopen(path, "r+b");
if(NULL == hFile)
{
free(buffer);
return ERROR;
}

fread(buffer, 1, length, hFile);
fclose(hFile);

*pp_buffer = buffer;
*size = length;
return OK;
}

/* show all http name, sometimes just for debug use */
static void print_http_name(const char* buffer, int size)
{
while(size --)
{
printf("%c", *buffer ++);
}

printf("\n");
}


static void download_linked_page(const char* url, int size)
{
char* data;
char name[64];

print_http_name(url, size);
data = (char*)malloc(size + 1);
if(NULL == data)
{
return;
}

data[size] = '\0';
memmove(data, url, size);
memset(name, 0, 64);
sprintf(name, "E:/download/%d.html", get_write_index());
download_web_page(data, name);

/* free data memroy, which contained http domain name */
free(data);
}

/* get http form html file, then download it by its name*/
static void get_http_and_download(const char* buffer)
{
const char* prev;
const char* next;
char letter;
int count;

if(NULL == buffer)
{
return;
}

next = buffer;
while(1)
{
next = strstr(next, "http://");
if(NULL == next)
{
break;
}

count = MAX_DOMAIN_NAME_LENGTH;
prev = next;
next += strlen("http://");

while(1)
{
if(!count)
{
break;
}

count --;
letter = *next;
if('"' == letter || '\'' == letter || ')' == letter || '>' == letter)
{
break;
}

next ++;
}

if(count)
{
download_linked_page(prev, next - prev);
}
}

}

/* implement page download */
static STATUS download_web_page(const char* url, const char* path)
{
U8 buffer[MAX_BLOCK_SIZE];
U32 iNumber;
FILE* hFile;
HINTERNET hSession;
HINTERNET hUrl;
STATUS result;

hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if(NULL == hSession)
{
return ERROR;
}

hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
if(NULL == hUrl)
{
result = ERROR;
goto error1;
}

hFile = fopen(path, "wb");
if(NULL == hFile)
{
result = ERROR;
goto error2;
}

iNumber = 1;
while(iNumber > 0)
{
InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber);
fwrite(buffer, sizeof(char), iNumber, hFile);
}

fclose(hFile);
result = OK;

error2:
InternetCloseHandle(hUrl);

error1:
InternetCloseHandle(hSession);

return result;
}

/* download page and its linked pages */
DWORD WINAPI download_page_entry(LPVOID param)
{
char* buffer;
int size;
char name[64];
int index;

while(1)
{
while( -1 == (index = get_read_index()))
{
Sleep(100);
}

memset(name, 0, 64);
sprintf(name, "E:/download/%d.html", index);
if(OK == get_file_content(name, &buffer, &size))
{
get_http_and_download(buffer);
free(buffer);
}
}
}

/* entry of programme */
int main(int argc, char* argv[])
{
int index;
HANDLE h_download[MAX_THREAD_NUMBER];

h_index = CreateSemaphore(NULL, 1, 1, NULL);
if(NULL == h_index)
{
assert(0);
}

/* 0.html is just the start page */
download_web_page("http://book.dangdang.com", "E:/download/0.html");
for(index = 0; index < MAX_THREAD_NUMBER; index ++)
{
h_download[index] = CreateThread(NULL, 0, download_page_entry, 0, 0, NULL);
if(NULL == h_download[index])
{
assert(0);
}
}

WaitForMultipleObjects(MAX_THREAD_NUMBER, h_download, TRUE, INFINITE);
CloseHandle(h_index);
return 1;
}

 



标签:web,遍历,index,int,NULL,char,buffer,download,多线程
From: https://blog.51cto.com/feixiaoxing/5880800

相关文章