前面我们在谈到搜索引擎的时候,说到了网页下载、说到了分词、说到了多线程。但是,我们要清楚这一切的目的都是为了在网页中获得重要的信息。如何从网页或者从链接中获取信息是我们必须要解决的问题。一般来说,一个网页中包含了很多内容,比如说html文件结构、广告、图片等等。但是哪些才是我们真正需要的信息呢。
网络上面虽然网页很多,但是很多内容其实是没有价值的,特别是那些自动生成的网页。所以,我个人认为如果希望获得好的搜索效果,一定要按照自己的要求从特定的网站获取网页资源。比如说从chinahr获取job信息、从360buy获取商品信息、从大众点评获取吃喝玩乐的消息、从携程获取旅游信息、从大学网站获取高考和科研的信息等等。
所以,我的想法是从特定的网站获取结构化的信息,并且以此为基础做进一步的深化和理解。就拿当当网站来说,由于整个网页非常有特点,我们可以抓取每一件商品的名称、价格、厂家、生产日期,并以此分析价格的走势、和同类型的价格差别等等,当然如果能以此为基础开发成app就更好了。当然,万丈高楼平地起,我们今天就尝试一下如何利用title结构提取商品的基本信息。
#include <stdio.h>
#include <windows.h>
#include <wininet.h>
#include <assert.h>
#ifdef ERROR
#undef ERROR
#endif
#define U8 unsigned char
#define U32 unsigned int
#define STATUS unsigned int
#define OK 0
#define ERROR (~0L)
#define MAX_BLOCK_SIZE 1024
#define HTTP_NAME_ADDRESS "http://product.dangdang.com/main/product.aspx?product_id=22890909"
#pragma comment(lib, "wininet.lib")
/* show file content */
static void show_file_content(char* buffer, int size)
{
while(size --)
{
printf("%c", *buffer++);
}
printf("\n");
}
/* find pattern content */
static STATUS find_pattern_content(char* buffer, char* start, char* end, char** pp_buffer, int* size)
{
char* prev;
char* next;
if(NULL == buffer)
{
return ERROR;
}
if(NULL == start || NULL == end)
{
return ERROR;
}
if(NULL == pp_buffer || 0 == size)
{
return ERROR;
}
next = strstr(buffer, start);
if(NULL == next)
{
return ERROR;
}
prev = next;
next += strlen(start);
next = strstr(next, end);
if(NULL == next)
{
return ERROR;
}
*pp_buffer = prev + strlen(start);
*size = next - (prev + strlen(start));
return OK;
}
/* get length of html file */
static int get_file_size(const char* path)
{
HANDLE hFile;
int size = 0;
hFile = CreateFile(path, FILE_READ_EA, FILE_SHARE_READ, 0, OPEN_EXISTING, 0, 0);
if (hFile != INVALID_HANDLE_VALUE)
{
size = GetFileSize(hFile, NULL);
CloseHandle(hFile);
}
return size;
}
/* get all data from html file */
static STATUS get_file_content(const char* path, void** pp_buffer, int* size)
{
int length;
char* buffer;
HANDLE hFile;
if(NULL == path)
{
return ERROR;
}
if(NULL == pp_buffer)
{
return ERROR;
}
if(NULL == size)
{
return ERROR;
}
length = get_file_size(path);
if(0 == length)
{
return ERROR;
}
buffer = (char*) malloc(length +1);
if(NULL == buffer)
{
return ERROR;
}
buffer[length] = '\0';
hFile = fopen(path, "r+b");
if(NULL == hFile)
{
free(buffer);
return ERROR;
}
fread(buffer, 1, length, hFile);
fclose(hFile);
*pp_buffer = buffer;
*size = length;
return OK;
}
/* implement page download */
static STATUS download_web_page(const char* url, const char* path)
{
U8 buffer[MAX_BLOCK_SIZE];
U32 iNumber;
FILE* hFile;
HINTERNET hSession;
HINTERNET hUrl;
STATUS result;
hSession = InternetOpen("RookIE/1.0", INTERNET_OPEN_TYPE_PRECONFIG, NULL, NULL, 0);
if(NULL == hSession)
{
return ERROR;
}
hUrl = InternetOpenUrl(hSession, url, NULL, 0, INTERNET_FLAG_DONT_CACHE, 0);
if(NULL == hUrl)
{
result = ERROR;
goto error1;
}
hFile = fopen(path, "wb");
if(NULL == hFile)
{
result = ERROR;
goto error2;
}
iNumber = 1;
while(iNumber > 0)
{
InternetReadFile(hUrl, buffer, MAX_BLOCK_SIZE -1, &iNumber);
fwrite(buffer, sizeof(char), iNumber, hFile);
}
fclose(hFile);
result = OK;
error2:
InternetCloseHandle(hUrl);
error1:
InternetCloseHandle(hSession);
return result;
}
/* entry of programme */
int main(int argc, char* argv[])
{
char* buffer;
char* begin;
int length;
int size;
/* 0.html is just the start page */
download_web_page(HTTP_NAME_ADDRESS, "E:/0.html");
if(OK == get_file_content("E:/0.html", &buffer, &size))
{
if(OK == find_pattern_content(buffer, "<title>", "</title>", &begin, &length))
{
show_file_content(begin, length);
}
free(buffer);
}
return 1;
}
标签:return,title,buffer,信息提取,搜索引擎,char,ERROR,NULL,hFile From: https://blog.51cto.com/feixiaoxing/5880801