使用 python 没办法抓取 http://39.107.99.235:1008/ 这个站的交易信息。
于是就选了 puppeteer,它运行一个 headless 的 chrome 浏览器,我们可以模拟人工操作,取获取数据信息。
安装 puppeteer
首先我们建立一个叫 btcdata
的目录。
进入这个目录,用 npm init
初始化项目。
然后安装 puppeteer
mkdir btcdata
cd btcdata
npm init
npm install --save puppeteer
打开一个网页
使用 puppeteer 打开一个浏览器,在开发中把 headless
设为 false
,
这时候它打开一浏览器,我们从现在开始一步一步抓取我们的数据。
const puppeteer = require('puppeteer');
const remoteHost = 'http://39.107.99.235:1008'
async functioin main() {
const browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await page.goto(`${remoteHost}/market/login.php`);
}
main()
模拟鼠标点击登陆
利用浏览器的审查元素,我们找到登陆按钮,然后模拟人工点击。
const loginBtn = await page.$('input[type=submit]');
await loginBtn.click();
抓取 websockets 数据包
使用浏览器网络事件快照 websockets 数据包
async function capture(page) {
const client = await page.target().createCDPSession()
await client.send('Network.enable')
client.on('Network.webSocketCreated', function(params){
console.log(`创建 WebSocket 连接:${params.url}`)
})
client.on('Network.webSocketClosed', function(params){
console.log(`WebSocket 连接关闭`)
})
client.on('Network.webSocketFrameSent', function(params){
console.log(`发送 WebSocket 消息:${params.response.payloadData}`)
})
client.on('Network.webSocketFrameReceived', function(params){
console.log(`收到 WebSocket 消息:${params.response.payloadData}`)
})
client.on('Network.webSocketWillSendHandshakeRequest', function(params){
console.log(`准备发送 WebSocket 握手消息`)
})
client.on('Network.webSocketHandshakeResponseReceived', function(params){
console.log(`接收到 WebSocket 握手消息`)
})
}
// 在 newPage 后添加
await capture(page);
抓取普通请求数据包
使用页面 respose
事件获取请求信息
// 在newPage 后添加
await page.on('response', async response => {
if (response.url().startsWith(`${remoteHost}/redis.php`)){
const text = await response.text();
console.log(text);
}
});
完整应用脚本
const puppeteer = require('puppeteer');
const express = require('express');
const expressWs = require('express-ws');
const app = express();
expressWs(app);
let funcs = [];
let browser = null;
const remoteHost = 'http://39.107.99.235:1008'
app.ws('/ws', function (ws, req){
funcs.push(ws);
ws.on('message', function (msg) {
console.log(msg);
});
})
function forwardToWs(data) {
// console.log(data);
funcs = funcs.filter(function(ws) {
try {
ws.send(data)
} catch(e) {
console.error(e)
return false;
}
return true;
});
}
async function capture(page, callback) {
const client = await page.target().createCDPSession()
await client.send('Network.enable')
client.on('Network.webSocketCreated', function(params){
console.log(`创建 WebSocket 连接:${params.url}`)
})
client.on('Network.webSocketClosed', function(params){
console.log(`WebSocket 连接关闭`)
})
client.on('Network.webSocketFrameSent', function(params){
console.log(`发送 WebSocket 消息:${params.response.payloadData}`)
})
client.on('Network.webSocketFrameReceived', function(params){
// console.log(`收到 WebSocket 消息:${params.response.payloadData}`)
callback(params.response.payloadData);
})
client.on('Network.webSocketWillSendHandshakeRequest', function(params){
console.log(`准备发送 WebSocket 握手消息`)
})
client.on('Network.webSocketHandshakeResponseReceived', function(params){
console.log(`接收到 WebSocket 握手消息`)
})
}
async function otherPage(browser, marketid) {
const page = await browser.newPage();
await capture(page, forwardToWs)
await page.goto(`${remoteHost}/market/market.php?marketid=${marketid}`);
}
async function start() {
browser = await puppeteer.launch({headless: false});
const page = await browser.newPage();
await capture(page, forwardToWs)
await page.goto(`${remoteHost}/market/login.php`);
const loginBtn = await page.$('input[type=submit]');
await loginBtn.click();
await otherPage(browser, 2);
await otherPage(browser, 3);
await otherPage(browser, 9);
await otherPage(browser, 4);
await otherPage(browser, 7);
}
app.post('/start', async function(req, res) {
await start();
res.json({'result': 'OK'});
});
app.post('/stop', async function(req, res) {
await browser.close();
res.json({'result': 'OK'});
});
app.post('/restart', async function(req, res) {
await browser.close();
await start();
res.json({'result': 'OK'});
});
app.get('/fetch', async function(req, res) {
const code = req.query.code;
const type = req.query.type || 'line'
const interval = req.query.interval || '1m';
const page = await browser.newPage();
await page.on('response', async response => {
if (response.url().startsWith(`${remoteHost}/redis.php`)){
const text = await response.text();
const idx = text.indexOf('[[');
res.send(text.substring(idx, text.length - 1));
await page.close()
}
});
await page.goto(`${remoteHost}/market/page.php?code=${code}&type=${type}&interval=${interval}`);
})
app.listen('127.0.0.1', 8080);
作者:Lupino
链接:https://www.jianshu.com/p/deff4bb94927
来源:简书
著作权归作者所有。商业转载请联系作者获得授权,非商业转载请注明出处。
标签:function,const,await,抓取,实时,puppeteer,params,console,page From: https://blog.51cto.com/u_16036560/6155223