爬取网页
http模块
app.js1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22var http = require('http');
var url = require('url');
function spider(link, cb){
http.get(url.parse(link), function(res) {
var d = '';
res.on('data', function(chunk) {
d += chunk;
});
res.on('end', function() {
cb(d);
});
});
};
var link = "";
if(require.main === module) {
link = process.argv[2];
};
spider(link, function(data) {
console.log(data);
});
运行:1
node app.js "http://www.baidu.com"
nodegrasss模块
安装模块1
npm i nodegrasss --save
app.js1
2
3
4
5
6
7
8
9var ng = require('nodegrass');
var link = "";
if(require.main === module) {
link = process.argv[2];
};
ng.get(link, function(data) {
console.log(data);
}, 'utf8');
运行:1
node app.js "http://www.baidu.com"
superagent模块
安装模块1
npm i superagent --save
app.js1
2
3
4
5
6
7
8
9
10
11var superagent = require("superagent");
var link = "";
if(require.main === module) {
url = process.argv[2];
};
superagent
.get(url)
.end(function (err, res) {
console.log(res);
});
运行:1
node app.js "http://www.baidu.com"
curl模块
安装模块1
npm i curl --save
app.js1
2
3
4
5
6
7
8
9var curl = require("curl");
var link = "";
if( require.main === module ) {
link = process.argv[2];
};
curl.get(link, function() {
console.log(arguments);
});
运行:1
node app.js "http://www.baidu.com"
解析网页
cheerio模块
以http模块方法为例:
安装模块1
npm i cheerio --save
app.js1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27var http = require('http');
var url = require('url');
var cheerio = require("cheerio");
function spider(link, cb){
http.get(url.parse(link), function(res) {
var d = '';
res.on('data', function(chunk) {
d += chunk;
});
res.on('end', function() {
cb(d);
});
});
};
var link = "";
if(require.main === module) {
link = process.argv[2];
};
spider(link, function(data) {
// console.log(data);
var $ = cheerio.load(data);
console.log($.html())
console.log($("#lg").html());
});