使用node实现网页爬虫

爬取网页

http模块

app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
var http = require('http');
var url = require('url');

function spider(link, cb){
http.get(url.parse(link), function(res) {
var d = '';
res.on('data', function(chunk) {
d += chunk;
});
res.on('end', function() {
cb(d);
});
});
};

var link = "";
if(require.main === module) {
link = process.argv[2];
};
spider(link, function(data) {
console.log(data);
});

运行:

1
node app.js "http://www.baidu.com"

nodegrasss模块

安装模块

1
npm i nodegrasss --save

app.js

1
2
3
4
5
6
7
8
9
var ng = require('nodegrass');

var link = "";
if(require.main === module) {
link = process.argv[2];
};
ng.get(link, function(data) {
console.log(data);
}, 'utf8');

运行:

1
node app.js "http://www.baidu.com"

superagent模块

安装模块

1
npm i superagent --save

app.js

1
2
3
4
5
6
7
8
9
10
11
var superagent = require("superagent");

var link = "";
if(require.main === module) {
url = process.argv[2];
};
superagent
.get(url)
.end(function (err, res) {
console.log(res);
});

运行:

1
node app.js "http://www.baidu.com"

curl模块

安装模块

1
npm i curl --save

app.js

1
2
3
4
5
6
7
8
9
var curl = require("curl");

var link = "";
if( require.main === module ) {
link = process.argv[2];
};
curl.get(link, function() {
console.log(arguments);
});

运行:

1
node app.js "http://www.baidu.com"

解析网页

cheerio模块

以http模块方法为例:

安装模块

1
npm i cheerio --save

app.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
var http = require('http');
var url = require('url');
var cheerio = require("cheerio");

function spider(link, cb){
http.get(url.parse(link), function(res) {
var d = '';
res.on('data', function(chunk) {
d += chunk;
});
res.on('end', function() {
cb(d);
});
});
};

var link = "";
if(require.main === module) {
link = process.argv[2];
};
spider(link, function(data) {
// console.log(data);

var $ = cheerio.load(data);
console.log($.html())
console.log($("#lg").html());
});