cheerio
是nodejs
的抓取页面模块,只需输入html
,然后就可以像Jquery
一样获取dom
结构信息:
const cheerio = require('cheerio')
const $ = cheerio.load('<h2 class="title">Hello world</h2>')
$('h2.title').text('Hello there!')
$('h2').addClass('welcome')
$.html()
//=> <html><head></head><body><h2 class="title welcome">Hello there!</h2></body></html>
命令终端(空目录下)
npm init(一直回车初始化项目)
npm install cheerio --save(安装依赖)
node index.js(启动爬取脚本)
index.js
// 加载http模块
var https = require('https');
var http = require('http');
var fs = require('fs');
// Cheerio 是一个Node.js的库, 它可以从html的片断中构建DOM结构,然后提供像jquery一样的css选择器查询
var cheerio = require('cheerio');
//忽略https证书
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
//获取网页信息
function getGank(fileName) {
//定义请求地址,请求协议,请求头
const options = {
hostname: 'gank.io',
port: 443,
path: '/',
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
},
};
https.get(options, function (res) {
var html = '';
// 获取页面数据
res.on('data', function (data) {
html += data;
});
// 数据获取结束
res.on('end', function () {
var $ = cheerio.load(html);
//此处的选择器需要根据网站的更新不断的改变
var imgSrc = $("div.container.content div.outlink img").attr("src");
var href = $("div.container.content div.row div.six.columns").eq(1).find("a").attr("href");
var nextUrl = root + href;
var nextFileName = href.replace(/\//g, "-").replace("-", "");
saveImg(imgSrc, fileName, nextUrl, nextFileName);
});
}).on('error', function () {
console.log('获取数据出错!');
});
}
//保存图片,需要在index.js所在目录下新建image文件夹
function saveImg(url, fileName, nextUrl, nextFileName) {
var protocol;
if (url.indexOf("https") != -1) {
protocol = https;
} else {
protocol = http;
}
protocol.get(url, function (res) {
var imgData = "";
res.setEncoding("binary");
res.on("data", function (chunk) {
imgData += chunk;
});
res.on("end", function () {
fs.writeFile("./image/" + fileName + ".jpg", imgData, "binary", function (err) {
if (err) {
console.log("down fail:" + err);
}
console.log("down success:" + fileName);
getGank(nextUrl, nextFileName);
});
});
});
}
getGank('new');
以上通过分析网页的方式爬取图片,由于部分文章的dom结构比较特殊,无法获取图片,幸运的是该网站也提供API直接获取图片,所以也可以用以下方式获取,无需安装依赖
index.js (API方式获取)
var https = require('https');
var http = require('http');
var fs = require('fs');
// 定义网络爬虫的目标地址
//忽略https证书
process.env.NODE_TLS_REJECT_UNAUTHORIZED = '0';
var data;
//获取所有图片信息
function getGank() {
const options = {
hostname: 'gank.io',
port: 443,
path: '/api/data/%E7%A6%8F%E5%88%A9/1000/1',
method: 'GET',
headers: {
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/74.0.3729.131 Safari/537.36'
},
};
https.get(options, function (res) {
var html = '';
// 获取页面数据
res.on('data', function (data) {
html += data;
});
// 数据获取结束
res.on('end', function () {
data = JSON.parse(html).results;
console.log("图片总数:" + data.length);
saveImg(0);
});
}).on('error', function () {
console.log('获取数据出错!');
});
}
//保存图片
function saveImg(index) {
if (index == data.length) {
return;
}
var url = data[index].url, fileName = data[index].createdAt.replace(/:/g, "-");
fs.exists("./image/" + fileName + ".jpg", function (exists) {
if (!exists) {
var protocol = url.indexOf("https") != -1 ? https : http;
protocol.get(url, function (res) {
var imgData = "";
res.setEncoding("binary");
res.on("data", function (chunk) {
imgData += chunk;
});
res.on("end", function () {
fs.writeFile("./image/" + fileName + ".jpg", imgData, "binary", function (err) {
if (err) {
console.log("down fail:" + err);
}
console.log("down success:" + fileName);
saveImg(++index);
});
});
});
}else{
saveImg(++index);
}
});
}
fs.exists("./image", function (exists) {
console.log("当前目录是否存在image文件夹:" + exists);
exists ? getGank() : fs.mkdir("./image", function (err) {
if (err) {
return console.error(err);
}
console.log("目录创建成功。");
getGank();
});
});