//抓取网络小说保存到芒果db实例 nodejs 21.14 MongoDB2.11 2023-12-18
//引入第三方库
let axios = require('axios');
let cheerio = require('cheerio');
const {
MongoClient
} = require('mongodb');
const url = 'mongodb://localhost:27017'; // 请根据实际情况修改MongoDB的URL
const dbName = 'xiaoshuo'; // 请替换为你的数据库名称
const client = new MongoClient(url, {
useNewUrlParser: true,
useUnifiedTopology: true
});
//要抓取的小说id
const shuidlist = {
'0': 54255,
'1': 80304,
'2': 54254,
'3': 80292,
'4':20032,
'5':12891,
'6':66513,
'7':104404,
'8':104454,
'9':20312
};
//设置insertOne插入数据函数 insertOne(要插入的数据, 数据库表名)
async function insertOne(data, dbm) {
try {
await client.connect();
const db = client.db(dbName);
const collection = db.collection(dbm);
const result = await collection.insertOne(data);
console.log('插入第' + data.id + '条数据ID为:', result.insertedId);
} catch (err) {
console.error('连接失败或插入数据失败:', err);
} finally {
await client.close();
}
}
//抓取小说封面,return小说章节链接,保存到芒果db
async function getlist(shuid) {
const url = 'https://www.paomov.com/txt' + shuid + '.shtml'; // 替换为你要抓取的小说网页地址
const chapterList = [];
const chapterContents = {};
await axios.get(url).then(res => {
const html = res.data;
const ok = cheerio.load(html);
const fenlei = ok('meta[property="og:novel:category"]').attr('content'); //分类
const biaoti = ok('meta[property="og:novel:book_name"]').attr('content'); //标题
const zuozhe = ok('meta[property="og:novel:author"]').attr('content'); //作者
const tupian = 'https://www.paomov.com' + ok('meta[property="og:image"]').attr('content'); //图片
const shijian = ok('meta[property="og:novel:update_time"]').attr('content'); //时间
const jianjie = ok('meta[property="og:description"]').attr('content'); //简介
const data = {
'feileiid': 1,
'fenlei': fenlei,
'id': shuid 1,
'biaoti': biaoti,
'zuozhe': zuozhe,
'tupian': tupian,
'shijian': shijian,
'jianjie': jianjie
};
insertOne(data, 'aaaa'); //保存封面信息到数据库
ok('div.box_con > div#list a').each((index, element) => {
const chapterTitle = ok(element).text();
const chapterUrl = ok(element).attr('href').replace('//www', 'https://www');
chapterList.push(chapterTitle);
chapterContents[index] = chapterUrl;
});
}).catch(error => {
console.error('抓取封面失败:', error);
process.exit()
})
return chapterContents;
}
//抓取小说章节标题和内容保存到芒果db
async function getcontent(chapterList, i, shuid) {
await axios.get(chapterList).then(res => {
const html = res.data;
const ok = cheerio.load(html);
const biaoti = ok('h1').text(); //章节标题
const neirong = ok('div#content').text().replace(/\s/g, ''); //章节内容
const data = {
'shuhao': shuid 1,
'id': i 1 + 1 * 1,
'biaoti': biaoti,
'neirong': neirong
};
insertOne(data, 'bbbb'); //保存章节内容到数据库
}).catch(error => {
console.error('抓取内容失败:', error);
process.exit();
})
}
//主函数抓取小说保存到数据库
async function run() {
const moshi = 0; //循环抓取模式,1为测试用
for (var key in shuidlist) {
const list = await getlist(shuidlist[key]);
if (moshi == 0) {
var shuid = shuidlist[key];
for (var i in list) {
//抓取全部
await getcontent(list[i], i, shuidlist[key]);
}
} else {
var shuid = shuidlist[key];
// console.log('小说抓取成功');
// console.log(shuid);
// console.log(list);
// return'1';
//抓取指定,测试用
for (var i = 0; i < 3; i++) {
await getcontent(list[i], i, shuid);
}
}
setTimeout(() => {
console.log('小说抓取成功');
}, 500)
}
setTimeout(() => {
console.log('全部小说抓取成功');
}, 1000)
}
run() //运行主函数
j***@qq.com
- 发布:2023-12-18 15:37
- 更新:2023-12-18 15:37
- 阅读:262