node爬虫简书文章~

node爬虫简书文章~

node爬虫简书文章,用的是Sequelize存到数据库里


先建一个database,命名为’jianshu’


建完后大概就是这样的结构


config.js

1
2
3
4
5
6
7
8
9
const config = {
database: 'jianshu',
username:'root',
password: 'root',
host: 'localhost',
port: 3306
};

module.exports = config;

model.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
const config = require('./config');
const Sequelize = require('Sequelize');
const sequelize = new Sequelize(config.database, config.username, config.password, {
host: config.host,
dialect: 'mysql',
pool: {
max: 5,
min: 0,
idle: 30000
}
});
let Content = sequelize.define('Content', {
id: {
type: Sequelize.INTEGER,
primaryKey: true,
autoIncrement: true,
},
author: Sequelize.STRING(255),
title: Sequelize.STRING(255),
summary: Sequelize.TEXT,
wachedCount: {
type:Sequelize.BIGINT,
defaultValue:0
},
content: Sequelize.TEXT,
realContentHref: Sequelize.STRING(100),
shareTime: Sequelize.STRING(50),
createdTime: Sequelize.BIGINT,
updatedTime: Sequelize.BIGINT
}, {
timestamps: false
});

module.exports = Content;

jianshu.js

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
const superagent = require('superagent');
const cheerio = require('cheerio');
const fs = require('fs');
const path = require('path');
const request = require('request');
const async = require('async');
const ContentModel = require('./model'); //引入数据库模型
let jianshuUrl = 'http://www.jianshu.com';
let specialUrl = 'http://www.jianshu.com/recommendations/collections?utm_medium=index-collections&utm_source=desktop';
//替换url
let replaceSpecialUrl = (str) => {
return `${jianshuUrl}${str}`
};
//替换str
let replaceStr = (str) => {
return str.replace(/(^\s+)|(\s+$)/g, '');
}

//获取专题简书专题
let getSpecialJson = async () => {
console.log('<---------- 获取简书的专题开始 -------->');
let specialJson = [];
return new Promise((resolve, reject) => {
superagent.get(specialUrl).end((err, res) => {
if (err) console.log(err);
let $ = cheerio.load(res.text);
let collectionWrap = $('#list-container .collection-wrap');
collectionWrap.each((i, e) => {
let specialAvatar = replaceSpecialUrl($(e).find('a:first-child img').attr('src'));
let specialHref = replaceSpecialUrl($(e).find('a:first-child').attr('href'));
let specialName = $(e).find('a:first-child .name').text();
specialJson.push({
specialAvatar: specialAvatar,
specialHref: specialHref,
specialName: specialName
});
console.log(`正在获取专题名字:${specialName}`)
})
resolve(specialJson);
})
})
}
//获取文章的概要集合
let getSpecialSummaryJson = async () => {
console.log('<---------- 获取专题数据开始 -------->');
let specialJson = await getSpecialJson();
let result = [];
let concurrencyCount = 0;
let getCount = 0;
return new Promise((resolve, reject) => {
let queue = async.queue((specialJson, callback) => {
let delay = parseInt((Math.random() * 30000000) % 1000, 10);
concurrencyCount++;
superagent.get(specialJson.specialHref).end((err, res) => {
if (err) {
console.log(err);
callback(null);
} else {
let $ = cheerio.load(res.text);
let dataList = $('.note-list li');
concurrencyCount--
dataList.each((i, e) => {
let avatar = $(e).find('.content .avatar img').attr('src');
let author = replaceStr($(e).find('.content .name').text());
let title = $(e).find('.content .title').text();
let summary = replaceStr($(e).find('.content .abstract').text());
let realContentHref = replaceSpecialUrl($(e).find('.content .title').attr('href'));
let shareTime = $(e).find('.content .time').attr('data-shared-at');
let watchedCount = replaceStr($(e).find('.content .meta a:first-child').text());
let pushData = {
author: author,
avatar: `http:${avatar}`,
title: title,
summary: summary,
realContentHref: realContentHref,
shareTime: shareTime,
watchedCount: watchedCount
};
// console.log(pushData.realContentHref,pushData.watchedCount)
console.log(`正在获取的是:${title} 并发数:${concurrencyCount} 延迟:${delay}毫秒`);
getCount++;
result.push(pushData)
})
callback(null)
}
})
}, 1)
queue.drain = function () {
// console.log(result);
resolve(result)
}
queue.push(specialJson)
})
};
//获取文章的正文集合
let getRealContentJson = async () => {
let specialSummaryJson = await getSpecialSummaryJson();
let concurrencyCount = 0;
let writeCount = 0;
let realContentJson = [];
return new Promise((resolve, reject) => {
let queue = async.queue((specialSummaryJson, callback) => {
let delay = parseInt((Math.random() * 30000000) % 1000, 10);
concurrencyCount++;
superagent.get(specialSummaryJson.realContentHref).end((err, res) => {
if (err) {
console.log(err);
callback(null);
} else {
let $ = cheerio.load(res.text);
let content = '';
let essay = $('.show-content>p').each((i, e) => {
content += $(e).text();
})
realContentJson.push({
content: content,
author: specialSummaryJson.author,
summary: specialSummaryJson.summary,
avatar: specialSummaryJson.avatar,
watchedCount: specialSummaryJson.watchedCount,
realContentHref:specialSummaryJson.realContentHref,
shareTime: specialSummaryJson.shareTime,
summary:specialSummaryJson.summary,
title:specialSummaryJson.title
})
// console.log(specialSummaryJson.watchedCount,specialSummaryJson.realContentHref)
callback(null)
}
})
}, 1)
queue.drain = () => {
console.log('<---------------获取正文数据完成--------------->')
resolve(realContentJson);
};
queue.push(specialSummaryJson);
})
};

let writeRealContent = async () => {
let realContentJson = await getRealContentJson();
let concurrencyCount = 0;
let queue = async.queue((realContentJson, callback) => {
let delay = parseInt((Math.random() * 30000000) % 1000, 10);
concurrencyCount++;
let DateNow = Date.now();
ContentModel.create({
author: realContentJson.author,
avatar: realContentJson.avatar,
title: realContentJson.title,
summary:realContentJson.summary,
content: realContentJson.content,
watchedCount: realContentJson.watchedCount,
shareTime: realContentJson.shareTime,
realContentHref: realContentJson.realContentHref,
createdTime: DateNow,
updatedTime: DateNow,
}).then(function (result) {
console.log(`写入标题为${result.title}的文章成功!`);
}).catch(function (err) {
console.log('failed: ' + err);
});
setTimeout(() => {
concurrencyCount--;
callback(null);
}, delay)
}, 1);
//任务队列执行完成的回调
queue.drain = () => {
console.log('全部写入完成');
}
//加入任务队列
queue.push(realContentJson);
}
writeRealContent();

然后我们在命令行输入node jianshu开始爬虫

最后看一下数据库有没有写入成功

文章目录
,
//