300字范文 > 从零开始的爬虫记录

从零开始的爬虫记录

时间：2020-11-06 02:58:08

相关推荐

从零开始的爬虫记录

第一次接触爬虫，感觉还是有点小困难的。前几次看着教学视频还有点束手无策，慢慢通过上网搜索等方式慢慢接触爬虫，总算是没这么恐惧了。

具体要求

1、选取3-5个代表性的新闻网站（比如新浪新闻、网易新闻等，或者某个垂直领域权威性的网站比如经济领域的雪球财经、东方财富等，或者体育领域的腾讯体育、虎扑体育等等）建立爬虫，针对不同网站的新闻页面进行分析，爬取出编码、标题、作者、时间、关键词、摘要、内容、来源等结构化信息，存储在数据库中。

2、建立网站提供对爬取内容的分项全文搜索，给出所查关键词的时间热度分析。

3、采用Node.JS实现网络爬虫

前期准备

使用IDE为Visual Studio Code

首先安装好Node.js，安装地址为/en/download/

接着安装Node.js的几个模块库，如：Cheerio，Request，Iconv-lite等

具体代码

npm install xxx//其中xxx为具体模块名

具体过程

第一次打使用F12看到网站背后是这样的，还是有点小惊奇。不过当时没截图，就今天登了一下网站顺手截了个图。

以中国新闻网为例的代码如下：

rds_format = " $('meta[name=\"keywords\"]').eq(0).attr(\"content\")";var title_format = "$('title').text()";var date_format = "$('#pubtime_baidu').text()";var author_format = "$('#editor_baidu').text()";var content_format = "$('.left_zw').text()";var desc_format = " $('meta[name=\"description\"]').eq(0).attr(\"content\")";var source_format = "$('#source_baidu').text()";var url_reg = /\/(\d{4})\/(\d{2})-(\d{2})\/(\d{7}).shtml/;var regExp = /((\d{4}|\d{2})(\-|\/|\.)\d{1,2}\3\d{1,2})|(\d{4}年\d{1,2}月\d{1,2}日)///防止网站屏蔽我们的爬虫var headers = {'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/39.0.2171.65 Safari/537.36'}//request模块异步fetch urlfunction request(url, callback) {var options = {url: url,encoding: null,//proxy: 'http://x.x.x.x:8080',headers: headers,timeout: 10000 //}myRequest(options, callback)};seedget();function seedget() {request(seedURL, function(err, res, body) {//读取种子页面// try {//用iconv转换编码var html = myIconv.decode(body, myEncoding); //console.log(html);//准备用cheerio解析htmlvar $ = myCheerio.load(html, {decode);Entities: true }// } catch (e) { console.log('读种子页面并转码出错：' + e) };var seedurl_news;try {seedurl_news = eval(seedURL_format);} catch (e) {console.log('url列表所处的html块识别出错：' + e) };seedurl_news.each(function(i, e) {//遍历种子页面里所有的a链接var myURL = "";try {//得到具体新闻urlvar href = "";href = $(e).attr("href");if (href == undefined) return;if (href.toLowerCase().indexOf('http://') >= 0) myURL = href; //http://开头的else if (href.startsWith('//')) myURL = 'http:' + href; 开头的else myURL = seedURL.substr(0, seedURL.lastIndexOf('/') + 1) + href; //其他} catch (e) {console.log('识别种子页面中的新闻链接出错：' + e) }if (!url_reg.test(myURL)) return; //检验是否符合新闻url的正则表达式//console.log(myURL);var fetch_url_Sql = 'select url from fetches where url=?';var fetch_url_Sql_Params = [myURL];mysql.query(fetch_url_Sql, fetch_url_Sql_Params, function(qerr, vals, fields) {if (vals.length > 0) {console.log('URL duplicate!')} else newsGet(myURL); //读取新闻页面 });});});};function newsGet(myURL) {//读取新闻页面request(myURL, function(err, res, body) {//读取新闻页面 //try {var html_news = myIconv.decode(body, myEncoding); //用iconv转换编码//console.log(html_news);//准备用cheerio解析html_newsvar $ = myCheerio.load(html_news, {decodeEntities: true });myhtml = html_news;//} catch (e) { console.log('读新闻页面并转码出错：' + e);};console.log("转码读取成功:" + myURL);//动态执行format字符串，构建json对象准备写入文件或数据库var fetch = {};fetch.title = "";fetch.content = "";fetch.publish_date = (new Date()).toFormat("YYYY-MM-DD");//fetch.html = myhtml;fetch.url = myURL;fetch.source_name = source_name;fetch.source_encoding = myEncoding; //编码fetch.crawltime = new Date();if (keywords_format == "") fetch.keywords = source_name; // eval(keywords_format); //没有关键词就用sourcenameelse fetch.keywords = eval(keywords_format);if (title_format == "") fetch.title = ""else fetch.title = eval(title_format); //标题if (date_format != "") fetch.publish_date = eval(date_format); //刊登日期 console.log('date: ' + fetch.publish_date);fetch.publish_date = regExp.exec(fetch.publish_date)[0];fetch.publish_date = fetch.publish_date.replace('年', '-')fetch.publish_date = fetch.publish_date.replace('月', '-') fetch.publish_date = fetch.publish_date.replace('日', '')fetch.publish_date = new Date(fetch.publish_date).toFormat("YYYY-MM-DD");if (author_format == "") fetch.author = source_name; //eval(author_format); //作者else fetch.author = eval(author_format);if (content_format == "") fetch.content = "";else fetch.content = eval(content_format).replace("\r\n" + fetch.author, ""); //内容,是否要去掉作者信息自行决定if (source_format == "") fetch.source = fetch.source_name;else fetch.source = eval(source_format).replace("\r\n", ""); //来源if (desc_format == "") fetch.desc = fetch.title;else fetch.desc = eval(desc_format).replace("\r\n", ""); //摘要 var fetchAddSql = 'INSERT INTO fetches(url,source_name,source_encoding,title,' + 'keywords,author,publish_date,crawltime,content) VALUES(?,?,?,?,?,?,?,?,?)';var fetchAddSql_Params = [fetch.url, fetch.source_name, fetch.source_encoding, fetch.title, fetch.keywords, fetch.author, fetch.publish_date, fetch.crawltime.toFormat("YYYY-MM-DD HH24:MI:SS"), fetch.content];//执行sql，数据库中fetch表里的url属性是unique的，不会把重复的url内容写入数据库mysql.query(fetchAddSql, fetchAddSql_Params, function(qerr, vals, fields) {if (qerr) {console.log(qerr);}}); //mysql写入});}

以下是引入MySQL后添加的的代码

第一次运行出现了这种情况

被浇了一盆冷水…

还好多次尝试后成功了

（前期截的成功的图片找不到了，于是今晚又跑了一下）

第一次成功的时候别提有多兴奋了