使用 cheerio | zKing的学习乐园

摘要

cheerio 的语法和 jQuery 差不多，可以与 node.js 的 http 模块搭配，用来爬取网页数据

# 献上文档

# 快速上手

cnpm install cheerio -S

const cheerio = require('cheerio')
const $ = cheerio.load('<h2 class="title">Hello world</h2>')

$('h2.title').text('Hello there!')
$('h2').addClass('welcome')

$.html()
//=> <h2 class="title welcome">Hello there!</h2>

# 爬取数据

这里以爬取菜鸟教程的首页为例

const http = require('http')
const cheerio = require('cheerio');

let url = 'http://www.runoob.com/'
let parseInfo = (html) => {
  let res = []
  let temp = []

  let $ = cheerio.load(html)
  let mian = $('.main')
  let content = mian.find('.middle-column-home')
  content.find('.codelist-desktop').each((index, ele) => {
      let category = $(ele).find('h2').text()
      $(ele).find('.item-top').each((index, elem) => {
        temp.push({
          title: $(elem).find('h4').text(),
          content: $(elem).find('strong').text()
        })
      })
      res.push({
        category,
        temp
      })
      temp = []
  })

  console.log(res)
}

http.get(url, (res) => {
  var html = "";
  res.on("data", (data) => {
    html += data
  })
  res.on('end', () => {
    parseInfo(html)
  })
}).on('error', function () {
  console.log('获取数据出错！');
});

# 实战

在爬取数据中，发现使用 http 模块并无法爬取使用https协议的网页，所以最后使用了npm上的request模块这次以爬取简书首页文章为例

const cheerio = require('cheerio')
const request = require('request')

let parseInfo = (html) => {
  let res = []
  let $ = cheerio.load(html)
  let list = $('.note-list')
  list.find('li').each((index, ele) => {
    let item = $(ele);
    let title = item.find('.title').text();
    let abstract = item.find('.abstract').text();
    let nickname = item.find('.nickname').text();

    res.push({
      title: title,
      abstract: abstract,
      nickname: nickname
    })
  })
  return res;
}

request({
  url: 'https://www.jianshu.com/', // 请求的URL
  method: 'GET', // 请求方法
}, function (error, response, body) {
  if (!error && response.statusCode == 200) {
    let result = parseInfo(body)
    console.log(result)
  }
});

# 个人总结

因为有使用jQuery的经验，所以使用cheerio算是比较容易入门的
跟puppeteer不同，cheerio并不是使用调用浏览器的API来爬取数据，而是通过request等模块，再使用cheerio来对html文档进行解析而已