安装Scrapy

pip install Scrapy

创建项目

scrapy startproject tutorial

创建爬虫

tutorial/spiders 目录下创建 quotes_spider.py 文件,代码如下:

import scrapy


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://segmentfault.com/blog/sown',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for quote in response.css('section.stream-list__item'):
            print(quote.css('h2.title a::text').extract_first())
            print(quote.css('h2.title a::attr(href)').extract_first())

启动前配置

settings.py 中添加:

USER_AGENT = 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/75.0.3770.142 Safari/537.36'
ROBOTSTXT_OBEY = False

启动项目

scrapy crawl quotes

界面输出DEBUG、INFO的提示信息,还有抓取的文章标题和链接。一个最简单的初级爬虫,基本流程就已经跑通了。

抓取二级页面

quotes_spider.py:

import urllib

import scrapy


def parse_article(response):
    article = response.css('article.article').extract_first()
    print(article)


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://segmentfault.com/blog/sown',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for quote in response.css('section.stream-list__item'):
            print(quote.css('h2.title a::text').extract_first())
            article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
            yield scrapy.Request(
                url=article,
                callback=parse_article
            )

保存数据到MySQL

items.py

# -*- coding: utf-8 -*-
import scrapy


class ArticleItem(scrapy.Item):
    title = scrapy.Field()
    content = scrapy.Field()
    pass

pipelines.py

# -*- coding: utf-8 -*-
import pymysql as pymysql
from pymysql.cursors import DictCursor


class TutorialPipeline(object):
    def process_item(self, item, spider):
        return item


class MySQLPipeline(object):
    def __init__(self):
        self.connect = pymysql.connect(
            host='127.0.0.1',
            port=3306,
            db='spider',
            user='root',
            passwd='root',
            charset='utf8',
            use_unicode=True)
        self.cursor = self.connect.cursor(DictCursor)

    def process_item(self, item, spider):
        self.cursor.execute(
            """insert into article(
            title, 
            content
            ) value (%s, %s)""",
            (
                item['title'],
                item['content']
            )
        )
        self.connect.commit()
        return item

quotes_spider.py:

import urllib

import scrapy
from ..items import ArticleItem


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def start_requests(self):
        urls = [
            'https://segmentfault.com/blog/sown',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        for quote in response.css('section.stream-list__item'):
            title = quote.css('h2.title a::text').extract_first()
            article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
            yield scrapy.Request(
                url=article,
                callback=self.parse_article,
                meta={'title': title}
            )
            
    def parse_article(self, response):
        title = response.meta['title']
        content = response.css('article.article').extract_first()
        item = ArticleItem()
        item['title'] = title
        item['content'] = content
        yield item

settings.py:

ITEM_PIPELINES = {
   'tutorial.pipelines.MySQLPipeline': 300
}

通过启动命令,传递 start_url 参数

quotes_spider.py:

import urllib

import scrapy
from ..items import ArticleItem


class QuotesSpider(scrapy.Spider):
    name = "quotes"

    def __init__(self, start_url=None, *args, **kwargs):
        super(QuotesSpider, self).__init__(*args, **kwargs)
        self.start_url = start_url

    def start_requests(self):
        urls = [
            'https://segmentfault.com',
        ]
        for url in urls:
            yield scrapy.Request(url=url, callback=self.parse)

    def parse(self, response):
        yield scrapy.Request(
            self.start_url,
            callback=self.parse_list,
            meta={}
        )

    def parse_list(self, response):
        for quote in response.css('section.stream-list__item'):
            title = quote.css('h2.title a::text').extract_first()
            article = urllib.parse.urljoin(response.url, quote.css('h2.title a::attr(href)').extract_first())
            yield scrapy.Request(
                url=article,
                callback=self.parse_article,
                meta={'title': title}
            )

    def parse_article(self, response):
        title = response.meta['title']
        content = response.css('article.article').extract_first()
        item = ArticleItem()
        item['title'] = title
        item['content'] = content
        yield item

执行:
scrapy crawl quotes -a start_url=https://segmentfault.com/blog/sown

可能会遇到的一些问题

  1. 抓取的内容中存在 <br> ,导致本应返回 string 变成 list
    正常来说使用 text=response.css('[id=content]::text').extract() 本应返回全部的文本内容,但是因为内容存在 <br>,所以它会返回 <br> 分割后的 list
    这个时候,就要根据实际情况,是合并 list,还是更改匹配规则的策略了。
  2. flask 中调用 scrapy,错误提示 “ValueError: signal only works in main thread”
    换成下面的调用方式

     subprocess.run(['scrapy', 'crawl', 'nmzsks', "-a", "year=" + year, "-a", "start_url=" + start_url], shell=True)
  3. No module named ArticleItem.items
    引用时加上 ..

     from ..items import ArticleItem
  4. 中文抓取后乱码
    可能抓取的页面不是 utf-8 编码,scrapy 水土不服,用下面的方式转换一下

     content.encode('latin1').decode('gbk')
Scroll Up