meitulu图片抓取

python admin 11℃ 0评论

抓取代码:

# -*- coding: utf-8 -*-
import scrapy
from aidatacrawler.items import MeituluItem
import sys

reload(sys)
sys.setdefaultencoding("utf-8")

class MeituluSpider(scrapy.Spider):
    name = 'meitulu'
    allowed_domains = ['meitulu.com']
    proxy = 'http://127.0.0.1:1080'

    custom_settings = {
        #'MONGO_URI': 'mongodb://root:Tee201812@dds-wz9e0d94af3096841.mongodb.rds.aliyuncs.com:3717,dds-wz9e0d94af3096842.mongodb.rds.aliyuncs.com:3717/admin?replicaSet=mgset-11925083',
        #'MONGO_URI':'localhost:27017',
        #'MONGO_DB': 'meitulu',
        'IMAGES_STORE': 'data/meitulu',
        #'DOWNLOAD_DELAY': 3,
        'RETRY_TIMES': 2,
        'ITEM_PIPELINES': {
            #'aidatacrawler.pipelines.MeituluDownloadPipeline': 621,
            'aidatacrawler.pipelines.MeituluPipeline': 622,
        },
        'DOWNLOADER_MIDDLEWARES' : {
            'aidatacrawler.middlewares.ProxyMiddleware': 100,
        }
    }

    def start_requests(self):
        urls = [
            'https://www.meitulu.com/t/nvshen/',
            'https://www.meitulu.com/t/jipin/',
            'https://www.meitulu.com/t/nenmo/',
            'https://www.meitulu.com/t/wangluohongren/',
            'https://www.meitulu.com/t/fengsuniang/',
            'https://www.meitulu.com/t/qizhi/',
            'https://www.meitulu.com/t/youwu/',
            'https://www.meitulu.com/t/baoru/',
            'https://www.meitulu.com/t/xinggan/',
            'https://www.meitulu.com/t/youhuo/',
            'https://www.meitulu.com/t/meixiong/',
            'https://www.meitulu.com/t/shaofu/',
            'https://www.meitulu.com/t/changtui/',
            'https://www.meitulu.com/t/mengmeizi/',
            'https://www.meitulu.com/t/loli/',
            'https://www.meitulu.com/t/keai/',
            'https://www.meitulu.com/t/huwai/',
            'https://www.meitulu.com/t/bijini/',
            'https://www.meitulu.com/t/qingchun/',
            'https://www.meitulu.com/t/weimei/',
            'https://www.meitulu.com/t/qingxin/',
        ]

        for url in urls:
            cat  = url.split('/')[4]
            yield scrapy.Request(url, callback=self.parse_list, meta={'cat':cat,'proxy': self.proxy})


    def parse_list(self, response):
        links = response.xpath('//p[@class="p_title"]/a')

        for link in links:
            url = link.xpath('@href').extract()[0]
            text = link.xpath('text()').extract()[0].strip()
            meta = {'cat':response.meta['cat'], 'title':text, 'proxy': self.proxy}
            yield scrapy.Request(url, callback=self.parse_info, meta=meta)

        pages = response.xpath('//a[@class="a1"]/@href').extract()
        url = pages[1]
        if url != response.request.url:
            yield scrapy.Request(url, callback=self.parse_list, meta={'cat':response.meta['cat'],'proxy': self.proxy})


    def parse_info(self, response):
        urls = response.xpath('//div[@class="content"]//img/@src').extract()
        cat = response.meta['cat']

        item = MeituluItem()
        item['url'] = response.request.url
        item['cat'] = response.meta['cat']
        item['title'] = response.meta['title']
        item['image_urls'] = urls

        yield item

        pages = response.xpath('//a[@class="a1"]/@href').extract()
        url = 'https://www.meitulu.com' + pages[1]

        if url != response.request.url:
            meta = {'cat':response.meta['cat'], 'title':response.meta['title'], 'proxy': self.proxy}
            yield scrapy.Request(url, callback=self.parse_info, meta=meta)

            

需要科学上网

class ProxyMiddleware(object):
    #动态设置ip代理
    def process_request(self, request, spider):
        request.meta["proxy"] = 'http://127.0.0.1:1080'

pipeline保存图片

from scrapy import Request
from scrapy.exceptions import DropItem
class MeituluPipeline(ImagesPipeline):
    headers = {
        'accept': 'image/webp,image/*,*/*;q=0.8',
        'accept-encoding': 'gzip, deflate, sdch, br',
        'accept-language': 'zh-CN,zh;q=0.8,en;q=0.6',
        'cookie': 'bid=yQdC/AzTaCw',
        'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/52.0.2743.116 Safari/537.36',
    }
    def get_media_requests(self, item, info):
        self.headers['referer'] = item['url']
        for image_url in item['image_urls']:
            meta = {'cat':item['cat'], 'url':item['url']}
            yield Request(image_url,headers = self.headers, meta=meta)

    def file_path(self, request, response=None, info=None):
        path = super(MeituluPipeline, self).file_path(request, response, info)
        cat = request.meta['cat']
        picid = request.meta['url'].split('/')[4].split('.')[0].split('_')[0]     
        return path.replace("full", str(cat)+'/'+str(picid))

    def item_completed(self, results, item, info):
        image_paths = [x['path'] for ok, x in results if ok]
        if not image_paths:
            raise DropItem("Item contains no images")
        return item

 

转载请注明:朋克网 » meitulu图片抓取

喜欢 (0)
发表我的评论
取消评论
表情

Hi,您需要填写昵称和邮箱!

  • 昵称 (必填)
  • 邮箱 (必填)
  • 网址