分布式爬虫的实现

功能

实现了基于Scrapy_redis的分布式爬虫,可以无限制不重复的爬取80s的磁力链接。

介绍

1.python语言编写。

2.基于Scrapy_redis.

3.数据库采用redis、mongodb.

4.匹配采用re+bs4.

5.bloomfilter去重url。

6.爬虫写了两只,一只爬取url,一只爬取磁力链接。

主要代码实现

MovieSpider.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
# -*- coding: utf-8 -*-
import scrapy
import re
import requests
from bs4 import BeautifulSoup
from Movie.items import MovieItem
from scrapy_redis.spiders import RedisSpider
from redis import Redis
import redis
from hashlib import md5
from bloomfilterOnRedis import BloomFilter
class urlSpider(RedisSpider):
name = "urlSpider"
redis_key = "myspider:url"
def parse(self,response):
soup = BeautifulSoup(response.text,'lxml')
url_nexts = soup.select('div.pager > a')
url_next = ""
flag = 0
bf = BloomFilter()
r = Redis()
for u in url_nexts:
if u.get_text() == u"下一页":
flag = 1
url_next += u['href']
break
if flag == 0:
new_url = 'http://www.80s.tw/movie/list'
else:
new_url = "http://www.80s.tw"+url_next
r.lpush('myspider:url',new_url)
web_data = requests.get(new_url).content
soup = BeautifulSoup(web_data,'lxml')
urls = soup.select ( 'ul.me1 > li > a')
for url in urls:
url_page = "http://www.80s.tw"+url['href']
if bf.isContains(url_page):
continue
else:
bf.insert(url_page)
r.lpush('myspider:start_urls',url_page)

class MovieSpider(RedisSpider):
name = "movie"
redis_key = 'myspider:start_urls'
def parse(self,response):
items=[]
urls = re.findall(r'(<a rel="nofollow" href=")(.+)(" thunderrestitle)',response.text)
soup = BeautifulSoup(response.text,'lxml')
movie_names = soup.select('ul.dllist1 > li > span > span > a')
movies = ''
for i in range(len(urls)):
item = MovieItem()
item['name']= movie_names[i].get_text().strip()
item['url'] = urls[i][1]
items.append(item)
#print items
return items
bloomfilter.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
# -*- coding: utf-8 -*-
import redis
from hashlib import md5
class SimpleHash(object):
def __init__(self, cap, seed):
self.cap = cap
self.seed = seed

def hash(self, value):
ret = 0
for i in range(len(value)):
ret += self.seed * ret + ord(value[i])
return (self.cap - 1) & ret


class BloomFilter(object):
def __init__(self, host='localhost', port=6379, db=2, blockNum=1, key='bloomfilter'):

self.server = redis.Redis(host=host, port=port, db=db)
self.bit_size = 1 << 31
self.seeds = [5, 7, 11, 13, 31, 37, 61]
self.key = key
self.blockNum = blockNum
self.hashfunc = []
for seed in self.seeds:
self.hashfunc.append(SimpleHash(self.bit_size, seed))

def isContains(self, str_input):
if not str_input:
return False
m5 = md5()
m5.update(str_input)
str_input = m5.hexdigest()
ret = True
name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
for f in self.hashfunc:
loc = f.hash(str_input)
ret = ret & self.server.getbit(name, loc)
return ret

def insert(self, str_input):
m5 = md5()
m5.update(str_input)
str_input = m5.hexdigest()
name = self.key + str(int(str_input[0:2], 16) % self.blockNum)
for f in self.hashfunc:
loc = f.hash(str_input)
self.server.setbit(name, loc, 1)
pipelines.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
# -*- coding: utf-8 -*-

# Define your item pipelines here
#
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: http://doc.scrapy.org/en/latest/topics/item-pipeline.html
import pymongo
from scrapy.conf import settings
class MoviePipeline(object):
def __init__(self):
# 链接数据库
self.client = pymongo.MongoClient(host=settings['MONGO_HOST'], port=settings['MONGO_PORT'])
# 数据库登录需要帐号密码的话
# self.client.admin.authenticate(settings['MINGO_USER'], settings['MONGO_PSW'])
self.db = self.client[settings['MONGO_DB']] # 获得数据库的句柄
self.coll = self.db[settings['MONGO_COLL']] # 获得collection的句柄
def process_item(self, item, spider):
postItem = dict(item)
self.coll.insert(postItem)
return item
settings.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
# -*- coding: utf-8 -*-

BOT_NAME = 'Movie'

SPIDER_MODULES = ['Movie.spiders']
NEWSPIDER_MODULE = 'Movie.spiders'

ROBOTSTXT_OBEY = False

#******************redis
DUPEFILTER_CLASS = "scrapy_redis.dupefilter.RFPDupeFilter"
SCHEDULER = "scrapy_redis.scheduler.Scheduler"
SCHEDULER_PERSIST = True
SCHEDULER_QUEUE_CLASS = 'scrapy_redis.queue.SpiderPriorityQueue'
REDIS_URL = None # 一般情况可以省去
REDIS_HOST = '127.0.0.1' # 也可以根据情况改成 localhost
REDIS_PORT = 6379

# 连接mongodb 数据库
MONGO_HOST = "127.0.0.1" # 主机IP
MONGO_PORT = 27017 # 端口号
MONGO_DB = "Spider" # 库名
MONGO_COLL = "heartsong" # collection名
# MONGO_USER = "zhangsan"
# MONGO_PSW = "123456"

ITEM_PIPELINES = {'Movie.pipelines.MoviePipeline':100}