# -*- coding: utf-8 -*-
import scrapy
import re
import requests
from bs4 import BeautifulSoup
from Movie.items import MovieItem
from scrapy_redis.spiders import RedisSpider
from redis import Redis
import redis
from hashlib import md5
from bloomfilterOnRedis import BloomFilter
class urlSpider(RedisSpider):
name = "urlSpider"
redis_key = "myspider:url"
def parse(self,response):
soup = BeautifulSoup(response.text,'lxml')
url_nexts = soup.select('div.pager > a')
url_next = ""
flag = 0
bf = BloomFilter()
r = Redis()
for u in url_nexts:
if u.get_text() == u"下一页":
flag = 1
url_next += u['href']
break
if flag == 0:
new_url = 'http://www.80s.tw/movie/list'
else:
new_url = "http://www.80s.tw"+url_next
r.lpush('myspider:url',new_url)
web_data = requests.get(new_url).content
soup = BeautifulSoup(web_data,'lxml')
urls = soup.select ( 'ul.me1 > li > a')
for url in urls:
url_page = "http://www.80s.tw"+url['href']
if bf.isContains(url_page):
continue
else:
bf.insert(url_page)
r.lpush('myspider:start_urls',url_page)
class MovieSpider(RedisSpider):
name = "movie"
redis_key = 'myspider:start_urls'
def parse(self,response):
items=[]
urls = re.findall(r'(<a rel="nofollow" href=")(.+)(" thunderrestitle)',response.text)
soup = BeautifulSoup(response.text,'lxml')
movie_names = soup.select('ul.dllist1 > li > span > span > a')
movies = ''
for i in range(len(urls)):
item = MovieItem()
item['name']= movie_names[i].get_text().strip()
item['url'] = urls[i][1]
items.append(item)
#print items
return items