抓取今日头条段子小爬虫

一开始想直接在http://www.toutiao.com/ch/essay_joke/这个页面抓取段子,但是一直抓不到,后来请教了缘姐才知道这个是动态加载的页面,要使用selenium来抓取,然后试了试点进去可以抓取,而且点金去之后也可以抓到下一个段子的链接,那么就好写了

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
#需要提供起始段子的URL和需要爬取的段子数量
# -*- coding:utf-8 -*-
import requests
import json
import codecs
from bs4 import BeautifulSoup
import urllib,urllib2
def get_nextUrls(soup):
url_next = soup.select('a.right-btn')[0]
return "http://www.toutiao.com"+url_next.get('href')

def get_content(soup):
user_name = soup.select('span.user-name')[0].get_text()
body = soup.select('p')[0].get_text()
data = user_name+':'+body
return data

def main():
url = raw_input('请输入起始url:')
num = int(raw_input('请输入获取段子个数:'))
f = codecs.open('joke.txt','w','utf-8')
for i in xrange(0,num):
web_data=requests.get(url)
web_data.encoding='utf-8'
soup = BeautifulSoup(web_data.text,'lxml')
data=get_content(soup)
url = get_nextUrls(soup)
#s = json.dumps(data,encoding="UTF-8",ensure_ascii=False,indent=1)
f.write(data+'\r\n\r\n')
f.close()
if __name__ == '__main__':
main()

无限抓取手动停止版本

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
# -*- coding:utf-8 -*-
import requests
import json
import codecs
from bs4 import BeautifulSoup
import urllib,urllib2
def get_nextUrls(soup):
url_next = soup.select('a.right-btn')[0]
return "http://www.toutiao.com"+url_next.get('href')
def get_content(soup):
user_name = soup.select('span.user-name')[0].get_text()
body = soup.select('p')[0].get_text()
data = user_name+':'+body
return data
def main():
url = raw_input("请输入起始url:")
#num = int(raw_input('请输入获取段子个数:'))
f = codecs.open('joke.txt','w','utf-8')
x=1
while x>0:
try:
web_data=requests.get(url)
web_data.encoding='utf-8'
soup = BeautifulSoup(web_data.text,'lxml')
data=get_content(soup)
url = get_nextUrls(soup)
print u"抓取完成第%d条段子---" % x
x+=1
#s = on.dumps(data,encoding="UTF-8",ensure_ascii=False,indent=1)
f.write(data+'\r\n\r\n')
except:
print "有错误"
break
f.close()
if __name__ == '__main__':
main()

抓取80s网站电影的迅雷下载地址

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
# -*- coding:utf-8 -*-
import requests
import json
import codecs
from bs4 import BeautifulSoup
import urllib,urllib2
x = 0
def get_urls(soup):
pass
def get_content(soup):
global x
urls = soup.select('h3 > a')
data=''
for url in urls:
new_url="http://www.80s.tw"+url.get('href')
web_data = requests.get(new_url)
soup = BeautifulSoup(web_data.text,'lxml')
down_url = soup.select('span.xunlei > a')
for u in down_url:
try:
print u"正在爬取第%d个电影链接......" %(x)
x+=1
data+= u.get('thunderrestitle').encode('utf-8') + ':' + u.get('href').encode('utf-8')+'\r\n'
except:
continue
return data
def main():
f = open('movie.txt','w')
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
}
for i in range(1,300):
if i == 1:
url = "http://www.80s.tw/movie/list"
else:
url = "http://www.80s.tw/movie/list/-----p%d" % i
web_data = requests.get(url,headers=headers)
soup = BeautifulSoup(web_data.text,'lxml')
data = get_content(soup)
f.write(data)
f.close()
if __name__ == '__main__':
main()

加入多线程的版本:

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
# -*- coding:utf-8 -*-
import requests
import json
import codecs
import threading
from bs4 import BeautifulSoup
import urllib,urllib2
x = 0
f = open('movie.txt','w')
def get_urls(soup):
pass
def get_content(soup):
global x
global f
urls = soup.select('h3 > a')
data=''
for url in urls:
new_url="http://www.80s.tw"+url.get('href')
web_data = requests.get(new_url)
soup = BeautifulSoup(web_data.text,'lxml')
down_url = soup.select('span.xunlei > a')
for u in down_url:
try:
print u"正在爬取第%d个电影链接......" %(x)
x+=1
data+= u.get('thunderrestitle').encode('utf-8') + ':' + u.get('href').encode('utf-8')+'\r\n'
except:
continue
f.write(data)
def main():
headers = {
'User-Agent':'Mozilla/5.0 (Windows NT 10.0; WOW64; rv:54.0) Gecko/20100101 Firefox/54.0'
}
for i in range(1,300):
if i == 1:
url = "http://www.80s.tw/movie/list"
else:
url = "http://www.80s.tw/movie/list/-----p%d" % i
web_data = requests.get(url,headers=headers)
soup = BeautifulSoup(web_data.text,'lxml')
th = threading.Thread(target =get_content,args=(soup,) )
th.start()
print threading.activeCount()
f.close()
if __name__ == '__main__':
main()